diff --git a/.astylerc b/.astylerc deleted file mode 100644 index 862e35f20a..0000000000 --- a/.astylerc +++ /dev/null @@ -1,12 +0,0 @@ -# libCEED formatting options for Artistic Style ---style=google ---indent=spaces=2 ---max-code-length=80 ---keep-one-line-statements ---keep-one-line-blocks ---indent-preproc-cond ---lineend=linux ---suffix=none ---preserve-date ---formatted ---align-pointer=name diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000..cb780d6802 --- /dev/null +++ b/.clang-format @@ -0,0 +1,14 @@ +BasedOnStyle: "Google" + +AlignAfterOpenBracket: Align +AlignArrayOfStructures: Left +AlignConsecutiveAssignments: Consecutive +AlignConsecutiveDeclarations: Consecutive +AlignEscapedNewlines: true +AlignOperands: Align +AlignTrailingComments: true +AllowShortIfStatementsOnASingleLine: AllIfsAndElse +ColumnLimit: 150 +CommentPragmas: 'TESTARGS' +TabWidth: 4 +UseTab: Never diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000000..b1c73be48c --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,2 @@ +Checks: "clang-diagnostic-*,clang-analyzer-*,readability-inconsistent-declaration-parameter-name" +HeaderFilterRegex: .* diff --git a/.github/workflows/c-fortran-test-style.yml b/.github/workflows/c-fortran-test-style.yml index a9b0ec3cda..3a0fec5250 100644 --- a/.github/workflows/c-fortran-test-style.yml +++ b/.github/workflows/c-fortran-test-style.yml @@ -15,12 +15,12 @@ jobs: steps: - name: Environment setup uses: actions/checkout@v3 - - name: Install astyle - run: sudo apt-get install astyle + - name: Install clang-format + run: sudo apt install clang-format-14 - name: C style env: CC: ${{ matrix.compiler }} FC: gfortran-11 run: | make info - make style-c -j2 && git diff --exit-code + make format-c -j2 && git diff --exit-code diff --git a/.github/workflows/python-test-with-style.yml b/.github/workflows/python-test-with-style.yml index e682cf79bd..5e4e71ccd2 100644 --- a/.github/workflows/python-test-with-style.yml +++ b/.github/workflows/python-test-with-style.yml @@ -41,4 +41,4 @@ jobs: CC: ${{ matrix.compiler }} FC: gfortran-11 run: | - make style-py && git diff --exit-code + make format-py && git diff --exit-code diff --git a/Makefile b/Makefile index 725daa1d31..70d5854f8a 100644 --- a/Makefile +++ b/Makefile @@ -676,7 +676,7 @@ install : $(libceed) $(OBJDIR)/ceed.pc $(INSTALL_DATA) $(wildcard include/ceed/jit-source/hip/*.h) "$(DESTDIR)$(includedir)/ceed/jit-source/hip/" $(INSTALL_DATA) $(wildcard include/ceed/jit-source/gallery/*.h) "$(DESTDIR)$(includedir)/ceed/jit-source/gallery/" -.PHONY : all cln clean doxygen doc lib install par print test tst prove prv prove-all junit examples style style-c style-py tidy iwyu info info-backends info-backends-all +.PHONY : all cln clean doxygen doc format lib install par print test tst prove prv prove-all junit examples tidy iwyu info info-backends info-backends-all cln clean : $(RM) -r $(OBJDIR) $(LIBDIR) dist *egg* .pytest_cache *cffi* @@ -687,6 +687,7 @@ cln clean : distclean : clean $(RM) -r doc/html doc/sphinx/build $(CONFIG) +# Documentation DOXYGEN ?= doxygen doxygen : $(DOXYGEN) -q Doxyfile @@ -696,19 +697,24 @@ doc-html doc-latexpdf doc-epub doc-livehtml : doc-% : doxygen doc : doc-html -style-c : - @astyle --options=.astylerc \ - $(filter-out include/ceedf.h $(wildcard tests/t*-f.h), \ - $(wildcard include/*.h interface/*.[ch] tests/*.[ch] backends/*/*.[ch] \ - examples/*/*/*.[ch] examples/*/*.[ch] examples/*/*.[ch]pp gallery/*/*.[ch])) +# Style/Format +CLANG_FORMAT ?= clang-format + +FORMAT_OPTS += -style=file -i + +format.ch := $(filter-out include/ceedf.h $(wildcard tests/t*-f.h), $(shell git ls-files *.[ch]pp *.[ch])) + +format-c : + $(CLANG_FORMAT) $(FORMAT_OPTS) $(format.ch) AUTOPEP8 = autopep8 -style-py : AUTOPEP8_ARGS = --in-place --aggressive -style-py : +format-py : AUTOPEP8_ARGS = --in-place --aggressive +format-py : @$(AUTOPEP8) $(AUTOPEP8_ARGS) $(wildcard *.py python**/*.py python/tests/*.py examples**/*.py doc/sphinx/source**/*.py benchmarks/*.py) -style : style-c style-py +format : format-c format-py +# Tidy CLANG_TIDY ?= clang-tidy %.c.tidy : %.c @@ -717,16 +723,17 @@ CLANG_TIDY ?= clang-tidy %.cpp.tidy : %.cpp $(CLANG_TIDY) $(TIDY_OPTS) $^ -- $(CPPFLAGS) --std=c++11 -I$(CUDA_DIR)/include -I$(OCCA_DIR)/include -I$(HIP_DIR)/include -tidy_c : $(libceed.c:%=%.tidy) -tidy_cpp : $(libceed.cpp:%=%.tidy) +tidy-c : $(libceed.c:%=%.tidy) +tidy-cpp : $(libceed.cpp:%=%.tidy) -tidy : tidy_c tidy_cpp +tidy : tidy-c tidy-cpp ifneq ($(wildcard ../iwyu/*),) IWYU_DIR ?= ../iwyu IWYU_CC ?= $(IWYU_DIR)/build/bin/include-what-you-use endif +# IWYU iwyu : CC=$(IWYU_CC) iwyu : lib diff --git a/backends/avx/ceed-avx-blocked.c b/backends/avx/ceed-avx-blocked.c index 5663d313aa..c6f592a4eb 100644 --- a/backends/avx/ceed-avx-blocked.c +++ b/backends/avx/ceed-avx-blocked.c @@ -5,39 +5,34 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-avx.h" //------------------------------------------------------------------------------ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Avx(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/avx") && - strcmp(resource, "/cpu/self/avx/blocked")) + if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/avx") && strcmp(resource, "/cpu/self/avx/blocked")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "AVX backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "AVX backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceed_ref; - CeedInit("/cpu/self/opt/blocked", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_f64_Avx); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Avx)); } else { - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_f32_Avx); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Avx)); } return CEED_ERROR_SUCCESS; @@ -46,7 +41,5 @@ static int CeedInit_Avx(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Avx_Blocked(void) { - return CeedRegister("/cpu/self/avx/blocked", CeedInit_Avx, 30); -} +CEED_INTERN int CeedRegister_Avx_Blocked(void) { return CeedRegister("/cpu/self/avx/blocked", CeedInit_Avx, 30); } //------------------------------------------------------------------------------ diff --git a/backends/avx/ceed-avx-serial.c b/backends/avx/ceed-avx-serial.c index 8ea4ab76be..156d9f8294 100644 --- a/backends/avx/ceed-avx-serial.c +++ b/backends/avx/ceed-avx-serial.c @@ -5,39 +5,34 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-avx.h" //------------------------------------------------------------------------------ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Avx(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self") - && strcmp(resource, "/cpu/self/avx/serial")) + if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/avx/serial")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "AVX backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "AVX backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceed_ref; - CeedInit("/cpu/self/opt/serial", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_f64_Avx); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Avx)); } else { - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_f32_Avx); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Avx)); } return CEED_ERROR_SUCCESS; @@ -46,7 +41,5 @@ static int CeedInit_Avx(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Avx_Serial(void) { - return CeedRegister("/cpu/self/avx/serial", CeedInit_Avx, 35); -} +CEED_INTERN int CeedRegister_Avx_Serial(void) { return CeedRegister("/cpu/self/avx/serial", CeedInit_Avx, 35); } //------------------------------------------------------------------------------ diff --git a/backends/avx/ceed-avx-tensor-f32.c b/backends/avx/ceed-avx-tensor-f32.c index 8d998e65b4..9c5774a265 100644 --- a/backends/avx/ceed-avx-tensor-f32.c +++ b/backends/avx/ceed-avx-tensor-f32.c @@ -5,71 +5,74 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-avx.h" // c += a * b #ifdef __FMA__ -# define fmadd(c,a,b) (c) = _mm_fmadd_ps((a), (b), (c)) +#define fmadd(c, a, b) (c) = _mm_fmadd_ps((a), (b), (c)) #else -# define fmadd(c,a,b) (c) += _mm_mul_ps((a), (b)) +#define fmadd(c, a, b) (c) += _mm_mul_ps((a), (b)) #endif //------------------------------------------------------------------------------ // Blocked Tensor Contract //------------------------------------------------------------------------------ -static inline int CeedTensorContract_Avx_Blocked(CeedTensorContract contract, - CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, - CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, - float *restrict v, const CeedInt JJ, const CeedInt CC) { +static inline int CeedTensorContract_Avx_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v, + const CeedInt JJ, const CeedInt CC) { CeedInt t_stride_0 = B, t_stride_1 = 1; if (t_mode == CEED_TRANSPOSE) { - t_stride_0 = 1; t_stride_1 = J; + t_stride_0 = 1; + t_stride_1 = J; } - for (CeedInt a=0; a= blk_size) - CeedTensorContract_Avx_Blocked_4_8(contract, A, B, C, J, t, t_mode, true, - u, v); + if (C >= blk_size) CeedTensorContract_Avx_Blocked_4_8(contract, A, B, C, J, t, t_mode, true, u, v); // Remainder of columns - if (C % blk_size) - CeedTensorContract_Avx_Remainder_8_8(contract, A, B, C, J, t, t_mode, true, - u, v); + if (C % blk_size) CeedTensorContract_Avx_Remainder_8_8(contract, A, B, C, J, t, t_mode, true, u, v); } return CEED_ERROR_SUCCESS; @@ -291,14 +262,11 @@ static int CeedTensorContractApply_Avx(CeedTensorContract contract, CeedInt A, //------------------------------------------------------------------------------ // Tensor Contract Create //------------------------------------------------------------------------------ -int CeedTensorContractCreate_f32_Avx(CeedBasis basis, - CeedTensorContract contract) { - int ierr; +int CeedTensorContractCreate_f32_Avx(CeedBasis basis, CeedTensorContract contract) { Ceed ceed; - ierr = CeedTensorContractGetCeed(contract, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); - ierr = CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", - CeedTensorContractApply_Avx); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Avx)); return CEED_ERROR_SUCCESS; } diff --git a/backends/avx/ceed-avx-tensor-f64.c b/backends/avx/ceed-avx-tensor-f64.c index 38fda70564..00806e61fc 100644 --- a/backends/avx/ceed-avx-tensor-f64.c +++ b/backends/avx/ceed-avx-tensor-f64.c @@ -5,71 +5,74 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-avx.h" // c += a * b #ifdef __FMA__ -# define fmadd(c,a,b) (c) = _mm256_fmadd_pd((a), (b), (c)) +#define fmadd(c, a, b) (c) = _mm256_fmadd_pd((a), (b), (c)) #else -# define fmadd(c,a,b) (c) += _mm256_mul_pd((a), (b)) +#define fmadd(c, a, b) (c) += _mm256_mul_pd((a), (b)) #endif //------------------------------------------------------------------------------ // Blocked Tensor Contract //------------------------------------------------------------------------------ -static inline int CeedTensorContract_Avx_Blocked(CeedTensorContract contract, - CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t, - CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, - double *restrict v, const CeedInt JJ, const CeedInt CC) { +static inline int CeedTensorContract_Avx_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v, + const CeedInt JJ, const CeedInt CC) { CeedInt t_stride_0 = B, t_stride_1 = 1; if (t_mode == CEED_TRANSPOSE) { - t_stride_0 = 1; t_stride_1 = J; + t_stride_0 = 1; + t_stride_1 = J; } - for (CeedInt a=0; a= blk_size) - CeedTensorContract_Avx_Blocked_4_8(contract, A, B, C, J, t, t_mode, true, - u, v); + if (C >= blk_size) CeedTensorContract_Avx_Blocked_4_8(contract, A, B, C, J, t, t_mode, true, u, v); // Remainder of columns - if (C % blk_size) - CeedTensorContract_Avx_Remainder_8_8(contract, A, B, C, J, t, t_mode, true, - u, v); + if (C % blk_size) CeedTensorContract_Avx_Remainder_8_8(contract, A, B, C, J, t, t_mode, true, u, v); } return CEED_ERROR_SUCCESS; @@ -291,14 +261,11 @@ static int CeedTensorContractApply_Avx(CeedTensorContract contract, CeedInt A, //------------------------------------------------------------------------------ // Tensor Contract Create //------------------------------------------------------------------------------ -int CeedTensorContractCreate_f64_Avx(CeedBasis basis, - CeedTensorContract contract) { - int ierr; +int CeedTensorContractCreate_f64_Avx(CeedBasis basis, CeedTensorContract contract) { Ceed ceed; - ierr = CeedTensorContractGetCeed(contract, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); - ierr = CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", - CeedTensorContractApply_Avx); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Avx)); return CEED_ERROR_SUCCESS; } diff --git a/backends/avx/ceed-avx.h b/backends/avx/ceed-avx.h index 935a9beddf..4d2ec08f35 100644 --- a/backends/avx/ceed-avx.h +++ b/backends/avx/ceed-avx.h @@ -8,12 +8,10 @@ #ifndef _ceed_avx_h #define _ceed_avx_h -#include #include +#include -CEED_INTERN int CeedTensorContractCreate_f32_Avx(CeedBasis basis, - CeedTensorContract contract); -CEED_INTERN int CeedTensorContractCreate_f64_Avx(CeedBasis basis, - CeedTensorContract contract); +CEED_INTERN int CeedTensorContractCreate_f32_Avx(CeedBasis basis, CeedTensorContract contract); +CEED_INTERN int CeedTensorContractCreate_f64_Avx(CeedBasis basis, CeedTensorContract contract); -#endif // _ceed_avx_h +#endif // _ceed_avx_h diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c index e26a95b021..e41495400d 100644 --- a/backends/blocked/ceed-blocked-operator.c +++ b/backends/blocked/ceed-blocked-operator.c @@ -5,114 +5,97 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include + #include "ceed-blocked.h" //------------------------------------------------------------------------------ // Setup Input/Output Fields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, - CeedOperator op, bool is_input, CeedElemRestriction *blk_restr, - CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, - CeedInt start_e, CeedInt num_fields, CeedInt Q) { - CeedInt ierr, num_comp, size, P; +static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, CeedElemRestriction *blk_restr, CeedVector *e_vecs_full, + CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) { + CeedInt num_comp, size, P; CeedSize e_size, q_size; - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedBasis basis; + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedBasis basis; CeedElemRestriction r; - CeedOperatorField *op_fields; + CeedOperatorField *op_fields; CeedQFunctionField *qf_fields; if (is_input) { - ierr = CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); } else { - ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); } const CeedInt blk_size = 8; // Loop over fields - for (CeedInt i=0; iis_identity_qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedQFunctionIsIdentity(qf, &impl->is_identity_qf)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate - ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr); - CeedChkBackend(ierr); - ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full); - CeedChkBackend(ierr); - - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->input_states); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out); CeedChkBackend(ierr); - - impl->num_inputs = num_input_fields; + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr)); + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full)); + + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); + + impl->num_inputs = num_input_fields; impl->num_outputs = num_output_fields; // Set up infield and outfield pointer arrays // Infields - ierr = CeedOperatorSetupFields_Blocked(qf, op, true, impl->blk_restr, - impl->e_vecs_full, impl->e_vecs_in, - impl->q_vecs_in, 0, - num_input_fields, Q); - CeedChkBackend(ierr); + CeedCallBackend( + CeedOperatorSetupFields_Blocked(qf, op, true, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q)); // Outfields - ierr = CeedOperatorSetupFields_Blocked(qf, op, false, impl->blk_restr, - impl->e_vecs_full, impl->e_vecs_out, - impl->q_vecs_out, num_input_fields, - num_output_fields, Q); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, + num_input_fields, num_output_fields, Q)); // Identity QFunctions if (impl->is_identity_qf) { - CeedEvalMode in_mode, out_mode; + CeedEvalMode in_mode, out_mode; CeedQFunctionField *in_fields, *out_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode)); if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) { impl->is_identity_restr_op = true; } else { - ierr = CeedVectorDestroy(&impl->q_vecs_out[0]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[0])); impl->q_vecs_out[0] = impl->q_vecs_in[0]; - ierr = CeedVectorAddReference(impl->q_vecs_in[0]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorAddReference(impl->q_vecs_in[0])); } } - ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } @@ -201,42 +168,32 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) { //------------------------------------------------------------------------------ // Setup Operator Inputs //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, - CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2*CEED_FIELD_MAX], - CeedOperator_Blocked *impl, CeedRequest *request) { - CeedInt ierr; +static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], + CeedOperator_Blocked *impl, CeedRequest *request) { CeedEvalMode eval_mode; - CeedVector vec; - uint64_t state; + CeedVector vec; + uint64_t state; - for (CeedInt i=0; iinput_states[i] || vec == in_vec) { - ierr = CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE, - vec, impl->e_vecs_full[i], request); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); impl->input_states[i] = state; } // Get evec - ierr = CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, - (const CeedScalar **) &e_data_full[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i])); } } return CEED_ERROR_SUCCESS; @@ -245,76 +202,54 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, - CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedInt num_input_fields, CeedInt blk_size, bool skip_active, - CeedScalar *e_data_full[2*CEED_FIELD_MAX], CeedOperator_Blocked *impl) { - CeedInt ierr; - CeedInt dim, elem_size, size; +static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedInt num_input_fields, CeedInt blk_size, bool skip_active, + CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) { + CeedInt dim, elem_size, size; CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; + CeedEvalMode eval_mode; + CeedBasis basis; - for (CeedInt i=0; iq_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data_full[i][e*Q*size]); - CeedChkBackend(ierr); - break; - case CEED_EVAL_INTERP: - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data_full[i][e*elem_size*size]); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, - CEED_EVAL_INTERP, impl->e_vecs_in[i], - impl->q_vecs_in[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data_full[i][e*elem_size*size/dim]); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, - CEED_EVAL_GRAD, impl->e_vecs_in[i], - impl->q_vecs_in[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_WEIGHT: - break; // No action - // LCOV_EXCL_START - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); - CeedChkBackend(ierr); - Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Ceed evaluation mode not implemented"); - // LCOV_EXCL_STOP - } + switch (eval_mode) { + case CEED_EVAL_NONE: + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][e * Q * size])); + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][e * elem_size * size])); + CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs_in[i], impl->q_vecs_in[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][e * elem_size * size / dim])); + CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs_in[i], impl->q_vecs_in[i])); + break; + case CEED_EVAL_WEIGHT: + break; // No action + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + Ceed ceed; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Ceed evaluation mode not implemented"); + // LCOV_EXCL_STOP + } } } return CEED_ERROR_SUCCESS; @@ -323,68 +258,50 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, //------------------------------------------------------------------------------ // Output Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, - CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, - CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields, - CeedOperator op, CeedScalar *e_data_full[2*CEED_FIELD_MAX], - CeedOperator_Blocked *impl) { - CeedInt ierr; - CeedInt dim, elem_size, size; +static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, + CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, + CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) { + CeedInt dim, elem_size, size; CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; + CeedEvalMode eval_mode; + CeedBasis basis; - for (CeedInt i=0; ie_vecs_out[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*elem_size*size]); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, - CEED_EVAL_INTERP, impl->q_vecs_out[i], - impl->e_vecs_out[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*elem_size*size/dim]); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, - CEED_EVAL_GRAD, impl->q_vecs_out[i], - impl->e_vecs_out[i]); CeedChkBackend(ierr); - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT cannot be an output " - "evaluation mode"); - } - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Ceed evaluation mode not implemented"); - // LCOV_EXCL_STOP - } + switch (eval_mode) { + case CEED_EVAL_NONE: + break; // No action + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend( + CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][e * elem_size * size])); + CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs_out[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend( + CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][e * elem_size * size / dim])); + CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs_out[i])); + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + } + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Ceed evaluation mode not implemented"); + // LCOV_EXCL_STOP + } } } return CEED_ERROR_SUCCESS; @@ -393,29 +310,21 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, //------------------------------------------------------------------------------ // Restore Input Vectors //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields, - CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - bool skip_active, CeedScalar *e_data_full[2*CEED_FIELD_MAX], - CeedOperator_Blocked *impl) { - CeedInt ierr; +static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) { CeedEvalMode eval_mode; - for (CeedInt i=0; ie_vecs_full[i], - (const CeedScalar **) &e_data_full[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_full[i], (const CeedScalar **)&e_data_full[i])); } } return CEED_ERROR_SUCCESS; @@ -424,110 +333,81 @@ static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields, //------------------------------------------------------------------------------ // Operator Apply //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, - CeedVector out_vec, - CeedRequest *request) { - int ierr; +static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { CeedOperator_Blocked *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); const CeedInt blk_size = 8; - CeedInt Q, num_input_fields, num_output_fields, num_elem, size; - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size); + CeedInt Q, num_input_fields, num_output_fields, num_elem, size; + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedInt num_blks = (num_elem / blk_size) + !!(num_elem % blk_size); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); CeedEvalMode eval_mode; - CeedVector vec; - CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0}; + CeedVector vec; + CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0}; // Setup - ierr = CeedOperatorSetup_Blocked(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Blocked(op)); // Restriction only operator if (impl->is_identity_restr_op) { - ierr = CeedElemRestrictionApply(impl->blk_restr[0], CEED_NOTRANSPOSE, in_vec, - impl->e_vecs_full[0], request); CeedChkBackend(ierr); - ierr = CeedElemRestrictionApply(impl->blk_restr[1], CEED_TRANSPOSE, - impl->e_vecs_full[0], out_vec, request); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[0], CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request)); + CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[1], CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request)); return CEED_ERROR_SUCCESS; } // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, - op_input_fields, in_vec, false, e_data_full, - impl, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request)); // Output Evecs - for (CeedInt i=0; ie_vecs_full[i+impl->num_inputs], - CEED_MEM_HOST, &e_data_full[i + num_input_fields]); - CeedChkBackend(ierr); + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields])); } // Loop through elements - for (CeedInt e=0; eq_vecs_out[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data_full[i + num_input_fields][e*Q*size]); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][e * Q * size])); } } // Input basis apply - ierr = CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, - num_input_fields, blk_size, false, e_data_full, - impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, blk_size, false, e_data_full, impl)); // Q function if (!impl->is_identity_qf) { - ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out)); } // Output basis apply - ierr = CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, - blk_size, num_input_fields, - num_output_fields, op, e_data_full, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, blk_size, num_input_fields, num_output_fields, op, + e_data_full, impl)); } // Output restriction - for (CeedInt i=0; ie_vecs_full[i+impl->num_inputs], - &e_data_full[i + num_input_fields]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields])); // Get output vector - ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Active - if (vec == CEED_VECTOR_ACTIVE) - vec = out_vec; + if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; // Restrict - ierr = CeedElemRestrictionApply(impl->blk_restr[i+impl->num_inputs], - CEED_TRANSPOSE, impl->e_vecs_full[i+impl->num_inputs], - vec, request); CeedChkBackend(ierr); - + CeedCallBackend( + CeedElemRestrictionApply(impl->blk_restr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request)); } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, - op_input_fields, false, e_data_full, impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl)); return CEED_ERROR_SUCCESS; } @@ -535,92 +415,75 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, //------------------------------------------------------------------------------ // Core code for assembling linear QFunction //------------------------------------------------------------------------------ -static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked( - CeedOperator op, bool build_objects, CeedVector *assembled, - CeedElemRestriction *rstr, CeedRequest *request) { - int ierr; +static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator op, bool build_objects, CeedVector *assembled, + CeedElemRestriction *rstr, CeedRequest *request) { CeedOperator_Blocked *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); const CeedInt blk_size = 8; - CeedInt Q, num_input_fields, num_output_fields, num_elem, size; - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size); + CeedInt Q, num_input_fields, num_output_fields, num_elem, size; + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedInt num_blks = (num_elem / blk_size) + !!(num_elem % blk_size); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); - CeedVector vec, l_vec = impl->qf_l_vec; - CeedInt num_active_in = impl->num_active_in, - num_active_out = impl->num_active_out; - CeedSize q_size; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedVector vec, l_vec = impl->qf_l_vec; + CeedInt num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; + CeedSize q_size; CeedVector *active_in = impl->qf_active_in; CeedScalar *a, *tmp; - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0}; + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0}; // Setup - ierr = CeedOperatorSetup_Blocked(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Blocked(op)); // Check for identity - if (impl->is_identity_qf) + if (impl->is_identity_qf) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Assembling identity QFunctions not supported"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); + // LCOV_EXCL_STOP + } // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, - op_input_fields, NULL, true, e_data_full, - impl, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request)); // Count number of active input fields if (!num_active_in) { - for (CeedInt i=0; iq_vecs_in[i], 0.0); CeedChkBackend(ierr); - ierr = CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp); - CeedChkBackend(ierr); - ierr = CeedRealloc(num_active_in + size, &active_in); CeedChkBackend(ierr); - for (CeedInt field=0; fieldq_vecs_in[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp)); + CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); + for (CeedInt field = 0; field < size; field++) { + q_size = (CeedSize)Q * blk_size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); + CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &tmp[field * Q * blk_size])); } num_active_in += size; - ierr = CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp)); } } impl->num_active_in = num_active_in; - impl->qf_active_in = active_in; + impl->qf_active_in = active_in; } // Count number of active output fields if (!num_active_out) { - for (CeedInt i=0; iqf_l_vec = l_vec; } - ierr = CeedVectorGetArrayWrite(l_vec, CEED_MEM_HOST, &a); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(l_vec, CEED_MEM_HOST, &a)); // Build objects if needed - CeedInt strides[3] = {1, Q, num_active_in *num_active_out*Q}; + CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; if (build_objects) { // Create output restriction - ierr = CeedElemRestrictionCreateStrided(ceed, num_elem, Q, - num_active_in*num_active_out, - num_active_in*num_active_out*num_elem*Q, - strides, rstr); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q, + strides, rstr)); // Create assembled vector - CeedSize l_size = (CeedSize)num_elem*Q*num_active_in*num_active_out; - ierr = CeedVectorCreate(ceed, l_size, assembled); CeedChkBackend(ierr); + CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); } // Loop through elements - for (CeedInt e=0; e 1) { - ierr = CeedVectorSetValue(active_in[(in+num_active_in-1)%num_active_in], - 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); } // Set Outputs - for (CeedInt out=0; outq_vecs_out[out], CEED_MEM_HOST, - CEED_USE_POINTER, a); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetSize(qf_output_fields[out], &size); - CeedChkBackend(ierr); - a += size*Q*blk_size; // Advance the pointer by the size of the output + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, a)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); + a += size * Q * blk_size; // Advance the pointer by the size of the output } } // Apply QFunction - ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out)); } } // Un-set output Qvecs to prevent accidental overwrite of Assembled - for (CeedInt out=0; outq_vecs_out[out], CEED_MEM_HOST, CEED_COPY_VALUES, - NULL); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_COPY_VALUES, NULL)); } } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, - op_input_fields, true, e_data_full, impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, true, e_data_full, impl)); // Output blocked restriction - ierr = CeedVectorRestoreArray(l_vec, &a); CeedChkBackend(ierr); - ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(l_vec, &a)); + CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); CeedElemRestriction blk_rstr = impl->qf_blk_rstr; if (!impl->qf_blk_rstr) { - ierr = CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size, - num_active_in*num_active_out, num_active_in*num_active_out*num_elem*Q, - strides, &blk_rstr); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size, num_active_in * num_active_out, + num_active_in * num_active_out * num_elem * Q, strides, &blk_rstr)); impl->qf_blk_rstr = blk_rstr; } - ierr = CeedElemRestrictionApply(blk_rstr, CEED_TRANSPOSE, l_vec, *assembled, - request); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(blk_rstr, CEED_TRANSPOSE, l_vec, *assembled, request)); return CEED_ERROR_SUCCESS; } @@ -726,60 +574,55 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked( //------------------------------------------------------------------------------ // Assemble Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunction_Blocked(CeedOperator op, - CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, true, assembled, - rstr, request); +static int CeedOperatorLinearAssembleQFunction_Blocked(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, true, assembled, rstr, request); } //------------------------------------------------------------------------------ // Update Assembled Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunctionUpdate_Blocked(CeedOperator op, - CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, false, &assembled, - &rstr, request); +static int CeedOperatorLinearAssembleQFunctionUpdate_Blocked(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Blocked(op, false, &assembled, &rstr, request); } //------------------------------------------------------------------------------ // Operator Destroy //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Blocked(CeedOperator op) { - int ierr; CeedOperator_Blocked *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); - for (CeedInt i=0; inum_inputs+impl->num_outputs; i++) { - ierr = CeedElemRestrictionDestroy(&impl->blk_restr[i]); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->e_vecs_full[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { + CeedCallBackend(CeedElemRestrictionDestroy(&impl->blk_restr[i])); + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i])); } - ierr = CeedFree(&impl->blk_restr); CeedChkBackend(ierr); - ierr = CeedFree(&impl->e_vecs_full); CeedChkBackend(ierr); - ierr = CeedFree(&impl->input_states); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->blk_restr)); + CeedCallBackend(CeedFree(&impl->e_vecs_full)); + CeedCallBackend(CeedFree(&impl->input_states)); - for (CeedInt i=0; inum_inputs; i++) { - ierr = CeedVectorDestroy(&impl->e_vecs_in[i]); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->q_vecs_in[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_inputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i])); + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); } - ierr = CeedFree(&impl->e_vecs_in); CeedChkBackend(ierr); - ierr = CeedFree(&impl->q_vecs_in); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->e_vecs_in)); + CeedCallBackend(CeedFree(&impl->q_vecs_in)); - for (CeedInt i=0; inum_outputs; i++) { - ierr = CeedVectorDestroy(&impl->e_vecs_out[i]); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->q_vecs_out[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i])); + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); } - ierr = CeedFree(&impl->e_vecs_out); CeedChkBackend(ierr); - ierr = CeedFree(&impl->q_vecs_out); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->e_vecs_out)); + CeedCallBackend(CeedFree(&impl->q_vecs_out)); // QFunction assembly data - for (CeedInt i=0; inum_active_in; i++) { - ierr = CeedVectorDestroy(&impl->qf_active_in[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_active_in; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); } - ierr = CeedFree(&impl->qf_active_in); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->qf_l_vec); CeedChkBackend(ierr); - ierr = CeedElemRestrictionDestroy(&impl->qf_blk_rstr); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->qf_active_in)); + CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_blk_rstr)); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -787,25 +630,17 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) { // Operator Create //------------------------------------------------------------------------------ int CeedOperatorCreate_Blocked(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Blocked *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", - CeedOperatorLinearAssembleQFunction_Blocked); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, - "LinearAssembleQFunctionUpdate", - CeedOperatorLinearAssembleQFunctionUpdate_Blocked); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", - CeedOperatorApplyAdd_Blocked); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", - CeedOperatorDestroy_Blocked); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Blocked)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Blocked)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Blocked)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Blocked)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/blocked/ceed-blocked.c b/backends/blocked/ceed-blocked.c index 8286b11bb9..dd48269734 100644 --- a/backends/blocked/ceed-blocked.c +++ b/backends/blocked/ceed-blocked.c @@ -5,38 +5,35 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-blocked.h" + #include +#include #include #include -#include "ceed-blocked.h" //------------------------------------------------------------------------------ // Backend Init //------------------------------------------------------------------------------ CEED_INTERN int CeedInit_Blocked(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self") - && strcmp(resource, "/cpu/self/ref/blocked")) + if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/ref/blocked")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Blocked backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Blocked backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceed_ref; - CeedInit("/cpu/self/ref/serial", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); // Set fallback CEED resource for advanced operator functionality const char fallbackresource[] = "/cpu/self/ref/serial"; - ierr = CeedSetOperatorFallbackResource(ceed, fallbackresource); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", - CeedOperatorCreate_Blocked); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Blocked)); return CEED_ERROR_SUCCESS; } @@ -44,7 +41,5 @@ CEED_INTERN int CeedInit_Blocked(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Ref_Blocked(void) { - return CeedRegister("/cpu/self/ref/blocked", CeedInit_Blocked, 55); -} +CEED_INTERN int CeedRegister_Ref_Blocked(void) { return CeedRegister("/cpu/self/ref/blocked", CeedInit_Blocked, 55); } //------------------------------------------------------------------------------ diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h index 85eb6df1a6..65c11e404e 100644 --- a/backends/blocked/ceed-blocked.h +++ b/backends/blocked/ceed-blocked.h @@ -8,8 +8,8 @@ #ifndef _ceed_blocked_h #define _ceed_blocked_h -#include #include +#include #include #include @@ -18,21 +18,21 @@ typedef struct { } CeedBasis_Blocked; typedef struct { - bool is_identity_qf, is_identity_restr_op; - CeedElemRestriction *blk_restr; /* Blocked versions of restrictions */ - CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ - uint64_t *input_states; /* State counter of inputs */ - CeedVector *e_vecs_in; /* Element block input E-vectors */ - CeedVector *e_vecs_out; /* Element block output E-vectors */ - CeedVector *q_vecs_in; /* Element block input Q-vectors */ - CeedVector *q_vecs_out; /* Element block output Q-vectors */ - CeedInt num_inputs, num_outputs; - CeedInt num_active_in, num_active_out; - CeedVector *qf_active_in; - CeedVector qf_l_vec; - CeedElemRestriction qf_blk_rstr; + bool is_identity_qf, is_identity_restr_op; + CeedElemRestriction *blk_restr; /* Blocked versions of restrictions */ + CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ + uint64_t *input_states; /* State counter of inputs */ + CeedVector *e_vecs_in; /* Element block input E-vectors */ + CeedVector *e_vecs_out; /* Element block output E-vectors */ + CeedVector *q_vecs_in; /* Element block input Q-vectors */ + CeedVector *q_vecs_out; /* Element block output Q-vectors */ + CeedInt num_inputs, num_outputs; + CeedInt num_active_in, num_active_out; + CeedVector *qf_active_in; + CeedVector qf_l_vec; + CeedElemRestriction qf_blk_rstr; } CeedOperator_Blocked; CEED_INTERN int CeedOperatorCreate_Blocked(CeedOperator op); -#endif // _ceed_blocked_h +#endif // _ceed_blocked_h diff --git a/backends/ceed-backend-weak.c b/backends/ceed-backend-weak.c index fb9d352118..6f191a7dd3 100644 --- a/backends/ceed-backend-weak.c +++ b/backends/ceed-backend-weak.c @@ -5,8 +5,8 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include // LCOV_EXCL_START // This function provides improved error messages for uncompiled backends @@ -22,11 +22,11 @@ static int CeedRegister_Weak(const char *name, int num_prefixes, ...) { va_list prefixes; va_start(prefixes, num_prefixes); int ierr; - for (int i=0; i #include +#include #include #include + #include #include -#include "ceed-cuda-gen.h" -#include "../cuda/ceed-cuda-compile.h" + #include "../cuda-ref/ceed-cuda-ref.h" #include "../cuda-shared/ceed-cuda-shared.h" +#include "../cuda/ceed-cuda-compile.h" +#include "ceed-cuda-gen.h" //------------------------------------------------------------------------------ // Build singe operator kernel //------------------------------------------------------------------------------ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { - using std::ostringstream; using std::string; - int ierr; bool is_setup_done; - ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Cuda_gen *data; - ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); - CeedQFunction qf; + CeedCallBackend(CeedOperatorGetData(op, &data)); + CeedQFunction qf; CeedQFunction_Cuda_gen *qf_data; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); - ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); CeedSize lsize; - CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, - num_output_fields, num_comp, dim = 1; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); + CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); Q_1d = Q; CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields); - CeedChkBackend(ierr); - CeedEvalMode eval_mode; - CeedBasis basis; - CeedBasis_Cuda_shared *basis_data; - CeedElemRestriction Erestrict; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedEvalMode eval_mode; + CeedBasis basis; + CeedBasis_Cuda_shared *basis_data; + CeedElemRestriction Erestrict; CeedElemRestriction_Cuda *restr_data; // TODO: put in a function? // Check for restriction only identity operator bool is_identity_qf; - ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); if (is_identity_qf) { CeedEvalMode eval_mode_in, eval_mode_out; - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement restriction only identity operators"); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement restriction only identity operators"); // LCOV_EXCL_STOP } @@ -74,47 +70,38 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { // TODO: put in a function? // Add atomicAdd function for old NVidia architectures struct cudaDeviceProp prop; - Ceed_Cuda *ceed_data; - ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); - ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr); - if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ + Ceed_Cuda *ceed_data; + CeedCallBackend(CeedGetData(ceed, &ceed_data)); + CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); + if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { char *atomic_add_path, *atomic_add_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", - &atomic_add_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path)); CeedDebug256(ceed, 2, "----- Loading Atomic Add Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source)); code << atomic_add_source; - ierr = CeedFree(&atomic_add_path); CeedChkBackend(ierr); - ierr = CeedFree(&atomic_add_source); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&atomic_add_path)); + CeedCallBackend(CeedFree(&atomic_add_source)); } // Load basis source files // TODO: generalize to accept different device functions? { char *tensor_basis_kernel_path, *tensor_basis_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", - &tensor_basis_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source)); code << tensor_basis_kernel_source; - ierr = CeedFree(&tensor_basis_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&tensor_basis_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&tensor_basis_kernel_path)); + CeedCallBackend(CeedFree(&tensor_basis_kernel_source)); } { char *cuda_gen_template_path, *cuda_gen_template_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-gen-templates.h", - &cuda_gen_template_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path)); CeedDebug256(ceed, 2, "----- Loading Cuda-Gen Template Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source)); code << cuda_gen_template_source; - ierr = CeedFree(&cuda_gen_template_path); CeedChkBackend(ierr); - ierr = CeedFree(&cuda_gen_template_source); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&cuda_gen_template_path)); + CeedCallBackend(CeedFree(&cuda_gen_template_source)); } // Get QFunction source and name @@ -126,19 +113,18 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { // Find dim, P_1d, Q_1d data->max_P_1d = 0; for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); if (basis != CEED_BASIS_COLLOCATED) { - ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); - CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); // Collect dim, P_1d, and Q_1d - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); bool isTensor; - ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); if (isTensor) { - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; } else { // LCOV_EXCL_START @@ -150,19 +136,18 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { // Check output bases for Q_1d, dim as well // The only input basis might be CEED_BASIS_COLLOCATED for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); if (basis != CEED_BASIS_COLLOCATED) { - ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); - CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Collect Q_1d - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); bool isTensor; - ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); if (isTensor) { - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); } else { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); @@ -170,7 +155,7 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { } } } - data->dim = dim; + data->dim = dim; data->Q_1d = Q_1d; // Only use 3D collocated gradient parallelization strategy when gradient is computed @@ -179,21 +164,21 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { if (dim == 3) { bool was_grad_found = false; for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); - ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); - was_grad_found = true; + was_grad_found = true; } } for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { - ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); - ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); - was_grad_found = true; + was_grad_found = true; } } } @@ -203,28 +188,28 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { if (dim != 3 || use_collograd_parallelization) { code << "#define CEED_Q_VLA 1\n\n"; } else { - code << "#define CEED_Q_VLA "<1?"*T_1D":"")<<";\n"; + code << " data.slice = slice+data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; code << "\n // -- Input field constants and basis data --\n"; // TODO: Put in a function? - //Initialize constants, and matrices B and G + // Initialize constants, and matrices B and G for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field "<B.inputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_in_"<(data, B.inputs["<B.inputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_in_"<(data, B.inputs["<G.inputs[i] = basis_data->d_collo_grad_1d; - code << " __shared__ CeedScalar s_G_in_"<(data, G.inputs["<d_collo_grad_1d; - data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; - code << " __shared__ CeedScalar s_G_in_"<(data, G.inputs["<B.inputs[i] = basis_data->d_interp_1d; + code << " __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(data, B.inputs[" << i << "], s_B_in_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + data->B.inputs[i] = basis_data->d_interp_1d; + code << " __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(data, B.inputs[" << i << "], s_B_in_" << i << ");\n"; + if (use_collograd_parallelization) { + data->G.inputs[i] = basis_data->d_collo_grad_1d; + code << " __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n"; + code << " loadMatrix(data, G.inputs[" << i << "], s_G_in_" << i << ");\n"; + } else { + bool has_collo_grad = !!basis_data->d_collo_grad_1d; + data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; + code << " __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; + code << " loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_in_" + std::to_string(i))) << ",Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i + << ");\n"; + } + break; + case CEED_EVAL_WEIGHT: + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } code << "\n // -- Output field constants and basis data --\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field "<B.outputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_out_"<(data, B.outputs["<B.outputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_out_"<(data, B.outputs["<G.outputs[i] = basis_data->d_collo_grad_1d; - code << " __shared__ CeedScalar s_G_out_"<(data, G.outputs["<d_collo_grad_1d; - data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; - code << " __shared__ CeedScalar s_G_out_"<(data, G.outputs["<B.outputs[i] = basis_data->d_interp_1d; + code << " __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(data, B.outputs[" << i << "], s_B_out_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + data->B.outputs[i] = basis_data->d_interp_1d; + code << " __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(data, B.outputs[" << i << "], s_B_out_" << i << ");\n"; + if (use_collograd_parallelization) { + data->G.outputs[i] = basis_data->d_collo_grad_1d; + code << " __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n"; + code << " loadMatrix(data, G.outputs[" << i << "], s_G_out_" << i << ");\n"; + } else { + bool has_collo_grad = !!basis_data->d_collo_grad_1d; + data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; + code << " __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; + code << " loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_out_" + std::to_string(i))) << ",Q_1d>(data, G.outputs[" << i << "], s_G_out_" + << i << ");\n"; + } + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + break; // Should not occur } - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur - } - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented - // LCOV_EXCL_STOP + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } } code << "\n // -- Element loop --\n"; @@ -370,88 +348,84 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { // Generate the correct eval mode code for each input code << " // -- Input field restrictions and basis actions --\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field "<indices.inputs[i] = restr_data->d_ind; - code << " readDofsOffset"<(data, lsize_in_"<(data, lsize_in_" << i + << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n"; } else { bool backendstrides; - ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides)); CeedInt num_elem; - ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); - CeedChkBackend(ierr); - CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; + CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; if (!backendstrides) { - ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); } - code << " // Strides: {"<(data, elem, d_u_"<(data, elem, d_u_" << i << ", r_u_" << i << ");\n"; } } // TODO: put in a function? // Basis action - code << " // EvalMode: "<1?"Tensor":"")<(data, r_u_"<1?"Tensor":"")<(data, r_u_"<1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<(data, r_u_"<W = basis_data->d_q_weight_1d; - code << " Weight"<<(dim>1?"Tensor":"")<(data, W, r_t_"< 1 ? "Tensor" : "") << dim << "d(data, r_u_" << i << ", s_B_in_" + << i << ", r_t_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + if (use_collograd_parallelization) { + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n"; + code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_u_" << i + << ", s_B_in_" << i << ", r_t_" << i << ");\n"; + } else { + CeedInt P_1d; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*dim*Q_1d];\n"; + code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ");\n"; + } + break; + case CEED_EVAL_WEIGHT: + code << " CeedScalar r_t_" << i << "[Q_1d];\n"; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + data->W = basis_data->d_q_weight_1d; + code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(data, W, r_t_" << i << ");\n"; + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } @@ -459,26 +433,23 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { // Q function code << "\n // -- Output field setup --\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << "\n // ---- Output field "<indices.inputs[i] = restr_data->d_ind; - code << " readSliceQuadsOffset"<<"3d(data, lsize_in_"<indices.inputs[i] = restr_data->d_ind; + code << " readSliceQuadsOffset" + << "3d(data, lsize_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_" + << i << ", r_q_" << i << ");\n"; + } else { + CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + bool backendstrides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides)); + CeedInt num_elem; + CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!backendstrides) { + CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + } + code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; + code << " readSliceQuadsStrided" + << "3d(data, elem, q, d_u_" << i << ", r_q_" << i << ");\n"; } - code << " // Strides: {"<(data, elem, q, d_u_"<(data, q, r_t_"<(data, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ");\n"; + break; + case CEED_EVAL_WEIGHT: + code << " CeedScalar r_q_" << i << "[1];\n"; + code << " r_q_" << i << "[0] = r_t_" << i << "[q];\n"; + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } code << "\n // -- Output fields --\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field "<(data, q, r_qq_"<(data, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ");\n"; + break; + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } code << " }\n"; @@ -641,80 +611,76 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { // Generate the correct eval mode code for each output code << "\n // -- Output field basis action and restrictions --\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field "<1?"Tensor":"")<(data, r_tt_"<1?"Tensor":"")<(data, r_tt_"<1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<(data, r_tt_"< 1 ? "Tensor" : "") << dim << "d(data, r_tt_" << i + << ", s_B_out_" << i << ", r_v_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; + if (use_collograd_parallelization) { + code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_tt_" << i + << ", s_B_out_" << i << ", r_v_" << i << ");\n"; + } else { + CeedInt P_1d; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ");\n"; + } + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + break; // Should not occur } - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur - } - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented - // LCOV_EXCL_STOP + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } // TODO put in a function // Restriction - bool is_strided; - ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); + bool is_strided; + CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); if (!is_strided) { - ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); - CeedChkBackend(ierr); - code << " const CeedInt lsize_out_"<indices.outputs[i] = restr_data->d_ind; - code << " writeDofsOffset"<(data, lsize_out_"<(data, lsize_out_" << i + << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n"; } else { bool has_backend_strides; - ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); CeedInt num_elem; - ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); - CeedChkBackend(ierr); - CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; + CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; if (!has_backend_strides) { - ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); } - code << " // Strides: {"<(data, elem, r_v_"<(data, elem, r_v_" << i << ", d_v_" << i << ");\n"; } } @@ -726,13 +692,10 @@ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); CeedDebug(ceed, code.str().c_str()); - ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, - "T_1D", CeedIntMax(Q_1d, data->max_P_1d)); - CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, operator_name.c_str(), &data->op); - CeedChkBackend(ierr); + CeedCallBackend(CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d))); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, operator_name.c_str(), &data->op)); - ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h index 13f025ce2e..62c0cd22db 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h @@ -10,4 +10,4 @@ CEED_INTERN int CeedCudaGenOperatorBuild(CeedOperator op); -#endif // _ceed_cuda_gen_operator_build_h +#endif // _ceed_cuda_gen_operator_build_h diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c index 89b64772a3..868ecfee51 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator.c +++ b/backends/cuda-gen/ceed-cuda-gen-operator.c @@ -5,30 +5,28 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include -#include "ceed-cuda-gen.h" -#include "ceed-cuda-gen-operator-build.h" + #include "../cuda/ceed-cuda-compile.h" +#include "ceed-cuda-gen-operator-build.h" +#include "ceed-cuda-gen.h" //------------------------------------------------------------------------------ // Destroy operator //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) { - int ierr; CeedOperator_Cuda_gen *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } -static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, - int elems_per_block) { +static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) { int useful_threads_per_block = threads_per_elem * elems_per_block; // round up to nearest multiple of warp_size - int block_size = ((useful_threads_per_block + warp_size - 1) / warp_size) * - warp_size; + int block_size = ((useful_threads_per_block + warp_size - 1) / warp_size) * warp_size; int blocks_per_sm = threads_per_sm / block_size; return threads_per_sm - useful_threads_per_block * blocks_per_sm; } @@ -71,30 +69,27 @@ static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, // pack a single block of 7 elements (2*49=343 useful threads) into the 354 // slots. The latter has the least "waste", but __syncthreads() // over-synchronizes and it might not pay off relative to smaller blocks. -static int BlockGridCalculate(CeedInt num_elem, int blocks_per_sm, - int max_threads_per_block, int max_threads_z, - int warp_size, int block[3], int *grid) { - const int threads_per_sm = blocks_per_sm * max_threads_per_block; +static int BlockGridCalculate(CeedInt num_elem, int blocks_per_sm, int max_threads_per_block, int max_threads_z, int warp_size, int block[3], + int *grid) { + const int threads_per_sm = blocks_per_sm * max_threads_per_block; const int threads_per_elem = block[0] * block[1]; - int elems_per_block = 1; - int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1); - for (int i = 2; - i <= CeedIntMin(max_threads_per_block / threads_per_elem, num_elem); - i++) { + int elems_per_block = 1; + int waste = Waste(threads_per_sm, warp_size, threads_per_elem, 1); + for (int i = 2; i <= CeedIntMin(max_threads_per_block / threads_per_elem, num_elem); i++) { int i_waste = Waste(threads_per_sm, warp_size, threads_per_elem, i); // We want to minimize waste, but smaller kernels have lower latency and // less __syncthreads() overhead so when a larger block size has the same // waste as a smaller one, go ahead and prefer the smaller block. if (i_waste < waste || (i_waste == waste && threads_per_elem * i <= 128)) { elems_per_block = i; - waste = i_waste; + waste = i_waste; } } // In low-order elements, threads_per_elem may be sufficiently low to give // an elems_per_block greater than allowable for the device, so we must check // before setting the z-dimension size of the block. block[2] = CeedIntMin(elems_per_block, max_threads_z); - *grid = (num_elem + elems_per_block - 1) / elems_per_block; + *grid = (num_elem + elems_per_block - 1) / elems_per_block; return CEED_ERROR_SUCCESS; } @@ -105,61 +100,51 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar) //------------------------------------------------------------------------------ // Apply and add to output //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, - CeedVector output_vec, CeedRequest *request) { - int ierr; +static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); Ceed_Cuda *cuda_data; - ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &cuda_data)); CeedOperator_Cuda_gen *data; - ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); - CeedQFunction qf; + CeedCallBackend(CeedOperatorGetData(op, &data)); + CeedQFunction qf; CeedQFunction_Cuda_gen *qf_data; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); - ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); CeedInt num_elem, num_input_fields, num_output_fields; - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); CeedEvalMode eval_mode; - CeedVector vec, output_vecs[CEED_FIELD_MAX] = {}; + CeedVector vec, output_vecs[CEED_FIELD_MAX] = {}; // Creation of the operator - ierr = CeedCudaGenOperatorBuild(op); CeedChkBackend(ierr); + CeedCallBackend(CeedCudaGenOperatorBuild(op)); // Input vectors for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); - CeedChkBackend(ierr); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.inputs[i] = NULL; } else { // Get input vector - ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; - ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); } } // Output vectors + for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); - CeedChkBackend(ierr); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.outputs[i] = NULL; } else { // Get output vector - ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; output_vecs[i] = vec; // Check for multiple output modes @@ -171,8 +156,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, } } if (index == -1) { - ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i])); } else { data->fields.outputs[i] = data->fields.outputs[index]; } @@ -180,53 +164,46 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, } // Get context data - ierr = CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); // Apply operator - void *opargs[] = {(void *) &num_elem, &qf_data->d_c, &data->indices, - &data->fields, &data->B, &data->G, &data->W - }; - const CeedInt dim = data->dim; - const CeedInt Q_1d = data->Q_1d; - const CeedInt P_1d = data->max_P_1d; + + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W}; + const CeedInt dim = data->dim; + const CeedInt Q_1d = data->Q_1d; + const CeedInt P_1d = data->max_P_1d; const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - int max_threads_per_block, min_grid_size; - CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, - &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); - int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1,}, grid; - CeedChkBackend(BlockGridCalculate(num_elem, - min_grid_size/ cuda_data->device_prop.multiProcessorCount, - max_threads_per_block, - cuda_data->device_prop.maxThreadsDim[2], - cuda_data->device_prop.warpSize, block, &grid)); + int max_threads_per_block, min_grid_size; + CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); + int block[3] = + { + thread_1d, + dim < 2 ? 1 : thread_1d, + -1, + }, + grid; + CeedChkBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block, + cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); - ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1], - block[2], shared_mem, opargs); - CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs)); // Restore input arrays for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); - CeedChkBackend(ierr); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { - ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; - ierr = CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); } } // Restore output arrays for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); - CeedChkBackend(ierr); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { - ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; // Check for multiple output modes CeedInt index = -1; @@ -237,15 +214,13 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, } } if (index == -1) { - ierr = CeedVectorRestoreArray(vec, &data->fields.outputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i])); } } } // Restore context data - ierr = CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); return CEED_ERROR_SUCCESS; } @@ -254,18 +229,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Cuda_gen(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Cuda_gen *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", - CeedOperatorApplyAdd_Cuda_gen); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", - CeedOperatorDestroy_Cuda_gen); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-gen/ceed-cuda-gen-qfunction.c b/backends/cuda-gen/ceed-cuda-gen-qfunction.c index f06e25e613..36e634064a 100644 --- a/backends/cuda-gen/ceed-cuda-gen-qfunction.c +++ b/backends/cuda-gen/ceed-cuda-gen-qfunction.c @@ -5,37 +5,34 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include + #include "ceed-cuda-gen.h" //------------------------------------------------------------------------------ // Apply QFunction //------------------------------------------------------------------------------ -static int CeedQFunctionApply_Cuda_gen(CeedQFunction qf, CeedInt Q, - CeedVector *U, CeedVector *V) { - int ierr; +static int CeedQFunctionApply_Cuda_gen(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement QFunctionApply"); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement QFunctionApply"); } //------------------------------------------------------------------------------ // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Cuda_gen(CeedQFunction qf) { - int ierr; CeedQFunction_Cuda_gen *data; - ierr = CeedQFunctionGetData(qf, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &data)); Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); - ierr = cudaFree(data->d_c); CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&data->q_function_source); CeedChkBackend(ierr); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + CeedCallCuda(ceed, cudaFree(data->d_c)); + CeedCallBackend(CeedFree(&data->q_function_source)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -43,30 +40,25 @@ static int CeedQFunctionDestroy_Cuda_gen(CeedQFunction qf) { // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Cuda_gen(CeedQFunction qf) { - int ierr; Ceed ceed; CeedQFunctionGetCeed(qf, &ceed); CeedQFunction_Cuda_gen *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); - ierr = CeedQFunctionSetData(qf, data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedQFunctionSetData(qf, data)); // Read QFunction source - ierr = CeedQFunctionGetKernelName(qf, &data->q_function_name); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->q_function_name)); CeedDebug256(ceed, 2, "----- Loading QFunction User Source -----\n"); - ierr = CeedQFunctionLoadSourceToBuffer(qf, &data->q_function_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->q_function_source)); CeedDebug256(ceed, 2, "----- Loading QFunction User Source Complete! -----\n"); - if (!data->q_function_source) + if (!data->q_function_source) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "/gpu/cuda/gen backend requires QFunction source code file"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); + // LCOV_EXCL_STOP + } - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", - CeedQFunctionApply_Cuda_gen); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", - CeedQFunctionDestroy_Cuda_gen); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda_gen)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c index 6038602879..7321936ccf 100644 --- a/backends/cuda-gen/ceed-cuda-gen.c +++ b/backends/cuda-gen/ceed-cuda-gen.c @@ -5,54 +5,45 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-cuda-gen.h" + #include +#include #include -#include "ceed-cuda-gen.h" //------------------------------------------------------------------------------ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) { - int ierr; - char *resource_root; - ierr = CeedCudaGetResourceRoot(ceed, resource, &resource_root); - CeedChkBackend(ierr); - if (strcmp(resource_root, "/gpu/cuda") - && strcmp(resource_root, "/gpu/cuda/gen")) + CeedCallBackend(CeedCudaGetResourceRoot(ceed, resource, &resource_root)); + if (strcmp(resource_root, "/gpu/cuda") && strcmp(resource_root, "/gpu/cuda/gen")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Cuda backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedFree(&resource_root); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Cuda backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedFree(&resource_root)); Ceed_Cuda *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); - ierr = CeedCudaInit(ceed, resource); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedSetData(ceed, data)); + CeedCallBackend(CeedCudaInit(ceed, resource)); Ceed ceedshared; - CeedInit("/gpu/cuda/shared", &ceedshared); - ierr = CeedSetDelegate(ceed, ceedshared); CeedChkBackend(ierr); + CeedCall(CeedInit("/gpu/cuda/shared", &ceedshared)); + CeedCallBackend(CeedSetDelegate(ceed, ceedshared)); const char fallbackresource[] = "/gpu/cuda/ref"; - ierr = CeedSetOperatorFallbackResource(ceed, fallbackresource); - CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", - CeedQFunctionCreate_Cuda_gen); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", - CeedOperatorCreate_Cuda_gen); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", - CeedDestroy_Cuda); CeedChkBackend(ierr); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Register backend //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Cuda_Gen(void) { - return CeedRegister("/gpu/cuda/gen", CeedInit_Cuda_gen, 20); -} +CEED_INTERN int CeedRegister_Cuda_Gen(void) { return CeedRegister("/gpu/cuda/gen", CeedInit_Cuda_gen, 20); } //------------------------------------------------------------------------------ diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h index 8d942c63d7..7db9b0f967 100644 --- a/backends/cuda-gen/ceed-cuda-gen.h +++ b/backends/cuda-gen/ceed-cuda-gen.h @@ -8,22 +8,23 @@ #ifndef _ceed_cuda_gen_h #define _ceed_cuda_gen_h -#include #include +#include #include + #include "../cuda/ceed-cuda-common.h" typedef struct { - CeedInt dim; - CeedInt Q_1d; - CeedInt max_P_1d; - CUmodule module; - CUfunction op; + CeedInt dim; + CeedInt Q_1d; + CeedInt max_P_1d; + CUmodule module; + CUfunction op; FieldsInt_Cuda indices; - Fields_Cuda fields; - Fields_Cuda B; - Fields_Cuda G; - CeedScalar *W; + Fields_Cuda fields; + Fields_Cuda B; + Fields_Cuda G; + CeedScalar *W; } CeedOperator_Cuda_gen; typedef struct { @@ -36,4 +37,4 @@ CEED_INTERN int CeedQFunctionCreate_Cuda_gen(CeedQFunction qf); CEED_INTERN int CeedOperatorCreate_Cuda_gen(CeedOperator op); -#endif // _ceed_cuda_gen_h +#endif // _ceed_cuda_gen_h diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c index d9343a7839..de7bf0088d 100644 --- a/backends/cuda-ref/ceed-cuda-ref-basis.c +++ b/backends/cuda-ref/ceed-cuda-ref-basis.c @@ -5,185 +5,159 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include -#include "ceed-cuda-ref.h" + #include "../cuda/ceed-cuda-compile.h" +#include "ceed-cuda-ref.h" //------------------------------------------------------------------------------ // Basis apply - tensor //------------------------------------------------------------------------------ -int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, - CeedTransposeMode t_mode, CeedEvalMode eval_mode, - CeedVector u, CeedVector v) { - int ierr; +int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Cuda *ceed_Cuda; - ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); CeedBasis_Cuda *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - const int max_block_size = 32; + CeedCallBackend(CeedBasisGetData(basis, &data)); + const CeedInt transpose = t_mode == CEED_TRANSPOSE; + const int max_block_size = 32; // Read vectors const CeedScalar *d_u; - CeedScalar *d_v; + CeedScalar *d_v; if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); } - ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation if (t_mode == CEED_TRANSPOSE) { CeedSize length; - ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); - ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); - CeedChk_Cu(ceed, ierr); + CeedCallBackend(CeedVectorGetLength(v, &length)); + CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); } CeedInt Q_1d, dim; - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); // Basis action switch (eval_mode) { - case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *) &num_elem, (void *) &transpose, - &data->d_interp_1d, &d_u, &d_v - }; - CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - - ierr = CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args); - CeedChkBackend(ierr); - } break; - case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, - &data->d_grad_1d, &d_u, &d_v - }; - CeedInt block_size = max_block_size; - - ierr = CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args); - CeedChkBackend(ierr); - } break; - case CEED_EVAL_WEIGHT: { - void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; - const int grid_size = num_elem; - ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid_size, - Q_1d, dim >= 2 ? Q_1d : 1, 1, - weight_args); CeedChkBackend(ierr); - } break; - // LCOV_EXCL_START - // Evaluate the divergence to/from the quadrature points - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - // Evaluate the curl to/from the quadrature points - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - // Take no action, BasisApply should not have been called - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + case CEED_EVAL_INTERP: { + void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &d_u, &d_v}; + CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + + CeedCallBackend(CeedRunKernelCuda(ceed, data->Interp, num_elem, block_size, interp_args)); + } break; + case CEED_EVAL_GRAD: { + void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; + CeedInt block_size = max_block_size; + + CeedCallBackend(CeedRunKernelCuda(ceed, data->Grad, num_elem, block_size, grad_args)); + } break; + case CEED_EVAL_WEIGHT: { + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; + const int grid_size = num_elem; + CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, grid_size, Q_1d, dim >= 2 ? Q_1d : 1, 1, weight_args)); + } break; + // LCOV_EXCL_START + // Evaluate the divergence to/from the quadrature points + case CEED_EVAL_DIV: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); + // Evaluate the curl to/from the quadrature points + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + // Take no action, BasisApply should not have been called + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP } // Restore vectors if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } - ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ -int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, - CeedTransposeMode t_mode, CeedEvalMode eval_mode, - CeedVector u, CeedVector v) { - int ierr; +int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Cuda *ceed_Cuda; - ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); CeedBasisNonTensor_Cuda *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedInt num_nodes, num_qpts; - ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr); - ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr); - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - int elems_per_block = 1; - int grid = num_elem / elems_per_block + - ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); + const CeedInt transpose = t_mode == CEED_TRANSPOSE; + int elems_per_block = 1; + int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); // Read vectors const CeedScalar *d_u; - CeedScalar *d_v; + CeedScalar *d_v; if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); } - ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation if (t_mode == CEED_TRANSPOSE) { CeedSize length; - ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); - ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); - CeedChk_Cu(ceed, ierr); + CeedCallBackend(CeedVectorGetLength(v, &length)); + CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); } // Apply basis operation switch (eval_mode) { - case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *) &num_elem, (void *) &transpose, - &data->d_interp, &d_u, &d_v - }; - if (transpose) { - ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1, - elems_per_block, interp_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1, - elems_per_block, interp_args); CeedChkBackend(ierr); - } - } break; - case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad, - &d_u, &d_v - }; - if (transpose) { - ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1, - elems_per_block, grad_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1, - elems_per_block, grad_args); CeedChkBackend(ierr); - } - } break; - case CEED_EVAL_WEIGHT: { - void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v}; - ierr = CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1, - elems_per_block, weight_args); CeedChkBackend(ierr); - } break; - // LCOV_EXCL_START - // Evaluate the divergence to/from the quadrature points - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - // Evaluate the curl to/from the quadrature points - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - // Take no action, BasisApply should not have been called - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + case CEED_EVAL_INTERP: { + void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; + if (transpose) { + CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Interp, grid, num_nodes, 1, elems_per_block, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Interp, grid, num_qpts, 1, elems_per_block, interp_args)); + } + } break; + case CEED_EVAL_GRAD: { + void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; + if (transpose) { + CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Grad, grid, num_nodes, 1, elems_per_block, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Grad, grid, num_qpts, 1, elems_per_block, grad_args)); + } + } break; + case CEED_EVAL_WEIGHT: { + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; + CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); + } break; + // LCOV_EXCL_START + // Evaluate the divergence to/from the quadrature points + case CEED_EVAL_DIV: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); + // Evaluate the curl to/from the quadrature points + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + // Take no action, BasisApply should not have been called + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP } // Restore vectors if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } - ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } @@ -191,19 +165,18 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, // Destroy tensor basis //------------------------------------------------------------------------------ static int CeedBasisDestroy_Cuda(CeedBasis basis) { - int ierr; Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasis_Cuda *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedChk_Cu(ceed, cuModuleUnload(data->module)); + CeedCallCuda(ceed, cuModuleUnload(data->module)); - ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); + CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); + CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -212,19 +185,18 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) { // Destroy non-tensor basis //------------------------------------------------------------------------------ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { - int ierr; Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasisNonTensor_Cuda *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedChk_Cu(ceed, cuModuleUnload(data->module)); + CeedCallCuda(ceed, cuModuleUnload(data->module)); - ierr = cudaFree(data->d_q_weight); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(data->d_interp); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(data->d_grad); CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallCuda(ceed, cudaFree(data->d_q_weight)); + CeedCallCuda(ceed, cudaFree(data->d_interp)); + CeedCallCuda(ceed, cudaFree(data->d_grad)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -232,139 +204,93 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { //------------------------------------------------------------------------------ // Create tensor //------------------------------------------------------------------------------ -int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, - const CeedScalar *interp_1d, - const CeedScalar *grad_1d, - const CeedScalar *q_ref_1d, - const CeedScalar *q_weight_1d, - CeedBasis basis) { - int ierr; +int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasis_Cuda *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); // Copy data to GPU const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); - ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); const CeedInt interp_bytes = q_bytes * P_1d; - ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); - ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); // Complie basis kernels CeedInt num_comp; - ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); char *basis_kernel_path, *basis_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", - &basis_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); - ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7, - "BASIS_Q_1D", Q_1d, - "BASIS_P_1D", P_1d, - "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? - Q_1d : P_1d, dim), - "BASIS_DIM", dim, - "BASIS_NUM_COMP", num_comp, - "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), - "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim) - ); CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); - CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); - CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); - CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); - - ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApply_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", - CeedBasisDestroy_Cuda); CeedChkBackend(ierr); + CeedCallBackend(CeedCompileCuda(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", + num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, + "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp)); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad)); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight)); + CeedCallBackend(CeedFree(&basis_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + + CeedCallBackend(CeedBasisSetData(basis, data)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Create non-tensor //------------------------------------------------------------------------------ -int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, - CeedInt num_nodes, - CeedInt num_qpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *qref, - const CeedScalar *q_weight, CeedBasis basis) { - int ierr; +int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar *qref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasisNonTensor_Cuda *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); - ierr = cudaMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_q_weight, q_weight, q_bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); const CeedInt interp_bytes = q_bytes * num_nodes; - ierr = cudaMalloc((void **)&data->d_interp, interp_bytes); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_interp, interp, interp_bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); const CeedInt grad_bytes = q_bytes * num_nodes * dim; - ierr = cudaMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_grad, grad, grad_bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); // Compile basis kernels CeedInt num_comp; - ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); char *basis_kernel_path, *basis_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", - &basis_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); - ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4, - "BASIS_Q", num_qpts, - "BASIS_P", num_nodes, - "BASIS_DIM", dim, - "BASIS_NUM_COMP", num_comp - ); CeedChk_Cu(ceed, ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); - CeedChk_Cu(ceed, ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); - CeedChk_Cu(ceed, ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); - CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); - - ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); + CeedCallCuda(ceed, CeedCompileCuda(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, + "BASIS_NUM_COMP", num_comp)); + CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp)); + CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad)); + CeedCallCuda(ceed, CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight)); + CeedCallBackend(CeedFree(&basis_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + + CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApplyNonTensor_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", - CeedBasisDestroyNonTensor_Cuda); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c index 928dceb4ec..dbae242406 100644 --- a/backends/cuda-ref/ceed-cuda-ref-operator.c +++ b/backends/cuda-ref/ceed-cuda-ref-operator.c @@ -5,123 +5,114 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include #include +#include #include -#include #include #include #include #include -#include "ceed-cuda-ref.h" + #include "../cuda/ceed-cuda-compile.h" +#include "ceed-cuda-ref.h" //------------------------------------------------------------------------------ // Destroy operator //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Cuda(CeedOperator op) { - int ierr; CeedOperator_Cuda *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Apply data for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { - ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&impl->evecs[i])); } - ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->evecs)); for (CeedInt i = 0; i < impl->numein; i++) { - ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&impl->qvecsin[i])); } - ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->qvecsin)); for (CeedInt i = 0; i < impl->numeout; i++) { - ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&impl->qvecsout[i])); } - ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->qvecsout)); // QFunction assembly data - for (CeedInt i=0; iqfnumactivein; i++) { - ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->qfnumactivein; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->qfactivein[i])); } - ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->qfactivein)); // Diag data if (impl->diag) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedChk_Cu(ceed, cuModuleUnload(impl->diag->module)); - ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr); - ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr); - ierr = cudaFree(impl->diag->d_emodein); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->diag->d_emodeout); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->diag->d_identity); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->diag->d_interpin); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->diag->d_interpout); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->diag->d_gradin); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->diag->d_gradout); CeedChk_Cu(ceed, ierr); - ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr); - CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallCuda(ceed, cuModuleUnload(impl->diag->module)); + CeedCallBackend(CeedFree(&impl->diag->h_emodein)); + CeedCallBackend(CeedFree(&impl->diag->h_emodeout)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_emodein)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_emodeout)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_identity)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_interpin)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_interpout)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_gradin)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_gradout)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->elemdiag)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->pbelemdiag)); } - ierr = CeedFree(&impl->diag); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->diag)); if (impl->asmb) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedChk_Cu(ceed, cuModuleUnload(impl->asmb->module)); - ierr = cudaFree(impl->asmb->d_B_in); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->asmb->d_B_out); CeedChk_Cu(ceed, ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module)); + CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in)); + CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_out)); } - ierr = CeedFree(&impl->asmb); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->asmb)); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Setup infields or outfields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, - bool isinput, CeedVector *evecs, - CeedVector *qvecs, CeedInt starte, - CeedInt numfields, CeedInt Q, - CeedInt numelements) { - CeedInt dim, ierr, size; +static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool isinput, CeedVector *evecs, CeedVector *qvecs, CeedInt starte, + CeedInt numfields, CeedInt Q, CeedInt numelements) { + CeedInt dim, size; CeedSize q_size; - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedBasis basis; + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedBasis basis; CeedElemRestriction Erestrict; - CeedOperatorField *opfields; + CeedOperatorField *opfields; CeedQFunctionField *qffields; - CeedVector fieldvec; - bool strided; - bool skiprestrict; + CeedVector fieldvec; + bool strided; + bool skiprestrict; if (isinput) { - ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); } else { - ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); } // Loop over fields for (CeedInt i = 0; i < numfields; i++) { CeedEvalMode emode; - ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); - strided = false; + strided = false; skiprestrict = false; if (emode != CEED_EVAL_WEIGHT) { - ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict)); // Check whether this field can skip the element restriction: // must be passive input, with emode NONE, and have a strided restriction with @@ -130,17 +121,15 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, // First, check whether the field is input or output: if (isinput) { // Check for passive input: - ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &fieldvec)); if (fieldvec != CEED_VECTOR_ACTIVE) { // Check emode if (emode == CEED_EVAL_NONE) { // Check for strided restriction - ierr = CeedElemRestrictionIsStrided(Erestrict, &strided); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &strided)); if (strided) { // Check if vector is already in preferred backend ordering - ierr = CeedElemRestrictionHasBackendStrides(Erestrict, - &skiprestrict); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &skiprestrict)); } } } @@ -150,41 +139,38 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, // directly in the operator application. evecs[i + starte] = NULL; } else { - ierr = CeedElemRestrictionCreateVector(Erestrict, NULL, - &evecs[i + starte]); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionCreateVector(Erestrict, NULL, &evecs[i + starte])); } } switch (emode) { - case CEED_EVAL_NONE: - ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); - q_size = (CeedSize)numelements * Q * size; - ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_INTERP: - ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); - q_size = (CeedSize)numelements * Q * size; - ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - q_size = (CeedSize)numelements * Q * size; - ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_WEIGHT: // Only on input fields - ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); - q_size = (CeedSize)numelements * Q; - ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, - CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented + case CEED_EVAL_NONE: + CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); + q_size = (CeedSize)numelements * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); + q_size = (CeedSize)numelements * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); + CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + q_size = (CeedSize)numelements * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + break; + case CEED_EVAL_WEIGHT: // Only on input fields + CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); + q_size = (CeedSize)numelements * Q; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, NULL, qvecs[i])); + break; + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } return CEED_ERROR_SUCCESS; @@ -195,101 +181,77 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, // to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ static int CeedOperatorSetup_Cuda(CeedOperator op) { - int ierr; bool setupdone; - ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); - if (setupdone) - return CEED_ERROR_SUCCESS; + CeedCallBackend(CeedOperatorIsSetupDone(op, &setupdone)); + if (setupdone) return CEED_ERROR_SUCCESS; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Cuda *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedInt Q, numelements, numinputfields, numoutputfields; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); CeedOperatorField *opinputfields, *opoutputfields; - ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, - &numoutputfields, &opoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); CeedQFunctionField *qfinputfields, *qfoutputfields; - ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); // Allocate - ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs); - CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(numinputfields + numoutputfields, &impl->evecs)); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout)); - impl->numein = numinputfields; impl->numeout = numoutputfields; + impl->numein = numinputfields; + impl->numeout = numoutputfields; // Set up infield and outfield evecs and qvecs // Infields - ierr = CeedOperatorSetupFields_Cuda(qf, op, true, - impl->evecs, impl->qvecsin, 0, - numinputfields, Q, numelements); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->evecs, impl->qvecsin, 0, numinputfields, Q, numelements)); // Outfields - ierr = CeedOperatorSetupFields_Cuda(qf, op, false, - impl->evecs, impl->qvecsout, - numinputfields, numoutputfields, Q, - numelements); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->evecs, impl->qvecsout, numinputfields, numoutputfields, Q, numelements)); - ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Setup Operator Inputs //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields, - CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], - CeedOperator_Cuda *impl, CeedRequest *request) { - CeedInt ierr; - CeedEvalMode emode; - CeedVector vec; +static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, + CeedVector invec, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], + CeedOperator_Cuda *impl, CeedRequest *request) { + CeedEvalMode emode; + CeedVector vec; CeedElemRestriction Erestrict; for (CeedInt i = 0; i < numinputfields; i++) { // Get input vector - ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - if (skipactive) - continue; - else - vec = invec; + if (skipactive) continue; + else vec = invec; } - ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); - CeedChkBackend(ierr); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); + if (emode == CEED_EVAL_WEIGHT) { // Skip } else { // Get input vector - ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); // Get input element restriction - ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); - CeedChkBackend(ierr); - if (vec == CEED_VECTOR_ACTIVE) - vec = invec; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); + if (vec == CEED_VECTOR_ACTIVE) vec = invec; // Restrict, if necessary if (!impl->evecs[i]) { // No restriction for this field; read data directly from vec. - ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, - (const CeedScalar **) &edata[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); } else { - ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, - impl->evecs[i], request); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, impl->evecs[i], request)); // Get evec - ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, - (const CeedScalar **) &edata[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); } } } @@ -299,58 +261,45 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields, //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements, - CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedInt numinputfields, const bool skipactive, - CeedScalar *edata[2*CEED_FIELD_MAX],CeedOperator_Cuda *impl) { - CeedInt ierr; - CeedInt elemsize, size; +static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, + CeedInt numinputfields, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], + CeedOperator_Cuda *impl) { + CeedInt elemsize, size; CeedElemRestriction Erestrict; - CeedEvalMode emode; - CeedBasis basis; + CeedEvalMode emode; + CeedBasis basis; - for (CeedInt i=0; iqvecsin[i], CEED_MEM_DEVICE, - CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_INTERP: - ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, - CEED_EVAL_INTERP, impl->evecs[i], - impl->qvecsin[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, - CEED_EVAL_GRAD, impl->evecs[i], - impl->qvecsin[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_WEIGHT: - break; // No action - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented + case CEED_EVAL_NONE: + CeedCallBackend(CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i])); + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->evecs[i], impl->qvecsin[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->evecs[i], impl->qvecsin[i])); + break; + case CEED_EVAL_WEIGHT: + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } return CEED_ERROR_SUCCESS; @@ -359,34 +308,25 @@ static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements, //------------------------------------------------------------------------------ // Restore Input Vectors //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields, - CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], - CeedOperator_Cuda *impl) { - CeedInt ierr; +static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, + const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { CeedEvalMode emode; - CeedVector vec; + CeedVector vec; for (CeedInt i = 0; i < numinputfields; i++) { // Skip active input if (skipactive) { - ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); - if (vec == CEED_VECTOR_ACTIVE) - continue; + CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) continue; } - ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); - CeedChkBackend(ierr); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); + if (emode == CEED_EVAL_WEIGHT) { // Skip } else { if (!impl->evecs[i]) { // This was a skiprestrict case - ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArrayRead(vec, - (const CeedScalar **)&edata[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i])); } else { - ierr = CeedVectorRestoreArrayRead(impl->evecs[i], - (const CeedScalar **) &edata[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(impl->evecs[i], (const CeedScalar **)&edata[i])); } } } @@ -396,227 +336,176 @@ static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields, //------------------------------------------------------------------------------ // Apply and add to output //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec, - CeedVector outvec, CeedRequest *request) { - int ierr; +static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) { CeedOperator_Cuda *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); CeedOperatorField *opinputfields, *opoutputfields; - ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, - &numoutputfields, &opoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); CeedQFunctionField *qfinputfields, *qfoutputfields; - ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); - CeedChkBackend(ierr); - CeedEvalMode emode; - CeedVector vec; - CeedBasis basis; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); + CeedEvalMode emode; + CeedVector vec; + CeedBasis basis; CeedElemRestriction Erestrict; - CeedScalar *edata[2*CEED_FIELD_MAX] = {0}; + CeedScalar *edata[2 * CEED_FIELD_MAX] = {0}; // Setup - ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Cuda(op)); // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, - opinputfields, invec, false, edata, - impl, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, opinputfields, invec, false, edata, impl, request)); // Input basis apply if needed - ierr = CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, - numinputfields, false, edata, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, numinputfields, false, edata, impl)); // Output pointers, as necessary for (CeedInt i = 0; i < numoutputfields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); if (emode == CEED_EVAL_NONE) { // Set the output Q-Vector to use the E-Vector data directly. - ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, - &edata[i + numinputfields]); CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, - CEED_USE_POINTER, edata[i + numinputfields]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, &edata[i + numinputfields])); + CeedCallBackend(CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i + numinputfields])); } } // Q function - ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout)); // Output basis apply if needed for (CeedInt i = 0; i < numoutputfields; i++) { // Get elemsize, emode, size - ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); - CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); + CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); // Basis action switch (emode) { - case CEED_EVAL_NONE: - break; - case CEED_EVAL_INTERP: - ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, - CEED_EVAL_INTERP, impl->qvecsout[i], - impl->evecs[i + impl->numein]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, - CEED_EVAL_GRAD, impl->qvecsout[i], - impl->evecs[i + impl->numein]); CeedChkBackend(ierr); - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur - } - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented - // LCOV_EXCL_STOP + case CEED_EVAL_NONE: + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->qvecsout[i], impl->evecs[i + impl->numein])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->qvecsout[i], impl->evecs[i + impl->numein])); + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + break; // Should not occur + } + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } } // Output restriction for (CeedInt i = 0; i < numoutputfields; i++) { // Restore evec - ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); if (emode == CEED_EVAL_NONE) { - ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein], - &edata[i + numinputfields]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(impl->evecs[i + impl->numein], &edata[i + numinputfields])); } // Get output vector - ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); // Restrict - ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); // Active - if (vec == CEED_VECTOR_ACTIVE) - vec = outvec; + if (vec == CEED_VECTOR_ACTIVE) vec = outvec; - ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, - impl->evecs[i + impl->numein], vec, - request); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, impl->evecs[i + impl->numein], vec, request)); } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, - opinputfields, false, edata, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, opinputfields, false, edata, impl)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Core code for assembling linear QFunction //------------------------------------------------------------------------------ -static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, - bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, - CeedRequest *request) { - int ierr; +static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { CeedOperator_Cuda *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); - CeedInt Q, numelements, numinputfields, numoutputfields, size; + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedInt Q, numelements, numinputfields, numoutputfields, size; CeedSize q_size; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); CeedOperatorField *opinputfields, *opoutputfields; - ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, - &numoutputfields, &opoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); CeedQFunctionField *qfinputfields, *qfoutputfields; - ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); - CeedChkBackend(ierr); - CeedVector vec; - CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); + CeedVector vec; + CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; CeedVector *activein = impl->qfactivein; CeedScalar *a, *tmp; - Ceed ceed, ceedparent; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent); - CeedChkBackend(ierr); + Ceed ceed, ceedparent; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedGetOperatorFallbackParentCeed(ceed, &ceedparent)); ceedparent = ceedparent ? ceedparent : ceed; - CeedScalar *edata[2*CEED_FIELD_MAX]; + CeedScalar *edata[2 * CEED_FIELD_MAX]; // Setup - ierr = CeedOperatorSetup_Cuda(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Cuda(op)); // Check for identity bool identityqf; - ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr); - if (identityqf) + CeedCallBackend(CeedQFunctionIsIdentity(qf, &identityqf)); + if (identityqf) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Assembling identity QFunctions not supported"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); + // LCOV_EXCL_STOP + } // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, - opinputfields, NULL, true, edata, - impl, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, opinputfields, NULL, true, edata, impl, request)); // Count number of active input fields if (!numactivein) { - for (CeedInt i=0; iqvecsin[i], 0.0); CeedChkBackend(ierr); - ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp); - CeedChkBackend(ierr); - ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); + CeedCallBackend(CeedVectorSetValue(impl->qvecsin[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp)); + CeedCallBackend(CeedRealloc(numactivein + size, &activein)); for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q*numelements; - ierr = CeedVectorCreate(ceed, q_size, &activein[numactivein+field]); - CeedChkBackend(ierr); - ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE, - CEED_USE_POINTER, &tmp[field*Q*numelements]); - CeedChkBackend(ierr); + q_size = (CeedSize)Q * numelements; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &activein[numactivein + field])); + CeedCallBackend(CeedVectorSetArray(activein[numactivein + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &tmp[field * Q * numelements])); } numactivein += size; - ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(impl->qvecsin[i], &tmp)); } } impl->qfnumactivein = numactivein; - impl->qfactivein = activein; + impl->qfactivein = activein; } // Count number of active output fields if (!numactiveout) { - for (CeedInt i=0; i 1) { - ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein], - 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetValue(activein[(in + numactivein - 1) % numactivein], 0.0)); } // Set Outputs - for (CeedInt out=0; outqvecsout[out], CEED_MEM_DEVICE, - CEED_USE_POINTER, a); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size); - CeedChkBackend(ierr); - a += size*Q*numelements; // Advance the pointer by the size of the output + CeedCallBackend(CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, CEED_USE_POINTER, a)); + CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[out], &size)); + a += size * Q * numelements; // Advance the pointer by the size of the output } } // Apply QFunction - ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, Q * numelements, impl->qvecsin, impl->qvecsout)); } // Un-set output Qvecs to prevent accidental overwrite of Assembled - for (CeedInt out=0; outqvecsout[out], CEED_MEM_DEVICE, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL)); } } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, - opinputfields, true, edata, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, opinputfields, true, edata, impl)); // Restore output - ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(*assembled, &a)); return CEED_ERROR_SUCCESS; } @@ -705,58 +579,47 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, //------------------------------------------------------------------------------ // Assemble Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, - CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, - request); +static int CeedOperatorLinearAssembleQFunction_Cuda(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, true, assembled, rstr, request); } //------------------------------------------------------------------------------ // Update Assembled Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, - CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, - &rstr, request); +static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Cuda(op, false, &assembled, &rstr, request); } //------------------------------------------------------------------------------ // Create point block restriction //------------------------------------------------------------------------------ -static int CreatePBRestriction(CeedElemRestriction rstr, - CeedElemRestriction *pbRstr) { - int ierr; +static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *pbRstr) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); const CeedInt *offsets; - ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); // Expand offsets - CeedInt nelem, ncomp, elemsize, compstride, *pbOffsets; + CeedInt nelem, ncomp, elemsize, compstride, *pbOffsets; CeedSize l_size; - ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetCompStride(rstr, &compstride); - CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetLVectorSize(rstr, &l_size); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &nelem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &ncomp)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elemsize)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &compstride)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); CeedInt shift = ncomp; - if (compstride != 1) - shift *= ncomp; - ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr); - for (CeedInt i = 0; i < nelem*elemsize; i++) { - pbOffsets[i] = offsets[i]*shift; + if (compstride != 1) shift *= ncomp; + CeedCallBackend(CeedCalloc(nelem * elemsize, &pbOffsets)); + for (CeedInt i = 0; i < nelem * elemsize; i++) { + pbOffsets[i] = offsets[i] * shift; } // Create new restriction - ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1, - l_size * ncomp, CEED_MEM_HOST, - CEED_OWN_POINTER, pbOffsets, pbRstr); - CeedChkBackend(ierr); + CeedCallBackend( + CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp * ncomp, 1, l_size * ncomp, CEED_MEM_HOST, CEED_OWN_POINTER, pbOffsets, pbRstr)); // Cleanup - ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); return CEED_ERROR_SUCCESS; } @@ -764,208 +627,171 @@ static int CreatePBRestriction(CeedElemRestriction rstr, //------------------------------------------------------------------------------ // Assemble diagonal setup //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, - const bool pointBlock) { - int ierr; +static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const bool pointBlock) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedInt numinputfields, numoutputfields; - ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields)); // Determine active input basis - CeedOperatorField *opfields; + CeedOperatorField *opfields; CeedQFunctionField *qffields; - ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); - CeedChkBackend(ierr); - CeedInt numemodein = 0, ncomp = 0, dim = 1; - CeedEvalMode *emodein = NULL; - CeedBasis basisin = NULL; - CeedElemRestriction rstrin = NULL; + CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); + CeedInt numemodein = 0, ncomp = 0, dim = 1; + CeedEvalMode *emodein = NULL; + CeedBasis basisin = NULL; + CeedElemRestriction rstrin = NULL; for (CeedInt i = 0; i < numinputfields; i++) { CeedVector vec; - ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedElemRestriction rstr; - ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr); - ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr); - ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); - CeedChkBackend(ierr); - if (rstrin && rstrin != rstr) + CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisin)); + CeedCallBackend(CeedBasisGetNumComponents(basisin, &ncomp)); + CeedCallBackend(CeedBasisGetDimension(basisin, &dim)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); + if (rstrin && rstrin != rstr) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement multi-field non-composite operator diagonal assembly"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); + // LCOV_EXCL_STOP + } rstrin = rstr; CeedEvalMode emode; - ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); switch (emode) { - case CEED_EVAL_NONE: - case CEED_EVAL_INTERP: - ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr); - emodein[numemodein] = emode; - numemodein += 1; - break; - case CEED_EVAL_GRAD: - ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr); - for (CeedInt d = 0; d < dim; d++) - emodein[numemodein+d] = emode; - numemodein += dim; - break; - case CEED_EVAL_WEIGHT: - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - break; // Caught by QF Assembly + case CEED_EVAL_NONE: + case CEED_EVAL_INTERP: + CeedCallBackend(CeedRealloc(numemodein + 1, &emodein)); + emodein[numemodein] = emode; + numemodein += 1; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedRealloc(numemodein + dim, &emodein)); + for (CeedInt d = 0; d < dim; d++) emodein[numemodein + d] = emode; + numemodein += dim; + break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // Caught by QF Assembly } } } // Determine active output basis - ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); - CeedChkBackend(ierr); - CeedInt numemodeout = 0; - CeedEvalMode *emodeout = NULL; - CeedBasis basisout = NULL; - CeedElemRestriction rstrout = NULL; + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); + CeedInt numemodeout = 0; + CeedEvalMode *emodeout = NULL; + CeedBasis basisout = NULL; + CeedElemRestriction rstrout = NULL; for (CeedInt i = 0; i < numoutputfields; i++) { CeedVector vec; - ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedElemRestriction rstr; - ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr); - ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); - CeedChkBackend(ierr); - if (rstrout && rstrout != rstr) + CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisout)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); + if (rstrout && rstrout != rstr) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement multi-field non-composite operator diagonal assembly"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); + // LCOV_EXCL_STOP + } rstrout = rstr; CeedEvalMode emode; - ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); switch (emode) { - case CEED_EVAL_NONE: - case CEED_EVAL_INTERP: - ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr); - emodeout[numemodeout] = emode; - numemodeout += 1; - break; - case CEED_EVAL_GRAD: - ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr); - for (CeedInt d = 0; d < dim; d++) - emodeout[numemodeout+d] = emode; - numemodeout += dim; - break; - case CEED_EVAL_WEIGHT: - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - break; // Caught by QF Assembly + case CEED_EVAL_NONE: + case CEED_EVAL_INTERP: + CeedCallBackend(CeedRealloc(numemodeout + 1, &emodeout)); + emodeout[numemodeout] = emode; + numemodeout += 1; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedRealloc(numemodeout + dim, &emodeout)); + for (CeedInt d = 0; d < dim; d++) emodeout[numemodeout + d] = emode; + numemodeout += dim; + break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // Caught by QF Assembly } } } // Operator data struct CeedOperator_Cuda *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); - ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedCalloc(1, &impl->diag)); CeedOperatorDiag_Cuda *diag = impl->diag; - diag->basisin = basisin; - diag->basisout = basisout; - diag->h_emodein = emodein; - diag->h_emodeout = emodeout; - diag->numemodein = numemodein; - diag->numemodeout = numemodeout; + diag->basisin = basisin; + diag->basisout = basisout; + diag->h_emodein = emodein; + diag->h_emodeout = emodeout; + diag->numemodein = numemodein; + diag->numemodeout = numemodeout; // Assemble kernel char *diagonal_kernel_path, *diagonal_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", - &diagonal_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Diagonal Assembly Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, - &diagonal_kernel_source); - CeedChkBackend(ierr); - CeedDebug256(ceed, 2, - "----- Loading Diagonal Assembly Source Complete! -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); + CeedDebug256(ceed, 2, "----- Loading Diagonal Assembly Source Complete! -----\n"); CeedInt nnodes, nqpts; - ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumNodes(basisin, &nnodes)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basisin, &nqpts)); diag->nnodes = nnodes; - ierr = CeedCompileCuda(ceed, diagonal_kernel_source, &diag->module, 5, - "NUMEMODEIN", numemodein, - "NUMEMODEOUT", numemodeout, - "NNODES", nnodes, - "NQPTS", nqpts, - "NCOMP", ncomp - ); CeedChk_Cu(ceed, ierr); - ierr = CeedGetKernelCuda(ceed, diag->module, "linearDiagonal", - &diag->linearDiagonal); CeedChk_Cu(ceed, ierr); - ierr = CeedGetKernelCuda(ceed, diag->module, "linearPointBlockDiagonal", - &diag->linearPointBlock); - CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&diagonal_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&diagonal_kernel_source); CeedChkBackend(ierr); + CeedCallCuda(ceed, CeedCompileCuda(ceed, diagonal_kernel_source, &diag->module, 5, "NUMEMODEIN", numemodein, "NUMEMODEOUT", numemodeout, "NNODES", + nnodes, "NQPTS", nqpts, "NCOMP", ncomp)); + CeedCallCuda(ceed, CeedGetKernelCuda(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); + CeedCallCuda(ceed, CeedGetKernelCuda(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); + CeedCallBackend(CeedFree(&diagonal_kernel_path)); + CeedCallBackend(CeedFree(&diagonal_kernel_source)); // Basis matrices - const CeedInt qBytes = nqpts * sizeof(CeedScalar); - const CeedInt iBytes = qBytes * nnodes; - const CeedInt gBytes = qBytes * nnodes * dim; - const CeedInt eBytes = sizeof(CeedEvalMode); + const CeedInt qBytes = nqpts * sizeof(CeedScalar); + const CeedInt iBytes = qBytes * nnodes; + const CeedInt gBytes = qBytes * nnodes * dim; + const CeedInt eBytes = sizeof(CeedEvalMode); const CeedScalar *interpin, *interpout, *gradin, *gradout; // CEED_EVAL_NONE CeedScalar *identity = NULL; - bool evalNone = false; - for (CeedInt i=0; id_identity, iBytes); CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(diag->d_identity, identity, iBytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity)); + for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, iBytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, iBytes, cudaMemcpyHostToDevice)); } // CEED_EVAL_INTERP - ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr); - ierr = cudaMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(diag->d_interpin, interpin, iBytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); - ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr); - ierr = cudaMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(diag->d_interpout, interpout, iBytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallBackend(CeedBasisGetInterp(basisin, &interpin)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interpin, iBytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_interpin, interpin, iBytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetInterp(basisout, &interpout)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interpout, iBytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_interpout, interpout, iBytes, cudaMemcpyHostToDevice)); // CEED_EVAL_GRAD - ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr); - ierr = cudaMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(diag->d_gradin, gradin, gBytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); - ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr); - ierr = cudaMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(diag->d_gradout, gradout, gBytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallBackend(CeedBasisGetGrad(basisin, &gradin)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_gradin, gBytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_gradin, gradin, gBytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetGrad(basisout, &gradout)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_gradout, gBytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_gradout, gradout, gBytes, cudaMemcpyHostToDevice)); // Arrays of emodes - ierr = cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); - ierr = cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes, cudaMemcpyHostToDevice)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, cudaMemcpyHostToDevice)); // Restriction diag->diagrstr = rstrout; @@ -976,25 +802,21 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, //------------------------------------------------------------------------------ // Assemble diagonal common code //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, - CeedVector assembled, CeedRequest *request, const bool pointBlock) { - int ierr; +static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool pointBlock) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Cuda *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Assemble QFunction - CeedVector assembledqf; + CeedVector assembledqf; CeedElemRestriction rstr; - ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, - &rstr, request); CeedChkBackend(ierr); - ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); // Setup if (!impl->diag) { - ierr = CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock)); } CeedOperatorDiag_Cuda *diag = impl->diag; assert(diag != NULL); @@ -1002,7 +824,7 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, // Restriction if (pointBlock && !diag->pbdiagrstr) { CeedElemRestriction pbdiagrstr; - ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr); + CeedCallBackend(CreatePBRestriction(diag->diagrstr, &pbdiagrstr)); diag->pbdiagrstr = pbdiagrstr; } CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; @@ -1010,55 +832,40 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, // Create diagonal vector CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; if (!elemdiag) { - ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag); - CeedChkBackend(ierr); - if (pointBlock) - diag->pbelemdiag = elemdiag; - else - diag->elemdiag = elemdiag; + CeedCallBackend(CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag)); + if (pointBlock) diag->pbelemdiag = elemdiag; + else diag->elemdiag = elemdiag; } - ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetValue(elemdiag, 0.0)); // Assemble element operator diagonals - CeedScalar *elemdiagarray; + CeedScalar *elemdiagarray; const CeedScalar *assembledqfarray; - ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray); - CeedChkBackend(ierr); - ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray)); + CeedCallBackend(CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray)); CeedInt nelem; - ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetNumElements(diagrstr, &nelem)); // Compute the diagonal of B^T D B - int elemsPerBlock = 1; - int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlockd_identity, - &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, - &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, - &assembledqfarray, &elemdiagarray - }; + int elemsPerBlock = 1; + int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); + void *args[] = {(void *)&nelem, &diag->d_identity, &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, + &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, &assembledqfarray, &elemdiagarray}; if (pointBlock) { - ierr = CeedRunKernelDimCuda(ceed, diag->linearPointBlock, grid, - diag->nnodes, 1, elemsPerBlock, args); - CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelDimCuda(ceed, diag->linearPointBlock, grid, diag->nnodes, 1, elemsPerBlock, args)); } else { - ierr = CeedRunKernelDimCuda(ceed, diag->linearDiagonal, grid, - diag->nnodes, 1, elemsPerBlock, args); - CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelDimCuda(ceed, diag->linearDiagonal, grid, diag->nnodes, 1, elemsPerBlock, args)); } // Restore arrays - ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(elemdiag, &elemdiagarray)); + CeedCallBackend(CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray)); // Assemble local operator diagonal - ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, - assembled, request); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, assembled, request)); // Cleanup - ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&assembledqf)); return CEED_ERROR_SUCCESS; } @@ -1066,20 +873,16 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, //------------------------------------------------------------------------------ // Assemble Linear Diagonal //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, - CeedVector assembled, CeedRequest *request) { - int ierr = CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false); - CeedChkBackend(ierr); +static int CeedOperatorLinearAssembleAddDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, false)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Assemble Linear Point Block Diagonal //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, - CeedVector assembled, CeedRequest *request) { - int ierr = CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true); - CeedChkBackend(ierr); +static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedCallBackend(CeedOperatorAssembleDiagonalCore_Cuda(op, assembled, request, true)); return CEED_ERROR_SUCCESS; } @@ -1087,59 +890,52 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, // Single operator assembly setup //------------------------------------------------------------------------------ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Cuda *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Get intput and output fields - CeedInt num_input_fields, num_output_fields; + CeedInt num_input_fields, num_output_fields; CeedOperatorField *input_fields; CeedOperatorField *output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &input_fields, - &num_output_fields, &output_fields); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Determine active input basis eval mode CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedQFunctionField *qf_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); // Note that the kernel will treat each dimension of a gradient action separately; // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment // by dim. However, for the purposes of loading the B matrices, it will be treated // as one mode, and we will load/copy the entire gradient matrix at once, so // num_B_in_mats_to_load will be incremented by 1. - CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; - CeedEvalMode *eval_mode_in = NULL; //will be of size num_B_in_mats_load - CeedBasis basis_in = NULL; - CeedInt nqpts = 0, esize = 0; + CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; + CeedEvalMode *eval_mode_in = NULL; // will be of size num_B_in_mats_load + CeedBasis basis_in = NULL; + CeedInt nqpts = 0, esize = 0; CeedElemRestriction rstr_in = NULL; - for (CeedInt i=0; iasmb); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl->asmb)); CeedOperatorAssemble_Cuda *asmb = impl->asmb; - asmb->nelem = nelem; + asmb->nelem = nelem; // Compile kernels - int elemsPerBlock = 1; - asmb->elemsPerBlock = elemsPerBlock; - CeedInt block_size = esize * esize * elemsPerBlock; + int elemsPerBlock = 1; + asmb->elemsPerBlock = elemsPerBlock; + CeedInt block_size = esize * esize * elemsPerBlock; Ceed_Cuda *cuda_data; - ierr = CeedGetData(ceed, &cuda_data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &cuda_data)); char *assembly_kernel_path, *assembly_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", - &assembly_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Assembly Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, assembly_kernel_path, - &assembly_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); CeedDebug256(ceed, 2, "----- Loading Assembly Source Complete! -----\n"); bool fallback = block_size > cuda_data->device_prop.maxThreadsPerBlock; if (fallback) { // Use fallback kernel with 1D threadblock - block_size = esize * elemsPerBlock; + block_size = esize * elemsPerBlock; asmb->block_size_x = esize; asmb->block_size_y = 1; } else { // Use kernel with 2D threadblock asmb->block_size_x = esize; asmb->block_size_y = esize; } - ierr = CeedCompileCuda(ceed, assembly_kernel_source, &asmb->module, 7, - "NELEM", nelem, - "NUMEMODEIN", num_emode_in, - "NUMEMODEOUT", num_emode_out, - "NQPTS", nqpts, - "NNODES", esize, - "BLOCK_SIZE", block_size, - "NCOMP", ncomp - ); CeedChk_Cu(ceed, ierr); - ierr = CeedGetKernelCuda(ceed, asmb->module, - fallback ? "linearAssembleFallback" : "linearAssemble", - &asmb->linearAssemble); CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&assembly_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&assembly_kernel_source); CeedChkBackend(ierr); + CeedCallCuda(ceed, CeedCompileCuda(ceed, assembly_kernel_source, &asmb->module, 7, "NELEM", nelem, "NUMEMODEIN", num_emode_in, "NUMEMODEOUT", + num_emode_out, "NQPTS", nqpts, "NNODES", esize, "BLOCK_SIZE", block_size, "NCOMP", ncomp)); + CeedCallCuda(ceed, CeedGetKernelCuda(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); + CeedCallBackend(CeedFree(&assembly_kernel_path)); + CeedCallBackend(CeedFree(&assembly_kernel_source)); // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) const CeedScalar *interp_in, *grad_in; - ierr = CeedBasisGetInterp(basis_in, &interp_in); CeedChkBackend(ierr); - ierr = CeedBasisGetGrad(basis_in, &grad_in); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); + CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); // Load into B_in, in order that they will be used in eval_mode - const CeedInt inBytes = size_B_in * sizeof(CeedScalar); - CeedInt mat_start = 0; - ierr = cudaMalloc((void **) &asmb->d_B_in, inBytes); CeedChk_Cu(ceed, ierr); + const CeedInt inBytes = size_B_in * sizeof(CeedScalar); + CeedInt mat_start = 0; + CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, inBytes)); for (int i = 0; i < num_B_in_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_in[i]; if (eval_mode == CEED_EVAL_INTERP) { - ierr = cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, - esize * nqpts * sizeof(CeedScalar), - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); mat_start += esize * nqpts; } else if (eval_mode == CEED_EVAL_GRAD) { - ierr = cudaMemcpy(&asmb->d_B_in[mat_start], grad_in, - dim * esize * nqpts * sizeof(CeedScalar), - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); mat_start += dim * esize * nqpts; } } @@ -1269,27 +1042,23 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op) { // for now if (basis_out == basis_in) { interp_out = interp_in; - grad_out = grad_in; + grad_out = grad_in; } else { - ierr = CeedBasisGetInterp(basis_out, &interp_out); CeedChkBackend(ierr); - ierr = CeedBasisGetGrad(basis_out, &grad_out); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); + CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); } // Load into B_out, in order that they will be used in eval_mode const CeedInt outBytes = size_B_out * sizeof(CeedScalar); - mat_start = 0; - ierr = cudaMalloc((void **) &asmb->d_B_out, outBytes); CeedChk_Cu(ceed, ierr); + mat_start = 0; + CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, outBytes)); for (int i = 0; i < num_B_out_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_out[i]; if (eval_mode == CEED_EVAL_INTERP) { - ierr = cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, - esize * nqpts * sizeof(CeedScalar), - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); mat_start += esize * nqpts; } else if (eval_mode == CEED_EVAL_GRAD) { - ierr = cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, - dim * esize * nqpts * sizeof(CeedScalar), - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); mat_start += dim * esize * nqpts; } } @@ -1305,57 +1074,43 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op) { // modes). // TODO: allow multiple active input restrictions/basis objects //------------------------------------------------------------------------------ -static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, - CeedVector values) { - - int ierr; +static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Cuda *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Setup if (!impl->asmb) { - ierr = CeedSingleOperatorAssembleSetup_Cuda(op); - CeedChkBackend(ierr); + CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op)); assert(impl->asmb != NULL); } // Assemble QFunction - CeedVector assembled_qf; + CeedVector assembled_qf; CeedElemRestriction rstr_q; - ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate( - op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE); CeedChkBackend(ierr); - ierr = CeedElemRestrictionDestroy(&rstr_q); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); CeedScalar *values_array; - ierr = CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array)); values_array += offset; const CeedScalar *qf_array; - ierr = CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); // Compute B^T D B - const CeedInt nelem = impl->asmb->nelem; + const CeedInt nelem = impl->asmb->nelem; const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock; - const CeedInt grid = nelem/elemsPerBlock+(( - nelem/elemsPerBlock*elemsPerBlockasmb->d_B_in, &impl->asmb->d_B_out, - &qf_array, &values_array - }; - ierr = CeedRunKernelDimCuda(ceed, impl->asmb->linearAssemble, grid, - impl->asmb->block_size_x, impl->asmb->block_size_y, - elemsPerBlock, args); - CeedChkBackend(ierr); - + const CeedInt grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); + void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; + CeedCallBackend( + CeedRunKernelDimCuda(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elemsPerBlock, args)); // Restore arrays - ierr = CeedVectorRestoreArray(values, &values_array); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArrayRead(assembled_qf, &qf_array); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); + CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); // Cleanup - ierr = CeedVectorDestroy(&assembled_qf); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&assembled_qf)); return CEED_ERROR_SUCCESS; } @@ -1364,35 +1119,21 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Cuda(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Cuda *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", - CeedOperatorLinearAssembleQFunction_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, - "LinearAssembleQFunctionUpdate", - CeedOperatorLinearAssembleQFunctionUpdate_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", - CeedOperatorLinearAssembleAddDiagonal_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, - "LinearAssembleAddPointBlockDiagonal", - CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, - "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", - CeedOperatorApplyAdd_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", - CeedOperatorDestroy_Cuda); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Cuda)); + CeedCallBackend( + CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp index 780285cb17..b5714ec974 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp +++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp @@ -5,56 +5,52 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include +#include + #include #include -#include -#include "ceed-cuda-ref.h" + #include "../cuda/ceed-cuda-compile.h" +#include "ceed-cuda-ref.h" //------------------------------------------------------------------------------ // Build QFunction kernel //------------------------------------------------------------------------------ extern "C" int CeedCudaBuildQFunction(CeedQFunction qf) { - CeedInt ierr; using std::ostringstream; using std::string; Ceed ceed; CeedQFunctionGetCeed(qf, &ceed); CeedQFunction_Cuda *data; - ierr = CeedQFunctionGetData(qf, (void **)&data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data)); // QFunction is built - if (data->QFunction) - return CEED_ERROR_SUCCESS; + if (data->QFunction) return CEED_ERROR_SUCCESS; - if (!data->qfunction_source) + if (!data->qfunction_source) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No QFunction source or CUfunction provided."); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "No QFunction source or CUfunction provided."); + // LCOV_EXCL_STOP + } // QFunction kernel generation - CeedInt num_input_fields, num_output_fields, size; + CeedInt num_input_fields, num_output_fields, size; CeedQFunctionField *input_fields, *output_fields; - ierr = CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, - &num_output_fields, &output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Build strings for final kernel char *read_write_kernel_path, *read_write_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-qfunction.h", - &read_write_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-qfunction.h", &read_write_kernel_path)); CeedDebug256(ceed, 2, "----- Loading QFunction Read/Write Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source)); CeedDebug256(ceed, 2, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n"); - string qfunction_source(data->qfunction_source); - string qfunction_name(data->qfunction_name); - string read_write(read_write_kernel_source); - string kernel_name = "CeedKernelCudaRefQFunction_" + qfunction_name; + string qfunction_source(data->qfunction_source); + string qfunction_name(data->qfunction_name); + string read_write(read_write_kernel_source); + string kernel_name = "CeedKernelCudaRefQFunction_" + qfunction_name; ostringstream code; // Defintions @@ -66,7 +62,7 @@ extern "C" int CeedCudaBuildQFunction(CeedQFunction qf) { // Inputs code << " // Input fields\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedQFunctionFieldGetSize(input_fields[i], &size); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetSize(input_fields[i], &size)); code << " const CeedInt size_input_" << i << " = " << size << ";\n"; code << " CeedScalar input_" << i << "[size_input_" << i << "];\n"; } @@ -79,7 +75,7 @@ extern "C" int CeedCudaBuildQFunction(CeedQFunction qf) { // Outputs code << " // Output fields\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedQFunctionFieldGetSize(output_fields[i], &size); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetSize(output_fields[i], &size)); code << " const CeedInt size_output_" << i << " = " << size << ";\n"; code << " CeedScalar output_" << i << "[size_output_" << i << "];\n"; } @@ -117,15 +113,13 @@ extern "C" int CeedCudaBuildQFunction(CeedQFunction qf) { CeedDebug(ceed, code.str().c_str()); // Compile kernel - ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); - CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, kernel_name.c_str(), &data->QFunction); - CeedChkBackend(ierr); + CeedCallBackend(CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0)); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, kernel_name.c_str(), &data->QFunction)); // Cleanup - ierr = CeedFree(&data->qfunction_source); CeedChkBackend(ierr); - ierr = CeedFree(&read_write_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&read_write_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&data->qfunction_source)); + CeedCallBackend(CeedFree(&read_write_kernel_path)); + CeedCallBackend(CeedFree(&read_write_kernel_source)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h index 565a024528..00912a6e96 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h +++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.h @@ -10,4 +10,4 @@ CEED_INTERN int CeedCudaBuildQFunction(CeedQFunction qf); -#endif // _ceed_cuda_qfunction_load_h +#endif // _ceed_cuda_qfunction_load_h diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction.c b/backends/cuda-ref/ceed-cuda-ref-qfunction.c index 9d5129b23d..b806057a70 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunction.c +++ b/backends/cuda-ref/ceed-cuda-ref-qfunction.c @@ -5,67 +5,58 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include -#include "ceed-cuda-ref.h" -#include "ceed-cuda-ref-qfunction-load.h" + #include "../cuda/ceed-cuda-compile.h" +#include "ceed-cuda-ref-qfunction-load.h" +#include "ceed-cuda-ref.h" //------------------------------------------------------------------------------ // Apply QFunction //------------------------------------------------------------------------------ -static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q, - CeedVector *U, CeedVector *V) { - int ierr; +static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); // Build and compile kernel, if not done - ierr = CeedCudaBuildQFunction(qf); CeedChkBackend(ierr); + CeedCallBackend(CeedCudaBuildQFunction(qf)); CeedQFunction_Cuda *data; - ierr = CeedQFunctionGetData(qf, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &data)); Ceed_Cuda *ceed_Cuda; - ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); CeedInt num_input_fields, num_output_fields; - ierr = CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); // Read vectors for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedVectorGetArrayRead(U[i], CEED_MEM_DEVICE, &data->fields.inputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_DEVICE, &data->fields.inputs[i])); } for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedVectorGetArrayWrite(V[i], CEED_MEM_DEVICE, &data->fields.outputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(V[i], CEED_MEM_DEVICE, &data->fields.outputs[i])); } // Get context data - ierr = CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &data->d_c); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &data->d_c)); // Run kernel - void *args[] = {&data->d_c, (void *) &Q, &data->fields}; - ierr = CeedRunKernelAutoblockCuda(ceed, data->QFunction, Q, args); - CeedChkBackend(ierr); + void *args[] = {&data->d_c, (void *)&Q, &data->fields}; + CeedCallBackend(CeedRunKernelAutoblockCuda(ceed, data->QFunction, Q, args)); // Restore vectors for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedVectorRestoreArrayRead(U[i], &data->fields.inputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(U[i], &data->fields.inputs[i])); } for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedVectorRestoreArray(V[i], &data->fields.outputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(V[i], &data->fields.outputs[i])); } // Restore context - ierr = CeedQFunctionRestoreInnerContextData(qf, &data->d_c); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c)); return CEED_ERROR_SUCCESS; } @@ -74,14 +65,12 @@ static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q, // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Cuda(CeedQFunction qf) { - int ierr; CeedQFunction_Cuda *data; - ierr = CeedQFunctionGetData(qf, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &data)); Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); - if (data->module) - CeedChk_Cu(ceed, cuModuleUnload(data->module)); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + if (data->module) CeedCallCuda(ceed, cuModuleUnload(data->module)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -89,13 +78,10 @@ static int CeedQFunctionDestroy_Cuda(CeedQFunction qf) { //------------------------------------------------------------------------------ // Set User QFunction //------------------------------------------------------------------------------ -static int CeedQFunctionSetCUDAUserFunction_Cuda(CeedQFunction qf, - CUfunction f) { - int ierr; +static int CeedQFunctionSetCUDAUserFunction_Cuda(CeedQFunction qf, CUfunction f) { CeedQFunction_Cuda *data; - ierr = CeedQFunctionGetData(qf, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &data)); data->QFunction = f; - return CEED_ERROR_SUCCESS; } @@ -103,29 +89,22 @@ static int CeedQFunctionSetCUDAUserFunction_Cuda(CeedQFunction qf, // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Cuda(CeedQFunction qf) { - int ierr; Ceed ceed; CeedQFunctionGetCeed(qf, &ceed); CeedQFunction_Cuda *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); - ierr = CeedQFunctionSetData(qf, data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedQFunctionSetData(qf, data)); // Read QFunction source - ierr = CeedQFunctionGetKernelName(qf, &data->qfunction_name); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name)); CeedDebug256(ceed, 2, "----- Loading QFunction User Source -----\n"); - ierr = CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source)); CeedDebug256(ceed, 2, "----- Loading QFunction User Source Complete! -----\n"); // Register backend functions - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", - CeedQFunctionApply_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", - CeedQFunctionDestroy_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "SetCUDAUserFunction", - CeedQFunctionSetCUDAUserFunction_Cuda); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "SetCUDAUserFunction", CeedQFunctionSetCUDAUserFunction_Cuda)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c index 271666ebca..b7fb8a3fa6 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c +++ b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c @@ -5,44 +5,41 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-cuda-ref.h" //------------------------------------------------------------------------------ // Sync host to device //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextSyncH2D_Cuda( - const CeedQFunctionContext ctx) { - int ierr; +static inline int CeedQFunctionContextSyncH2D_Cuda(const CeedQFunctionContext ctx) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - if (!impl->h_data) + if (!impl->h_data) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No valid host data to sync to device"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device"); + // LCOV_EXCL_STOP + } size_t ctxsize; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctxsize); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); if (impl->d_data_borrowed) { impl->d_data = impl->d_data_borrowed; } else if (impl->d_data_owned) { impl->d_data = impl->d_data_owned; } else { - ierr = cudaMalloc((void **)&impl->d_data_owned, ctxsize); - CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_data_owned, ctxsize)); impl->d_data = impl->d_data_owned; } - ierr = cudaMemcpy(impl->d_data, impl->h_data, ctxsize, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMemcpy(impl->d_data, impl->h_data, ctxsize, cudaMemcpyHostToDevice)); return CEED_ERROR_SUCCESS; } @@ -50,35 +47,31 @@ static inline int CeedQFunctionContextSyncH2D_Cuda( //------------------------------------------------------------------------------ // Sync device to host //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextSyncD2H_Cuda( - const CeedQFunctionContext ctx) { - int ierr; +static inline int CeedQFunctionContextSyncD2H_Cuda(const CeedQFunctionContext ctx) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - if (!impl->d_data) + if (!impl->d_data) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No valid device data to sync to host"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); + // LCOV_EXCL_STOP + } size_t ctxsize; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctxsize); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); if (impl->h_data_borrowed) { impl->h_data = impl->h_data_borrowed; } else if (impl->h_data_owned) { impl->h_data = impl->h_data_owned; } else { - ierr = CeedMallocArray(1, ctxsize, &impl->h_data_owned); - CeedChkBackend(ierr); + CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); impl->h_data = impl->h_data_owned; } - ierr = cudaMemcpy(impl->h_data, impl->d_data, ctxsize, - cudaMemcpyDeviceToHost); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMemcpy(impl->h_data, impl->d_data, ctxsize, cudaMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -86,11 +79,12 @@ static inline int CeedQFunctionContextSyncD2H_Cuda( //------------------------------------------------------------------------------ // Sync data of type //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextSync_Cuda( - const CeedQFunctionContext ctx, CeedMemType mem_type) { +static inline int CeedQFunctionContextSync_Cuda(const CeedQFunctionContext ctx, CeedMemType mem_type) { switch (mem_type) { - case CEED_MEM_HOST: return CeedQFunctionContextSyncD2H_Cuda(ctx); - case CEED_MEM_DEVICE: return CeedQFunctionContextSyncH2D_Cuda(ctx); + case CEED_MEM_HOST: + return CeedQFunctionContextSyncD2H_Cuda(ctx); + case CEED_MEM_DEVICE: + return CeedQFunctionContextSyncH2D_Cuda(ctx); } return CEED_ERROR_UNSUPPORTED; } @@ -98,11 +92,9 @@ static inline int CeedQFunctionContextSync_Cuda( //------------------------------------------------------------------------------ // Set all pointers as invalid //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextSetAllInvalid_Cuda( - const CeedQFunctionContext ctx) { - int ierr; +static inline int CeedQFunctionContextSetAllInvalid_Cuda(const CeedQFunctionContext ctx) { CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); impl->h_data = NULL; impl->d_data = NULL; @@ -113,11 +105,9 @@ static inline int CeedQFunctionContextSetAllInvalid_Cuda( //------------------------------------------------------------------------------ // Check if ctx has valid data //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextHasValidData_Cuda( - const CeedQFunctionContext ctx, bool *has_valid_data) { - int ierr; +static inline int CeedQFunctionContextHasValidData_Cuda(const CeedQFunctionContext ctx, bool *has_valid_data) { CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); *has_valid_data = impl && (!!impl->h_data || !!impl->d_data); @@ -127,20 +117,18 @@ static inline int CeedQFunctionContextHasValidData_Cuda( //------------------------------------------------------------------------------ // Check if ctx has borrowed data //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextHasBorrowedDataOfType_Cuda( - const CeedQFunctionContext ctx, CeedMemType mem_type, - bool *has_borrowed_data_of_type) { - int ierr; +static inline int CeedQFunctionContextHasBorrowedDataOfType_Cuda(const CeedQFunctionContext ctx, CeedMemType mem_type, + bool *has_borrowed_data_of_type) { CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); switch (mem_type) { - case CEED_MEM_HOST: - *has_borrowed_data_of_type = !!impl->h_data_borrowed; - break; - case CEED_MEM_DEVICE: - *has_borrowed_data_of_type = !!impl->d_data_borrowed; - break; + case CEED_MEM_HOST: + *has_borrowed_data_of_type = !!impl->h_data_borrowed; + break; + case CEED_MEM_DEVICE: + *has_borrowed_data_of_type = !!impl->d_data_borrowed; + break; } return CEED_ERROR_SUCCESS; @@ -149,22 +137,19 @@ static inline int CeedQFunctionContextHasBorrowedDataOfType_Cuda( //------------------------------------------------------------------------------ // Check if data of given type needs sync //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextNeedSync_Cuda( - const CeedQFunctionContext ctx, CeedMemType mem_type, bool *need_sync) { - int ierr; +static inline int CeedQFunctionContextNeedSync_Cuda(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *need_sync) { CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); bool has_valid_data = true; - ierr = CeedQFunctionContextHasValidData(ctx, &has_valid_data); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextHasValidData(ctx, &has_valid_data)); switch (mem_type) { - case CEED_MEM_HOST: - *need_sync = has_valid_data && !impl->h_data; - break; - case CEED_MEM_DEVICE: - *need_sync = has_valid_data && !impl->d_data; - break; + case CEED_MEM_HOST: + *need_sync = has_valid_data && !impl->h_data; + break; + case CEED_MEM_DEVICE: + *need_sync = has_valid_data && !impl->d_data; + break; } return CEED_ERROR_SUCCESS; @@ -173,32 +158,29 @@ static inline int CeedQFunctionContextNeedSync_Cuda( //------------------------------------------------------------------------------ // Set data from host //------------------------------------------------------------------------------ -static int CeedQFunctionContextSetDataHost_Cuda(const CeedQFunctionContext ctx, - const CeedCopyMode copy_mode, void *data) { - int ierr; +static int CeedQFunctionContextSetDataHost_Cuda(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = CeedFree(&impl->h_data_owned); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->h_data_owned)); switch (copy_mode) { - case CEED_COPY_VALUES: { - size_t ctxsize; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctxsize); CeedChkBackend(ierr); - ierr = CeedMallocArray(1, ctxsize, &impl->h_data_owned); - CeedChkBackend(ierr); - impl->h_data_borrowed = NULL; - impl->h_data = impl->h_data_owned; - memcpy(impl->h_data, data, ctxsize); - } break; - case CEED_OWN_POINTER: - impl->h_data_owned = data; - impl->h_data_borrowed = NULL; - impl->h_data = data; - break; - case CEED_USE_POINTER: - impl->h_data_borrowed = data; - impl->h_data = data; - break; + case CEED_COPY_VALUES: { + size_t ctxsize; + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); + CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); + impl->h_data_borrowed = NULL; + impl->h_data = impl->h_data_owned; + memcpy(impl->h_data, data, ctxsize); + } break; + case CEED_OWN_POINTER: + impl->h_data_owned = data; + impl->h_data_borrowed = NULL; + impl->h_data = data; + break; + case CEED_USE_POINTER: + impl->h_data_borrowed = data; + impl->h_data = data; + break; } return CEED_ERROR_SUCCESS; @@ -207,37 +189,33 @@ static int CeedQFunctionContextSetDataHost_Cuda(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // Set data from device //------------------------------------------------------------------------------ -static int CeedQFunctionContextSetDataDevice_Cuda( - const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { - int ierr; +static int CeedQFunctionContextSetDataDevice_Cuda(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = cudaFree(impl->d_data_owned); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaFree(impl->d_data_owned)); impl->d_data_owned = NULL; switch (copy_mode) { - case CEED_COPY_VALUES: { - size_t ctxsize; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctxsize); CeedChkBackend(ierr); - ierr = cudaMalloc((void **)&impl->d_data_owned, ctxsize); - CeedChk_Cu(ceed, ierr); - impl->d_data_borrowed = NULL; - impl->d_data = impl->d_data_owned; - ierr = cudaMemcpy(impl->d_data, data, ctxsize, - cudaMemcpyDeviceToDevice); CeedChk_Cu(ceed, ierr); - } break; - case CEED_OWN_POINTER: - impl->d_data_owned = data; - impl->d_data_borrowed = NULL; - impl->d_data = data; - break; - case CEED_USE_POINTER: - impl->d_data_owned = NULL; - impl->d_data_borrowed = data; - impl->d_data = data; - break; + case CEED_COPY_VALUES: { + size_t ctxsize; + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_data_owned, ctxsize)); + impl->d_data_borrowed = NULL; + impl->d_data = impl->d_data_owned; + CeedCallCuda(ceed, cudaMemcpy(impl->d_data, data, ctxsize, cudaMemcpyDeviceToDevice)); + } break; + case CEED_OWN_POINTER: + impl->d_data_owned = data; + impl->d_data_borrowed = NULL; + impl->d_data = data; + break; + case CEED_USE_POINTER: + impl->d_data_owned = NULL; + impl->d_data_borrowed = data; + impl->d_data = data; + break; } return CEED_ERROR_SUCCESS; @@ -247,18 +225,16 @@ static int CeedQFunctionContextSetDataDevice_Cuda( // Set the data used by a user context, // freeing any previously allocated data if applicable //------------------------------------------------------------------------------ -static int CeedQFunctionContextSetData_Cuda(const CeedQFunctionContext ctx, - const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) { - int ierr; +static int CeedQFunctionContextSetData_Cuda(const CeedQFunctionContext ctx, const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - ierr = CeedQFunctionContextSetAllInvalid_Cuda(ctx); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextSetAllInvalid_Cuda(ctx)); switch (mem_type) { - case CEED_MEM_HOST: - return CeedQFunctionContextSetDataHost_Cuda(ctx, copy_mode, data); - case CEED_MEM_DEVICE: - return CeedQFunctionContextSetDataDevice_Cuda(ctx, copy_mode, data); + case CEED_MEM_HOST: + return CeedQFunctionContextSetDataHost_Cuda(ctx, copy_mode, data); + case CEED_MEM_DEVICE: + return CeedQFunctionContextSetDataDevice_Cuda(ctx, copy_mode, data); } return CEED_ERROR_UNSUPPORTED; @@ -267,34 +243,29 @@ static int CeedQFunctionContextSetData_Cuda(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // Take data //------------------------------------------------------------------------------ -static int CeedQFunctionContextTakeData_Cuda(const CeedQFunctionContext ctx, - const CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextTakeData_Cuda(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type bool need_sync = false; - ierr = CeedQFunctionContextNeedSync_Cuda(ctx, mem_type, &need_sync); - CeedChkBackend(ierr); - if (need_sync) { - ierr = CeedQFunctionContextSync_Cuda(ctx, mem_type); CeedChkBackend(ierr); - } + CeedCallBackend(CeedQFunctionContextNeedSync_Cuda(ctx, mem_type, &need_sync)); + if (need_sync) CeedCallBackend(CeedQFunctionContextSync_Cuda(ctx, mem_type)); // Update pointer switch (mem_type) { - case CEED_MEM_HOST: - *(void **)data = impl->h_data_borrowed; - impl->h_data_borrowed = NULL; - impl->h_data = NULL; - break; - case CEED_MEM_DEVICE: - *(void **)data = impl->d_data_borrowed; - impl->d_data_borrowed = NULL; - impl->d_data = NULL; - break; + case CEED_MEM_HOST: + *(void **)data = impl->h_data_borrowed; + impl->h_data_borrowed = NULL; + impl->h_data = NULL; + break; + case CEED_MEM_DEVICE: + *(void **)data = impl->d_data_borrowed; + impl->d_data_borrowed = NULL; + impl->d_data = NULL; + break; } return CEED_ERROR_SUCCESS; @@ -304,30 +275,25 @@ static int CeedQFunctionContextTakeData_Cuda(const CeedQFunctionContext ctx, // Core logic for GetData. // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ -static int CeedQFunctionContextGetDataCore_Cuda(const CeedQFunctionContext ctx, - const CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextGetDataCore_Cuda(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type bool need_sync = false; - ierr = CeedQFunctionContextNeedSync_Cuda(ctx, mem_type, &need_sync); - CeedChkBackend(ierr); - if (need_sync) { - ierr = CeedQFunctionContextSync_Cuda(ctx, mem_type); CeedChkBackend(ierr); - } + CeedCallBackend(CeedQFunctionContextNeedSync_Cuda(ctx, mem_type, &need_sync)); + if (need_sync) CeedCallBackend(CeedQFunctionContextSync_Cuda(ctx, mem_type)); // Update pointer switch (mem_type) { - case CEED_MEM_HOST: - *(void **)data = impl->h_data; - break; - case CEED_MEM_DEVICE: - *(void **)data = impl->d_data; - break; + case CEED_MEM_HOST: + *(void **)data = impl->h_data; + break; + case CEED_MEM_DEVICE: + *(void **)data = impl->d_data; + break; } return CEED_ERROR_SUCCESS; @@ -336,32 +302,28 @@ static int CeedQFunctionContextGetDataCore_Cuda(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // Get read-only access to the data //------------------------------------------------------------------------------ -static int CeedQFunctionContextGetDataRead_Cuda(const CeedQFunctionContext ctx, - const CeedMemType mem_type, void *data) { +static int CeedQFunctionContextGetDataRead_Cuda(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { return CeedQFunctionContextGetDataCore_Cuda(ctx, mem_type, data); } //------------------------------------------------------------------------------ // Get read/write access to the data //------------------------------------------------------------------------------ -static int CeedQFunctionContextGetData_Cuda(const CeedQFunctionContext ctx, - const CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextGetData_Cuda(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = CeedQFunctionContextGetDataCore_Cuda(ctx, mem_type, data); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetDataCore_Cuda(ctx, mem_type, data)); // Mark only pointer for requested memory as valid - ierr = CeedQFunctionContextSetAllInvalid_Cuda(ctx); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextSetAllInvalid_Cuda(ctx)); switch (mem_type) { - case CEED_MEM_HOST: - impl->h_data = *(void **)data; - break; - case CEED_MEM_DEVICE: - impl->d_data = *(void **)data; - break; + case CEED_MEM_HOST: + impl->h_data = *(void **)data; + break; + case CEED_MEM_DEVICE: + impl->d_data = *(void **)data; + break; } return CEED_ERROR_SUCCESS; @@ -371,15 +333,14 @@ static int CeedQFunctionContextGetData_Cuda(const CeedQFunctionContext ctx, // Destroy the user context //------------------------------------------------------------------------------ static int CeedQFunctionContextDestroy_Cuda(const CeedQFunctionContext ctx) { - int ierr; Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Cuda *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = cudaFree(impl->d_data_owned); CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&impl->h_data_owned); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallCuda(ceed, cudaFree(impl->d_data_owned)); + CeedCallBackend(CeedFree(&impl->h_data_owned)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -388,31 +349,20 @@ static int CeedQFunctionContextDestroy_Cuda(const CeedQFunctionContext ctx) { // QFunctionContext Create //------------------------------------------------------------------------------ int CeedQFunctionContextCreate_Cuda(CeedQFunctionContext ctx) { - int ierr; CeedQFunctionContext_Cuda *impl; - Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", - CeedQFunctionContextHasValidData_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, - "HasBorrowedDataOfType", - CeedQFunctionContextHasBorrowedDataOfType_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", - CeedQFunctionContextSetData_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "TakeData", - CeedQFunctionContextTakeData_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", - CeedQFunctionContextGetData_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", - CeedQFunctionContextGetDataRead_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", - CeedQFunctionContextDestroy_Cuda); CeedChkBackend(ierr); - - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedQFunctionContextSetBackendData(ctx, impl); CeedChkBackend(ierr); + Ceed ceed; + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", CeedQFunctionContextHasValidData_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasBorrowedDataOfType", CeedQFunctionContextHasBorrowedDataOfType_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", CeedQFunctionContextSetData_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "TakeData", CeedQFunctionContextTakeData_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Cuda)); + + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c index 443ab2d02e..6bd7b78b6b 100644 --- a/backends/cuda-ref/ceed-cuda-ref.c +++ b/backends/cuda-ref/ceed-cuda-ref.c @@ -5,13 +5,14 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-cuda-ref.h" + #include +#include #include #include #include #include -#include "ceed-cuda-ref.h" //------------------------------------------------------------------------------ // CUDA preferred MemType @@ -25,13 +26,10 @@ static int CeedGetPreferredMemType_Cuda(CeedMemType *mem_type) { // Get CUBLAS handle //------------------------------------------------------------------------------ int CeedCudaGetCublasHandle(Ceed ceed, cublasHandle_t *handle) { - int ierr; Ceed_Cuda *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); - if (!data->cublas_handle) { - ierr = cublasCreate(&data->cublas_handle); CeedChk_Cublas(ceed, ierr); - } + if (!data->cublas_handle) CeedCallCublas(ceed, cublasCreate(&data->cublas_handle)); *handle = data->cublas_handle; return CEED_ERROR_SUCCESS; } @@ -40,53 +38,36 @@ int CeedCudaGetCublasHandle(Ceed ceed, cublasHandle_t *handle) { // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Cuda(const char *resource, Ceed ceed) { - int ierr; - char *resource_root; - ierr = CeedCudaGetResourceRoot(ceed, resource, &resource_root); - CeedChkBackend(ierr); - if (strcmp(resource_root, "/gpu/cuda/ref")) + CeedCallBackend(CeedCudaGetResourceRoot(ceed, resource, &resource_root)); + if (strcmp(resource_root, "/gpu/cuda/ref")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Cuda backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedFree(&resource_root); CeedChkBackend(ierr); - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Cuda backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedSetDeterministic(ceed, true)); Ceed_Cuda *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); - ierr = CeedCudaInit(ceed, resource); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedSetData(ceed, data)); + CeedCallBackend(CeedCudaInit(ceed, resource)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "GetPreferredMemType", - CeedGetPreferredMemType_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", - CeedVectorCreate_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", - CeedBasisCreateTensorH1_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", - CeedBasisCreateH1_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", - CeedElemRestrictionCreate_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, - "ElemRestrictionCreateBlocked", - CeedElemRestrictionCreateBlocked_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", - CeedQFunctionCreate_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", - CeedQFunctionContextCreate_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", - CeedOperatorCreate_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", - CeedDestroy_Cuda); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "GetPreferredMemType", CeedGetPreferredMemType_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateBlocked", CeedElemRestrictionCreateBlocked_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Cuda(void) { - return CeedRegister("/gpu/cuda/ref", CeedInit_Cuda, 40); -} +CEED_INTERN int CeedRegister_Cuda(void) { return CeedRegister("/gpu/cuda/ref", CeedInit_Cuda, 40); } //------------------------------------------------------------------------------ diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h index 70e860f0d1..2c4e83ac77 100644 --- a/backends/cuda-ref/ceed-cuda-ref.h +++ b/backends/cuda-ref/ceed-cuda-ref.h @@ -8,9 +8,10 @@ #ifndef _ceed_cuda_h #define _ceed_cuda_h -#include #include +#include #include + #include "../cuda/ceed-cuda-common.h" typedef struct { @@ -23,48 +24,48 @@ typedef struct { } CeedVector_Cuda; typedef struct { - CUmodule module; + CUmodule module; CUfunction StridedTranspose; CUfunction StridedNoTranspose; CUfunction OffsetTranspose; CUfunction OffsetNoTranspose; - CeedInt num_nodes; - CeedInt *h_ind; - CeedInt *h_ind_allocated; - CeedInt *d_ind; - CeedInt *d_ind_allocated; - CeedInt *d_t_offsets; - CeedInt *d_t_indices; - CeedInt *d_l_vec_indices; + CeedInt num_nodes; + CeedInt *h_ind; + CeedInt *h_ind_allocated; + CeedInt *d_ind; + CeedInt *d_ind_allocated; + CeedInt *d_t_offsets; + CeedInt *d_t_indices; + CeedInt *d_l_vec_indices; } CeedElemRestriction_Cuda; typedef struct { - CUmodule module; - CUfunction Interp; - CUfunction Grad; - CUfunction Weight; + CUmodule module; + CUfunction Interp; + CUfunction Grad; + CUfunction Weight; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_q_weight_1d; } CeedBasis_Cuda; typedef struct { - CUmodule module; - CUfunction Interp; - CUfunction Grad; - CUfunction Weight; + CUmodule module; + CUfunction Interp; + CUfunction Grad; + CUfunction Weight; CeedScalar *d_interp; CeedScalar *d_grad; CeedScalar *d_q_weight; } CeedBasisNonTensor_Cuda; typedef struct { - CUmodule module; - char *qfunction_name; - char *qfunction_source; - CUfunction QFunction; + CUmodule module; + char *qfunction_name; + char *qfunction_source; + CUfunction QFunction; Fields_Cuda fields; - void *d_c; + void *d_c; } CeedQFunction_Cuda; typedef struct { @@ -77,34 +78,34 @@ typedef struct { } CeedQFunctionContext_Cuda; typedef struct { - CUmodule module; - CUfunction linearDiagonal; - CUfunction linearPointBlock; - CeedBasis basisin, basisout; + CUmodule module; + CUfunction linearDiagonal; + CUfunction linearPointBlock; + CeedBasis basisin, basisout; CeedElemRestriction diagrstr, pbdiagrstr; - CeedVector elemdiag, pbelemdiag; - CeedInt numemodein, numemodeout, nnodes; - CeedEvalMode *h_emodein, *h_emodeout; - CeedEvalMode *d_emodein, *d_emodeout; - CeedScalar *d_identity, *d_interpin, *d_interpout, *d_gradin, *d_gradout; + CeedVector elemdiag, pbelemdiag; + CeedInt numemodein, numemodeout, nnodes; + CeedEvalMode *h_emodein, *h_emodeout; + CeedEvalMode *d_emodein, *d_emodeout; + CeedScalar *d_identity, *d_interpin, *d_interpout, *d_gradin, *d_gradout; } CeedOperatorDiag_Cuda; typedef struct { - CUmodule module; - CUfunction linearAssemble; - CeedInt nelem, block_size_x, block_size_y, elemsPerBlock; + CUmodule module; + CUfunction linearAssemble; + CeedInt nelem, block_size_x, block_size_y, elemsPerBlock; CeedScalar *d_B_in, *d_B_out; } CeedOperatorAssemble_Cuda; typedef struct { - CeedVector *evecs; // E-vectors, inputs followed by outputs - CeedVector *qvecsin; // Input Q-vectors needed to apply operator - CeedVector *qvecsout; // Output Q-vectors needed to apply operator - CeedInt numein; - CeedInt numeout; - CeedInt qfnumactivein, qfnumactiveout; - CeedVector *qfactivein; - CeedOperatorDiag_Cuda *diag; + CeedVector *evecs; // E-vectors, inputs followed by outputs + CeedVector *qvecsin; // Input Q-vectors needed to apply operator + CeedVector *qvecsout; // Output Q-vectors needed to apply operator + CeedInt numein; + CeedInt numeout; + CeedInt qfnumactivein, qfnumactiveout; + CeedVector *qfactivein; + CeedOperatorDiag_Cuda *diag; CeedOperatorAssemble_Cuda *asmb; } CeedOperator_Cuda; @@ -112,29 +113,20 @@ CEED_INTERN int CeedCudaGetCublasHandle(Ceed ceed, cublasHandle_t *handle); CEED_INTERN int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec); -CEED_INTERN int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, - CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r); +CEED_INTERN int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r); -CEED_INTERN int CeedElemRestrictionCreateBlocked_Cuda(const CeedMemType - mem_type, - const CeedCopyMode copy_mode, const CeedInt *indices, - const CeedElemRestriction res); +CEED_INTERN int CeedElemRestrictionCreateBlocked_Cuda(const CeedMemType mem_type, const CeedCopyMode copy_mode, const CeedInt *indices, + const CeedElemRestriction res); -CEED_INTERN int CeedBasisApplyElems_Cuda(CeedBasis basis, - const CeedInt num_elem, - CeedTransposeMode t_mode, CeedEvalMode eval_mode, const CeedVector u, - CeedVector v); +CEED_INTERN int CeedBasisApplyElems_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, + const CeedVector u, CeedVector v); -CEED_INTERN int CeedQFunctionApplyElems_Cuda(CeedQFunction qf, const CeedInt Q, - const CeedVector *const u, const CeedVector *v); +CEED_INTERN int CeedQFunctionApplyElems_Cuda(CeedQFunction qf, const CeedInt Q, const CeedVector *const u, const CeedVector *v); -CEED_INTERN int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, - CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, - const CeedScalar *qref_1d, const CeedScalar *qweight_1d, CeedBasis basis); +CEED_INTERN int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *qref_1d, const CeedScalar *qweight_1d, CeedBasis basis); -CEED_INTERN int CeedBasisCreateH1_Cuda(CeedElemTopology, CeedInt, CeedInt, - CeedInt, const CeedScalar *, - const CeedScalar *, const CeedScalar *, +CEED_INTERN int CeedBasisCreateH1_Cuda(CeedElemTopology, CeedInt, CeedInt, CeedInt, const CeedScalar *, const CeedScalar *, const CeedScalar *, const CeedScalar *, CeedBasis); CEED_INTERN int CeedQFunctionCreate_Cuda(CeedQFunction qf); diff --git a/backends/cuda-ref/ceed-cuda-restriction.c b/backends/cuda-ref/ceed-cuda-restriction.c index 9f90613299..c5f9736bce 100644 --- a/backends/cuda-ref/ceed-cuda-restriction.c +++ b/backends/cuda-ref/ceed-cuda-restriction.c @@ -5,47 +5,46 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include #include #include #include -#include "ceed-cuda-ref.h" + #include "../cuda/ceed-cuda-compile.h" +#include "ceed-cuda-ref.h" //------------------------------------------------------------------------------ // Apply restriction //------------------------------------------------------------------------------ -static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, - CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { - int ierr; +static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { CeedElemRestriction_Cuda *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); Ceed_Cuda *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); const CeedInt warp_size = 32; const CeedInt block_size = warp_size; - const CeedInt num_nodes = impl->num_nodes; - CeedInt num_elem, elem_size; + const CeedInt num_nodes = impl->num_nodes; + CeedInt num_elem, elem_size; CeedElemRestrictionGetNumElements(r, &num_elem); - ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); CUfunction kernel; // Get vectors const CeedScalar *d_u; - CeedScalar *d_v; - ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); + CeedScalar *d_v; + CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); if (t_mode == CEED_TRANSPOSE) { // Sum into for transpose mode, e-vec to l-vec - ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); } else { // Overwrite for notranspose mode, l-vec to e-vec - ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); } // Restrict @@ -53,80 +52,66 @@ static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, // L-vector -> E-vector if (impl->d_ind) { // -- Offsets provided - kernel = impl->OffsetNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; - CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : - 1024; - ierr = CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), - block_size, args); CeedChkBackend(ierr); + kernel = impl->OffsetNoTranspose; + void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; + CeedCallBackend(CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); } else { // -- Strided restriction - kernel = impl->StridedNoTranspose; - void *args[] = {&num_elem, &d_u, &d_v}; - CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : - 1024; - ierr = CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), - block_size, args); CeedChkBackend(ierr); + kernel = impl->StridedNoTranspose; + void *args[] = {&num_elem, &d_u, &d_v}; + CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; + CeedCallBackend(CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); } } else { // E-vector -> L-vector if (impl->d_ind) { // -- Offsets provided - kernel = impl->OffsetTranspose; - void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, - &impl->d_t_offsets, &d_u, &d_v - }; - ierr = CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), - block_size, args); CeedChkBackend(ierr); + kernel = impl->OffsetTranspose; + void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; + CeedCallBackend(CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); } else { // -- Strided restriction - kernel = impl->StridedTranspose; + kernel = impl->StridedTranspose; void *args[] = {&num_elem, &d_u, &d_v}; - ierr = CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), - block_size, args); CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); } } - if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) - *request = NULL; + if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; // Restore arrays - ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Blocked not supported //------------------------------------------------------------------------------ -int CeedElemRestrictionApplyBlock_Cuda(CeedElemRestriction r, CeedInt block, - CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { +int CeedElemRestrictionApplyBlock_Cuda(CeedElemRestriction r, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v, + CeedRequest *request) { // LCOV_EXCL_START - int ierr; Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement blocked restrictions"); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ // Get offsets //------------------------------------------------------------------------------ -static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, - CeedMemType m_type, const CeedInt **offsets) { - int ierr; +static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType m_type, const CeedInt **offsets) { CeedElemRestriction_Cuda *impl; - ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); switch (m_type) { - case CEED_MEM_HOST: - *offsets = impl->h_ind; - break; - case CEED_MEM_DEVICE: - *offsets = impl->d_ind; - break; + case CEED_MEM_HOST: + *offsets = impl->h_ind; + break; + case CEED_MEM_DEVICE: + *offsets = impl->d_ind; + break; } return CEED_ERROR_SUCCESS; } @@ -135,19 +120,18 @@ static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, // Destroy restriction //------------------------------------------------------------------------------ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { - int ierr; CeedElemRestriction_Cuda *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); - ierr = cuModuleUnload(impl->module); CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr); - ierr = cudaFree(impl->d_ind_allocated); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->d_t_offsets); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->d_t_indices); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(impl->d_l_vec_indices); CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + CeedCallCuda(ceed, cuModuleUnload(impl->module)); + CeedCallBackend(CeedFree(&impl->h_ind_allocated)); + CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated)); + CeedCallCuda(ceed, cudaFree(impl->d_t_offsets)); + CeedCallCuda(ceed, cudaFree(impl->d_t_indices)); + CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -155,92 +139,80 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { //------------------------------------------------------------------------------ // Create transpose offsets and indices //------------------------------------------------------------------------------ -static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, - const CeedInt *indices) { - int ierr; +static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const CeedInt *indices) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedElemRestriction_Cuda *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); CeedSize l_size; - CeedInt num_elem, elem_size, num_comp; - ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); + CeedInt num_elem, elem_size, num_comp; + CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); // Count num_nodes bool *is_node; - ierr = CeedCalloc(l_size, &is_node); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(l_size, &is_node)); const CeedInt size_indices = num_elem * elem_size; - for (CeedInt i = 0; i < size_indices; i++) - is_node[indices[i]] = 1; + for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; CeedInt num_nodes = 0; - for (CeedInt i = 0; i < l_size; i++) - num_nodes += is_node[i]; + for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; impl->num_nodes = num_nodes; // L-vector offsets array CeedInt *ind_to_offset, *l_vec_indices; - ierr = CeedCalloc(l_size, &ind_to_offset); CeedChkBackend(ierr); - ierr = CeedCalloc(num_nodes, &l_vec_indices); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); + CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); CeedInt j = 0; - for (CeedInt i = 0; i < l_size; i++) + for (CeedInt i = 0; i < l_size; i++) { if (is_node[i]) { l_vec_indices[j] = i; ind_to_offset[i] = j++; } - ierr = CeedFree(&is_node); CeedChkBackend(ierr); + } + CeedCallBackend(CeedFree(&is_node)); // Compute transpose offsets and indices const CeedInt size_offsets = num_nodes + 1; - CeedInt *t_offsets; - ierr = CeedCalloc(size_offsets, &t_offsets); CeedChkBackend(ierr); + CeedInt *t_offsets; + CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); CeedInt *t_indices; - ierr = CeedMalloc(size_indices, &t_indices); CeedChkBackend(ierr); + CeedCallBackend(CeedMalloc(size_indices, &t_indices)); // Count node multiplicity - for (CeedInt e = 0; e < num_elem; ++e) - for (CeedInt i = 0; i < elem_size; ++i) - ++t_offsets[ind_to_offset[indices[elem_size*e + i]] + 1]; + for (CeedInt e = 0; e < num_elem; ++e) { + for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; + } // Convert to running sum - for (CeedInt i = 1; i < size_offsets; ++i) - t_offsets[i] += t_offsets[i-1]; + for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; // List all E-vec indices associated with L-vec node for (CeedInt e = 0; e < num_elem; ++e) { for (CeedInt i = 0; i < elem_size; ++i) { - const CeedInt lid = elem_size*e + i; - const CeedInt gid = indices[lid]; + const CeedInt lid = elem_size * e + i; + const CeedInt gid = indices[lid]; t_indices[t_offsets[ind_to_offset[gid]]++] = lid; } } // Reset running sum - for (int i = size_offsets - 1; i > 0; --i) - t_offsets[i] = t_offsets[i - 1]; + for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; t_offsets[0] = 0; // Copy data to device // -- L-vector indices - ierr = cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes*sizeof(CeedInt)); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(impl->d_l_vec_indices, l_vec_indices, - num_nodes*sizeof(CeedInt), cudaMemcpyHostToDevice); - CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); + CeedCallCuda(ceed, cudaMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice)); // -- Transpose offsets - ierr = cudaMalloc((void **)&impl->d_t_offsets, size_offsets*sizeof(CeedInt)); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(impl->d_t_offsets, t_offsets, size_offsets*sizeof(CeedInt), - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); + CeedCallCuda(ceed, cudaMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice)); // -- Transpose indices - ierr = cudaMalloc((void **)&impl->d_t_indices, size_indices*sizeof(CeedInt)); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(impl->d_t_indices, t_indices, size_indices*sizeof(CeedInt), - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); + CeedCallCuda(ceed, cudaMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice)); // Cleanup - ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr); - ierr = CeedFree(&l_vec_indices); CeedChkBackend(ierr); - ierr = CeedFree(&t_offsets); CeedChkBackend(ierr); - ierr = CeedFree(&t_indices); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&ind_to_offset)); + CeedCallBackend(CeedFree(&l_vec_indices)); + CeedCallBackend(CeedFree(&t_offsets)); + CeedCallBackend(CeedFree(&t_indices)); return CEED_ERROR_SUCCESS; } @@ -248,33 +220,30 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, //------------------------------------------------------------------------------ // Create restriction //------------------------------------------------------------------------------ -int CeedElemRestrictionCreate_Cuda(CeedMemType m_type, CeedCopyMode copy_mode, - const CeedInt *indices, CeedElemRestriction r) { - int ierr; +int CeedElemRestrictionCreate_Cuda(CeedMemType m_type, CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedElemRestriction_Cuda *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); CeedInt num_elem, num_comp, elem_size; - ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); - CeedInt size = num_elem * elem_size; - CeedInt strides[3] = {1, size, elem_size}; + CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); + CeedInt size = num_elem * elem_size; + CeedInt strides[3] = {1, size, elem_size}; CeedInt comp_stride = 1; // Stride data bool is_strided; - ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); if (is_strided) { bool has_backend_strides; - ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); if (!has_backend_strides) { - ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); } } else { - ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); } impl->h_ind = NULL; @@ -283,131 +252,94 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType m_type, CeedCopyMode copy_mode, impl->d_ind_allocated = NULL; impl->d_t_indices = NULL; impl->d_t_offsets = NULL; - impl->num_nodes = size; - ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr); - CeedInt layout[3] = {1, elem_size*num_elem, elem_size}; - ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr); + impl->num_nodes = size; + CeedCallBackend(CeedElemRestrictionSetData(r, impl)); + CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; + CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); // Set up device indices/offset arrays if (m_type == CEED_MEM_HOST) { switch (copy_mode) { - case CEED_OWN_POINTER: - impl->h_ind_allocated = (CeedInt *)indices; - impl->h_ind = (CeedInt *)indices; - break; - case CEED_USE_POINTER: - impl->h_ind = (CeedInt *)indices; - break; - case CEED_COPY_VALUES: - if (indices != NULL) { - ierr = CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated); - CeedChkBackend(ierr); - memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); - impl->h_ind = impl->h_ind_allocated; - } - break; + case CEED_OWN_POINTER: + impl->h_ind_allocated = (CeedInt *)indices; + impl->h_ind = (CeedInt *)indices; + break; + case CEED_USE_POINTER: + impl->h_ind = (CeedInt *)indices; + break; + case CEED_COPY_VALUES: + if (indices != NULL) { + CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); + memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); + impl->h_ind = impl->h_ind_allocated; + } + break; } if (indices != NULL) { - ierr = cudaMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); - CeedChk_Cu(ceed, ierr); - impl->d_ind_allocated = impl->d_ind; // We own the device memory - ierr = cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), - cudaMemcpyHostToDevice); - CeedChk_Cu(ceed, ierr); - ierr = CeedElemRestrictionOffset_Cuda(r, indices); CeedChkBackend(ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); + impl->d_ind_allocated = impl->d_ind; // We own the device memory + CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); + CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); } } else if (m_type == CEED_MEM_DEVICE) { switch (copy_mode) { - case CEED_COPY_VALUES: - if (indices != NULL) { - ierr = cudaMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); - CeedChk_Cu(ceed, ierr); - impl->d_ind_allocated = impl->d_ind; // We own the device memory - ierr = cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), - cudaMemcpyDeviceToDevice); - CeedChk_Cu(ceed, ierr); - } - break; - case CEED_OWN_POINTER: - impl->d_ind = (CeedInt *)indices; - impl->d_ind_allocated = impl->d_ind; - break; - case CEED_USE_POINTER: - impl->d_ind = (CeedInt *)indices; + case CEED_COPY_VALUES: + if (indices != NULL) { + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); + impl->d_ind_allocated = impl->d_ind; // We own the device memory + CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyDeviceToDevice)); + } + break; + case CEED_OWN_POINTER: + impl->d_ind = (CeedInt *)indices; + impl->d_ind_allocated = impl->d_ind; + break; + case CEED_USE_POINTER: + impl->d_ind = (CeedInt *)indices; } if (indices != NULL) { - ierr = CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated); - CeedChkBackend(ierr); - ierr = cudaMemcpy(impl->h_ind_allocated, impl->d_ind, - elem_size * num_elem * sizeof(CeedInt), cudaMemcpyDeviceToHost); - CeedChk_Cu(ceed, ierr); + CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); + CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), cudaMemcpyDeviceToHost)); impl->h_ind = impl->h_ind_allocated; - ierr = CeedElemRestrictionOffset_Cuda(r, indices); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); } } else { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Only MemType = HOST or DEVICE supported"); + return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); // LCOV_EXCL_STOP } // Compile CUDA kernels CeedInt num_nodes = impl->num_nodes; - char *restriction_kernel_path, *restriction_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-ref-restriction.h", - &restriction_kernel_path); CeedChkBackend(ierr); + char *restriction_kernel_path, *restriction_kernel_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, restriction_kernel_path, - &restriction_kernel_source); - CeedChkBackend(ierr); - CeedDebug256(ceed, 2, - "----- Loading Restriction Kernel Source Complete! -----\n"); - ierr = CeedCompileCuda(ceed, restriction_kernel_source, &impl->module, 8, - "RESTR_ELEM_SIZE", elem_size, - "RESTR_NUM_ELEM", num_elem, - "RESTR_NUM_COMP", num_comp, - "RESTR_NUM_NODES", num_nodes, - "RESTR_COMP_STRIDE", comp_stride, - "RESTR_STRIDE_NODES", strides[0], - "RESTR_STRIDE_COMP", strides[1], - "RESTR_STRIDE_ELEM", strides[2]); CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, impl->module, "StridedTranspose", - &impl->StridedTranspose); CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, impl->module, "StridedNoTranspose", - &impl->StridedNoTranspose); CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, impl->module, "OffsetTranspose", - &impl->OffsetTranspose); CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, impl->module, "OffsetNoTranspose", - &impl->OffsetNoTranspose); CeedChkBackend(ierr); - ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileCuda(ceed, restriction_kernel_source, &impl->module, 8, "RESTR_ELEM_SIZE", elem_size, "RESTR_NUM_ELEM", num_elem, + "RESTR_NUM_COMP", num_comp, "RESTR_NUM_NODES", num_nodes, "RESTR_COMP_STRIDE", comp_stride, "RESTR_STRIDE_NODES", + strides[0], "RESTR_STRIDE_COMP", strides[1], "RESTR_STRIDE_ELEM", strides[2])); + CeedCallBackend(CeedGetKernelCuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); + CeedCallBackend(CeedGetKernelCuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); + CeedCallBackend(CeedGetKernelCuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); + CeedCallBackend(CeedGetKernelCuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); + CeedCallBackend(CeedFree(&restriction_kernel_path)); + CeedCallBackend(CeedFree(&restriction_kernel_source)); // Register backend functions - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", - CeedElemRestrictionApply_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", - CeedElemRestrictionApplyBlock_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", - CeedElemRestrictionGetOffsets_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", - CeedElemRestrictionDestroy_Cuda); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", CeedElemRestrictionApplyBlock_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Cuda)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Blocked not supported //------------------------------------------------------------------------------ -int CeedElemRestrictionCreateBlocked_Cuda(const CeedMemType m_type, - const CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r) { - int ierr; +int CeedElemRestrictionCreateBlocked_Cuda(const CeedMemType m_type, const CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement blocked restrictions"); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); } //------------------------------------------------------------------------------ diff --git a/backends/cuda-ref/ceed-cuda-vector.c b/backends/cuda-ref/ceed-cuda-vector.c index a6dcb2462c..bcccade430 100644 --- a/backends/cuda-ref/ceed-cuda-vector.c +++ b/backends/cuda-ref/ceed-cuda-vector.c @@ -5,33 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include #include -#include "ceed-cuda-ref.h" +#include "ceed-cuda-ref.h" //------------------------------------------------------------------------------ // Check if host/device sync is needed //------------------------------------------------------------------------------ -static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, - CeedMemType mem_type, bool *need_sync) { - int ierr; +static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_type, bool *need_sync) { CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); bool has_valid_array = false; - ierr = CeedVectorHasValidArray(vec, &has_valid_array); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array)); switch (mem_type) { - case CEED_MEM_HOST: - *need_sync = has_valid_array && !impl->h_array; - break; - case CEED_MEM_DEVICE: - *need_sync = has_valid_array && !impl->d_array; - break; + case CEED_MEM_HOST: + *need_sync = has_valid_array && !impl->h_array; + break; + case CEED_MEM_DEVICE: + *need_sync = has_valid_array && !impl->d_array; + break; } return CEED_ERROR_SUCCESS; @@ -41,20 +39,19 @@ static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, // Sync host to device //------------------------------------------------------------------------------ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - if (!impl->h_array) + if (!impl->h_array) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No valid host data to sync to device"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device"); + // LCOV_EXCL_STOP + } CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); if (impl->d_array_borrowed) { @@ -62,13 +59,11 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { } else if (impl->d_array_owned) { impl->d_array = impl->d_array_owned; } else { - ierr = cudaMalloc((void **)&impl->d_array_owned, bytes); - CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes)); impl->d_array = impl->d_array_owned; } - ierr = cudaMemcpy(impl->d_array, impl->h_array, bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice)); return CEED_ERROR_SUCCESS; } @@ -77,17 +72,16 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { // Sync device to host //------------------------------------------------------------------------------ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - if (!impl->d_array) + if (!impl->d_array) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No valid device data to sync to host"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); + // LCOV_EXCL_STOP + } if (impl->h_array_borrowed) { impl->h_array = impl->h_array_borrowed; @@ -95,16 +89,15 @@ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) { impl->h_array = impl->h_array_owned; } else { CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); - ierr = CeedCalloc(length, &impl->h_array_owned); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + CeedCallBackend(CeedCalloc(length, &impl->h_array_owned)); impl->h_array = impl->h_array_owned; } CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); - ierr = cudaMemcpy(impl->h_array, impl->d_array, bytes, - cudaMemcpyDeviceToHost); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -112,19 +105,17 @@ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) { //------------------------------------------------------------------------------ // Sync arrays //------------------------------------------------------------------------------ -static int CeedVectorSyncArray_Cuda(const CeedVector vec, - CeedMemType mem_type) { - int ierr; +static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type) { // Check whether device/host sync is needed bool need_sync = false; - ierr = CeedVectorNeedSync_Cuda(vec, mem_type, &need_sync); - CeedChkBackend(ierr); - if (!need_sync) - return CEED_ERROR_SUCCESS; + CeedCallBackend(CeedVectorNeedSync_Cuda(vec, mem_type, &need_sync)); + if (!need_sync) return CEED_ERROR_SUCCESS; switch (mem_type) { - case CEED_MEM_HOST: return CeedVectorSyncD2H_Cuda(vec); - case CEED_MEM_DEVICE: return CeedVectorSyncH2D_Cuda(vec); + case CEED_MEM_HOST: + return CeedVectorSyncD2H_Cuda(vec); + case CEED_MEM_DEVICE: + return CeedVectorSyncH2D_Cuda(vec); } return CEED_ERROR_UNSUPPORTED; } @@ -133,9 +124,8 @@ static int CeedVectorSyncArray_Cuda(const CeedVector vec, // Set all pointers as invalid //------------------------------------------------------------------------------ static inline int CeedVectorSetAllInvalid_Cuda(const CeedVector vec) { - int ierr; CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); impl->h_array = NULL; impl->d_array = NULL; @@ -146,11 +136,9 @@ static inline int CeedVectorSetAllInvalid_Cuda(const CeedVector vec) { //------------------------------------------------------------------------------ // Check if CeedVector has any valid pointer //------------------------------------------------------------------------------ -static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, - bool *has_valid_array) { - int ierr; +static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, bool *has_valid_array) { CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); *has_valid_array = !!impl->h_array || !!impl->d_array; @@ -160,19 +148,17 @@ static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, //------------------------------------------------------------------------------ // Check if has array of given type //------------------------------------------------------------------------------ -static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, - CeedMemType mem_type, bool *has_array_of_type) { - int ierr; +static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) { CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { - case CEED_MEM_HOST: - *has_array_of_type = !!impl->h_array_borrowed || !!impl->h_array_owned; - break; - case CEED_MEM_DEVICE: - *has_array_of_type = !!impl->d_array_borrowed || !!impl->d_array_owned; - break; + case CEED_MEM_HOST: + *has_array_of_type = !!impl->h_array_borrowed || !!impl->h_array_owned; + break; + case CEED_MEM_DEVICE: + *has_array_of_type = !!impl->d_array_borrowed || !!impl->d_array_owned; + break; } return CEED_ERROR_SUCCESS; @@ -181,19 +167,17 @@ static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, //------------------------------------------------------------------------------ // Check if has borrowed array of given type //------------------------------------------------------------------------------ -static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, - CeedMemType mem_type, bool *has_borrowed_array_of_type) { - int ierr; +static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { - case CEED_MEM_HOST: - *has_borrowed_array_of_type = !!impl->h_array_borrowed; - break; - case CEED_MEM_DEVICE: - *has_borrowed_array_of_type = !!impl->d_array_borrowed; - break; + case CEED_MEM_HOST: + *has_borrowed_array_of_type = !!impl->h_array_borrowed; + break; + case CEED_MEM_DEVICE: + *has_borrowed_array_of_type = !!impl->d_array_borrowed; + break; } return CEED_ERROR_SUCCESS; @@ -202,40 +186,37 @@ static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, //------------------------------------------------------------------------------ // Set array from host //------------------------------------------------------------------------------ -static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, - const CeedCopyMode copy_mode, CeedScalar *array) { - int ierr; +static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); - + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (copy_mode) { - case CEED_COPY_VALUES: { - CeedSize length; - if (!impl->h_array_owned) { - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); - ierr = CeedMalloc(length, &impl->h_array_owned); CeedChkBackend(ierr); - } - impl->h_array_borrowed = NULL; - impl->h_array = impl->h_array_owned; - if (array) { + case CEED_COPY_VALUES: { CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); - size_t bytes = length * sizeof(CeedScalar); - memcpy(impl->h_array, array, bytes); - } - } break; - case CEED_OWN_POINTER: - ierr = CeedFree(&impl->h_array_owned); CeedChkBackend(ierr); - impl->h_array_owned = array; - impl->h_array_borrowed = NULL; - impl->h_array = array; - break; - case CEED_USE_POINTER: - ierr = CeedFree(&impl->h_array_owned); CeedChkBackend(ierr); - impl->h_array_borrowed = array; - impl->h_array = array; - break; + if (!impl->h_array_owned) { + CeedCallBackend(CeedVectorGetLength(vec, &length)); + CeedCallBackend(CeedMalloc(length, &impl->h_array_owned)); + } + impl->h_array_borrowed = NULL; + impl->h_array = impl->h_array_owned; + if (array) { + CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); + size_t bytes = length * sizeof(CeedScalar); + memcpy(impl->h_array, array, bytes); + } + } break; + case CEED_OWN_POINTER: + CeedCallBackend(CeedFree(&impl->h_array_owned)); + impl->h_array_owned = array; + impl->h_array_borrowed = NULL; + impl->h_array = array; + break; + case CEED_USE_POINTER: + CeedCallBackend(CeedFree(&impl->h_array_owned)); + impl->h_array_borrowed = array; + impl->h_array = array; + break; } return CEED_ERROR_SUCCESS; @@ -244,42 +225,35 @@ static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, //------------------------------------------------------------------------------ // Set array from device //------------------------------------------------------------------------------ -static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, - const CeedCopyMode copy_mode, CeedScalar *array) { - int ierr; +static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); - + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (copy_mode) { - case CEED_COPY_VALUES: { - CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); - size_t bytes = length * sizeof(CeedScalar); - if (!impl->d_array_owned) { - ierr = cudaMalloc((void **)&impl->d_array_owned, bytes); - CeedChk_Cu(ceed, ierr); - impl->d_array = impl->d_array_owned; - } - if (array) { - ierr = cudaMemcpy(impl->d_array, array, bytes, - cudaMemcpyDeviceToDevice); CeedChk_Cu(ceed, ierr); - } - } break; - case CEED_OWN_POINTER: - ierr = cudaFree(impl->d_array_owned); CeedChk_Cu(ceed, ierr); - impl->d_array_owned = array; - impl->d_array_borrowed = NULL; - impl->d_array = array; - break; - case CEED_USE_POINTER: - ierr = cudaFree(impl->d_array_owned); CeedChk_Cu(ceed, ierr); - impl->d_array_owned = NULL; - impl->d_array_borrowed = array; - impl->d_array = array; - break; + case CEED_COPY_VALUES: { + CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); + size_t bytes = length * sizeof(CeedScalar); + if (!impl->d_array_owned) { + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes)); + impl->d_array = impl->d_array_owned; + } + if (array) CeedCallCuda(ceed, cudaMemcpy(impl->d_array, array, bytes, cudaMemcpyDeviceToDevice)); + } break; + case CEED_OWN_POINTER: + CeedCallCuda(ceed, cudaFree(impl->d_array_owned)); + impl->d_array_owned = array; + impl->d_array_borrowed = NULL; + impl->d_array = array; + break; + case CEED_USE_POINTER: + CeedCallCuda(ceed, cudaFree(impl->d_array_owned)); + impl->d_array_owned = NULL; + impl->d_array_borrowed = array; + impl->d_array = array; + break; } return CEED_ERROR_SUCCESS; @@ -289,21 +263,18 @@ static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, // Set the array used by a vector, // freeing any previously allocated array if applicable //------------------------------------------------------------------------------ -static int CeedVectorSetArray_Cuda(const CeedVector vec, - const CeedMemType mem_type, - const CeedCopyMode copy_mode, CeedScalar *array) { - int ierr; +static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - ierr = CeedVectorSetAllInvalid_Cuda(vec); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec)); switch (mem_type) { - case CEED_MEM_HOST: - return CeedVectorSetArrayHost_Cuda(vec, copy_mode, array); - case CEED_MEM_DEVICE: - return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array); + case CEED_MEM_HOST: + return CeedVectorSetArrayHost_Cuda(vec, copy_mode, array); + case CEED_MEM_DEVICE: + return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array); } return CEED_ERROR_UNSUPPORTED; @@ -312,30 +283,26 @@ static int CeedVectorSetArray_Cuda(const CeedVector vec, //------------------------------------------------------------------------------ // Set host array to value //------------------------------------------------------------------------------ -static int CeedHostSetValue_Cuda(CeedScalar *h_array, CeedInt length, - CeedScalar val) { - for (int i = 0; i < length; i++) - h_array[i] = val; +static int CeedHostSetValue_Cuda(CeedScalar *h_array, CeedInt length, CeedScalar val) { + for (int i = 0; i < length; i++) h_array[i] = val; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Set device array to value (impl in .cu file) //------------------------------------------------------------------------------ -int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedInt length, - CeedScalar val); +int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedInt length, CeedScalar val); //------------------------------------------------------------------------------ // Set a vector to a value, //------------------------------------------------------------------------------ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); // Set value for synced device/host array if (!impl->d_array && !impl->h_array) { @@ -348,17 +315,15 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) { } else if (impl->h_array_owned) { impl->h_array = impl->h_array_owned; } else { - ierr = CeedVectorSetArray(vec, CEED_MEM_DEVICE, CEED_COPY_VALUES, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetArray(vec, CEED_MEM_DEVICE, CEED_COPY_VALUES, NULL)); } } if (impl->d_array) { - ierr = CeedDeviceSetValue_Cuda(impl->d_array, length, val); - CeedChkBackend(ierr); + CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val)); impl->h_array = NULL; } if (impl->h_array) { - ierr = CeedHostSetValue_Cuda(impl->h_array, length, val); CeedChkBackend(ierr); + CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val)); impl->d_array = NULL; } @@ -368,29 +333,27 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) { //------------------------------------------------------------------------------ // Vector Take Array //------------------------------------------------------------------------------ -static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, - CeedScalar **array) { - int ierr; +static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); // Sync array to requested mem_type - ierr = CeedVectorSyncArray(vec, mem_type); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(vec, mem_type)); // Update pointer switch (mem_type) { - case CEED_MEM_HOST: - (*array) = impl->h_array_borrowed; - impl->h_array_borrowed = NULL; - impl->h_array = NULL; - break; - case CEED_MEM_DEVICE: - (*array) = impl->d_array_borrowed; - impl->d_array_borrowed = NULL; - impl->d_array = NULL; - break; + case CEED_MEM_HOST: + (*array) = impl->h_array_borrowed; + impl->h_array_borrowed = NULL; + impl->h_array = NULL; + break; + case CEED_MEM_DEVICE: + (*array) = impl->d_array_borrowed; + impl->d_array_borrowed = NULL; + impl->d_array = NULL; + break; } return CEED_ERROR_SUCCESS; @@ -400,25 +363,23 @@ static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, // Core logic for array syncronization for GetArray. // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ -static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, - const CeedMemType mem_type, CeedScalar **array) { - int ierr; +static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); // Sync array to requested mem_type - ierr = CeedVectorSyncArray(vec, mem_type); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(vec, mem_type)); // Update pointer switch (mem_type) { - case CEED_MEM_HOST: - *array = impl->h_array; - break; - case CEED_MEM_DEVICE: - *array = impl->d_array; - break; + case CEED_MEM_HOST: + *array = impl->h_array; + break; + case CEED_MEM_DEVICE: + *array = impl->d_array; + break; } return CEED_ERROR_SUCCESS; @@ -426,30 +387,27 @@ static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, //------------------------------------------------------------------------------ // Get read-only access to a vector via the specified mem_type //------------------------------------------------------------------------------ -static int CeedVectorGetArrayRead_Cuda(const CeedVector vec, - const CeedMemType mem_type, const CeedScalar **array) { +static int CeedVectorGetArrayRead_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedScalar **array) { return CeedVectorGetArrayCore_Cuda(vec, mem_type, (CeedScalar **)array); } //------------------------------------------------------------------------------ // Get read/write access to a vector via the specified mem_type //------------------------------------------------------------------------------ -static int CeedVectorGetArray_Cuda(const CeedVector vec, - const CeedMemType mem_type, CeedScalar **array) { - int ierr; +static int CeedVectorGetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - ierr = CeedVectorGetArrayCore_Cuda(vec, mem_type, array); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayCore_Cuda(vec, mem_type, array)); - ierr = CeedVectorSetAllInvalid_Cuda(vec); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec)); switch (mem_type) { - case CEED_MEM_HOST: - impl->h_array = *array; - break; - case CEED_MEM_DEVICE: - impl->d_array = *array; - break; + case CEED_MEM_HOST: + impl->h_array = *array; + break; + case CEED_MEM_DEVICE: + impl->d_array = *array; + break; } return CEED_ERROR_SUCCESS; @@ -458,33 +416,25 @@ static int CeedVectorGetArray_Cuda(const CeedVector vec, //------------------------------------------------------------------------------ // Get write access to a vector via the specified mem_type //------------------------------------------------------------------------------ -static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, - const CeedMemType mem_type, CeedScalar **array) { - int ierr; +static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); bool has_array_of_type = true; - ierr = CeedVectorHasArrayOfType_Cuda(vec, mem_type, &has_array_of_type); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorHasArrayOfType_Cuda(vec, mem_type, &has_array_of_type)); if (!has_array_of_type) { // Allocate if array is not yet allocated - ierr = CeedVectorSetArray(vec, mem_type, CEED_COPY_VALUES, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetArray(vec, mem_type, CEED_COPY_VALUES, NULL)); } else { // Select dirty array switch (mem_type) { - case CEED_MEM_HOST: - if (impl->h_array_borrowed) - impl->h_array = impl->h_array_borrowed; - else - impl->h_array = impl->h_array_owned; - break; - case CEED_MEM_DEVICE: - if (impl->d_array_borrowed) - impl->d_array = impl->d_array_borrowed; - else - impl->d_array = impl->d_array_owned; + case CEED_MEM_HOST: + if (impl->h_array_borrowed) impl->h_array = impl->h_array_borrowed; + else impl->h_array = impl->h_array_owned; + break; + case CEED_MEM_DEVICE: + if (impl->d_array_borrowed) impl->d_array = impl->d_array_borrowed; + else impl->d_array = impl->d_array_owned; } } @@ -494,57 +444,50 @@ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, //------------------------------------------------------------------------------ // Get the norm of a CeedVector //------------------------------------------------------------------------------ -static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, - CeedScalar *norm) { - int ierr; +static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); cublasHandle_t handle; - ierr = CeedCudaGetCublasHandle(ceed, &handle); CeedChkBackend(ierr); + CeedCallBackend(CeedCudaGetCublasHandle(ceed, &handle)); // Compute norm const CeedScalar *d_array; - ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array)); switch (type) { - case CEED_NORM_1: { - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - ierr = cublasSasum(handle, length, (float *) d_array, 1, (float *) norm); - } else { - ierr = cublasDasum(handle, length, (double *) d_array, 1, (double *) norm); + case CEED_NORM_1: { + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { + CeedCallCublas(ceed, cublasSasum(handle, length, (float *)d_array, 1, (float *)norm)); + } else { + CeedCallCublas(ceed, cublasDasum(handle, length, (double *)d_array, 1, (double *)norm)); + } + break; } - CeedChk_Cublas(ceed, ierr); - break; - } - case CEED_NORM_2: { - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - ierr = cublasSnrm2(handle, length, (float *) d_array, 1, (float *) norm); - } else { - ierr = cublasDnrm2(handle, length, (double *) d_array, 1, (double *) norm); + case CEED_NORM_2: { + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { + CeedCallCublas(ceed, cublasSnrm2(handle, length, (float *)d_array, 1, (float *)norm)); + } else { + CeedCallCublas(ceed, cublasDnrm2(handle, length, (double *)d_array, 1, (double *)norm)); + } + break; } - CeedChk_Cublas(ceed, ierr); - break; - } - case CEED_NORM_MAX: { - CeedInt indx; - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - ierr = cublasIsamax(handle, length, (float *) d_array, 1, &indx); - } else { - ierr = cublasIdamax(handle, length, (double *) d_array, 1, &indx); + case CEED_NORM_MAX: { + CeedInt indx; + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { + CeedCallCublas(ceed, cublasIsamax(handle, length, (float *)d_array, 1, &indx)); + } else { + CeedCallCublas(ceed, cublasIdamax(handle, length, (double *)d_array, 1, &indx)); + } + CeedScalar normNoAbs; + CeedCallCuda(ceed, cudaMemcpy(&normNoAbs, impl->d_array + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); + *norm = fabs(normNoAbs); + break; } - CeedChk_Cublas(ceed, ierr); - CeedScalar normNoAbs; - ierr = cudaMemcpy(&normNoAbs, impl->d_array+indx-1, sizeof(CeedScalar), - cudaMemcpyDeviceToHost); CeedChk_Cu(ceed, ierr); - *norm = fabs(normNoAbs); - break; } - } - ierr = CeedVectorRestoreArrayRead(vec, &d_array); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array)); return CEED_ERROR_SUCCESS; } @@ -553,9 +496,9 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, // Take reciprocal of a vector on host //------------------------------------------------------------------------------ static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedInt length) { - for (int i = 0; i < length; i++) - if (fabs(h_array[i]) > CEED_EPSILON) - h_array[i] = 1./h_array[i]; + for (int i = 0; i < length; i++) { + if (fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i]; + } return CEED_ERROR_SUCCESS; } @@ -568,21 +511,16 @@ int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedInt length); // Take reciprocal of a vector //------------------------------------------------------------------------------ static int CeedVectorReciprocal_Cuda(CeedVector vec) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); // Set value for synced device/host array - if (impl->d_array) { - ierr = CeedDeviceReciprocal_Cuda(impl->d_array, length); CeedChkBackend(ierr); - } - if (impl->h_array) { - ierr = CeedHostReciprocal_Cuda(impl->h_array, length); CeedChkBackend(ierr); - } + if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Cuda(impl->d_array, length)); + if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Cuda(impl->h_array, length)); return CEED_ERROR_SUCCESS; } @@ -590,39 +528,30 @@ static int CeedVectorReciprocal_Cuda(CeedVector vec) { //------------------------------------------------------------------------------ // Compute x = alpha x on the host //------------------------------------------------------------------------------ -static int CeedHostScale_Cuda(CeedScalar *x_array, CeedScalar alpha, - CeedInt length) { - for (int i = 0; i < length; i++) - x_array[i] *= alpha; +static int CeedHostScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedInt length) { + for (int i = 0; i < length; i++) x_array[i] *= alpha; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute x = alpha x on device (impl in .cu file) //------------------------------------------------------------------------------ -int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, - CeedInt length); +int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedInt length); //------------------------------------------------------------------------------ // Compute x = alpha x //------------------------------------------------------------------------------ static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(x, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(x, &ceed)); CeedVector_Cuda *x_impl; - ierr = CeedVectorGetData(x, &x_impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedSize length; - ierr = CeedVectorGetLength(x, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(x, &length)); // Set value for synced device/host array - if (x_impl->d_array) { - ierr = CeedDeviceScale_Cuda(x_impl->d_array, alpha, length); - CeedChkBackend(ierr); - } - if (x_impl->h_array) { - ierr = CeedHostScale_Cuda(x_impl->h_array, alpha, length); CeedChkBackend(ierr); - } + if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length)); + if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length)); return CEED_ERROR_SUCCESS; } @@ -630,42 +559,36 @@ static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) { //------------------------------------------------------------------------------ // Compute y = alpha x + y on the host //------------------------------------------------------------------------------ -static int CeedHostAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, - CeedScalar *x_array, CeedInt length) { - for (int i = 0; i < length; i++) - y_array[i] += alpha * x_array[i]; +static int CeedHostAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) { + for (int i = 0; i < length; i++) y_array[i] += alpha * x_array[i]; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute y = alpha x + y on device (impl in .cu file) //------------------------------------------------------------------------------ -int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, - CeedScalar *x_array, CeedInt length); +int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length); //------------------------------------------------------------------------------ // Compute y = alpha x + y //------------------------------------------------------------------------------ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(y, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(y, &ceed)); CeedVector_Cuda *y_impl, *x_impl; - ierr = CeedVectorGetData(y, &y_impl); CeedChkBackend(ierr); - ierr = CeedVectorGetData(x, &x_impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(y, &y_impl)); + CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedSize length; - ierr = CeedVectorGetLength(y, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(y, &length)); // Set value for synced device/host array if (y_impl->d_array) { - ierr = CeedVectorSyncArray(x, CEED_MEM_DEVICE); CeedChkBackend(ierr); - ierr = CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); + CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length)); } if (y_impl->h_array) { - ierr = CeedVectorSyncArray(x, CEED_MEM_HOST); CeedChkBackend(ierr); - ierr = CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); + CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length)); } return CEED_ERROR_SUCCESS; @@ -674,51 +597,42 @@ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) { //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on the host //------------------------------------------------------------------------------ -static int CeedHostPointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, - CeedScalar *y_array, CeedInt length) { - for (int i = 0; i < length; i++) - w_array[i] = x_array[i] * y_array[i]; +static int CeedHostPointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) { + for (int i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i]; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on device (impl in .cu file) //------------------------------------------------------------------------------ -int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, - CeedScalar *y_array, CeedInt length); +int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length); //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y //------------------------------------------------------------------------------ -static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, - CeedVector y) { - int ierr; +static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, CeedVector y) { Ceed ceed; - ierr = CeedVectorGetCeed(w, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(w, &ceed)); CeedVector_Cuda *w_impl, *x_impl, *y_impl; - ierr = CeedVectorGetData(w, &w_impl); CeedChkBackend(ierr); - ierr = CeedVectorGetData(x, &x_impl); CeedChkBackend(ierr); - ierr = CeedVectorGetData(y, &y_impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(w, &w_impl)); + CeedCallBackend(CeedVectorGetData(x, &x_impl)); + CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedSize length; - ierr = CeedVectorGetLength(w, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(w, &length)); // Set value for synced device/host array if (!w_impl->d_array && !w_impl->h_array) { - ierr = CeedVectorSetValue(w, 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetValue(w, 0.0)); } if (w_impl->d_array) { - ierr = CeedVectorSyncArray(x, CEED_MEM_DEVICE); CeedChkBackend(ierr); - ierr = CeedVectorSyncArray(y, CEED_MEM_DEVICE); CeedChkBackend(ierr); - ierr = CeedDevicePointwiseMult_Cuda(w_impl->d_array, x_impl->d_array, - y_impl->d_array, length); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); + CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_DEVICE)); + CeedCallBackend(CeedDevicePointwiseMult_Cuda(w_impl->d_array, x_impl->d_array, y_impl->d_array, length)); } if (w_impl->h_array) { - ierr = CeedVectorSyncArray(x, CEED_MEM_HOST); CeedChkBackend(ierr); - ierr = CeedVectorSyncArray(y, CEED_MEM_HOST); CeedChkBackend(ierr); - ierr = CeedHostPointwiseMult_Cuda(w_impl->h_array, x_impl->h_array, - y_impl->h_array, length); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); + CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST)); + CeedCallBackend(CeedHostPointwiseMult_Cuda(w_impl->h_array, x_impl->h_array, y_impl->h_array, length)); } return CEED_ERROR_SUCCESS; @@ -728,15 +642,14 @@ static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, // Destroy the vector //------------------------------------------------------------------------------ static int CeedVectorDestroy_Cuda(const CeedVector vec) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Cuda *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - ierr = cudaFree(impl->d_array_owned); CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&impl->h_array_owned); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallCuda(ceed, cudaFree(impl->d_array_owned)); + CeedCallBackend(CeedFree(&impl->h_array_owned)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -746,45 +659,27 @@ static int CeedVectorDestroy_Cuda(const CeedVector vec) { //------------------------------------------------------------------------------ int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) { CeedVector_Cuda *impl; - int ierr; - Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", - CeedVectorHasValidArray_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", - CeedVectorHasBorrowedArrayOfType_Cuda); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", - CeedVectorSetArray_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", - CeedVectorTakeArray_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", - (int (*)())(CeedVectorSetValue_Cuda)); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", - CeedVectorSyncArray_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", - CeedVectorGetArray_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", - CeedVectorGetArrayRead_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", - CeedVectorGetArrayWrite_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "Norm", - CeedVectorNorm_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", - CeedVectorReciprocal_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", - (int (*)())(CeedVectorAXPY_Cuda)); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "Scale", - (int (*)())(CeedVectorScale_Cuda)); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", - CeedVectorPointwiseMult_Cuda); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", - CeedVectorDestroy_Cuda); CeedChkBackend(ierr); - - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedVectorSetData(vec, impl); CeedChkBackend(ierr); + Ceed ceed; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())(CeedVectorSetValue_Cuda))); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())(CeedVectorAXPY_Cuda))); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())(CeedVectorScale_Cuda))); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda)); + + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedVectorSetData(vec, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index cd32868cbd..8fc32ec1be 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -5,241 +5,170 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include #include -#include "ceed-cuda-shared.h" -#include "../cuda/ceed-cuda-compile.h" +#include "../cuda/ceed-cuda-compile.h" +#include "ceed-cuda-shared.h" //------------------------------------------------------------------------------ // Device initalization //------------------------------------------------------------------------------ -int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, - CeedScalar **c_B); -int CeedCudaInitGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, - CeedInt Q_1d, CeedScalar **c_B_ptr, - CeedScalar **c_G_ptr); -int CeedCudaInitCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, - CeedInt Q_1d, CeedScalar **c_B_ptr, - CeedScalar **c_G_ptr); +int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B); +int CeedCudaInitGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); +int CeedCudaInitCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); //------------------------------------------------------------------------------ // Apply basis //------------------------------------------------------------------------------ -int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, - CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector u, +int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - int ierr; Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Cuda *ceed_Cuda; - CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); CeedBasis_Cuda_shared *data; - CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedInt dim, num_comp; - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); // Read vectors const CeedScalar *d_u; - CeedScalar *d_v; + CeedScalar *d_v; if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); } - ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Apply basis operation switch (eval_mode) { - case CEED_EVAL_INTERP: { - CeedInt P_1d, Q_1d; - ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - ierr = CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B); - CeedChkBackend(ierr); - void *interp_args[] = {(void *) &num_elem, &data->c_B, - &d_u, &d_v - }; - if (dim == 1) { - CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], - CeedIntMax(512 / thread_1d, - 1)); // avoid >512 total threads - CeedInt grid = num_elem/elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d, - 1, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); + case CEED_EVAL_INTERP: { + CeedInt P_1d, Q_1d; + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + CeedCallBackend(CeedCudaInitInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B)); + void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v}; + if (dim == 1) { + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, + 1)); // avoid >512 total threads + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + } + } else if (dim == 2) { + const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; + // elems_per_block must be at least 1 + CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend( + CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } + } else if (dim == 3) { + CeedInt elems_per_block = 1; + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend( + CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } } - } else if (dim == 2) { - const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; - // elems_per_block must be at least 1 - CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / - num_comp : 1, 1); - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( - CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); + } break; + case CEED_EVAL_GRAD: { + CeedInt P_1d, Q_1d; + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + if (data->d_collo_grad_1d) { + CeedCallBackend(CeedCudaInitCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); } else { - ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); + CeedCallBackend(CeedCudaInitGrad(data->d_interp_1d, data->d_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); } - } else if (dim == 3) { - CeedInt elems_per_block = 1; - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( - CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedCuda(ceed, data->InterpTranspose, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedCuda(ceed, data->Interp, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); - } - } - } break; - case CEED_EVAL_GRAD: { - CeedInt P_1d, Q_1d; - ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - if (data->d_collo_grad_1d) { - ierr = CeedCudaInitCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, - P_1d, - Q_1d, &data->c_B, &data->c_G); - CeedChkBackend(ierr); - } else { - ierr = CeedCudaInitGrad(data->d_interp_1d, data->d_grad_1d, P_1d, - Q_1d, &data->c_B, &data->c_G); - CeedChkBackend(ierr); - } - void *grad_args[] = {(void *) &num_elem, &data->c_B, - &data->c_G, &d_u, &d_v - }; - if (dim == 1) { - CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], - CeedIntMax(512 / thread_1d, - 1)); // avoid >512 total threads - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_blockGradTranspose, grid, thread_1d, 1, - elems_per_block, shared_mem, grad_args); - CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1, - elems_per_block, shared_mem, grad_args); - CeedChkBackend(ierr); + void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v}; + if (dim == 1) { + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, + 1)); // avoid >512 total threads + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + } + } else if (dim == 2) { + const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; + // elems_per_block must be at least 1 + CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } + } else if (dim == 3) { + CeedInt elems_per_block = 1; + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } } - } else if (dim == 2) { - const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; - // elems_per_block must be at least 1 - CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / - num_comp : 1, 1); - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( - CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - grad_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, - elems_per_block, shared_mem, - grad_args); CeedChkBackend(ierr); - } - } else if (dim == 3) { - CeedInt elems_per_block = 1; - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( - CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedCuda(ceed, data->GradTranspose, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - grad_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedCuda(ceed, data->Grad, grid, thread_1d, thread_1d, - elems_per_block, shared_mem, - grad_args); CeedChkBackend(ierr); + } break; + case CEED_EVAL_WEIGHT: { + CeedInt Q_1d; + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; + if (dim == 1) { + const CeedInt elems_per_block = 32 / Q_1d; + const CeedInt gridsize = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, elems_per_block, 1, weight_args)); + } else if (dim == 2) { + const CeedInt opt_elems = 32 / (Q_1d * Q_1d); + const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; + const CeedInt gridsize = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, elems_per_block, weight_args)); + } else if (dim == 3) { + const CeedInt opt_elems = 32 / (Q_1d * Q_1d); + const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; + const CeedInt gridsize = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedCallBackend(CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, elems_per_block, weight_args)); } - } - } break; - case CEED_EVAL_WEIGHT: { - CeedInt Q_1d; - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; - if (dim == 1) { - const CeedInt elems_per_block = 32 / Q_1d; - const CeedInt gridsize = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, - elems_per_block, 1, weight_args); - CeedChkBackend(ierr); - } else if (dim == 2) { - const CeedInt opt_elems = 32 / (Q_1d * Q_1d); - const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt gridsize = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, - elems_per_block, weight_args); - CeedChkBackend(ierr); - } else if (dim == 3) { - const CeedInt opt_elems = 32 / (Q_1d * Q_1d); - const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt gridsize = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - ierr = CeedRunKernelDimCuda(ceed, data->Weight, gridsize, Q_1d, Q_1d, - elems_per_block, weight_args); - CeedChkBackend(ierr); - } - } break; - // LCOV_EXCL_START - // Evaluate the divergence to/from the quadrature points - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - // Evaluate the curl to/from the quadrature points - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - // Take no action, BasisApply should not have been called - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + } break; + // LCOV_EXCL_START + // Evaluate the divergence to/from the quadrature points + case CEED_EVAL_DIV: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); + // Evaluate the curl to/from the quadrature points + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + // Take no action, BasisApply should not have been called + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP } // Restore vectors if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } - ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } @@ -247,21 +176,20 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, // Destroy basis //------------------------------------------------------------------------------ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { - int ierr; Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasis_Cuda_shared *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedChk_Cu(ceed, cuModuleUnload(data->module)); + CeedCallCuda(ceed, cuModuleUnload(data->module)); - ierr = cudaFree(data->d_q_weight_1d); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(data->d_interp_1d); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(data->d_grad_1d); CeedChk_Cu(ceed, ierr); - ierr = cudaFree(data->d_collo_grad_1d); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); + CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); + CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); + CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -269,94 +197,61 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { //------------------------------------------------------------------------------ // Create tensor basis //------------------------------------------------------------------------------ -int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, - const CeedScalar *interp_1d, - const CeedScalar *grad_1d, - const CeedScalar *q_ref_1d, - const CeedScalar *q_weight_1d, - CeedBasis basis) { - int ierr; +int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasis_Cuda_shared *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); - ierr = cudaMalloc((void **)&data->d_q_weight_1d, q_bytes); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); const CeedInt interp_bytes = q_bytes * P_1d; - ierr = cudaMalloc((void **)&data->d_interp_1d, interp_bytes); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); - ierr = cudaMalloc((void **)&data->d_grad_1d, interp_bytes); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); // Compute collocated gradient and copy to GPU - data->d_collo_grad_1d = NULL; + data->d_collo_grad_1d = NULL; bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; if (has_collocated_grad) { CeedScalar *collo_grad_1d; - ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); - ierr = cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d); - CeedChk_Cu(ceed, ierr); - ierr = cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, - cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); - ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); + CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); + CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); + CeedCallCuda(ceed, cudaMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&collo_grad_1d)); } // Compile basis kernels CeedInt num_comp; - ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); char *basis_kernel_path, *basis_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-shared-basis-tensor.h", - &basis_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor.h", &basis_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete -----\n"); - ierr = CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8, - "BASIS_Q_1D", Q_1d, - "BASIS_P_1D", P_1d, - "T_1D", CeedIntMax(Q_1d, P_1d), - "BASIS_DIM", dim, - "BASIS_NUM_COMP", num_comp, - "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), - "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), - "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad - ); CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp); - CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "InterpTranspose", - &data->InterpTranspose); - CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad); - CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "GradTranspose", - &data->GradTranspose); - CeedChkBackend(ierr); - ierr = CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight); - CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedCompileCuda(ceed, basis_kernel_source, &data->module, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", CeedIntMax(Q_1d, P_1d), + "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", + CeedIntPow(Q_1d, dim), "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad)); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Interp", &data->Interp)); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Grad", &data->Grad)); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernelCuda(ceed, data->module, "Weight", &data->Weight)); + CeedCallBackend(CeedFree(&basis_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); - ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApplyTensor_Cuda_shared); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", - CeedBasisDestroy_Cuda_shared); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda_shared)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/cuda-shared/ceed-cuda-shared.c b/backends/cuda-shared/ceed-cuda-shared.c index a606f2c495..19d5407a27 100644 --- a/backends/cuda-shared/ceed-cuda-shared.c +++ b/backends/cuda-shared/ceed-cuda-shared.c @@ -5,49 +5,41 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-cuda-shared.h" + #include +#include #include -#include "ceed-cuda-shared.h" //------------------------------------------------------------------------------ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Cuda_shared(const char *resource, Ceed ceed) { - int ierr; - char *resource_root; - ierr = CeedCudaGetResourceRoot(ceed, resource, &resource_root); - CeedChkBackend(ierr); - if (strcmp(resource_root, "/gpu/cuda/shared")) + CeedCallBackend(CeedCudaGetResourceRoot(ceed, resource, &resource_root)); + if (strcmp(resource_root, "/gpu/cuda/shared")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Cuda backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Cuda backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); Ceed_Cuda *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); - ierr = CeedCudaInit(ceed, resource); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedSetData(ceed, data)); + CeedCallBackend(CeedCudaInit(ceed, resource)); Ceed ceed_ref; - CeedInit("/gpu/cuda/ref", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", - CeedBasisCreateTensorH1_Cuda_shared); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", - CeedDestroy_Cuda); CeedChkBackend(ierr); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Cuda_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda)); return 0; } //------------------------------------------------------------------------------ // Register backend //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Cuda_Shared(void) { - return CeedRegister("/gpu/cuda/shared", CeedInit_Cuda_shared, 25); -} +CEED_INTERN int CeedRegister_Cuda_Shared(void) { return CeedRegister("/gpu/cuda/shared", CeedInit_Cuda_shared, 25); } //------------------------------------------------------------------------------ diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h index 5f9644741e..0802d04988 100644 --- a/backends/cuda-shared/ceed-cuda-shared.h +++ b/backends/cuda-shared/ceed-cuda-shared.h @@ -8,18 +8,19 @@ #ifndef _ceed_cuda_shared_h #define _ceed_cuda_shared_h -#include #include +#include #include + #include "../cuda/ceed-cuda-common.h" typedef struct { - CUmodule module; - CUfunction Interp; - CUfunction InterpTranspose; - CUfunction Grad; - CUfunction GradTranspose; - CUfunction Weight; + CUmodule module; + CUfunction Interp; + CUfunction InterpTranspose; + CUfunction Grad; + CUfunction GradTranspose; + CUfunction Weight; CeedScalar *d_interp_1d; CeedScalar *d_grad_1d; CeedScalar *d_collo_grad_1d; @@ -28,8 +29,7 @@ typedef struct { CeedScalar *c_G; } CeedBasis_Cuda_shared; -CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, - CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, - const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); +CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); -#endif // _ceed_cuda_shared_h +#endif // _ceed_cuda_shared_h diff --git a/backends/cuda/ceed-cuda-common.c b/backends/cuda/ceed-cuda-common.c index e38aee07f5..a52f94af22 100644 --- a/backends/cuda/ceed-cuda-common.c +++ b/backends/cuda/ceed-cuda-common.c @@ -5,21 +5,17 @@ // // This file is part of CEED: http://github.com/ceed -#include #include "ceed-cuda-common.h" +#include + //------------------------------------------------------------------------------ // Get root resource without device spec //------------------------------------------------------------------------------ -int CeedCudaGetResourceRoot(Ceed ceed, const char *resource, - char **resource_root) { - int ierr; - - char *device_spec = strstr(resource, ":device_id="); - size_t resource_root_len = device_spec - ? (size_t)(device_spec - resource) + 1 - : strlen(resource) + 1; - ierr = CeedCalloc(resource_root_len, resource_root); CeedChkBackend(ierr); +int CeedCudaGetResourceRoot(Ceed ceed, const char *resource, char **resource_root) { + char *device_spec = strstr(resource, ":device_id="); + size_t resource_root_len = device_spec ? (size_t)(device_spec - resource) + 1 : strlen(resource) + 1; + CeedCallBackend(CeedCalloc(resource_root_len, resource_root)); memcpy(*resource_root, resource, resource_root_len - 1); return CEED_ERROR_SUCCESS; @@ -29,21 +25,19 @@ int CeedCudaGetResourceRoot(Ceed ceed, const char *resource, // Device information backend init //------------------------------------------------------------------------------ int CeedCudaInit(Ceed ceed, const char *resource) { - int ierr; const char *device_spec = strstr(resource, ":device_id="); - const int device_id = (device_spec) ? atoi(device_spec + 11) : -1; + const int device_id = (device_spec) ? atoi(device_spec + 11) : -1; int current_device_id; - ierr = cudaGetDevice(¤t_device_id); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaGetDevice(¤t_device_id)); if (device_id >= 0 && current_device_id != device_id) { - ierr = cudaSetDevice(device_id); CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaSetDevice(device_id)); current_device_id = device_id; } Ceed_Cuda *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); data->device_id = current_device_id; - ierr = cudaGetDeviceProperties(&data->device_prop, current_device_id); - CeedChk_Cu(ceed, ierr); + CeedCallCuda(ceed, cudaGetDeviceProperties(&data->device_prop, current_device_id)); return CEED_ERROR_SUCCESS; } @@ -51,13 +45,10 @@ int CeedCudaInit(Ceed ceed, const char *resource) { // Backend destroy //------------------------------------------------------------------------------ int CeedDestroy_Cuda(Ceed ceed) { - int ierr; Ceed_Cuda *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); - if (data->cublas_handle) { - ierr = cublasDestroy(data->cublas_handle); CeedChk_Cublas(ceed, ierr); - } - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); + if (data->cublas_handle) CeedCallCublas(ceed, cublasDestroy(data->cublas_handle)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda/ceed-cuda-common.h b/backends/cuda/ceed-cuda-common.h index 9499805ce4..2ed6eb2a5c 100644 --- a/backends/cuda/ceed-cuda-common.h +++ b/backends/cuda/ceed-cuda-common.h @@ -8,7 +8,6 @@ #ifndef _ceed_cuda_common_h #define _ceed_cuda_common_h -#include #include #include #include @@ -17,26 +16,40 @@ #define QUOTE(...) #__VA_ARGS__ -#define CeedChk_Cu(ceed, x) \ -do { \ - CUresult cuda_result = (CUresult)x; \ - if (cuda_result != CUDA_SUCCESS) { \ - const char *msg; \ - cuGetErrorName(cuda_result, &msg); \ - return CeedError((ceed), CEED_ERROR_BACKEND, msg); \ - } \ -} while (0) +#define CeedChk_Cu(ceed, x) \ + do { \ + CUresult cuda_result = (CUresult)x; \ + if (cuda_result != CUDA_SUCCESS) { \ + const char *msg; \ + cuGetErrorName(cuda_result, &msg); \ + return CeedError((ceed), CEED_ERROR_BACKEND, msg); \ + } \ + } while (0) -#define CeedChk_Cublas(ceed, x) \ -do { \ - cublasStatus_t cublas_result = x; \ - if (cublas_result != CUBLAS_STATUS_SUCCESS) { \ - const char *msg = cublasGetErrorName(cublas_result); \ - return CeedError((ceed), CEED_ERROR_BACKEND, msg); \ - } \ -} while (0) +#define CeedChk_Cublas(ceed, x) \ + do { \ + cublasStatus_t cublas_result = x; \ + if (cublas_result != CUBLAS_STATUS_SUCCESS) { \ + const char *msg = cublasGetErrorName(cublas_result); \ + return CeedError((ceed), CEED_ERROR_BACKEND, msg); \ + } \ + } while (0) -#define CASE(name) case name: return #name +#define CeedCallCuda(ceed, ...) \ + do { \ + int ierr_q_ = __VA_ARGS__; \ + CeedChk_Cu(ceed, ierr_q_); \ + } while (0); + +#define CeedCallCublas(ceed, ...) \ + do { \ + int ierr_q_ = __VA_ARGS__; \ + CeedChk_Cublas(ceed, ierr_q_); \ + } while (0); + +#define CASE(name) \ + case name: \ + return #name // LCOV_EXCL_START static const char *cublasGetErrorName(cublasStatus_t error) { switch (error) { @@ -48,22 +61,22 @@ static const char *cublasGetErrorName(cublasStatus_t error) { CASE(CUBLAS_STATUS_MAPPING_ERROR); CASE(CUBLAS_STATUS_EXECUTION_FAILED); CASE(CUBLAS_STATUS_INTERNAL_ERROR); - default: return "CUBLAS_STATUS_UNKNOWN_ERROR"; + default: + return "CUBLAS_STATUS_UNKNOWN_ERROR"; } } // LCOV_EXCL_STOP typedef struct { - int device_id; - cublasHandle_t cublas_handle; + int device_id; + cublasHandle_t cublas_handle; struct cudaDeviceProp device_prop; } Ceed_Cuda; -CEED_INTERN int CeedCudaGetResourceRoot(Ceed ceed, const char *resource, - char **resource_root); +CEED_INTERN int CeedCudaGetResourceRoot(Ceed ceed, const char *resource, char **resource_root); CEED_INTERN int CeedCudaInit(Ceed ceed, const char *resource); CEED_INTERN int CeedDestroy_Cuda(Ceed ceed); -#endif // _ceed_cuda_common_h \ No newline at end of file +#endif // _ceed_cuda_common_h \ No newline at end of file diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp index 7c5175830e..80e9b8d7d2 100644 --- a/backends/cuda/ceed-cuda-compile.cpp +++ b/backends/cuda/ceed-cuda-compile.cpp @@ -5,32 +5,38 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-cuda-compile.h" + #include +#include #include #include #include #include -#include #include #include + +#include + #include "ceed-cuda-common.h" -#include "ceed-cuda-compile.h" -#define CeedChk_Nvrtc(ceed, x) \ -do { \ - nvrtcResult result = static_cast(x); \ - if (result != NVRTC_SUCCESS) \ - return CeedError((ceed), CEED_ERROR_BACKEND, nvrtcGetErrorString(result)); \ -} while (0) +#define CeedChk_Nvrtc(ceed, x) \ + do { \ + nvrtcResult result = static_cast(x); \ + if (result != NVRTC_SUCCESS) return CeedError((ceed), CEED_ERROR_BACKEND, nvrtcGetErrorString(result)); \ + } while (0) + +#define CeedCallNvrtc(ceed, ...) \ + do { \ + int ierr_q_ = __VA_ARGS__; \ + CeedChk_Nvrtc(ceed, ierr_q_); \ + } while (0); //------------------------------------------------------------------------------ // Compile CUDA kernel //------------------------------------------------------------------------------ -int CeedCompileCuda(Ceed ceed, const char *source, CUmodule *module, - const CeedInt num_defines, ...) { - int ierr; - cudaFree(0); // Make sure a Context exists for nvrtc +int CeedCompileCuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) { + cudaFree(0); // Make sure a Context exists for nvrtc nvrtcProgram prog; std::ostringstream code; @@ -40,10 +46,10 @@ int CeedCompileCuda(Ceed ceed, const char *source, CUmodule *module, va_list args; va_start(args, num_defines); char *name; - int val; + int val; for (int i = 0; i < num_defines; i++) { name = va_arg(args, char *); - val = va_arg(args, int); + val = va_arg(args, int); code << "#define " << name << " " << val << "\n"; } va_end(args); @@ -51,124 +57,103 @@ int CeedCompileCuda(Ceed ceed, const char *source, CUmodule *module, // Standard libCEED definitions for CUDA backends char *jit_defs_path, *jit_defs_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/cuda/cuda-jit.h", - &jit_defs_path); CeedChkBackend(ierr); - ierr = CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-jit.h", &jit_defs_path)); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source)); code << jit_defs_source; code << "\n\n"; - ierr = CeedFree(&jit_defs_path); CeedChkBackend(ierr); - ierr = CeedFree(&jit_defs_source); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&jit_defs_path)); + CeedCallBackend(CeedFree(&jit_defs_source)); // Non-macro options - const int num_opts = 3; + const int num_opts = 3; const char *opts[num_opts]; opts[0] = "-default-device"; struct cudaDeviceProp prop; - Ceed_Cuda *ceed_data; - ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); - ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); - CeedChk_Cu(ceed, ierr); - std::string arch_arg = "-arch=compute_" + std::to_string(prop.major) + std::to_string(prop.minor); - opts[1] = arch_arg.c_str(); - opts[2] = "-Dint32_t=int"; + Ceed_Cuda *ceed_data; + CeedCallBackend(CeedGetData(ceed, &ceed_data)); + CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id)); + std::string arch_arg = "-arch=compute_" + std::to_string(prop.major) + std::to_string(prop.minor); + opts[1] = arch_arg.c_str(); + opts[2] = "-Dint32_t=int"; // Add string source argument provided in call code << source; // Create Program - CeedChk_Nvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL)); + CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL)); // Compile kernel nvrtcResult result = nvrtcCompileProgram(prog, num_opts, opts); if (result != NVRTC_SUCCESS) { size_t log_size; - CeedChk_Nvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size)); + CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size)); char *log; - ierr = CeedMalloc(log_size, &log); CeedChkBackend(ierr); - CeedChk_Nvrtc(ceed, nvrtcGetProgramLog(prog, log)); - return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", - nvrtcGetErrorString(result), log); + CeedCallBackend(CeedMalloc(log_size, &log)); + CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log)); + return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log); } size_t ptx_size; - CeedChk_Nvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size)); + CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size)); char *ptx; - ierr = CeedMalloc(ptx_size, &ptx); CeedChkBackend(ierr); - CeedChk_Nvrtc(ceed, nvrtcGetPTX(prog, ptx)); - CeedChk_Nvrtc(ceed, nvrtcDestroyProgram(&prog)); + CeedCallBackend(CeedMalloc(ptx_size, &ptx)); + CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx)); + CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog)); - CeedChk_Cu(ceed, cuModuleLoadData(module, ptx)); - ierr = CeedFree(&ptx); CeedChkBackend(ierr); + CeedCallCuda(ceed, cuModuleLoadData(module, ptx)); + CeedCallBackend(CeedFree(&ptx)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Get CUDA kernel //------------------------------------------------------------------------------ -int CeedGetKernelCuda(Ceed ceed, CUmodule module, const char *name, - CUfunction *kernel) { - CeedChk_Cu(ceed, cuModuleGetFunction(kernel, module, name)); +int CeedGetKernelCuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel) { + CeedCallCuda(ceed, cuModuleGetFunction(kernel, module, name)); return CEED_ERROR_SUCCESS; } // Run kernel with block size selected automatically based on the kernel (which // may use enough registers to require a smaller block size than the hardware is // capable). -int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, - void **args) { +int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void **args) { int min_grid_size, max_block_size; - CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, - &max_block_size, kernel, NULL, 0, 0x10000)); - CeedChkBackend(CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(points, - max_block_size), max_block_size, args)); + CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_block_size, kernel, NULL, 0, 0x10000)); + CeedCallBackend(CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args)); return 0; } //------------------------------------------------------------------------------ // Run CUDA kernel //------------------------------------------------------------------------------ -int CeedRunKernelCuda(Ceed ceed, CUfunction kernel, const int grid_size, - const int block_size, void **args) { - CeedChkBackend(CeedRunKernelDimSharedCuda(ceed, kernel, grid_size, - block_size, 1, 1, 0, args)); +int CeedRunKernelCuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, kernel, grid_size, block_size, 1, 1, 0, args)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Run CUDA kernel for spatial dimension //------------------------------------------------------------------------------ -int CeedRunKernelDimCuda(Ceed ceed, CUfunction kernel, const int grid_size, - const int block_size_x, const int block_size_y, - const int block_size_z, void **args) { - CeedChkBackend(CeedRunKernelDimSharedCuda(ceed, kernel, grid_size, - block_size_x, block_size_y, block_size_z, - 0, args)); +int CeedRunKernelDimCuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, + void **args) { + CeedCallBackend(CeedRunKernelDimSharedCuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, 0, args)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Run CUDA kernel for spatial dimension with sharde memory //------------------------------------------------------------------------------ -int CeedRunKernelDimSharedCuda(Ceed ceed, CUfunction kernel, - const int grid_size, const int block_size_x, - const int block_size_y, const int block_size_z, - const int shared_mem_size, void **args) { - CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, - block_size_x, block_size_y, block_size_z, - shared_mem_size, NULL, args, NULL); +int CeedRunKernelDimSharedCuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, void **args) { + CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL); if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) { int max_threads_per_block, shared_size_bytes, num_regs; - cuFuncGetAttribute(&max_threads_per_block, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); - cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, - kernel); + cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); + cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel); return CeedError(ceed, CEED_ERROR_BACKEND, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d", - max_threads_per_block, block_size_x, block_size_y, block_size_z, - shared_size_bytes, num_regs); + max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs); } else CeedChk_Cu(ceed, result); return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda/ceed-cuda-compile.h b/backends/cuda/ceed-cuda-compile.h index b6c465b383..d67eecb725 100644 --- a/backends/cuda/ceed-cuda-compile.h +++ b/backends/cuda/ceed-cuda-compile.h @@ -8,34 +8,25 @@ #ifndef _ceed_cuda_compile_h #define _ceed_cuda_compile_h +#include #include #include #include -static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { - return (numerator + denominator - 1) / denominator; -} +static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; } -CEED_INTERN int CeedCompileCuda(Ceed ceed, const char *source, CUmodule *module, - const CeedInt num_defines, ...); +CEED_INTERN int CeedCompileCuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...); -CEED_INTERN int CeedGetKernelCuda(Ceed ceed, CUmodule module, const char *name, - CUfunction *kernel); +CEED_INTERN int CeedGetKernelCuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel); -CEED_INTERN int CeedRunKernelCuda(Ceed ceed, CUfunction kernel, - const int grid_size, - const int block_size, void **args); +CEED_INTERN int CeedRunKernelCuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args); -CEED_INTERN int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, - size_t size, void **args); +CEED_INTERN int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t size, void **args); -CEED_INTERN int CeedRunKernelDimCuda(Ceed ceed, CUfunction kernel, - const int grid_size, - const int block_size_x, const int block_size_y, +CEED_INTERN int CeedRunKernelDimCuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, void **args); -CEED_INTERN int CeedRunKernelDimSharedCuda(Ceed ceed, CUfunction kernel, - const int grid_size, const int block_size_x, const int block_size_y, - const int block_size_z, const int shared_mem_size, void **args); +CEED_INTERN int CeedRunKernelDimSharedCuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, void **args); -#endif // _ceed_cuda_compile_h +#endif // _ceed_cuda_compile_h diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp index 5e41df189f..c808b05986 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp +++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp @@ -7,42 +7,40 @@ #define CEED_DEBUG_COLOR 12 -#include #include +#include #include + #include -#include #include -#include "ceed-hip-gen.h" +#include + #include "../hip-ref/ceed-hip-ref.h" #include "../hip-shared/ceed-hip-shared.h" #include "../hip/ceed-hip-compile.h" - +#include "ceed-hip-gen.h" //------------------------------------------------------------------------------ -// Calculate the block size used for launching the operator kernel +// Calculate the block size used for launching the operator kernel //------------------------------------------------------------------------------ -extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, - const CeedInt P_1d, const CeedInt Q_1d, - CeedInt *block_sizes) { - +extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); - if (dim==1) { - CeedInt elems_per_block = 64*thread1d > 256? 256/thread1d : 64; - elems_per_block = elems_per_block>0?elems_per_block:1; - block_sizes[0] = thread1d; - block_sizes[1] = 1; - block_sizes[2] = elems_per_block; - } else if (dim==2) { - const CeedInt elems_per_block = thread1d<4? 16 : 2; - block_sizes[0] = thread1d; - block_sizes[1] = thread1d; - block_sizes[2] = elems_per_block; - } else if (dim==3) { - const CeedInt elems_per_block = thread1d<6? 4 : (thread1d<8? 2 : 1); - block_sizes[0] = thread1d; - block_sizes[1] = thread1d; - block_sizes[2] = elems_per_block; + if (dim == 1) { + CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + block_sizes[0] = thread1d; + block_sizes[1] = 1; + block_sizes[2] = elems_per_block; + } else if (dim == 2) { + const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; + block_sizes[0] = thread1d; + block_sizes[1] = thread1d; + block_sizes[2] = elems_per_block; + } else if (dim == 3) { + const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); + block_sizes[0] = thread1d; + block_sizes[1] = thread1d; + block_sizes[2] = elems_per_block; } return CEED_ERROR_SUCCESS; } @@ -51,49 +49,43 @@ extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_e // Build single operator kernel //------------------------------------------------------------------------------ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { - using std::ostringstream; using std::string; - int ierr; bool is_setup_done; - ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Hip_gen *data; - ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); - CeedQFunction qf; + CeedCallBackend(CeedOperatorGetData(op, &data)); + CeedQFunction qf; CeedQFunction_Hip_gen *qf_data; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); - ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); CeedSize lsize; - CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, - num_output_fields, num_comp, dim = 1; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); + CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); Q_1d = Q; CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields); - CeedChkBackend(ierr); - CeedEvalMode eval_mode; - CeedBasis basis; - CeedBasis_Hip_shared *basis_data; - CeedElemRestriction Erestrict; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedEvalMode eval_mode; + CeedBasis basis; + CeedBasis_Hip_shared *basis_data; + CeedElemRestriction Erestrict; CeedElemRestriction_Hip *restr_data; // Check for restriction only identity operator bool is_identity_qf; - ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); if (is_identity_qf) { CeedEvalMode eval_mode_in, eval_mode_out; - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement restriction only identity operators"); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement restriction only identity operators"); // LCOV_EXCL_STOP } @@ -101,27 +93,21 @@ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { // TODO: generalize to accept different device functions? { char *tensor_basis_kernel_path, *tensor_basis_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", - &tensor_basis_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source)); code << tensor_basis_kernel_source; - ierr = CeedFree(&tensor_basis_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&tensor_basis_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&tensor_basis_kernel_path)); + CeedCallBackend(CeedFree(&tensor_basis_kernel_source)); } { char *hip_gen_template_path, *hip_gen_template_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/hip/hip-gen-templates.h", - &hip_gen_template_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-gen-templates.h", &hip_gen_template_path)); CeedDebug256(ceed, 2, "----- Loading Hip-Gen Template Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source)); code << hip_gen_template_source; - ierr = CeedFree(&hip_gen_template_path); CeedChkBackend(ierr); - ierr = CeedFree(&hip_gen_template_source); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&hip_gen_template_path)); + CeedCallBackend(CeedFree(&hip_gen_template_source)); } string q_function_source(qf_data->q_function_source); @@ -132,19 +118,18 @@ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { // Find dim, P_1d, Q_1d data->max_P_1d = 0; for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); if (basis != CEED_BASIS_COLLOCATED) { - ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); - CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); // Collect dim, P_1d, and Q_1d - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); bool isTensor; - ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); if (isTensor) { - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; } else { // LCOV_EXCL_START @@ -156,19 +141,18 @@ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { // Check output bases for Q_1d, dim as well // The only input basis might be CEED_BASIS_COLLOCATED for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); if (basis != CEED_BASIS_COLLOCATED) { - ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); - CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Collect Q_1d - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); bool isTensor; - ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); if (isTensor) { - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); } else { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); @@ -176,7 +160,7 @@ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { } } } - data->dim = dim; + data->dim = dim; data->Q_1d = Q_1d; // Only use 3D collocated gradient parallelization strategy when gradient is computed @@ -185,21 +169,21 @@ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { if (dim == 3) { bool was_grad_found = false; for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); - ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); - was_grad_found = true; + was_grad_found = true; } } for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { - ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); - ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); - was_grad_found = true; + was_grad_found = true; } } } @@ -209,7 +193,7 @@ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { if (dim != 3 || use_collograd_parallelization) { code << "#define CEED_Q_VLA 1\n\n"; } else { - code << "#define CEED_Q_VLA "<1?"*T_1D":"")<<";\n"; + code << " data.slice = slice+data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; code << "\n // -- Input field constants and basis data --\n"; - //Initialize constants, and matrices B and G + // Initialize constants, and matrices B and G for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field "<B.inputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_in_"<(data, B.inputs["<B.inputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_in_"<(data, B.inputs["<G.inputs[i] = basis_data->d_collo_grad_1d; - code << " __shared__ CeedScalar s_G_in_"<(data, G.inputs["<d_collo_grad_1d; - data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; - code << " __shared__ CeedScalar s_G_in_"<(data, G.inputs["<B.inputs[i] = basis_data->d_interp_1d; + code << " __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(data, B.inputs[" << i << "], s_B_in_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + data->B.inputs[i] = basis_data->d_interp_1d; + code << " __shared__ CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(data, B.inputs[" << i << "], s_B_in_" << i << ");\n"; + if (use_collograd_parallelization) { + data->G.inputs[i] = basis_data->d_collo_grad_1d; + code << " __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n"; + code << " loadMatrix(data, G.inputs[" << i << "], s_G_in_" << i << ");\n"; + } else { + bool has_collo_grad = !!basis_data->d_collo_grad_1d; + data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; + code << " __shared__ CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; + code << " loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_in_" + std::to_string(i))) << ",Q_1d>(data, G.inputs[" << i << "], s_G_in_" << i + << ");\n"; + } + break; + case CEED_EVAL_WEIGHT: + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } code << "\n // -- Output field constants and basis data --\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field "<B.outputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_out_"<(data, B.outputs["<B.outputs[i] = basis_data->d_interp_1d; - code << " __shared__ CeedScalar s_B_out_"<(data, B.outputs["<G.outputs[i] = basis_data->d_collo_grad_1d; - code << " __shared__ CeedScalar s_G_out_"<(data, G.outputs["<d_collo_grad_1d; - data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; - code << " __shared__ CeedScalar s_G_out_"<(data, G.outputs["<B.outputs[i] = basis_data->d_interp_1d; + code << " __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(data, B.outputs[" << i << "], s_B_out_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + data->B.outputs[i] = basis_data->d_interp_1d; + code << " __shared__ CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; + code << " loadMatrix(data, B.outputs[" << i << "], s_B_out_" << i << ");\n"; + if (use_collograd_parallelization) { + data->G.outputs[i] = basis_data->d_collo_grad_1d; + code << " __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n"; + code << " loadMatrix(data, G.outputs[" << i << "], s_G_out_" << i << ");\n"; + } else { + bool has_collo_grad = !!basis_data->d_collo_grad_1d; + data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; + code << " __shared__ CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; + code << " loadMatrix<" << (has_collo_grad ? "Q_1d" : ("P_out_" + std::to_string(i))) << ",Q_1d>(data, G.outputs[" << i << "], s_G_out_" + << i << ");\n"; + } + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + break; // Should not occur } - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur - } - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented - // LCOV_EXCL_STOP + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } } code << "\n // -- Element loop --\n"; @@ -375,112 +352,105 @@ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { // Generate the correct eval mode code for each input code << " // -- Input field restrictions and basis actions --\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - code << " // ---- Input field "<indices.inputs[i] = restr_data->d_ind; - code << " readDofsOffset"<(data, lsize_in_"<(data, lsize_in_" << i + << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n"; } else { bool has_backend_strides; - ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); CeedInt num_elem; - ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); - CeedChkBackend(ierr); - CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; + CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; if (!has_backend_strides) { - ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); } - code << " // Strides: {"<(data, elem, d_u_"<(data, elem, d_u_" << i << ", r_u_" << i << ");\n"; } } // Basis action - code << " // EvalMode: "<1?"Tensor":"")<(data, r_u_"<1?"Tensor":"")<(data, r_u_"<1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<(data, r_u_"<W = basis_data->d_q_weight_1d; - code << " Weight"<<(dim>1?"Tensor":"")<(data, W, r_t_"< 1 ? "Tensor" : "") << dim << "d(data, r_u_" << i << ", s_B_in_" + << i << ", r_t_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + if (use_collograd_parallelization) { + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1d];\n"; + code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_u_" << i + << ", s_B_in_" << i << ", r_t_" << i << ");\n"; + } else { + CeedInt P_1d; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*dim*Q_1d];\n"; + code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_u_" << i << ", s_B_in_" << i << ", s_G_in_" << i << ", r_t_" << i << ");\n"; + } + break; + case CEED_EVAL_WEIGHT: + code << " CeedScalar r_t_" << i << "[Q_1d];\n"; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); + data->W = basis_data->d_q_weight_1d; + code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(data, W, r_t_" << i << ");\n"; + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } // Q function code << "\n // -- Output field setup --\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << "\n // ---- Output field "<indices.inputs[i] = restr_data->d_ind; - code << " readSliceQuadsOffset"<<"3d(data, lsize_in_"<indices.inputs[i] = restr_data->d_ind; + code << " readSliceQuadsOffset" + << "3d(data, lsize_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_" + << i << ", r_q_" << i << ");\n"; + } else { + CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + bool has_backend_strides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + CeedInt num_elem; + CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + } + code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; + code << " readSliceQuadsStrided" + << "3d(data, elem, q, d_u_" << i << ", r_q_" << i << ");\n"; } - code << " // Strides: {"<(data, elem, q, d_u_"<(data, q, r_t_"<(data, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ");\n"; + break; + case CEED_EVAL_WEIGHT: + code << " CeedScalar r_q_" << i << "[1];\n"; + code << " r_q_" << i << "[0] = r_t_" << i << "[q];\n"; + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } code << "\n // -- Output fields --\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field "<(data, q, r_qq_"<(data, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i << ");\n"; + break; + case CEED_EVAL_WEIGHT: + break; // Should not occur + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } code << " }\n"; @@ -643,78 +612,74 @@ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { // Generate the correct eval mode code for each output code << "\n // -- Output field basis action and restrictions --\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - code << " // ---- Output field "<1?"Tensor":"")<(data, r_tt_"<1?"Tensor":"")<(data, r_tt_"<1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<(data, r_tt_"< 1 ? "Tensor" : "") << dim << "d(data, r_tt_" << i + << ", s_B_out_" << i << ", r_v_" << i << ");\n"; + break; + case CEED_EVAL_GRAD: + code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; + if (use_collograd_parallelization) { + code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_tt_" << i + << ", s_B_out_" << i << ", r_v_" << i << ");\n"; + } else { + CeedInt P_1d; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_tt_" << i << ", s_B_out_" << i << ", s_G_out_" << i << ", r_v_" << i << ");\n"; + } + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + break; // Should not occur } - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur - } - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented - // LCOV_EXCL_STOP + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } // Restriction - bool is_strided; - ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); + bool is_strided; + CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); if (!is_strided) { - ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); - CeedChkBackend(ierr); - code << " const CeedInt lsize_out_"<indices.outputs[i] = restr_data->d_ind; - code << " writeDofsOffset"<(data, lsize_out_"<(data, lsize_out_" << i + << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n"; } else { bool has_backend_strides; - ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); CeedInt num_elem; - ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); - CeedChkBackend(ierr); - CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; + CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; if (!has_backend_strides) { - ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); } - code << " // Strides: {"<(data, elem, r_v_"<(data, elem, r_v_" << i << ", d_v_" << i << ");\n"; } } @@ -728,17 +693,13 @@ extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { CeedInt block_sizes[3] = {0, 0, 0}; CeedInt num_elem; - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); - ierr = BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes); - CeedChkBackend(ierr); - ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2, - "T_1D", block_sizes[0], - "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]); - CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, operator_name.c_str(), &data->op); - CeedChkBackend(ierr); - - ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); + CeedCallBackend(CeedCompileHip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE", + block_sizes[0] * block_sizes[1] * block_sizes[2])); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, operator_name.c_str(), &data->op)); + + CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h index 598f2de106..5e72045fc2 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.h +++ b/backends/hip-gen/ceed-hip-gen-operator-build.h @@ -8,10 +8,7 @@ #ifndef _ceed_hip_gen_operator_build_h #define _ceed_hip_gen_operator_build_h -CEED_INTERN int BlockGridCalculate_Hip_gen(const CeedInt dim, - const CeedInt num_elem, - const CeedInt P_1d, const CeedInt Q_1d, - CeedInt *block_sizes); +CEED_INTERN int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes); CEED_INTERN int CeedHipGenOperatorBuild(CeedOperator op); -#endif // _ceed_hip_gen_operator_build_h +#endif // _ceed_hip_gen_operator_build_h diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c index 8d3b8d204a..d0b1030155 100644 --- a/backends/hip-gen/ceed-hip-gen-operator.c +++ b/backends/hip-gen/ceed-hip-gen-operator.c @@ -5,80 +5,69 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include -#include "ceed-hip-gen.h" -#include "ceed-hip-gen-operator-build.h" + #include "../hip/ceed-hip-compile.h" +#include "ceed-hip-gen-operator-build.h" +#include "ceed-hip-gen.h" //------------------------------------------------------------------------------ // Destroy operator //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) { - int ierr; CeedOperator_Hip_gen *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Apply and add to output //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, - CeedVector output_vec, CeedRequest *request) { - int ierr; +static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Hip_gen *data; - ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); - CeedQFunction qf; + CeedCallBackend(CeedOperatorGetData(op, &data)); + CeedQFunction qf; CeedQFunction_Hip_gen *qf_data; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); - ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); CeedInt num_elem, num_input_fields, num_output_fields; - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); CeedEvalMode eval_mode; - CeedVector vec, output_vecs[CEED_FIELD_MAX] = {}; + CeedVector vec, output_vecs[CEED_FIELD_MAX] = {}; - //Creation of the operator - ierr = CeedHipGenOperatorBuild(op); CeedChkBackend(ierr); + // Creation of the operator + CeedCallBackend(CeedHipGenOperatorBuild(op)); // Input vectors for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); - CeedChkBackend(ierr); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.inputs[i] = NULL; } else { // Get input vector - ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; - ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.inputs[i])); } } // Output vectors for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); - CeedChkBackend(ierr); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.outputs[i] = NULL; } else { // Get output vector - ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; output_vecs[i] = vec; // Check for multiple output modes @@ -90,8 +79,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, } } if (index == -1) { - ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.outputs[i])); } else { data->fields.outputs[i] = data->fields.outputs[index]; } @@ -99,69 +87,47 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, } // Get context data - ierr = CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c)); // Apply operator - void *opargs[] = {(void *) &num_elem, &qf_data->d_c, &data->indices, - &data->fields, &data->B, &data->G, &data->W - }; - const CeedInt dim = data->dim; - const CeedInt Q_1d = data->Q_1d; - const CeedInt P_1d = data->max_P_1d; + void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W}; + const CeedInt dim = data->dim; + const CeedInt Q_1d = data->Q_1d; + const CeedInt P_1d = data->max_P_1d; const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - CeedInt block_sizes[3]; - ierr = BlockGridCalculate_Hip_gen(dim, num_elem, P_1d, Q_1d, block_sizes); - CeedChkBackend(ierr); - if (dim==1) { - CeedInt grid = num_elem/block_sizes[2] + ( ( - num_elem/block_sizes[2]*block_sizes[2]op, grid, block_sizes[0], - block_sizes[1], - block_sizes[2], sharedMem, opargs); - } else if (dim==2) { - CeedInt grid = num_elem/block_sizes[2] + ( ( - num_elem/block_sizes[2]*block_sizes[2]op, grid, block_sizes[0], - block_sizes[1], - block_sizes[2], sharedMem, opargs); - } else if (dim==3) { - CeedInt grid = num_elem/block_sizes[2] + ( ( - num_elem/block_sizes[2]*block_sizes[2]op, grid, block_sizes[0], - block_sizes[1], - block_sizes[2], sharedMem, opargs); + CeedInt block_sizes[3]; + CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, P_1d, Q_1d, block_sizes)); + if (dim == 1) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar); + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + } else if (dim == 2) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar); + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + } else if (dim == 3) { + CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); + CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar); + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); } - CeedChkBackend(ierr); // Restore input arrays for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); - CeedChkBackend(ierr); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { - ierr = CeedOperatorFieldGetVector(op_input_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; - ierr = CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); } } // Restore output arrays for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); - CeedChkBackend(ierr); - if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { - ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; // Check for multiple output modes CeedInt index = -1; @@ -172,15 +138,13 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, } } if (index == -1) { - ierr = CeedVectorRestoreArray(vec, &data->fields.outputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(vec, &data->fields.outputs[i])); } } } // Restore context data - ierr = CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); return CEED_ERROR_SUCCESS; } @@ -189,18 +153,15 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Hip_gen(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Hip_gen *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", - CeedOperatorApplyAdd_Hip_gen); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", - CeedOperatorDestroy_Hip_gen); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/hip-gen/ceed-hip-gen-qfunction.c b/backends/hip-gen/ceed-hip-gen-qfunction.c index 70b621bc0c..cfa44de430 100644 --- a/backends/hip-gen/ceed-hip-gen-qfunction.c +++ b/backends/hip-gen/ceed-hip-gen-qfunction.c @@ -5,37 +5,34 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include + #include "ceed-hip-gen.h" //------------------------------------------------------------------------------ // Apply QFunction //------------------------------------------------------------------------------ -static int CeedQFunctionApply_Hip_gen(CeedQFunction qf, CeedInt Q, - CeedVector *U, CeedVector *V) { - int ierr; +static int CeedQFunctionApply_Hip_gen(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement QFunctionApply"); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement QFunctionApply"); } //------------------------------------------------------------------------------ // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Hip_gen(CeedQFunction qf) { - int ierr; CeedQFunction_Hip_gen *data; - ierr = CeedQFunctionGetData(qf, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &data)); Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); - ierr = hipFree(data->d_c); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&data->q_function_source); CeedChkBackend(ierr); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + CeedCallHip(ceed, hipFree(data->d_c)); + CeedCallBackend(CeedFree(&data->q_function_source)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -43,30 +40,25 @@ static int CeedQFunctionDestroy_Hip_gen(CeedQFunction qf) { // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Hip_gen(CeedQFunction qf) { - int ierr; Ceed ceed; CeedQFunctionGetCeed(qf, &ceed); CeedQFunction_Hip_gen *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); - ierr = CeedQFunctionSetData(qf, data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedQFunctionSetData(qf, data)); // Read QFunction source - ierr = CeedQFunctionGetKernelName(qf, &data->q_function_name); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->q_function_name)); CeedDebug256(ceed, 2, "----- Loading QFunction User Source -----\n"); - ierr = CeedQFunctionLoadSourceToBuffer(qf, &data->q_function_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->q_function_source)); CeedDebug256(ceed, 2, "----- Loading QFunction User Source Complete! -----\n"); - if (!data->q_function_source) + if (!data->q_function_source) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "/gpu/hip/gen backend requires QFunction source code file"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); + // LCOV_EXCL_STOP + } - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", - CeedQFunctionApply_Hip_gen); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", - CeedQFunctionDestroy_Hip_gen); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Hip_gen)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c index 2811dc506b..8a83912e0b 100644 --- a/backends/hip-gen/ceed-hip-gen.c +++ b/backends/hip-gen/ceed-hip-gen.c @@ -5,53 +5,45 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-hip-gen.h" + #include +#include #include -#include "ceed-hip-gen.h" //------------------------------------------------------------------------------ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) { - int ierr; - char *resource_root; - ierr = CeedHipGetResourceRoot(ceed, resource, &resource_root); - CeedChkBackend(ierr); - if (strcmp(resource_root, "/gpu/hip") && strcmp(resource_root, "/gpu/hip/gen")) + CeedCallBackend(CeedHipGetResourceRoot(ceed, resource, &resource_root)); + if (strcmp(resource_root, "/gpu/hip") && strcmp(resource_root, "/gpu/hip/gen")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Hip backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedFree(&resource_root); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Hip backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedFree(&resource_root)); Ceed_Hip *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); - ierr = CeedHipInit(ceed, resource); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedSetData(ceed, data)); + CeedCallBackend(CeedHipInit(ceed, resource)); Ceed ceedshared; - CeedInit("/gpu/hip/shared", &ceedshared); - ierr = CeedSetDelegate(ceed, ceedshared); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/gpu/hip/shared", &ceedshared)); + CeedCallBackend(CeedSetDelegate(ceed, ceedshared)); const char fallbackresource[] = "/gpu/hip/ref"; - ierr = CeedSetOperatorFallbackResource(ceed, fallbackresource); - CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", - CeedQFunctionCreate_Hip_gen); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", - CeedOperatorCreate_Hip_gen); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", - CeedDestroy_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Register backend //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Hip_Gen(void) { - return CeedRegister("/gpu/hip/gen", CeedInit_Hip_gen, 20); -} +CEED_INTERN int CeedRegister_Hip_Gen(void) { return CeedRegister("/gpu/hip/gen", CeedInit_Hip_gen, 20); } //------------------------------------------------------------------------------ diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h index 81ddc27059..4c392e831f 100644 --- a/backends/hip-gen/ceed-hip-gen.h +++ b/backends/hip-gen/ceed-hip-gen.h @@ -8,22 +8,23 @@ #ifndef _ceed_hip_gen_h #define _ceed_hip_gen_h -#include #include +#include #include + #include "../hip/ceed-hip-common.h" typedef struct { - CeedInt dim; - CeedInt Q_1d; - CeedInt max_P_1d; - hipModule_t module; + CeedInt dim; + CeedInt Q_1d; + CeedInt max_P_1d; + hipModule_t module; hipFunction_t op; FieldsInt_Hip indices; - Fields_Hip fields; - Fields_Hip B; - Fields_Hip G; - CeedScalar *W; + Fields_Hip fields; + Fields_Hip B; + Fields_Hip G; + CeedScalar *W; } CeedOperator_Hip_gen; typedef struct { @@ -36,4 +37,4 @@ CEED_INTERN int CeedQFunctionCreate_Hip_gen(CeedQFunction qf); CEED_INTERN int CeedOperatorCreate_Hip_gen(CeedOperator op); -#endif // _ceed_hip_gen_h +#endif // _ceed_hip_gen_h diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c index 88184db967..d82d5e7358 100644 --- a/backends/hip-ref/ceed-hip-ref-basis.c +++ b/backends/hip-ref/ceed-hip-ref-basis.c @@ -5,187 +5,155 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include -#include "ceed-hip-ref.h" + #include "../hip/ceed-hip-compile.h" +#include "ceed-hip-ref.h" //------------------------------------------------------------------------------ // Basis apply - tensor //------------------------------------------------------------------------------ -int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, - CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - int ierr; +int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Hip *ceed_Hip; - ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); CeedBasis_Hip *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - const int max_block_size = 64; + CeedCallBackend(CeedBasisGetData(basis, &data)); + const CeedInt transpose = t_mode == CEED_TRANSPOSE; + const int max_block_size = 64; // Read vectors const CeedScalar *d_u; - CeedScalar *d_v; + CeedScalar *d_v; if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); } - ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Clear v for transpose operation if (t_mode == CEED_TRANSPOSE) { CeedSize length; - ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); - ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); - CeedChk_Hip(ceed, ierr); + CeedCallBackend(CeedVectorGetLength(v, &length)); + CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); } // Basis action switch (eval_mode) { - case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *) &num_elem, (void *) &transpose, - &data->d_interp_1d, &d_u, &d_v - }; - CeedInt Q_1d, dim; - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); - - ierr = CeedRunKernelHip(ceed, data->Interp, num_elem, block_size, interp_args); - CeedChkBackend(ierr); - } break; - case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, - &data->d_grad_1d, &d_u, &d_v - }; - CeedInt block_size = max_block_size; - - ierr = CeedRunKernelHip(ceed, data->Grad, num_elem, block_size, grad_args); - CeedChkBackend(ierr); - } break; - case CEED_EVAL_WEIGHT: { - void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; - const int block_size = 64; - int grid_size = num_elem / block_size; - if (block_size * grid_size < num_elem) - grid_size += 1; - - ierr = CeedRunKernelHip(ceed, data->Weight, grid_size, block_size, - weight_args); CeedChkBackend(ierr); - } break; - // LCOV_EXCL_START - // Evaluate the divergence to/from the quadrature points - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - // Evaluate the curl to/from the quadrature points - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - // Take no action, BasisApply should not have been called - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + case CEED_EVAL_INTERP: { + void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &d_u, &d_v}; + CeedInt Q_1d, dim; + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + + CeedCallBackend(CeedRunKernelHip(ceed, data->Interp, num_elem, block_size, interp_args)); + } break; + case CEED_EVAL_GRAD: { + void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v}; + CeedInt block_size = max_block_size; + + CeedCallBackend(CeedRunKernelHip(ceed, data->Grad, num_elem, block_size, grad_args)); + } break; + case CEED_EVAL_WEIGHT: { + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; + const int block_size = 64; + int grid_size = num_elem / block_size; + if (block_size * grid_size < num_elem) grid_size += 1; + + CeedCallBackend(CeedRunKernelHip(ceed, data->Weight, grid_size, block_size, weight_args)); + } break; + // LCOV_EXCL_START + // Evaluate the divergence to/from the quadrature points + case CEED_EVAL_DIV: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); + // Evaluate the curl to/from the quadrature points + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + // Take no action, BasisApply should not have been called + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP } // Restore vectors if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } - ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Basis apply - non-tensor //------------------------------------------------------------------------------ -int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, - CeedTransposeMode t_mode, CeedEvalMode eval_mode, - CeedVector u, CeedVector v) { - int ierr; +int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Hip *ceed_Hip; - ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); CeedBasisNonTensor_Hip *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedInt num_nodes, num_qpts; - ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr); - ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr); - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - int elemsPerBlock = 1; - int grid = num_elem/elemsPerBlock+(( - num_elem/elemsPerBlock*elemsPerBlockd_interp, &d_u, &d_v - }; - if (transpose) { - ierr = CeedRunKernelDimHip(ceed, data->Interp, grid, num_nodes, 1, - elemsPerBlock, interp_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimHip(ceed, data->Interp, grid, num_qpts, 1, - elemsPerBlock, interp_args); CeedChkBackend(ierr); - } - } break; - case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_grad, - &d_u, &d_v - }; - if (transpose) { - ierr = CeedRunKernelDimHip(ceed, data->Grad, grid, num_nodes, 1, - elemsPerBlock, grad_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimHip(ceed, data->Grad, grid, num_qpts, 1, - elemsPerBlock, grad_args); CeedChkBackend(ierr); - } - } break; - case CEED_EVAL_WEIGHT: { - void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight, &d_v}; - ierr = CeedRunKernelDimHip(ceed, data->Weight, grid, num_qpts, 1, - elemsPerBlock, weight_args); CeedChkBackend(ierr); - } break; - // LCOV_EXCL_START - // Evaluate the divergence to/from the quadrature points - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - // Evaluate the curl to/from the quadrature points - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - // Take no action, BasisApply should not have been called - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + case CEED_EVAL_INTERP: { + void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; + const int block_size_x = transpose ? num_nodes : num_qpts; + CeedCallBackend(CeedRunKernelDimHip(ceed, data->Interp, grid, block_size_x, 1, elemsPerBlock, interp_args)); + } break; + case CEED_EVAL_GRAD: { + void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; + const int block_size_x = transpose ? num_nodes : num_qpts; + CeedCallBackend(CeedRunKernelDimHip(ceed, data->Grad, grid, block_size_x, 1, elemsPerBlock, grad_args)); + } break; + case CEED_EVAL_WEIGHT: { + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; + CeedCallBackend(CeedRunKernelDimHip(ceed, data->Weight, grid, num_qpts, 1, elemsPerBlock, weight_args)); + } break; + // LCOV_EXCL_START + // Evaluate the divergence to/from the quadrature points + case CEED_EVAL_DIV: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); + // Evaluate the curl to/from the quadrature points + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + // Take no action, BasisApply should not have been called + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP } // Restore vectors if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } - ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } @@ -193,19 +161,18 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, // Destroy tensor basis //------------------------------------------------------------------------------ static int CeedBasisDestroy_Hip(CeedBasis basis) { - int ierr; Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasis_Hip *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedChk_Hip(ceed, hipModuleUnload(data->module)); + CeedCallHip(ceed, hipModuleUnload(data->module)); - ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr); - ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr); - ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); + CeedCallHip(ceed, hipFree(data->d_interp_1d)); + CeedCallHip(ceed, hipFree(data->d_grad_1d)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -214,19 +181,18 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) { // Destroy non-tensor basis //------------------------------------------------------------------------------ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { - int ierr; Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasisNonTensor_Hip *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedChk_Hip(ceed, hipModuleUnload(data->module)); + CeedCallHip(ceed, hipModuleUnload(data->module)); - ierr = hipFree(data->d_q_weight); CeedChk_Hip(ceed, ierr); - ierr = hipFree(data->d_interp); CeedChk_Hip(ceed, ierr); - ierr = hipFree(data->d_grad); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallHip(ceed, hipFree(data->d_q_weight)); + CeedCallHip(ceed, hipFree(data->d_interp)); + CeedCallHip(ceed, hipFree(data->d_grad)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -234,138 +200,92 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { //------------------------------------------------------------------------------ // Create tensor //------------------------------------------------------------------------------ -int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, - const CeedScalar *interp_1d, - const CeedScalar *grad_1d, - const CeedScalar *qref1d, - const CeedScalar *q_weight_1d, - CeedBasis basis) { - int ierr; +int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *qref1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasis_Hip *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); // Copy data to GPU const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); - ierr = hipMalloc((void **)&data->d_q_weight_1d, q_bytes); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); const CeedInt interp_bytes = q_bytes * P_1d; - ierr = hipMalloc((void **)&data->d_interp_1d, interp_bytes); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice)); - ierr = hipMalloc((void **)&data->d_grad_1d, interp_bytes); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice)); // Complie basis kernels CeedInt ncomp; - ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); char *basis_kernel_path, *basis_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/hip/hip-ref-basis-tensor.h", - &basis_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor.h", &basis_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); - ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 7, - "BASIS_Q_1D", Q_1d, - "BASIS_P_1D", P_1d, - "BASIS_BUF_LEN", ncomp * CeedIntPow(Q_1d > P_1d ? - Q_1d : P_1d, dim), - "BASIS_DIM", dim, - "BASIS_NUM_COMP", ncomp, - "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), - "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim) - ); CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); - CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); - CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); - CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); - - ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApply_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", - CeedBasisDestroy_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedCompileHip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", + ncomp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", ncomp, "BASIS_NUM_NODES", + CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight)); + CeedCallBackend(CeedFree(&basis_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + + CeedCallBackend(CeedBasisSetData(basis, data)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Create non-tensor //------------------------------------------------------------------------------ -int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, - CeedInt num_qpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *qref, - const CeedScalar *q_weight, CeedBasis basis) { - int ierr; +int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar *qref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasisNonTensor_Hip *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); - ierr = hipMalloc((void **)&data->d_q_weight, q_bytes); CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_q_weight, q_weight, q_bytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); const CeedInt interp_bytes = q_bytes * num_nodes; - ierr = hipMalloc((void **)&data->d_interp, interp_bytes); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_interp, interp, interp_bytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); const CeedInt grad_bytes = q_bytes * num_nodes * dim; - ierr = hipMalloc((void **)&data->d_grad, grad_bytes); CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_grad, grad, grad_bytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); // Compile basis kernels CeedInt ncomp; - ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); char *basis_kernel_path, *basis_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/hip/hip-ref-basis-nontensor.h", - &basis_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); - ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 4, - "BASIS_Q", num_qpts, - "BASIS_P", num_nodes, - "BASIS_DIM", dim, - "BASIS_NUM_COMP", ncomp - ); CeedChk_Hip(ceed, ierr); - ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); - CeedChk_Hip(ceed, ierr); - ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); - CeedChk_Hip(ceed, ierr); - ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); - CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); - - ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); + CeedCallBackend(CeedCompileHip(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, + "BASIS_NUM_COMP", ncomp)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight)); + CeedCallBackend(CeedFree(&basis_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApplyNonTensor_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", - CeedBasisDestroyNonTensor_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c index daeb77d74e..16caa4a5c4 100644 --- a/backends/hip-ref/ceed-hip-ref-operator.c +++ b/backends/hip-ref/ceed-hip-ref-operator.c @@ -5,122 +5,113 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include #include +#include #include #include -#include #include #include -#include "ceed-hip-ref.h" + #include "../hip/ceed-hip-compile.h" +#include "ceed-hip-ref.h" //------------------------------------------------------------------------------ // Destroy operator //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Hip(CeedOperator op) { - int ierr; CeedOperator_Hip *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Apply data for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { - ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&impl->evecs[i])); } - ierr = CeedFree(&impl->evecs); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->evecs)); for (CeedInt i = 0; i < impl->numein; i++) { - ierr = CeedVectorDestroy(&impl->qvecsin[i]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&impl->qvecsin[i])); } - ierr = CeedFree(&impl->qvecsin); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->qvecsin)); for (CeedInt i = 0; i < impl->numeout; i++) { - ierr = CeedVectorDestroy(&impl->qvecsout[i]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&impl->qvecsout[i])); } - ierr = CeedFree(&impl->qvecsout); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->qvecsout)); // QFunction diagonal assembly data - for (CeedInt i=0; iqfnumactivein; i++) { - ierr = CeedVectorDestroy(&impl->qfactivein[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->qfnumactivein; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->qfactivein[i])); } - ierr = CeedFree(&impl->qfactivein); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->qfactivein)); // Diag data if (impl->diag) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedChk_Hip(ceed, hipModuleUnload(impl->diag->module)); - ierr = CeedFree(&impl->diag->h_emodein); CeedChkBackend(ierr); - ierr = CeedFree(&impl->diag->h_emodeout); CeedChkBackend(ierr); - ierr = hipFree(impl->diag->d_emodein); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->diag->d_emodeout); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->diag->d_identity); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->diag->d_interpin); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->diag->d_interpout); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->diag->d_gradin); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->diag->d_gradout); CeedChk_Hip(ceed, ierr); - ierr = CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr); - CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->diag->elemdiag); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->diag->pbelemdiag); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallHip(ceed, hipModuleUnload(impl->diag->module)); + CeedCallBackend(CeedFree(&impl->diag->h_emodein)); + CeedCallBackend(CeedFree(&impl->diag->h_emodeout)); + CeedCallHip(ceed, hipFree(impl->diag->d_emodein)); + CeedCallHip(ceed, hipFree(impl->diag->d_emodeout)); + CeedCallHip(ceed, hipFree(impl->diag->d_identity)); + CeedCallHip(ceed, hipFree(impl->diag->d_interpin)); + CeedCallHip(ceed, hipFree(impl->diag->d_interpout)); + CeedCallHip(ceed, hipFree(impl->diag->d_gradin)); + CeedCallHip(ceed, hipFree(impl->diag->d_gradout)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->elemdiag)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->pbelemdiag)); } - ierr = CeedFree(&impl->diag); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->diag)); if (impl->asmb) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedChk_Hip(ceed, hipModuleUnload(impl->asmb->module)); - ierr = hipFree(impl->asmb->d_B_in); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->asmb->d_B_out); CeedChk_Hip(ceed, ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallHip(ceed, hipModuleUnload(impl->asmb->module)); + CeedCallHip(ceed, hipFree(impl->asmb->d_B_in)); + CeedCallHip(ceed, hipFree(impl->asmb->d_B_out)); } - ierr = CeedFree(&impl->asmb); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->asmb)); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Setup infields or outfields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, - bool isinput, CeedVector *evecs, - CeedVector *qvecs, CeedInt starte, - CeedInt numfields, CeedInt Q, - CeedInt numelements) { - CeedInt dim, ierr, size; +static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool isinput, CeedVector *evecs, CeedVector *qvecs, CeedInt starte, + CeedInt numfields, CeedInt Q, CeedInt numelements) { + CeedInt dim, size; CeedSize q_size; - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedBasis basis; + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedBasis basis; CeedElemRestriction Erestrict; - CeedOperatorField *opfields; + CeedOperatorField *opfields; CeedQFunctionField *qffields; - CeedVector fieldvec; - bool strided; - bool skiprestrict; + CeedVector fieldvec; + bool strided; + bool skiprestrict; if (isinput) { - ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); } else { - ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); } // Loop over fields for (CeedInt i = 0; i < numfields; i++) { CeedEvalMode emode; - ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); - strided = false; + strided = false; skiprestrict = false; if (emode != CEED_EVAL_WEIGHT) { - ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict)); // Check whether this field can skip the element restriction: // must be passive input, with emode NONE, and have a strided restriction with @@ -129,17 +120,15 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, // First, check whether the field is input or output: if (isinput) { // Check for passive input: - ierr = CeedOperatorFieldGetVector(opfields[i], &fieldvec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &fieldvec)); if (fieldvec != CEED_VECTOR_ACTIVE) { // Check emode if (emode == CEED_EVAL_NONE) { // Check for strided restriction - ierr = CeedElemRestrictionIsStrided(Erestrict, &strided); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &strided)); if (strided) { // Check if vector is already in preferred backend ordering - ierr = CeedElemRestrictionHasBackendStrides(Erestrict, - &skiprestrict); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &skiprestrict)); } } } @@ -149,41 +138,38 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, // directly in the operator application. evecs[i + starte] = NULL; } else { - ierr = CeedElemRestrictionCreateVector(Erestrict, NULL, - &evecs[i + starte]); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionCreateVector(Erestrict, NULL, &evecs[i + starte])); } } switch (emode) { - case CEED_EVAL_NONE: - ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); - q_size = (CeedSize)numelements * Q * size; - ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_INTERP: - ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); - q_size = (CeedSize)numelements * Q * size; - ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetSize(qffields[i], &size); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - q_size = (CeedSize)numelements * Q * size; - ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_WEIGHT: // Only on input fields - ierr = CeedOperatorFieldGetBasis(opfields[i], &basis); CeedChkBackend(ierr); - q_size = (CeedSize)numelements * Q; - ierr = CeedVectorCreate(ceed, q_size, &qvecs[i]); CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, - CEED_EVAL_WEIGHT, NULL, qvecs[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented + case CEED_EVAL_NONE: + CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); + q_size = (CeedSize)numelements * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); + q_size = (CeedSize)numelements * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); + CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + q_size = (CeedSize)numelements * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + break; + case CEED_EVAL_WEIGHT: // Only on input fields + CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); + q_size = (CeedSize)numelements * Q; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, NULL, qvecs[i])); + break; + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } return CEED_ERROR_SUCCESS; @@ -194,100 +180,77 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, // to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ static int CeedOperatorSetup_Hip(CeedOperator op) { - int ierr; bool setupdone; - ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); - if (setupdone) - return CEED_ERROR_SUCCESS; + CeedCallBackend(CeedOperatorIsSetupDone(op, &setupdone)); + if (setupdone) return CEED_ERROR_SUCCESS; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Hip *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedInt Q, numelements, numinputfields, numoutputfields; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); CeedOperatorField *opinputfields, *opoutputfields; - ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, - &numoutputfields, &opoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); CeedQFunctionField *qfinputfields, *qfoutputfields; - ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); // Allocate - ierr = CeedCalloc(numinputfields + numoutputfields, &impl->evecs); - CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(numinputfields + numoutputfields, &impl->evecs)); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout)); - impl->numein = numinputfields; impl->numeout = numoutputfields; + impl->numein = numinputfields; + impl->numeout = numoutputfields; // Set up infield and outfield evecs and qvecs // Infields - ierr = CeedOperatorSetupFields_Hip(qf, op, true, - impl->evecs, impl->qvecsin, 0, - numinputfields, Q, numelements); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, impl->evecs, impl->qvecsin, 0, numinputfields, Q, numelements)); // Outfields - ierr = CeedOperatorSetupFields_Hip(qf, op, false, - impl->evecs, impl->qvecsout, - numinputfields, numoutputfields, Q, - numelements); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, impl->evecs, impl->qvecsout, numinputfields, numoutputfields, Q, numelements)); - ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Setup Operator Inputs //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Hip(CeedInt numinputfields, - CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedVector invec, const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], - CeedOperator_Hip *impl, CeedRequest *request) { - CeedInt ierr; - CeedEvalMode emode; - CeedVector vec; +static inline int CeedOperatorSetupInputs_Hip(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, + CeedVector invec, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl, + CeedRequest *request) { + CeedEvalMode emode; + CeedVector vec; CeedElemRestriction Erestrict; for (CeedInt i = 0; i < numinputfields; i++) { // Get input vector - ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - if (skipactive) - continue; - else - vec = invec; + if (skipactive) continue; + else vec = invec; } - ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); - CeedChkBackend(ierr); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); + if (emode == CEED_EVAL_WEIGHT) { // Skip } else { // Get input vector - ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); // Get input element restriction - ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); - CeedChkBackend(ierr); - if (vec == CEED_VECTOR_ACTIVE) - vec = invec; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); + if (vec == CEED_VECTOR_ACTIVE) vec = invec; // Restrict, if necessary if (!impl->evecs[i]) { // No restriction for this field; read data directly from vec. - ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, - (const CeedScalar **) &edata[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); } else { - ierr = CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, - impl->evecs[i], request); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, impl->evecs[i], request)); // Get evec - ierr = CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, - (const CeedScalar **) &edata[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); } } } @@ -297,58 +260,45 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt numinputfields, //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Hip(CeedInt numelements, - CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedInt numinputfields, const bool skipactive, - CeedScalar *edata[2*CEED_FIELD_MAX], CeedOperator_Hip *impl) { - CeedInt ierr; - CeedInt elemsize, size; +static inline int CeedOperatorInputBasis_Hip(CeedInt numelements, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, + CeedInt numinputfields, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], + CeedOperator_Hip *impl) { + CeedInt elemsize, size; CeedElemRestriction Erestrict; - CeedEvalMode emode; - CeedBasis basis; + CeedEvalMode emode; + CeedBasis basis; - for (CeedInt i=0; iqvecsin[i], CEED_MEM_DEVICE, - CEED_USE_POINTER, edata[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_INTERP: - ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, - CEED_EVAL_INTERP, impl->evecs[i], - impl->qvecsin[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, - CEED_EVAL_GRAD, impl->evecs[i], - impl->qvecsin[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_WEIGHT: - break; // No action - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented + case CEED_EVAL_NONE: + CeedCallBackend(CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i])); + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->evecs[i], impl->qvecsin[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->evecs[i], impl->qvecsin[i])); + break; + case CEED_EVAL_WEIGHT: + break; // No action + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented } } return CEED_ERROR_SUCCESS; @@ -357,34 +307,25 @@ static inline int CeedOperatorInputBasis_Hip(CeedInt numelements, //------------------------------------------------------------------------------ // Restore Input Vectors //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Hip(CeedInt numinputfields, - CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - const bool skipactive, CeedScalar *edata[2*CEED_FIELD_MAX], - CeedOperator_Hip *impl) { - CeedInt ierr; +static inline int CeedOperatorRestoreInputs_Hip(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, + const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) { CeedEvalMode emode; - CeedVector vec; + CeedVector vec; for (CeedInt i = 0; i < numinputfields; i++) { // Skip active input if (skipactive) { - ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); - if (vec == CEED_VECTOR_ACTIVE) - continue; + CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) continue; } - ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); - CeedChkBackend(ierr); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); + if (emode == CEED_EVAL_WEIGHT) { // Skip } else { if (!impl->evecs[i]) { // This was a skiprestrict case - ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArrayRead(vec, - (const CeedScalar **)&edata[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i])); } else { - ierr = CeedVectorRestoreArrayRead(impl->evecs[i], - (const CeedScalar **) &edata[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(impl->evecs[i], (const CeedScalar **)&edata[i])); } } } @@ -394,225 +335,176 @@ static inline int CeedOperatorRestoreInputs_Hip(CeedInt numinputfields, //------------------------------------------------------------------------------ // Apply and add to output //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector invec, - CeedVector outvec, CeedRequest *request) { - int ierr; +static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) { CeedOperator_Hip *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); CeedOperatorField *opinputfields, *opoutputfields; - ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, - &numoutputfields, &opoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); CeedQFunctionField *qfinputfields, *qfoutputfields; - ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); - CeedChkBackend(ierr); - CeedEvalMode emode; - CeedVector vec; - CeedBasis basis; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); + CeedEvalMode emode; + CeedVector vec; + CeedBasis basis; CeedElemRestriction Erestrict; - CeedScalar *edata[2*CEED_FIELD_MAX]; + CeedScalar *edata[2 * CEED_FIELD_MAX]; // Setup - ierr = CeedOperatorSetup_Hip(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Hip(op)); // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Hip(numinputfields, qfinputfields, - opinputfields, invec, false, edata, - impl, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Hip(numinputfields, qfinputfields, opinputfields, invec, false, edata, impl, request)); // Input basis apply if needed - ierr = CeedOperatorInputBasis_Hip(numelements, qfinputfields, opinputfields, - numinputfields, false, edata, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorInputBasis_Hip(numelements, qfinputfields, opinputfields, numinputfields, false, edata, impl)); // Output pointers, as necessary for (CeedInt i = 0; i < numoutputfields; i++) { - ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); if (emode == CEED_EVAL_NONE) { // Set the output Q-Vector to use the E-Vector data directly. - ierr = CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, - &edata[i + numinputfields]); CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, - CEED_USE_POINTER, edata[i + numinputfields]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, &edata[i + numinputfields])); + CeedCallBackend(CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i + numinputfields])); } } // Q function - ierr = CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout)); // Output basis apply if needed for (CeedInt i = 0; i < numoutputfields; i++) { // Get elemsize, emode, size - ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); - CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetSize(qfoutputfields[i], &size); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); + CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); // Basis action switch (emode) { - case CEED_EVAL_NONE: - break; - case CEED_EVAL_INTERP: - ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, - CEED_EVAL_INTERP, impl->qvecsout[i], - impl->evecs[i + impl->numein]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, numelements, CEED_TRANSPOSE, - CEED_EVAL_GRAD, impl->qvecsout[i], - impl->evecs[i + impl->numein]); CeedChkBackend(ierr); - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); - break; // Should not occur - } - case CEED_EVAL_DIV: - break; // TODO: Not implemented - case CEED_EVAL_CURL: - break; // TODO: Not implemented - // LCOV_EXCL_STOP + case CEED_EVAL_NONE: + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->qvecsout[i], impl->evecs[i + impl->numein])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->qvecsout[i], impl->evecs[i + impl->numein])); + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); + break; // Should not occur + } + case CEED_EVAL_DIV: + break; // TODO: Not implemented + case CEED_EVAL_CURL: + break; // TODO: Not implemented + // LCOV_EXCL_STOP } } // Output restriction for (CeedInt i = 0; i < numoutputfields; i++) { // Restore evec - ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); if (emode == CEED_EVAL_NONE) { - ierr = CeedVectorRestoreArray(impl->evecs[i+impl->numein], - &edata[i + numinputfields]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(impl->evecs[i + impl->numein], &edata[i + numinputfields])); } // Get output vector - ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); // Restrict - ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); // Active - if (vec == CEED_VECTOR_ACTIVE) - vec = outvec; + if (vec == CEED_VECTOR_ACTIVE) vec = outvec; - ierr = CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, - impl->evecs[i + impl->numein], vec, - request); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, impl->evecs[i + impl->numein], vec, request)); } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Hip(numinputfields, qfinputfields, - opinputfields, false, edata, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Hip(numinputfields, qfinputfields, opinputfields, false, edata, impl)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Core code for assembling linear QFunction //------------------------------------------------------------------------------ -static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, - bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, - CeedRequest *request) { - int ierr; +static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { CeedOperator_Hip *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); - CeedInt Q, numelements, numinputfields, numoutputfields, size; + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedInt Q, numelements, numinputfields, numoutputfields, size; CeedSize q_size; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); CeedOperatorField *opinputfields, *opoutputfields; - ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, - &numoutputfields, &opoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); CeedQFunctionField *qfinputfields, *qfoutputfields; - ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); - CeedChkBackend(ierr); - CeedVector vec; - CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); + CeedVector vec; + CeedInt numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; CeedVector *activein = impl->qfactivein; CeedScalar *a, *tmp; - Ceed ceed, ceedparent; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceedparent); - CeedChkBackend(ierr); + Ceed ceed, ceedparent; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedGetOperatorFallbackParentCeed(ceed, &ceedparent)); ceedparent = ceedparent ? ceedparent : ceed; - CeedScalar *edata[2*CEED_FIELD_MAX]; + CeedScalar *edata[2 * CEED_FIELD_MAX]; // Setup - ierr = CeedOperatorSetup_Hip(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Hip(op)); // Check for identity bool identityqf; - ierr = CeedQFunctionIsIdentity(qf, &identityqf); CeedChkBackend(ierr); - if (identityqf) + CeedCallBackend(CeedQFunctionIsIdentity(qf, &identityqf)); + if (identityqf) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Assembling identity QFunctions not supported"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); + // LCOV_EXCL_STOP + } // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Hip(numinputfields, qfinputfields, - opinputfields, NULL, true, edata, - impl, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Hip(numinputfields, qfinputfields, opinputfields, NULL, true, edata, impl, request)); // Count number of active input fields if (!numactivein) { - for (CeedInt i=0; iqvecsin[i], 0.0); CeedChkBackend(ierr); - ierr = CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp); - CeedChkBackend(ierr); - ierr = CeedRealloc(numactivein + size, &activein); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); + CeedCallBackend(CeedVectorSetValue(impl->qvecsin[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp)); + CeedCallBackend(CeedRealloc(numactivein + size, &activein)); for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q*numelements; - ierr = CeedVectorCreate(ceed, q_size, &activein[numactivein+field]); - CeedChkBackend(ierr); - ierr = CeedVectorSetArray(activein[numactivein+field], CEED_MEM_DEVICE, - CEED_USE_POINTER, &tmp[field*Q*numelements]); - CeedChkBackend(ierr); + q_size = (CeedSize)Q * numelements; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &activein[numactivein + field])); + CeedCallBackend(CeedVectorSetArray(activein[numactivein + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &tmp[field * Q * numelements])); } numactivein += size; - ierr = CeedVectorRestoreArray(impl->qvecsin[i], &tmp); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(impl->qvecsin[i], &tmp)); } } impl->qfnumactivein = numactivein; - impl->qfactivein = activein; + impl->qfactivein = activein; } // Count number of active output fields if (!numactiveout) { - for (CeedInt i=0; i 1) { - ierr = CeedVectorSetValue(activein[(in+numactivein-1)%numactivein], - 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetValue(activein[(in + numactivein - 1) % numactivein], 0.0)); } // Set Outputs - for (CeedInt out=0; outqvecsout[out], CEED_MEM_DEVICE, - CEED_USE_POINTER, a); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetSize(qfoutputfields[out], &size); - CeedChkBackend(ierr); - a += size*Q*numelements; // Advance the pointer by the size of the output + CeedCallBackend(CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, CEED_USE_POINTER, a)); + CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[out], &size)); + a += size * Q * numelements; // Advance the pointer by the size of the output } } // Apply QFunction - ierr = CeedQFunctionApply(qf, Q*numelements, impl->qvecsin, impl->qvecsout); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, Q * numelements, impl->qvecsin, impl->qvecsout)); } // Un-set output Qvecs to prevent accidental overwrite of Assembled - for (CeedInt out=0; outqvecsout[out], CEED_MEM_DEVICE, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL)); } } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Hip(numinputfields, qfinputfields, - opinputfields, true, edata, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Hip(numinputfields, qfinputfields, opinputfields, true, edata, impl)); // Restore output - ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(*assembled, &a)); return CEED_ERROR_SUCCESS; } @@ -701,58 +578,47 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, //------------------------------------------------------------------------------ // Assemble Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunction_Hip(CeedOperator op, - CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Hip(op, true, assembled, rstr, - request); +static int CeedOperatorLinearAssembleQFunction_Hip(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Hip(op, true, assembled, rstr, request); } //------------------------------------------------------------------------------ // Assemble Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op, - CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Hip(op, false, &assembled, &rstr, - request); +static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Hip(op, false, &assembled, &rstr, request); } //------------------------------------------------------------------------------ // Create point block restriction //------------------------------------------------------------------------------ -static int CreatePBRestriction(CeedElemRestriction rstr, - CeedElemRestriction *pbRstr) { - int ierr; +static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *pbRstr) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); const CeedInt *offsets; - ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); // Expand offsets - CeedInt nelem, ncomp, elemsize, compstride, *pbOffsets; + CeedInt nelem, ncomp, elemsize, compstride, *pbOffsets; CeedSize l_size; - ierr = CeedElemRestrictionGetNumElements(rstr, &nelem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumComponents(rstr, &ncomp); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(rstr, &elemsize); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetCompStride(rstr, &compstride); - CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetLVectorSize(rstr, &l_size); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &nelem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &ncomp)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elemsize)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &compstride)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); CeedInt shift = ncomp; - if (compstride != 1) - shift *= ncomp; - ierr = CeedCalloc(nelem*elemsize, &pbOffsets); CeedChkBackend(ierr); - for (CeedInt i = 0; i < nelem*elemsize; i++) { - pbOffsets[i] = offsets[i]*shift; + if (compstride != 1) shift *= ncomp; + CeedCallBackend(CeedCalloc(nelem * elemsize, &pbOffsets)); + for (CeedInt i = 0; i < nelem * elemsize; i++) { + pbOffsets[i] = offsets[i] * shift; } // Create new restriction - ierr = CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp*ncomp, 1, - l_size * ncomp, CEED_MEM_HOST, - CEED_OWN_POINTER, pbOffsets, pbRstr); - CeedChkBackend(ierr); + CeedCallBackend( + CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp * ncomp, 1, l_size * ncomp, CEED_MEM_HOST, CEED_OWN_POINTER, pbOffsets, pbRstr)); // Cleanup - ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); return CEED_ERROR_SUCCESS; } @@ -760,209 +626,172 @@ static int CreatePBRestriction(CeedElemRestriction rstr, //------------------------------------------------------------------------------ // Assemble diagonal setup //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, - const bool pointBlock) { - int ierr; +static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, const bool pointBlock) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedInt numinputfields, numoutputfields; - ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields)); // Determine active input basis - CeedOperatorField *opfields; + CeedOperatorField *opfields; CeedQFunctionField *qffields; - ierr = CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL); - CeedChkBackend(ierr); - CeedInt numemodein = 0, ncomp = 0, dim = 1; - CeedEvalMode *emodein = NULL; - CeedBasis basisin = NULL; - CeedElemRestriction rstrin = NULL; + CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); + CeedInt numemodein = 0, ncomp = 0, dim = 1; + CeedEvalMode *emodein = NULL; + CeedBasis basisin = NULL; + CeedElemRestriction rstrin = NULL; for (CeedInt i = 0; i < numinputfields; i++) { CeedVector vec; - ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedElemRestriction rstr; - ierr = CeedOperatorFieldGetBasis(opfields[i], &basisin); CeedChkBackend(ierr); - ierr = CeedBasisGetNumComponents(basisin, &ncomp); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basisin, &dim); CeedChkBackend(ierr); - ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); - CeedChkBackend(ierr); - if (rstrin && rstrin != rstr) + CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisin)); + CeedCallBackend(CeedBasisGetNumComponents(basisin, &ncomp)); + CeedCallBackend(CeedBasisGetDimension(basisin, &dim)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); + if (rstrin && rstrin != rstr) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement multi-field non-composite operator diagonal assembly"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); + // LCOV_EXCL_STOP + } rstrin = rstr; CeedEvalMode emode; - ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); switch (emode) { - case CEED_EVAL_NONE: - case CEED_EVAL_INTERP: - ierr = CeedRealloc(numemodein + 1, &emodein); CeedChkBackend(ierr); - emodein[numemodein] = emode; - numemodein += 1; - break; - case CEED_EVAL_GRAD: - ierr = CeedRealloc(numemodein + dim, &emodein); CeedChkBackend(ierr); - for (CeedInt d = 0; d < dim; d++) - emodein[numemodein+d] = emode; - numemodein += dim; - break; - case CEED_EVAL_WEIGHT: - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - break; // Caught by QF Assembly + case CEED_EVAL_NONE: + case CEED_EVAL_INTERP: + CeedCallBackend(CeedRealloc(numemodein + 1, &emodein)); + emodein[numemodein] = emode; + numemodein += 1; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedRealloc(numemodein + dim, &emodein)); + for (CeedInt d = 0; d < dim; d++) emodein[numemodein + d] = emode; + numemodein += dim; + break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // Caught by QF Assembly } } } // Determine active output basis - ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields); - CeedChkBackend(ierr); - CeedInt numemodeout = 0; - CeedEvalMode *emodeout = NULL; - CeedBasis basisout = NULL; - CeedElemRestriction rstrout = NULL; + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); + CeedInt numemodeout = 0; + CeedEvalMode *emodeout = NULL; + CeedBasis basisout = NULL; + CeedElemRestriction rstrout = NULL; for (CeedInt i = 0; i < numoutputfields; i++) { CeedVector vec; - ierr = CeedOperatorFieldGetVector(opfields[i], &vec); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedElemRestriction rstr; - ierr = CeedOperatorFieldGetBasis(opfields[i], &basisout); CeedChkBackend(ierr); - ierr = CeedOperatorFieldGetElemRestriction(opfields[i], &rstr); - CeedChkBackend(ierr); - if (rstrout && rstrout != rstr) + CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisout)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); + if (rstrout && rstrout != rstr) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement multi-field non-composite operator diagonal assembly"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); + // LCOV_EXCL_STOP + } rstrout = rstr; CeedEvalMode emode; - ierr = CeedQFunctionFieldGetEvalMode(qffields[i], &emode); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); switch (emode) { - case CEED_EVAL_NONE: - case CEED_EVAL_INTERP: - ierr = CeedRealloc(numemodeout + 1, &emodeout); CeedChkBackend(ierr); - emodeout[numemodeout] = emode; - numemodeout += 1; - break; - case CEED_EVAL_GRAD: - ierr = CeedRealloc(numemodeout + dim, &emodeout); CeedChkBackend(ierr); - for (CeedInt d = 0; d < dim; d++) - emodeout[numemodeout+d] = emode; - numemodeout += dim; - break; - case CEED_EVAL_WEIGHT: - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - break; // Caught by QF Assembly + case CEED_EVAL_NONE: + case CEED_EVAL_INTERP: + CeedCallBackend(CeedRealloc(numemodeout + 1, &emodeout)); + emodeout[numemodeout] = emode; + numemodeout += 1; + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedRealloc(numemodeout + dim, &emodeout)); + for (CeedInt d = 0; d < dim; d++) emodeout[numemodeout + d] = emode; + numemodeout += dim; + break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // Caught by QF Assembly } } } // Operator data struct CeedOperator_Hip *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); - ierr = CeedCalloc(1, &impl->diag); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedCalloc(1, &impl->diag)); CeedOperatorDiag_Hip *diag = impl->diag; - diag->basisin = basisin; - diag->basisout = basisout; - diag->h_emodein = emodein; - diag->h_emodeout = emodeout; - diag->numemodein = numemodein; - diag->numemodeout = numemodeout; + diag->basisin = basisin; + diag->basisout = basisout; + diag->h_emodein = emodein; + diag->h_emodeout = emodeout; + diag->numemodein = numemodein; + diag->numemodeout = numemodeout; // Assemble kernel char *diagonal_kernel_path, *diagonal_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", - &diagonal_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Diagonal Assembly Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, - &diagonal_kernel_source); - CeedChkBackend(ierr); - CeedDebug256(ceed, 2, - "----- Loading Diagonal Assembly Source Complete! -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); + CeedDebug256(ceed, 2, "----- Loading Diagonal Assembly Source Complete! -----\n"); CeedInt nnodes, nqpts; - ierr = CeedBasisGetNumNodes(basisin, &nnodes); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basisin, &nqpts); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumNodes(basisin, &nnodes)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basisin, &nqpts)); diag->nnodes = nnodes; - ierr = CeedCompileHip(ceed, diagonal_kernel_source, &diag->module, 5, - "NUMEMODEIN", numemodein, - "NUMEMODEOUT", numemodeout, - "NNODES", nnodes, - "NQPTS", nqpts, - "NCOMP", ncomp - ); CeedChk_Hip(ceed, ierr); - ierr = CeedGetKernelHip(ceed, diag->module, "linearDiagonal", - &diag->linearDiagonal); CeedChk_Hip(ceed, ierr); - ierr = CeedGetKernelHip(ceed, diag->module, "linearPointBlockDiagonal", - &diag->linearPointBlock); - CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&diagonal_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&diagonal_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedCompileHip(ceed, diagonal_kernel_source, &diag->module, 5, "NUMEMODEIN", numemodein, "NUMEMODEOUT", numemodeout, "NNODES", + nnodes, "NQPTS", nqpts, "NCOMP", ncomp)); + CeedCallBackend(CeedGetKernelHip(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); + CeedCallBackend(CeedGetKernelHip(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); + CeedCallBackend(CeedFree(&diagonal_kernel_path)); + CeedCallBackend(CeedFree(&diagonal_kernel_source)); // Basis matrices - const CeedInt qBytes = nqpts * sizeof(CeedScalar); - const CeedInt iBytes = qBytes * nnodes; - const CeedInt gBytes = qBytes * nnodes * dim; - const CeedInt eBytes = sizeof(CeedEvalMode); + const CeedInt qBytes = nqpts * sizeof(CeedScalar); + const CeedInt iBytes = qBytes * nnodes; + const CeedInt gBytes = qBytes * nnodes * dim; + const CeedInt eBytes = sizeof(CeedEvalMode); const CeedScalar *interpin, *interpout, *gradin, *gradout; // CEED_EVAL_NONE CeedScalar *identity = NULL; - bool evalNone = false; - for (CeedInt i=0; id_identity, iBytes); CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(diag->d_identity, identity, iBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity)); + for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; + CeedCallHip(ceed, hipMalloc((void **)&diag->d_identity, iBytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_identity, identity, iBytes, hipMemcpyHostToDevice)); } // CEED_EVAL_INTERP - ierr = CeedBasisGetInterp(basisin, &interpin); CeedChkBackend(ierr); - ierr = hipMalloc((void **)&diag->d_interpin, iBytes); CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(diag->d_interpin, interpin, iBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); - ierr = CeedBasisGetInterp(basisout, &interpout); CeedChkBackend(ierr); - ierr = hipMalloc((void **)&diag->d_interpout, iBytes); CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(diag->d_interpout, interpout, iBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallBackend(CeedBasisGetInterp(basisin, &interpin)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_interpin, iBytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_interpin, interpin, iBytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetInterp(basisout, &interpout)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_interpout, iBytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_interpout, interpout, iBytes, hipMemcpyHostToDevice)); // CEED_EVAL_GRAD - ierr = CeedBasisGetGrad(basisin, &gradin); CeedChkBackend(ierr); - ierr = hipMalloc((void **)&diag->d_gradin, gBytes); CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(diag->d_gradin, gradin, gBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); - ierr = CeedBasisGetGrad(basisout, &gradout); CeedChkBackend(ierr); - ierr = hipMalloc((void **)&diag->d_gradout, gBytes); CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(diag->d_gradout, gradout, gBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallBackend(CeedBasisGetGrad(basisin, &gradin)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_gradin, gBytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_gradin, gradin, gBytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetGrad(basisout, &gradout)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_gradout, gBytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_gradout, gradout, gBytes, hipMemcpyHostToDevice)); // Arrays of emodes - ierr = hipMalloc((void **)&diag->d_emodein, numemodein * eBytes); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(diag->d_emodein, emodein, numemodein * eBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); - ierr = hipMalloc((void **)&diag->d_emodeout, numemodeout * eBytes); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_emodein, numemodein * eBytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_emodein, emodein, numemodein * eBytes, hipMemcpyHostToDevice)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_emodeout, numemodeout * eBytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, hipMemcpyHostToDevice)); // Restriction diag->diagrstr = rstrout; @@ -973,33 +802,27 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, //------------------------------------------------------------------------------ // Assemble diagonal common code //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, - CeedVector assembled, CeedRequest *request, const bool pointBlock) { - int ierr; +static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool pointBlock) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Hip *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Assemble QFunction - CeedVector assembledqf; + CeedVector assembledqf; CeedElemRestriction rstr; - ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, - &rstr, request); CeedChkBackend(ierr); - ierr = CeedElemRestrictionDestroy(&rstr); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); // Setup - if (!impl->diag) { - ierr = CeedOperatorAssembleDiagonalSetup_Hip(op, pointBlock); - CeedChkBackend(ierr); - } + if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op, pointBlock)); CeedOperatorDiag_Hip *diag = impl->diag; assert(diag != NULL); // Restriction if (pointBlock && !diag->pbdiagrstr) { CeedElemRestriction pbdiagrstr; - ierr = CreatePBRestriction(diag->diagrstr, &pbdiagrstr); CeedChkBackend(ierr); + CeedCallBackend(CreatePBRestriction(diag->diagrstr, &pbdiagrstr)); diag->pbdiagrstr = pbdiagrstr; } CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; @@ -1008,55 +831,40 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; if (!elemdiag) { // Element diagonal vector - ierr = CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag); - CeedChkBackend(ierr); - if (pointBlock) - diag->pbelemdiag = elemdiag; - else - diag->elemdiag = elemdiag; + CeedCallBackend(CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag)); + if (pointBlock) diag->pbelemdiag = elemdiag; + else diag->elemdiag = elemdiag; } - ierr = CeedVectorSetValue(elemdiag, 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetValue(elemdiag, 0.0)); // Assemble element operator diagonals - CeedScalar *elemdiagarray; + CeedScalar *elemdiagarray; const CeedScalar *assembledqfarray; - ierr = CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray); - CeedChkBackend(ierr); - ierr = CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray)); + CeedCallBackend(CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray)); CeedInt nelem; - ierr = CeedElemRestrictionGetNumElements(diagrstr, &nelem); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetNumElements(diagrstr, &nelem)); // Compute the diagonal of B^T D B - int elemsPerBlock = 1; - int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlockd_identity, - &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, - &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, - &assembledqfarray, &elemdiagarray - }; + int elemsPerBlock = 1; + int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); + void *args[] = {(void *)&nelem, &diag->d_identity, &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, + &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, &assembledqfarray, &elemdiagarray}; if (pointBlock) { - ierr = CeedRunKernelDimHip(ceed, diag->linearPointBlock, grid, - diag->nnodes, 1, elemsPerBlock, args); - CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelDimHip(ceed, diag->linearPointBlock, grid, diag->nnodes, 1, elemsPerBlock, args)); } else { - ierr = CeedRunKernelDimHip(ceed, diag->linearDiagonal, grid, - diag->nnodes, 1, elemsPerBlock, args); - CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelDimHip(ceed, diag->linearDiagonal, grid, diag->nnodes, 1, elemsPerBlock, args)); } // Restore arrays - ierr = CeedVectorRestoreArray(elemdiag, &elemdiagarray); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(elemdiag, &elemdiagarray)); + CeedCallBackend(CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray)); // Assemble local operator diagonal - ierr = CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, - assembled, request); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, assembled, request)); // Cleanup - ierr = CeedVectorDestroy(&assembledqf); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&assembledqf)); return CEED_ERROR_SUCCESS; } @@ -1064,20 +872,16 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, //------------------------------------------------------------------------------ // Assemble Linear Diagonal //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleAddDiagonal_Hip(CeedOperator op, - CeedVector assembled, CeedRequest *request) { - int ierr = CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, false); - CeedChkBackend(ierr); +static int CeedOperatorLinearAssembleAddDiagonal_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedCallBackend(CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, false)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Assemble Linear Point Block Diagonal //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op, - CeedVector assembled, CeedRequest *request) { - int ierr = CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, true); - CeedChkBackend(ierr); +static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedCallBackend(CeedOperatorAssembleDiagonalCore_Hip(op, assembled, request, true)); return CEED_ERROR_SUCCESS; } @@ -1085,59 +889,52 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op, // Single operator assembly setup //------------------------------------------------------------------------------ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Hip *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Get intput and output fields - CeedInt num_input_fields, num_output_fields; + CeedInt num_input_fields, num_output_fields; CeedOperatorField *input_fields; CeedOperatorField *output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &input_fields, - &num_output_fields, &output_fields); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Determine active input basis eval mode CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedQFunctionField *qf_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); // Note that the kernel will treat each dimension of a gradient action separately; // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment // by dim. However, for the purposes of loading the B matrices, it will be treated // as one mode, and we will load/copy the entire gradient matrix at once, so // num_B_in_mats_to_load will be incremented by 1. - CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; - CeedEvalMode *eval_mode_in = NULL; //will be of size num_B_in_mats_load - CeedBasis basis_in = NULL; - CeedInt nqpts = 0, esize = 0; + CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; + CeedEvalMode *eval_mode_in = NULL; // will be of size num_B_in_mats_load + CeedBasis basis_in = NULL; + CeedInt nqpts = 0, esize = 0; CeedElemRestriction rstr_in = NULL; - for (CeedInt i=0; iasmb); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl->asmb)); CeedOperatorAssemble_Hip *asmb = impl->asmb; - asmb->nelem = nelem; + asmb->nelem = nelem; // Compile kernels - int elemsPerBlock = 1; + int elemsPerBlock = 1; asmb->elemsPerBlock = elemsPerBlock; - CeedInt block_size = esize * esize * elemsPerBlock; - char *assembly_kernel_path, *assembly_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/hip/hip-ref-operator-assemble.h", - &assembly_kernel_path); CeedChkBackend(ierr); + CeedInt block_size = esize * esize * elemsPerBlock; + char *assembly_kernel_path, *assembly_kernel_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble.h", &assembly_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Assembly Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, assembly_kernel_path, - &assembly_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); CeedDebug256(ceed, 2, "----- Loading Assembly Source Complete! -----\n"); bool fallback = block_size > 1024; - if (fallback) { // Use fallback kernel with 1D threadblock - block_size = esize * elemsPerBlock; + if (fallback) { // Use fallback kernel with 1D threadblock + block_size = esize * elemsPerBlock; asmb->block_size_x = esize; asmb->block_size_y = 1; } else { // Use kernel with 2D threadblock asmb->block_size_x = esize; asmb->block_size_y = esize; } - ierr = CeedCompileHip(ceed, assembly_kernel_source, &asmb->module, 7, - "NELEM", nelem, - "NUMEMODEIN", num_emode_in, - "NUMEMODEOUT", num_emode_out, - "NQPTS", nqpts, - "NNODES", esize, - "BLOCK_SIZE", block_size, - "NCOMP", ncomp - ); CeedChk_Hip(ceed, ierr); - ierr = CeedGetKernelHip(ceed, asmb->module, - fallback ? "linearAssembleFallback" : "linearAssemble", - &asmb->linearAssemble); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&assembly_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&assembly_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedCompileHip(ceed, assembly_kernel_source, &asmb->module, 7, "NELEM", nelem, "NUMEMODEIN", num_emode_in, "NUMEMODEOUT", + num_emode_out, "NQPTS", nqpts, "NNODES", esize, "BLOCK_SIZE", block_size, "NCOMP", ncomp)); + CeedCallBackend(CeedGetKernelHip(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); + CeedCallBackend(CeedFree(&assembly_kernel_path)); + CeedCallBackend(CeedFree(&assembly_kernel_source)); // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) const CeedScalar *interp_in, *grad_in; - ierr = CeedBasisGetInterp(basis_in, &interp_in); CeedChkBackend(ierr); - ierr = CeedBasisGetGrad(basis_in, &grad_in); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); + CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); // Load into B_in, in order that they will be used in eval_mode - const CeedInt inBytes = size_B_in * sizeof(CeedScalar); - CeedInt mat_start = 0; - ierr = hipMalloc((void **) &asmb->d_B_in, inBytes); CeedChk_Hip(ceed, ierr); + const CeedInt inBytes = size_B_in * sizeof(CeedScalar); + CeedInt mat_start = 0; + CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_in, inBytes)); for (int i = 0; i < num_B_in_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_in[i]; if (eval_mode == CEED_EVAL_INTERP) { - ierr = hipMemcpy(&asmb->d_B_in[mat_start], interp_in, - esize * nqpts * sizeof(CeedScalar), - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], interp_in, esize * nqpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); mat_start += esize * nqpts; } else if (eval_mode == CEED_EVAL_GRAD) { - ierr = hipMemcpy(&asmb->d_B_in[mat_start], grad_in, - dim * esize * nqpts * sizeof(CeedScalar), - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * esize * nqpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); mat_start += dim * esize * nqpts; } } @@ -1264,27 +1038,23 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op) { // for now if (basis_out == basis_in) { interp_out = interp_in; - grad_out = grad_in; + grad_out = grad_in; } else { - ierr = CeedBasisGetInterp(basis_out, &interp_out); CeedChkBackend(ierr); - ierr = CeedBasisGetGrad(basis_out, &grad_out); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); + CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); } // Load into B_out, in order that they will be used in eval_mode const CeedInt outBytes = size_B_out * sizeof(CeedScalar); - mat_start = 0; - ierr = hipMalloc((void **) &asmb->d_B_out, outBytes); CeedChk_Hip(ceed, ierr); + mat_start = 0; + CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_out, outBytes)); for (int i = 0; i < num_B_out_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_out[i]; if (eval_mode == CEED_EVAL_INTERP) { - ierr = hipMemcpy(&asmb->d_B_out[mat_start], interp_out, - esize * nqpts * sizeof(CeedScalar), - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], interp_out, esize * nqpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); mat_start += esize * nqpts; } else if (eval_mode == CEED_EVAL_GRAD) { - ierr = hipMemcpy(&asmb->d_B_out[mat_start], grad_out, - dim * esize * nqpts * sizeof(CeedScalar), - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * esize * nqpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); mat_start += dim * esize * nqpts; } } @@ -1300,57 +1070,43 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op) { // modes). // TODO: allow multiple active input restrictions/basis objects //------------------------------------------------------------------------------ -static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, - CeedVector values) { - - int ierr; +static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Hip *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Setup if (!impl->asmb) { - ierr = CeedSingleOperatorAssembleSetup_Hip(op); - CeedChkBackend(ierr); + CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op)); assert(impl->asmb != NULL); } // Assemble QFunction - CeedVector assembled_qf; + CeedVector assembled_qf; CeedElemRestriction rstr_q; - ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate( - op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE); CeedChkBackend(ierr); - ierr = CeedElemRestrictionDestroy(&rstr_q); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); + CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); CeedScalar *values_array; - ierr = CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array)); values_array += offset; const CeedScalar *qf_array; - ierr = CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); // Compute B^T D B - const CeedInt nelem = impl->asmb->nelem; // to satisfy clang-tidy + const CeedInt nelem = impl->asmb->nelem; // to satisfy clang-tidy const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock; - const CeedInt grid = nelem/elemsPerBlock+(( - nelem/elemsPerBlock*elemsPerBlockasmb->d_B_in, &impl->asmb->d_B_out, - &qf_array, &values_array - }; - ierr = CeedRunKernelDimHip(ceed, impl->asmb->linearAssemble, grid, - impl->asmb->block_size_x, impl->asmb->block_size_y, - elemsPerBlock, args); - CeedChkBackend(ierr); - + const CeedInt grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); + void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; + CeedCallBackend( + CeedRunKernelDimHip(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elemsPerBlock, args)); // Restore arrays - ierr = CeedVectorRestoreArray(values, &values_array); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArrayRead(assembled_qf, &qf_array); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); + CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); // Cleanup - ierr = CeedVectorDestroy(&assembled_qf); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&assembled_qf)); return CEED_ERROR_SUCCESS; } @@ -1359,35 +1115,21 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Hip(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Hip *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", - CeedOperatorLinearAssembleQFunction_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, - "LinearAssembleQFunctionUpdate", - CeedOperatorLinearAssembleQFunctionUpdate_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", - CeedOperatorLinearAssembleAddDiagonal_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, - "LinearAssembleAddPointBlockDiagonal", - CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, - "LinearAssembleSingle", CeedSingleOperatorAssemble_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", - CeedOperatorApplyAdd_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", - CeedOperatorDestroy_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Hip)); + CeedCallBackend( + CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddPointBlockDiagonal", CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleSingle", CeedSingleOperatorAssemble_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp index 09ef88d280..14c7eae7c0 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp +++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp @@ -5,58 +5,54 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include +#include + #include #include -#include -#include "ceed-hip-ref.h" + #include "../hip/ceed-hip-compile.h" +#include "ceed-hip-ref.h" //------------------------------------------------------------------------------ // Build QFunction kernel //------------------------------------------------------------------------------ extern "C" int CeedHipBuildQFunction(CeedQFunction qf) { - CeedInt ierr; using std::ostringstream; using std::string; Ceed ceed; CeedQFunctionGetCeed(qf, &ceed); Ceed_Hip *ceed_Hip; - ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); CeedQFunction_Hip *data; - ierr = CeedQFunctionGetData(qf, (void **)&data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data)); // QFunction is built - if (data->QFunction) - return CEED_ERROR_SUCCESS; + if (data->QFunction) return CEED_ERROR_SUCCESS; - if (!data->qfunction_source) + if (!data->qfunction_source) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No QFunction source or hipFunction_t provided."); - // LCOV_EXCL_STOP - + return CeedError(ceed, CEED_ERROR_BACKEND, "No QFunction source or hipFunction_t provided."); + // LCOV_EXCL_STOP + } + // QFunction kernel generation - CeedInt num_input_fields, num_output_fields, size; + CeedInt num_input_fields, num_output_fields, size; CeedQFunctionField *input_fields, *output_fields; - ierr = CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, - &num_output_fields, &output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Build strings for final kernel char *read_write_kernel_path, *read_write_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-qfunction.h", - &read_write_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-qfunction.h", &read_write_kernel_path)); CeedDebug256(ceed, 2, "----- Loading QFunction Read/Write Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source)); CeedDebug256(ceed, 2, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n"); - string qfunction_source(data->qfunction_source); - string qfunction_name(data->qfunction_name); - string read_write(read_write_kernel_source); - string kernel_name = "CeedKernelHipRefQFunction_" + qfunction_name; + string qfunction_source(data->qfunction_source); + string qfunction_name(data->qfunction_name); + string read_write(read_write_kernel_source); + string kernel_name = "CeedKernelHipRefQFunction_" + qfunction_name; ostringstream code; // Defintions @@ -65,11 +61,11 @@ extern "C" int CeedHipBuildQFunction(CeedQFunction qf) { code << "\n"; code << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; code << "__global__ void " << kernel_name << "(void *ctx, CeedInt Q, Fields_Hip fields) {\n"; - + // Inputs code << " // Input fields\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedQFunctionFieldGetSize(input_fields[i], &size); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetSize(input_fields[i], &size)); code << " const CeedInt size_input_" << i << " = " << size << ";\n"; code << " CeedScalar input_" << i << "[size_input_" << i << "];\n"; } @@ -82,7 +78,7 @@ extern "C" int CeedHipBuildQFunction(CeedQFunction qf) { // Outputs code << " // Output fields\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedQFunctionFieldGetSize(output_fields[i], &size); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetSize(output_fields[i], &size)); code << " const CeedInt size_output_" << i << " = " << size << ";\n"; code << " CeedScalar output_" << i << "[size_output_" << i << "];\n"; } @@ -118,18 +114,15 @@ extern "C" int CeedHipBuildQFunction(CeedQFunction qf) { // View kernel for debugging CeedDebug256(ceed, 2, "Generated QFunction Kernels:\n"); CeedDebug(ceed, code.str().c_str()); - + // Compile kernel - ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, - 1, "BLOCK_SIZE", ceed_Hip->opt_block_size); - CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, kernel_name.c_str(), &data->QFunction); - CeedChkBackend(ierr); + CeedCallBackend(CeedCompileHip(ceed, code.str().c_str(), &data->module, 1, "BLOCK_SIZE", ceed_Hip->opt_block_size)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, kernel_name.c_str(), &data->QFunction)); // Cleanup - ierr = CeedFree(&data->qfunction_source); CeedChkBackend(ierr); - ierr = CeedFree(&read_write_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&read_write_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&data->qfunction_source)); + CeedCallBackend(CeedFree(&read_write_kernel_path)); + CeedCallBackend(CeedFree(&read_write_kernel_source)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.h b/backends/hip-ref/ceed-hip-ref-qfunction-load.h index 3f00f26bbc..df1060d584 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunction-load.h +++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.h @@ -10,4 +10,4 @@ CEED_INTERN int CeedHipBuildQFunction(CeedQFunction qf); -#endif // _ceed_hip_qfunction_load_h +#endif // _ceed_hip_qfunction_load_h diff --git a/backends/hip-ref/ceed-hip-ref-qfunction.c b/backends/hip-ref/ceed-hip-ref-qfunction.c index ddebbf3785..824db6458e 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunction.c +++ b/backends/hip-ref/ceed-hip-ref-qfunction.c @@ -5,68 +5,59 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include -#include "ceed-hip-ref.h" -#include "ceed-hip-ref-qfunction-load.h" + #include "../hip/ceed-hip-compile.h" +#include "ceed-hip-ref-qfunction-load.h" +#include "ceed-hip-ref.h" //------------------------------------------------------------------------------ // Apply QFunction //------------------------------------------------------------------------------ -static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, - CeedVector *U, CeedVector *V) { - int ierr; +static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); // Build and compile kernel, if not done - ierr = CeedHipBuildQFunction(qf); CeedChkBackend(ierr); + CeedCallBackend(CeedHipBuildQFunction(qf)); CeedQFunction_Hip *data; - ierr = CeedQFunctionGetData(qf, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &data)); Ceed_Hip *ceed_Hip; - ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); CeedInt num_input_fields, num_output_fields; - ierr = CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); const int blocksize = ceed_Hip->opt_block_size; // Read vectors for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedVectorGetArrayRead(U[i], CEED_MEM_DEVICE, &data->fields.inputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_DEVICE, &data->fields.inputs[i])); } for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedVectorGetArrayWrite(V[i], CEED_MEM_DEVICE, &data->fields.outputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(V[i], CEED_MEM_DEVICE, &data->fields.outputs[i])); } // Get context data - ierr = CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &data->d_c); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &data->d_c)); // Run kernel - void *args[] = {&data->d_c, (void *) &Q, &data->fields}; - ierr = CeedRunKernelHip(ceed, data->QFunction, CeedDivUpInt(Q, blocksize), - blocksize, args); CeedChkBackend(ierr); + void *args[] = {&data->d_c, (void *)&Q, &data->fields}; + CeedCallBackend(CeedRunKernelHip(ceed, data->QFunction, CeedDivUpInt(Q, blocksize), blocksize, args)); // Restore vectors for (CeedInt i = 0; i < num_input_fields; i++) { - ierr = CeedVectorRestoreArrayRead(U[i], &data->fields.inputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(U[i], &data->fields.inputs[i])); } for (CeedInt i = 0; i < num_output_fields; i++) { - ierr = CeedVectorRestoreArray(V[i], &data->fields.outputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(V[i], &data->fields.outputs[i])); } // Restore context - ierr = CeedQFunctionRestoreInnerContextData(qf, &data->d_c); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c)); return CEED_ERROR_SUCCESS; } @@ -75,14 +66,12 @@ static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Hip(CeedQFunction qf) { - int ierr; CeedQFunction_Hip *data; - ierr = CeedQFunctionGetData(qf, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &data)); Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); - if (data->module) - CeedChk_Hip(ceed, hipModuleUnload(data->module)); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + if (data->module) CeedCallHip(ceed, hipModuleUnload(data->module)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -91,29 +80,23 @@ static int CeedQFunctionDestroy_Hip(CeedQFunction qf) { // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Hip(CeedQFunction qf) { - int ierr; Ceed ceed; CeedQFunctionGetCeed(qf, &ceed); CeedQFunction_Hip *data; - ierr = CeedCalloc(1,&data); CeedChkBackend(ierr); - ierr = CeedQFunctionSetData(qf, data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedQFunctionSetData(qf, data)); CeedInt num_input_fields, num_output_fields; - ierr = CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); // Read QFunction source - ierr = CeedQFunctionGetKernelName(qf, &data->qfunction_name); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetKernelName(qf, &data->qfunction_name)); CeedDebug256(ceed, 2, "----- Loading QFunction User Source -----\n"); - ierr = CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &data->qfunction_source)); CeedDebug256(ceed, 2, "----- Loading QFunction User Source Complete! -----\n"); // Register backend functions - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", - CeedQFunctionApply_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", - CeedQFunctionDestroy_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Hip)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c index e4c71e21c1..cfc250ba9c 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c +++ b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c @@ -5,44 +5,41 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-hip-ref.h" //------------------------------------------------------------------------------ // Sync host to device //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextSyncH2D_Hip( - const CeedQFunctionContext ctx) { - int ierr; +static inline int CeedQFunctionContextSyncH2D_Hip(const CeedQFunctionContext ctx) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - if (!impl->h_data) + if (!impl->h_data) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No valid host data to sync to device"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device"); + // LCOV_EXCL_STOP + } size_t ctxsize; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctxsize); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); if (impl->d_data_borrowed) { impl->d_data = impl->d_data_borrowed; } else if (impl->d_data_owned) { impl->d_data = impl->d_data_owned; } else { - ierr = hipMalloc((void **)&impl->d_data_owned, ctxsize); - CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_data_owned, ctxsize)); impl->d_data = impl->d_data_owned; } - ierr = hipMemcpy(impl->d_data, impl->h_data, ctxsize, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMemcpy(impl->d_data, impl->h_data, ctxsize, hipMemcpyHostToDevice)); return CEED_ERROR_SUCCESS; } @@ -50,35 +47,31 @@ static inline int CeedQFunctionContextSyncH2D_Hip( //------------------------------------------------------------------------------ // Sync device to host //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextSyncD2H_Hip( - const CeedQFunctionContext ctx) { - int ierr; +static inline int CeedQFunctionContextSyncD2H_Hip(const CeedQFunctionContext ctx) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - if (!impl->d_data) + if (!impl->d_data) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No valid device data to sync to host"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); + // LCOV_EXCL_STOP + } size_t ctxsize; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctxsize); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); if (impl->h_data_borrowed) { impl->h_data = impl->h_data_borrowed; } else if (impl->h_data_owned) { impl->h_data = impl->h_data_owned; } else { - ierr = CeedMallocArray(1, ctxsize, &impl->h_data_owned); - CeedChkBackend(ierr); + CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); impl->h_data = impl->h_data_owned; } - ierr = hipMemcpy(impl->h_data, impl->d_data, ctxsize, - hipMemcpyDeviceToHost); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMemcpy(impl->h_data, impl->d_data, ctxsize, hipMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -86,11 +79,12 @@ static inline int CeedQFunctionContextSyncD2H_Hip( //------------------------------------------------------------------------------ // Sync data of type //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextSync_Hip(const CeedQFunctionContext ctx, - CeedMemType mem_type) { +static inline int CeedQFunctionContextSync_Hip(const CeedQFunctionContext ctx, CeedMemType mem_type) { switch (mem_type) { - case CEED_MEM_HOST: return CeedQFunctionContextSyncD2H_Hip(ctx); - case CEED_MEM_DEVICE: return CeedQFunctionContextSyncH2D_Hip(ctx); + case CEED_MEM_HOST: + return CeedQFunctionContextSyncD2H_Hip(ctx); + case CEED_MEM_DEVICE: + return CeedQFunctionContextSyncH2D_Hip(ctx); } return CEED_ERROR_UNSUPPORTED; } @@ -98,11 +92,9 @@ static inline int CeedQFunctionContextSync_Hip(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // Set all pointers as invalid //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextSetAllInvalid_Hip( - const CeedQFunctionContext ctx) { - int ierr; +static inline int CeedQFunctionContextSetAllInvalid_Hip(const CeedQFunctionContext ctx) { CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); impl->h_data = NULL; impl->d_data = NULL; @@ -113,11 +105,9 @@ static inline int CeedQFunctionContextSetAllInvalid_Hip( //------------------------------------------------------------------------------ // Check for valid data //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextHasValidData_Hip( - const CeedQFunctionContext ctx, bool *has_valid_data) { - int ierr; +static inline int CeedQFunctionContextHasValidData_Hip(const CeedQFunctionContext ctx, bool *has_valid_data) { CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); *has_valid_data = !!impl->h_data || !!impl->d_data; @@ -127,20 +117,18 @@ static inline int CeedQFunctionContextHasValidData_Hip( //------------------------------------------------------------------------------ // Check if ctx has borrowed data //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextHasBorrowedDataOfType_Hip( - const CeedQFunctionContext ctx, CeedMemType mem_type, - bool *has_borrowed_data_of_type) { - int ierr; +static inline int CeedQFunctionContextHasBorrowedDataOfType_Hip(const CeedQFunctionContext ctx, CeedMemType mem_type, + bool *has_borrowed_data_of_type) { CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); switch (mem_type) { - case CEED_MEM_HOST: - *has_borrowed_data_of_type = !!impl->h_data_borrowed; - break; - case CEED_MEM_DEVICE: - *has_borrowed_data_of_type = !!impl->d_data_borrowed; - break; + case CEED_MEM_HOST: + *has_borrowed_data_of_type = !!impl->h_data_borrowed; + break; + case CEED_MEM_DEVICE: + *has_borrowed_data_of_type = !!impl->d_data_borrowed; + break; } return CEED_ERROR_SUCCESS; @@ -149,22 +137,19 @@ static inline int CeedQFunctionContextHasBorrowedDataOfType_Hip( //------------------------------------------------------------------------------ // Check if data of given type needs sync //------------------------------------------------------------------------------ -static inline int CeedQFunctionContextNeedSync_Hip( - const CeedQFunctionContext ctx, CeedMemType mem_type, bool *need_sync) { - int ierr; +static inline int CeedQFunctionContextNeedSync_Hip(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *need_sync) { CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); bool has_valid_data = true; - ierr = CeedQFunctionContextHasValidData_Hip(ctx, &has_valid_data); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextHasValidData_Hip(ctx, &has_valid_data)); switch (mem_type) { - case CEED_MEM_HOST: - *need_sync = has_valid_data && !impl->h_data; - break; - case CEED_MEM_DEVICE: - *need_sync = has_valid_data && !impl->d_data; - break; + case CEED_MEM_HOST: + *need_sync = has_valid_data && !impl->h_data; + break; + case CEED_MEM_DEVICE: + *need_sync = has_valid_data && !impl->d_data; + break; } return CEED_ERROR_SUCCESS; @@ -173,32 +158,29 @@ static inline int CeedQFunctionContextNeedSync_Hip( //------------------------------------------------------------------------------ // Set data from host //------------------------------------------------------------------------------ -static int CeedQFunctionContextSetDataHost_Hip(const CeedQFunctionContext ctx, - const CeedCopyMode copy_mode, void *data) { - int ierr; +static int CeedQFunctionContextSetDataHost_Hip(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = CeedFree(&impl->h_data_owned); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->h_data_owned)); switch (copy_mode) { - case CEED_COPY_VALUES: { - size_t ctxsize; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctxsize); CeedChkBackend(ierr); - ierr = CeedMallocArray(1, ctxsize, &impl->h_data_owned); - CeedChkBackend(ierr); - impl->h_data_borrowed = NULL; - impl->h_data = impl->h_data_owned; - memcpy(impl->h_data, data, ctxsize); - } break; - case CEED_OWN_POINTER: - impl->h_data_owned = data; - impl->h_data_borrowed = NULL; - impl->h_data = data; - break; - case CEED_USE_POINTER: - impl->h_data_borrowed = data; - impl->h_data = data; - break; + case CEED_COPY_VALUES: { + size_t ctxsize; + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); + CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); + impl->h_data_borrowed = NULL; + impl->h_data = impl->h_data_owned; + memcpy(impl->h_data, data, ctxsize); + } break; + case CEED_OWN_POINTER: + impl->h_data_owned = data; + impl->h_data_borrowed = NULL; + impl->h_data = data; + break; + case CEED_USE_POINTER: + impl->h_data_borrowed = data; + impl->h_data = data; + break; } return CEED_ERROR_SUCCESS; @@ -207,37 +189,33 @@ static int CeedQFunctionContextSetDataHost_Hip(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // Set data from device //------------------------------------------------------------------------------ -static int CeedQFunctionContextSetDataDevice_Hip(const CeedQFunctionContext ctx, - const CeedCopyMode copy_mode, void *data) { - int ierr; +static int CeedQFunctionContextSetDataDevice_Hip(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = hipFree(impl->d_data_owned); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipFree(impl->d_data_owned)); impl->d_data_owned = NULL; switch (copy_mode) { - case CEED_COPY_VALUES: { - size_t ctxsize; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctxsize); CeedChkBackend(ierr); - ierr = hipMalloc((void **)&impl->d_data_owned, ctxsize); - CeedChk_Hip(ceed, ierr); - impl->d_data_borrowed = NULL; - impl->d_data = impl->d_data_owned; - ierr = hipMemcpy(impl->d_data, data, ctxsize, - hipMemcpyDeviceToDevice); CeedChk_Hip(ceed, ierr); - } break; - case CEED_OWN_POINTER: - impl->d_data_owned = data; - impl->d_data_borrowed = NULL; - impl->d_data = data; - break; - case CEED_USE_POINTER: - impl->d_data_owned = NULL; - impl->d_data_borrowed = data; - impl->d_data = data; - break; + case CEED_COPY_VALUES: { + size_t ctxsize; + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_data_owned, ctxsize)); + impl->d_data_borrowed = NULL; + impl->d_data = impl->d_data_owned; + CeedCallHip(ceed, hipMemcpy(impl->d_data, data, ctxsize, hipMemcpyDeviceToDevice)); + } break; + case CEED_OWN_POINTER: + impl->d_data_owned = data; + impl->d_data_borrowed = NULL; + impl->d_data = data; + break; + case CEED_USE_POINTER: + impl->d_data_owned = NULL; + impl->d_data_borrowed = data; + impl->d_data = data; + break; } return CEED_ERROR_SUCCESS; @@ -247,18 +225,16 @@ static int CeedQFunctionContextSetDataDevice_Hip(const CeedQFunctionContext ctx, // Set the data used by a user context, // freeing any previously allocated data if applicable //------------------------------------------------------------------------------ -static int CeedQFunctionContextSetData_Hip(const CeedQFunctionContext ctx, - const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) { - int ierr; +static int CeedQFunctionContextSetData_Hip(const CeedQFunctionContext ctx, const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - ierr = CeedQFunctionContextSetAllInvalid_Hip(ctx); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextSetAllInvalid_Hip(ctx)); switch (mem_type) { - case CEED_MEM_HOST: - return CeedQFunctionContextSetDataHost_Hip(ctx, copy_mode, data); - case CEED_MEM_DEVICE: - return CeedQFunctionContextSetDataDevice_Hip(ctx, copy_mode, data); + case CEED_MEM_HOST: + return CeedQFunctionContextSetDataHost_Hip(ctx, copy_mode, data); + case CEED_MEM_DEVICE: + return CeedQFunctionContextSetDataDevice_Hip(ctx, copy_mode, data); } return CEED_ERROR_UNSUPPORTED; @@ -267,34 +243,31 @@ static int CeedQFunctionContextSetData_Hip(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // Take data //------------------------------------------------------------------------------ -static int CeedQFunctionContextTakeData_Hip(const CeedQFunctionContext ctx, - const CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextTakeData_Hip(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type bool need_sync = false; - ierr = CeedQFunctionContextNeedSync_Hip(ctx, mem_type, &need_sync); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextNeedSync_Hip(ctx, mem_type, &need_sync)); if (need_sync) { - ierr = CeedQFunctionContextSync_Hip(ctx, mem_type); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextSync_Hip(ctx, mem_type)); } // Update pointer switch (mem_type) { - case CEED_MEM_HOST: - *(void **)data = impl->h_data_borrowed; - impl->h_data_borrowed = NULL; - impl->h_data = NULL; - break; - case CEED_MEM_DEVICE: - *(void **)data = impl->d_data_borrowed; - impl->d_data_borrowed = NULL; - impl->d_data = NULL; - break; + case CEED_MEM_HOST: + *(void **)data = impl->h_data_borrowed; + impl->h_data_borrowed = NULL; + impl->h_data = NULL; + break; + case CEED_MEM_DEVICE: + *(void **)data = impl->d_data_borrowed; + impl->d_data_borrowed = NULL; + impl->d_data = NULL; + break; } return CEED_ERROR_SUCCESS; @@ -304,30 +277,25 @@ static int CeedQFunctionContextTakeData_Hip(const CeedQFunctionContext ctx, // Core logic for GetData. // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ -static int CeedQFunctionContextGetDataCore_Hip(const CeedQFunctionContext ctx, - const CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextGetDataCore_Hip(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type bool need_sync = false; - ierr = CeedQFunctionContextNeedSync_Hip(ctx, mem_type, &need_sync); - CeedChkBackend(ierr); - if (need_sync) { - ierr = CeedQFunctionContextSync_Hip(ctx, mem_type); CeedChkBackend(ierr); - } + CeedCallBackend(CeedQFunctionContextNeedSync_Hip(ctx, mem_type, &need_sync)); + if (need_sync) CeedCallBackend(CeedQFunctionContextSync_Hip(ctx, mem_type)); // Sync data to requested mem_type and update pointer switch (mem_type) { - case CEED_MEM_HOST: - *(void **)data = impl->h_data; - break; - case CEED_MEM_DEVICE: - *(void **)data = impl->d_data; - break; + case CEED_MEM_HOST: + *(void **)data = impl->h_data; + break; + case CEED_MEM_DEVICE: + *(void **)data = impl->d_data; + break; } return CEED_ERROR_SUCCESS; @@ -336,32 +304,28 @@ static int CeedQFunctionContextGetDataCore_Hip(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // Get read-only access to the data //------------------------------------------------------------------------------ -static int CeedQFunctionContextGetDataRead_Hip(const CeedQFunctionContext ctx, - const CeedMemType mem_type, void *data) { +static int CeedQFunctionContextGetDataRead_Hip(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { return CeedQFunctionContextGetDataCore_Hip(ctx, mem_type, data); } //------------------------------------------------------------------------------ // Get read/write access to the data //------------------------------------------------------------------------------ -static int CeedQFunctionContextGetData_Hip(const CeedQFunctionContext ctx, - const CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextGetData_Hip(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = CeedQFunctionContextGetDataCore_Hip(ctx, mem_type, data); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetDataCore_Hip(ctx, mem_type, data)); // Mark only pointer for requested memory as valid - ierr = CeedQFunctionContextSetAllInvalid_Hip(ctx); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextSetAllInvalid_Hip(ctx)); switch (mem_type) { - case CEED_MEM_HOST: - impl->h_data = *(void **)data; - break; - case CEED_MEM_DEVICE: - impl->d_data = *(void **)data; - break; + case CEED_MEM_HOST: + impl->h_data = *(void **)data; + break; + case CEED_MEM_DEVICE: + impl->d_data = *(void **)data; + break; } return CEED_ERROR_SUCCESS; @@ -371,15 +335,14 @@ static int CeedQFunctionContextGetData_Hip(const CeedQFunctionContext ctx, // Destroy the user context //------------------------------------------------------------------------------ static int CeedQFunctionContextDestroy_Hip(const CeedQFunctionContext ctx) { - int ierr; Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedQFunctionContext_Hip *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = hipFree(impl->d_data_owned); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&impl->h_data_owned); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallHip(ceed, hipFree(impl->d_data_owned)); + CeedCallBackend(CeedFree(&impl->h_data_owned)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -388,31 +351,20 @@ static int CeedQFunctionContextDestroy_Hip(const CeedQFunctionContext ctx) { // QFunctionContext Create //------------------------------------------------------------------------------ int CeedQFunctionContextCreate_Hip(CeedQFunctionContext ctx) { - int ierr; CeedQFunctionContext_Hip *impl; - Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", - CeedQFunctionContextHasValidData_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, - "HasBorrowedDataOfType", - CeedQFunctionContextHasBorrowedDataOfType_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", - CeedQFunctionContextSetData_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "TakeData", - CeedQFunctionContextTakeData_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", - CeedQFunctionContextGetData_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", - CeedQFunctionContextGetDataRead_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", - CeedQFunctionContextDestroy_Hip); CeedChkBackend(ierr); - - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedQFunctionContextSetBackendData(ctx, impl); CeedChkBackend(ierr); + Ceed ceed; + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", CeedQFunctionContextHasValidData_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasBorrowedDataOfType", CeedQFunctionContextHasBorrowedDataOfType_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", CeedQFunctionContextSetData_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "TakeData", CeedQFunctionContextTakeData_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Hip)); + + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c index 4ff56a3bbf..18841a518d 100644 --- a/backends/hip-ref/ceed-hip-ref-restriction.c +++ b/backends/hip-ref/ceed-hip-ref-restriction.c @@ -5,46 +5,44 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include #include #include -#include "ceed-hip-ref.h" + #include "../hip/ceed-hip-compile.h" +#include "ceed-hip-ref.h" //------------------------------------------------------------------------------ // Apply restriction //------------------------------------------------------------------------------ -static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, - CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - int ierr; +static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { CeedElemRestriction_Hip *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); Ceed_Hip *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); const CeedInt block_size = 64; - const CeedInt num_nodes = impl->num_nodes; - CeedInt num_elem, elem_size; + const CeedInt num_nodes = impl->num_nodes; + CeedInt num_elem, elem_size; CeedElemRestrictionGetNumElements(r, &num_elem); - ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); hipFunction_t kernel; // Get vectors const CeedScalar *d_u; - CeedScalar *d_v; - ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); + CeedScalar *d_v; + CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); if (t_mode == CEED_TRANSPOSE) { // Sum into for transpose mode, e-vec to l-vec - ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); } else { // Overwrite for notranspose mode, l-vec to e-vec - ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); } // Restrict @@ -52,78 +50,66 @@ static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, // L-vector -> E-vector if (impl->d_ind) { // -- Offsets provided - kernel = impl->OffsetNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + kernel = impl->OffsetNoTranspose; + void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; - ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), - block_size, args); CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); } else { // -- Strided restriction - kernel = impl->StridedNoTranspose; - void *args[] = {&num_elem, &d_u, &d_v}; + kernel = impl->StridedNoTranspose; + void *args[] = {&num_elem, &d_u, &d_v}; CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; - ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), - block_size, args); CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); } } else { // E-vector -> L-vector if (impl->d_ind) { // -- Offsets provided - kernel = impl->OffsetTranspose; - void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, - &impl->d_t_offsets, &d_u, &d_v - }; - ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), - block_size, args); CeedChkBackend(ierr); + kernel = impl->OffsetTranspose; + void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; + CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); } else { // -- Strided restriction - kernel = impl->StridedTranspose; + kernel = impl->StridedTranspose; void *args[] = {&num_elem, &d_u, &d_v}; - ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), - block_size, args); CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); } } - if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) - *request = NULL; + if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; // Restore arrays - ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Blocked not supported //------------------------------------------------------------------------------ -int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block, - CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { +int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v, + CeedRequest *request) { // LCOV_EXCL_START - int ierr; Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement blocked restrictions"); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); // LCOV_EXCL_STOP } //------------------------------------------------------------------------------ // Get offsets //------------------------------------------------------------------------------ -static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, - CeedMemType mtype, const CeedInt **offsets) { - int ierr; +static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemType mtype, const CeedInt **offsets) { CeedElemRestriction_Hip *impl; - ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); switch (mtype) { - case CEED_MEM_HOST: - *offsets = impl->h_ind; - break; - case CEED_MEM_DEVICE: - *offsets = impl->d_ind; - break; + case CEED_MEM_HOST: + *offsets = impl->h_ind; + break; + case CEED_MEM_DEVICE: + *offsets = impl->d_ind; + break; } return CEED_ERROR_SUCCESS; } @@ -132,19 +118,18 @@ static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, // Destroy restriction //------------------------------------------------------------------------------ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { - int ierr; CeedElemRestriction_Hip *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); - ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr); - ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->d_t_offsets); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->d_t_indices); CeedChk_Hip(ceed, ierr); - ierr = hipFree(impl->d_l_vec_indices); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + CeedCallHip(ceed, hipModuleUnload(impl->module)); + CeedCallBackend(CeedFree(&impl->h_ind_allocated)); + CeedCallHip(ceed, hipFree(impl->d_ind_allocated)); + CeedCallHip(ceed, hipFree(impl->d_t_offsets)); + CeedCallHip(ceed, hipFree(impl->d_t_indices)); + CeedCallHip(ceed, hipFree(impl->d_l_vec_indices)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -152,92 +137,80 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { //------------------------------------------------------------------------------ // Create transpose offsets and indices //------------------------------------------------------------------------------ -static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, - const CeedInt *indices) { - int ierr; +static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const CeedInt *indices) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedElemRestriction_Hip *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); CeedSize l_size; - CeedInt num_elem, elem_size, num_comp; - ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); + CeedInt num_elem, elem_size, num_comp; + CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); // Count num_nodes bool *is_node; - ierr = CeedCalloc(l_size, &is_node); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(l_size, &is_node)); const CeedInt size_indices = num_elem * elem_size; - for (CeedInt i = 0; i < size_indices; i++) - is_node[indices[i]] = 1; + for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; CeedInt num_nodes = 0; - for (CeedInt i = 0; i < l_size; i++) - num_nodes += is_node[i]; + for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; impl->num_nodes = num_nodes; // L-vector offsets array CeedInt *ind_to_offset, *l_vec_indices; - ierr = CeedCalloc(l_size, &ind_to_offset); CeedChkBackend(ierr); - ierr = CeedCalloc(num_nodes, &l_vec_indices); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); + CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); CeedInt j = 0; - for (CeedInt i = 0; i < l_size; i++) + for (CeedInt i = 0; i < l_size; i++) { if (is_node[i]) { l_vec_indices[j] = i; ind_to_offset[i] = j++; } - ierr = CeedFree(&is_node); CeedChkBackend(ierr); + } + CeedCallBackend(CeedFree(&is_node)); // Compute transpose offsets and indices const CeedInt size_offsets = num_nodes + 1; - CeedInt *t_offsets; - ierr = CeedCalloc(size_offsets, &t_offsets); CeedChkBackend(ierr); + CeedInt *t_offsets; + CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); CeedInt *t_indices; - ierr = CeedMalloc(size_indices, &t_indices); CeedChkBackend(ierr); + CeedCallBackend(CeedMalloc(size_indices, &t_indices)); // Count node multiplicity - for (CeedInt e = 0; e < num_elem; ++e) - for (CeedInt i = 0; i < elem_size; ++i) - ++t_offsets[ind_to_offset[indices[elem_size*e + i]] + 1]; + for (CeedInt e = 0; e < num_elem; ++e) { + for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; + } // Convert to running sum - for (CeedInt i = 1; i < size_offsets; ++i) - t_offsets[i] += t_offsets[i-1]; + for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; // List all E-vec indices associated with L-vec node for (CeedInt e = 0; e < num_elem; ++e) { for (CeedInt i = 0; i < elem_size; ++i) { - const CeedInt lid = elem_size*e + i; - const CeedInt gid = indices[lid]; + const CeedInt lid = elem_size * e + i; + const CeedInt gid = indices[lid]; t_indices[t_offsets[ind_to_offset[gid]]++] = lid; } } // Reset running sum - for (int i = size_offsets - 1; i > 0; --i) - t_offsets[i] = t_offsets[i - 1]; + for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; t_offsets[0] = 0; // Copy data to device // -- L-vector indices - ierr = hipMalloc((void **)&impl->d_l_vec_indices, num_nodes*sizeof(CeedInt)); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(impl->d_l_vec_indices, l_vec_indices, - num_nodes*sizeof(CeedInt), hipMemcpyHostToDevice); - CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); + CeedCallHip(ceed, hipMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), hipMemcpyHostToDevice)); // -- Transpose offsets - ierr = hipMalloc((void **)&impl->d_t_offsets, size_offsets*sizeof(CeedInt)); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets*sizeof(CeedInt), - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); + CeedCallHip(ceed, hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), hipMemcpyHostToDevice)); // -- Transpose indices - ierr = hipMalloc((void **)&impl->d_t_indices, size_indices*sizeof(CeedInt)); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(impl->d_t_indices, t_indices, size_indices*sizeof(CeedInt), - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); + CeedCallHip(ceed, hipMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), hipMemcpyHostToDevice)); // Cleanup - ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr); - ierr = CeedFree(&l_vec_indices); CeedChkBackend(ierr); - ierr = CeedFree(&t_offsets); CeedChkBackend(ierr); - ierr = CeedFree(&t_indices); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&ind_to_offset)); + CeedCallBackend(CeedFree(&l_vec_indices)); + CeedCallBackend(CeedFree(&t_offsets)); + CeedCallBackend(CeedFree(&t_indices)); return CEED_ERROR_SUCCESS; } @@ -245,34 +218,30 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, //------------------------------------------------------------------------------ // Create restriction //------------------------------------------------------------------------------ -int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, - const CeedInt *indices, - CeedElemRestriction r) { - int ierr; +int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedElemRestriction_Hip *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); CeedInt num_elem, num_comp, elem_size; - ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); - CeedInt size = num_elem * elem_size; - CeedInt strides[3] = {1, size, elem_size}; + CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); + CeedInt size = num_elem * elem_size; + CeedInt strides[3] = {1, size, elem_size}; CeedInt comp_stride = 1; // Stride data bool is_strided; - ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); if (is_strided) { bool has_backend_strides; - ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); if (!has_backend_strides) { - ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); } } else { - ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); } impl->h_ind = NULL; @@ -281,131 +250,94 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, impl->d_ind_allocated = NULL; impl->d_t_indices = NULL; impl->d_t_offsets = NULL; - impl->num_nodes = size; - ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr); - CeedInt layout[3] = {1, elem_size*num_elem, elem_size}; - ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr); + impl->num_nodes = size; + CeedCallBackend(CeedElemRestrictionSetData(r, impl)); + CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; + CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); // Set up device indices/offset arrays if (mtype == CEED_MEM_HOST) { switch (cmode) { - case CEED_OWN_POINTER: - impl->h_ind_allocated = (CeedInt *)indices; - impl->h_ind = (CeedInt *)indices; - break; - case CEED_USE_POINTER: - impl->h_ind = (CeedInt *)indices; - break; - case CEED_COPY_VALUES: - if (indices != NULL) { - ierr = CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated); - CeedChkBackend(ierr); - memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); - impl->h_ind = impl->h_ind_allocated; - } - break; + case CEED_OWN_POINTER: + impl->h_ind_allocated = (CeedInt *)indices; + impl->h_ind = (CeedInt *)indices; + break; + case CEED_USE_POINTER: + impl->h_ind = (CeedInt *)indices; + break; + case CEED_COPY_VALUES: + if (indices != NULL) { + CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); + memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); + impl->h_ind = impl->h_ind_allocated; + } + break; } if (indices != NULL) { - ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); - CeedChk_Hip(ceed, ierr); - impl->d_ind_allocated = impl->d_ind; // We own the device memory - ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), - hipMemcpyHostToDevice); - CeedChk_Hip(ceed, ierr); - ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); + impl->d_ind_allocated = impl->d_ind; // We own the device memory + CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyHostToDevice)); + CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); } } else if (mtype == CEED_MEM_DEVICE) { switch (cmode) { - case CEED_COPY_VALUES: - if (indices != NULL) { - ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); - CeedChk_Hip(ceed, ierr); - impl->d_ind_allocated = impl->d_ind; // We own the device memory - ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), - hipMemcpyDeviceToDevice); - CeedChk_Hip(ceed, ierr); - } - break; - case CEED_OWN_POINTER: - impl->d_ind = (CeedInt *)indices; - impl->d_ind_allocated = impl->d_ind; - break; - case CEED_USE_POINTER: - impl->d_ind = (CeedInt *)indices; + case CEED_COPY_VALUES: + if (indices != NULL) { + CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); + impl->d_ind_allocated = impl->d_ind; // We own the device memory + CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyDeviceToDevice)); + } + break; + case CEED_OWN_POINTER: + impl->d_ind = (CeedInt *)indices; + impl->d_ind_allocated = impl->d_ind; + break; + case CEED_USE_POINTER: + impl->d_ind = (CeedInt *)indices; } if (indices != NULL) { - ierr = CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated); - CeedChkBackend(ierr); - ierr = hipMemcpy(impl->h_ind_allocated, impl->d_ind, - elem_size * num_elem * sizeof(CeedInt), hipMemcpyDeviceToHost); - CeedChk_Hip(ceed, ierr); + CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); + CeedCallHip(ceed, hipMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), hipMemcpyDeviceToHost)); impl->h_ind = impl->h_ind_allocated; - ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); } } else { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Only MemType = HOST or DEVICE supported"); + return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); // LCOV_EXCL_STOP } // Compile HIP kernels CeedInt num_nodes = impl->num_nodes; - char *restriction_kernel_path, *restriction_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/hip/hip-ref-restriction.h", - &restriction_kernel_path); CeedChkBackend(ierr); + char *restriction_kernel_path, *restriction_kernel_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction.h", &restriction_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, restriction_kernel_path, - &restriction_kernel_source); - CeedChkBackend(ierr); - CeedDebug256(ceed, 2, - "----- Loading Restriction Kernel Source Complete! -----\n"); - ierr = CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8, - "RESTR_ELEM_SIZE", elem_size, - "RESTR_NUM_ELEM", num_elem, - "RESTR_NUM_COMP", num_comp, - "RESTR_NUM_NODES", num_nodes, - "RESTR_COMP_STRIDE", comp_stride, - "RESTR_STRIDE_NODES", strides[0], - "RESTR_STRIDE_COMP", strides[1], - "RESTR_STRIDE_ELEM", strides[2]); CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose", - &impl->StridedNoTranspose); CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose", - &impl->OffsetNoTranspose); CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, impl->module, "StridedTranspose", - &impl->StridedTranspose); CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, impl->module, "OffsetTranspose", - &impl->OffsetTranspose); CeedChkBackend(ierr); - ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8, "RESTR_ELEM_SIZE", elem_size, "RESTR_NUM_ELEM", num_elem, + "RESTR_NUM_COMP", num_comp, "RESTR_NUM_NODES", num_nodes, "RESTR_COMP_STRIDE", comp_stride, "RESTR_STRIDE_NODES", + strides[0], "RESTR_STRIDE_COMP", strides[1], "RESTR_STRIDE_ELEM", strides[2])); + CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); + CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); + CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); + CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); + CeedCallBackend(CeedFree(&restriction_kernel_path)); + CeedCallBackend(CeedFree(&restriction_kernel_source)); // Register backend functions - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", - CeedElemRestrictionApply_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", - CeedElemRestrictionApplyBlock_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", - CeedElemRestrictionGetOffsets_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", - CeedElemRestrictionDestroy_Hip); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", CeedElemRestrictionApplyBlock_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Hip)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Blocked not supported //------------------------------------------------------------------------------ -int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, - const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { - int ierr; +int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement blocked restrictions"); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); } //------------------------------------------------------------------------------ diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c index c86a9dd1fc..c91f769bff 100644 --- a/backends/hip-ref/ceed-hip-ref-vector.c +++ b/backends/hip-ref/ceed-hip-ref-vector.c @@ -5,32 +5,30 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include -#include "ceed-hip-ref.h" +#include "ceed-hip-ref.h" //------------------------------------------------------------------------------ // Check if host/device sync is needed //------------------------------------------------------------------------------ -static inline int CeedVectorNeedSync_Hip(const CeedVector vec, - CeedMemType mem_type, bool *need_sync) { - int ierr; +static inline int CeedVectorNeedSync_Hip(const CeedVector vec, CeedMemType mem_type, bool *need_sync) { CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); bool has_valid_array = false; - ierr = CeedVectorHasValidArray(vec, &has_valid_array); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array)); switch (mem_type) { - case CEED_MEM_HOST: - *need_sync = has_valid_array && !impl->h_array; - break; - case CEED_MEM_DEVICE: - *need_sync = has_valid_array && !impl->d_array; - break; + case CEED_MEM_HOST: + *need_sync = has_valid_array && !impl->h_array; + break; + case CEED_MEM_DEVICE: + *need_sync = has_valid_array && !impl->d_array; + break; } return CEED_ERROR_SUCCESS; @@ -40,34 +38,31 @@ static inline int CeedVectorNeedSync_Hip(const CeedVector vec, // Sync host to device //------------------------------------------------------------------------------ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); - if (!impl->h_array) + if (!impl->h_array) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No valid host data to sync to device"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device"); + // LCOV_EXCL_STOP + } if (impl->d_array_borrowed) { impl->d_array = impl->d_array_borrowed; } else if (impl->d_array_owned) { impl->d_array = impl->d_array_owned; } else { - ierr = hipMalloc((void **)&impl->d_array_owned, bytes); - CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, bytes)); impl->d_array = impl->d_array_owned; } - ierr = hipMemcpy(impl->d_array, impl->h_array, bytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMemcpy(impl->d_array, impl->h_array, bytes, hipMemcpyHostToDevice)); return CEED_ERROR_SUCCESS; } @@ -76,17 +71,16 @@ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) { // Sync device to host //------------------------------------------------------------------------------ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - if (!impl->d_array) + if (!impl->d_array) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "No valid device data to sync to host"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); + // LCOV_EXCL_STOP + } if (impl->h_array_borrowed) { impl->h_array = impl->h_array_borrowed; @@ -94,16 +88,15 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) { impl->h_array = impl->h_array_owned; } else { CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); - ierr = CeedCalloc(length, &impl->h_array_owned); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); + CeedCallBackend(CeedCalloc(length, &impl->h_array_owned)); impl->h_array = impl->h_array_owned; } CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); - ierr = hipMemcpy(impl->h_array, impl->d_array, bytes, - hipMemcpyDeviceToHost); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMemcpy(impl->h_array, impl->d_array, bytes, hipMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -111,19 +104,17 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) { //------------------------------------------------------------------------------ // Sync arrays //------------------------------------------------------------------------------ -static int CeedVectorSyncArray_Hip(const CeedVector vec, - CeedMemType mem_type) { - int ierr; +static int CeedVectorSyncArray_Hip(const CeedVector vec, CeedMemType mem_type) { // Check whether device/host sync is needed bool need_sync = false; - ierr = CeedVectorNeedSync_Hip(vec, mem_type, &need_sync); - CeedChkBackend(ierr); - if (!need_sync) - return CEED_ERROR_SUCCESS; + CeedCallBackend(CeedVectorNeedSync_Hip(vec, mem_type, &need_sync)); + if (!need_sync) return CEED_ERROR_SUCCESS; switch (mem_type) { - case CEED_MEM_HOST: return CeedVectorSyncD2H_Hip(vec); - case CEED_MEM_DEVICE: return CeedVectorSyncH2D_Hip(vec); + case CEED_MEM_HOST: + return CeedVectorSyncD2H_Hip(vec); + case CEED_MEM_DEVICE: + return CeedVectorSyncH2D_Hip(vec); } return CEED_ERROR_UNSUPPORTED; } @@ -132,9 +123,8 @@ static int CeedVectorSyncArray_Hip(const CeedVector vec, // Set all pointers as invalid //------------------------------------------------------------------------------ static inline int CeedVectorSetAllInvalid_Hip(const CeedVector vec) { - int ierr; CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); impl->h_array = NULL; impl->d_array = NULL; @@ -145,11 +135,9 @@ static inline int CeedVectorSetAllInvalid_Hip(const CeedVector vec) { //------------------------------------------------------------------------------ // Check if CeedVector has any valid pointers //------------------------------------------------------------------------------ -static inline int CeedVectorHasValidArray_Hip(const CeedVector vec, - bool *has_valid_array) { - int ierr; +static inline int CeedVectorHasValidArray_Hip(const CeedVector vec, bool *has_valid_array) { CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); *has_valid_array = !!impl->h_array || !!impl->d_array; @@ -159,19 +147,17 @@ static inline int CeedVectorHasValidArray_Hip(const CeedVector vec, //------------------------------------------------------------------------------ // Check if has any array of given type //------------------------------------------------------------------------------ -static inline int CeedVectorHasArrayOfType_Hip(const CeedVector vec, - CeedMemType mem_type, bool *has_array_of_type) { - int ierr; +static inline int CeedVectorHasArrayOfType_Hip(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) { CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { - case CEED_MEM_HOST: - *has_array_of_type = !!impl->h_array_borrowed || !!impl->h_array_owned; - break; - case CEED_MEM_DEVICE: - *has_array_of_type = !!impl->d_array_borrowed || !!impl->d_array_owned; - break; + case CEED_MEM_HOST: + *has_array_of_type = !!impl->h_array_borrowed || !!impl->h_array_owned; + break; + case CEED_MEM_DEVICE: + *has_array_of_type = !!impl->d_array_borrowed || !!impl->d_array_owned; + break; } return CEED_ERROR_SUCCESS; @@ -180,19 +166,17 @@ static inline int CeedVectorHasArrayOfType_Hip(const CeedVector vec, //------------------------------------------------------------------------------ // Check if has borrowed array of given type //------------------------------------------------------------------------------ -static inline int CeedVectorHasBorrowedArrayOfType_Hip(const CeedVector vec, - CeedMemType mem_type, bool *has_borrowed_array_of_type) { - int ierr; +static inline int CeedVectorHasBorrowedArrayOfType_Hip(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { - case CEED_MEM_HOST: - *has_borrowed_array_of_type = !!impl->h_array_borrowed; - break; - case CEED_MEM_DEVICE: - *has_borrowed_array_of_type = !!impl->d_array_borrowed; - break; + case CEED_MEM_HOST: + *has_borrowed_array_of_type = !!impl->h_array_borrowed; + break; + case CEED_MEM_DEVICE: + *has_borrowed_array_of_type = !!impl->d_array_borrowed; + break; } return CEED_ERROR_SUCCESS; @@ -201,39 +185,37 @@ static inline int CeedVectorHasBorrowedArrayOfType_Hip(const CeedVector vec, //------------------------------------------------------------------------------ // Set array from host //------------------------------------------------------------------------------ -static int CeedVectorSetArrayHost_Hip(const CeedVector vec, - const CeedCopyMode copy_mode, CeedScalar *array) { - int ierr; +static int CeedVectorSetArrayHost_Hip(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (copy_mode) { - case CEED_COPY_VALUES: { - CeedSize length; - if (!impl->h_array_owned) { - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); - ierr = CeedMalloc(length, &impl->h_array_owned); CeedChkBackend(ierr); - } - impl->h_array_borrowed = NULL; - impl->h_array = impl->h_array_owned; - if (array) { + case CEED_COPY_VALUES: { CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); - size_t bytes = length * sizeof(CeedScalar); - memcpy(impl->h_array, array, bytes); - } - } break; - case CEED_OWN_POINTER: - ierr = CeedFree(&impl->h_array_owned); CeedChkBackend(ierr); - impl->h_array_owned = array; - impl->h_array_borrowed = NULL; - impl->h_array = array; - break; - case CEED_USE_POINTER: - ierr = CeedFree(&impl->h_array_owned); CeedChkBackend(ierr); - impl->h_array_borrowed = array; - impl->h_array = array; - break; + if (!impl->h_array_owned) { + CeedCallBackend(CeedVectorGetLength(vec, &length)); + CeedCallBackend(CeedMalloc(length, &impl->h_array_owned)); + } + impl->h_array_borrowed = NULL; + impl->h_array = impl->h_array_owned; + if (array) { + CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); + size_t bytes = length * sizeof(CeedScalar); + memcpy(impl->h_array, array, bytes); + } + } break; + case CEED_OWN_POINTER: + CeedCallBackend(CeedFree(&impl->h_array_owned)); + impl->h_array_owned = array; + impl->h_array_borrowed = NULL; + impl->h_array = array; + break; + case CEED_USE_POINTER: + CeedCallBackend(CeedFree(&impl->h_array_owned)); + impl->h_array_borrowed = array; + impl->h_array = array; + break; } return CEED_ERROR_SUCCESS; @@ -242,42 +224,38 @@ static int CeedVectorSetArrayHost_Hip(const CeedVector vec, //------------------------------------------------------------------------------ // Set array from device //------------------------------------------------------------------------------ -static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, - const CeedCopyMode copy_mode, CeedScalar *array) { - int ierr; +static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (copy_mode) { - case CEED_COPY_VALUES: { - CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); - size_t bytes = length * sizeof(CeedScalar); - if (!impl->d_array_owned) { - ierr = hipMalloc((void **)&impl->d_array_owned, bytes); - CeedChk_Hip(ceed, ierr); - } - impl->d_array_borrowed = NULL; - impl->d_array = impl->d_array_owned; - if (array) { - ierr = hipMemcpy(impl->d_array, array, bytes, - hipMemcpyDeviceToDevice); CeedChk_Hip(ceed, ierr); - } - } break; - case CEED_OWN_POINTER: - ierr = hipFree(impl->d_array_owned); CeedChk_Hip(ceed, ierr); - impl->d_array_owned = array; - impl->d_array_borrowed = NULL; - impl->d_array = array; - break; - case CEED_USE_POINTER: - ierr = hipFree(impl->d_array_owned); CeedChk_Hip(ceed, ierr); - impl->d_array_owned = NULL; - impl->d_array_borrowed = array; - impl->d_array = array; - break; + case CEED_COPY_VALUES: { + CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); + size_t bytes = length * sizeof(CeedScalar); + if (!impl->d_array_owned) { + CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, bytes)); + } + impl->d_array_borrowed = NULL; + impl->d_array = impl->d_array_owned; + if (array) { + CeedCallHip(ceed, hipMemcpy(impl->d_array, array, bytes, hipMemcpyDeviceToDevice)); + } + } break; + case CEED_OWN_POINTER: + CeedCallHip(ceed, hipFree(impl->d_array_owned)); + impl->d_array_owned = array; + impl->d_array_borrowed = NULL; + impl->d_array = array; + break; + case CEED_USE_POINTER: + CeedCallHip(ceed, hipFree(impl->d_array_owned)); + impl->d_array_owned = NULL; + impl->d_array_borrowed = array; + impl->d_array = array; + break; } return CEED_ERROR_SUCCESS; @@ -287,21 +265,18 @@ static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, // Set the array used by a vector, // freeing any previously allocated array if applicable //------------------------------------------------------------------------------ -static int CeedVectorSetArray_Hip(const CeedVector vec, - const CeedMemType mem_type, - const CeedCopyMode copy_mode, CeedScalar *array) { - int ierr; +static int CeedVectorSetArray_Hip(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - ierr = CeedVectorSetAllInvalid_Hip(vec); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec)); switch (mem_type) { - case CEED_MEM_HOST: - return CeedVectorSetArrayHost_Hip(vec, copy_mode, array); - case CEED_MEM_DEVICE: - return CeedVectorSetArrayDevice_Hip(vec, copy_mode, array); + case CEED_MEM_HOST: + return CeedVectorSetArrayHost_Hip(vec, copy_mode, array); + case CEED_MEM_DEVICE: + return CeedVectorSetArrayDevice_Hip(vec, copy_mode, array); } return CEED_ERROR_UNSUPPORTED; @@ -310,10 +285,8 @@ static int CeedVectorSetArray_Hip(const CeedVector vec, //------------------------------------------------------------------------------ // Set host array to value //------------------------------------------------------------------------------ -static int CeedHostSetValue_Hip(CeedScalar *h_array, CeedInt length, - CeedScalar val) { - for (int i = 0; i < length; i++) - h_array[i] = val; +static int CeedHostSetValue_Hip(CeedScalar *h_array, CeedInt length, CeedScalar val) { + for (int i = 0; i < length; i++) h_array[i] = val; return CEED_ERROR_SUCCESS; } @@ -326,13 +299,12 @@ int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedInt length, CeedScalar val); // Set a vector to a value, //------------------------------------------------------------------------------ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); // Set value for synced device/host array if (!impl->d_array && !impl->h_array) { @@ -345,15 +317,14 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) { } else if (impl->h_array_owned) { impl->h_array = impl->h_array_owned; } else { - ierr = CeedVectorSetArray(vec, CEED_MEM_DEVICE, CEED_COPY_VALUES, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetArray(vec, CEED_MEM_DEVICE, CEED_COPY_VALUES, NULL)); } } if (impl->d_array) { - ierr = CeedDeviceSetValue_Hip(impl->d_array, length, val); CeedChkBackend(ierr); + CeedCallBackend(CeedDeviceSetValue_Hip(impl->d_array, length, val)); } if (impl->h_array) { - ierr = CeedHostSetValue_Hip(impl->h_array, length, val); CeedChkBackend(ierr); + CeedCallBackend(CeedHostSetValue_Hip(impl->h_array, length, val)); } return CEED_ERROR_SUCCESS; @@ -362,29 +333,27 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) { //------------------------------------------------------------------------------ // Vector Take Array //------------------------------------------------------------------------------ -static int CeedVectorTakeArray_Hip(CeedVector vec, CeedMemType mem_type, - CeedScalar **array) { - int ierr; +static int CeedVectorTakeArray_Hip(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); // Sync array to requested mem_type - ierr = CeedVectorSyncArray(vec, mem_type); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(vec, mem_type)); // Update pointer switch (mem_type) { - case CEED_MEM_HOST: - (*array) = impl->h_array_borrowed; - impl->h_array_borrowed = NULL; - impl->h_array = NULL; - break; - case CEED_MEM_DEVICE: - (*array) = impl->d_array_borrowed; - impl->d_array_borrowed = NULL; - impl->d_array = NULL; - break; + case CEED_MEM_HOST: + (*array) = impl->h_array_borrowed; + impl->h_array_borrowed = NULL; + impl->h_array = NULL; + break; + case CEED_MEM_DEVICE: + (*array) = impl->d_array_borrowed; + impl->d_array_borrowed = NULL; + impl->d_array = NULL; + break; } return CEED_ERROR_SUCCESS; @@ -394,25 +363,23 @@ static int CeedVectorTakeArray_Hip(CeedVector vec, CeedMemType mem_type, // Core logic for array syncronization for GetArray. // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ -static int CeedVectorGetArrayCore_Hip(const CeedVector vec, - const CeedMemType mem_type, CeedScalar **array) { - int ierr; +static int CeedVectorGetArrayCore_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); // Sync array to requested mem_type - ierr = CeedVectorSyncArray(vec, mem_type); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(vec, mem_type)); // Update pointer switch (mem_type) { - case CEED_MEM_HOST: - *array = impl->h_array; - break; - case CEED_MEM_DEVICE: - *array = impl->d_array; - break; + case CEED_MEM_HOST: + *array = impl->h_array; + break; + case CEED_MEM_DEVICE: + *array = impl->d_array; + break; } return CEED_ERROR_SUCCESS; @@ -421,31 +388,27 @@ static int CeedVectorGetArrayCore_Hip(const CeedVector vec, //------------------------------------------------------------------------------ // Get read-only access to a vector via the specified mem_type //------------------------------------------------------------------------------ -static int CeedVectorGetArrayRead_Hip(const CeedVector vec, - const CeedMemType mem_type, const CeedScalar **array) { +static int CeedVectorGetArrayRead_Hip(const CeedVector vec, const CeedMemType mem_type, const CeedScalar **array) { return CeedVectorGetArrayCore_Hip(vec, mem_type, (CeedScalar **)array); } //------------------------------------------------------------------------------ // Get read/write access to a vector via the specified mem_type //------------------------------------------------------------------------------ -static int CeedVectorGetArray_Hip(const CeedVector vec, - const CeedMemType mem_type, - CeedScalar **array) { - int ierr; +static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - ierr = CeedVectorGetArrayCore_Hip(vec, mem_type, array); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayCore_Hip(vec, mem_type, array)); - ierr = CeedVectorSetAllInvalid_Hip(vec); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec)); switch (mem_type) { - case CEED_MEM_HOST: - impl->h_array = *array; - break; - case CEED_MEM_DEVICE: - impl->d_array = *array; - break; + case CEED_MEM_HOST: + impl->h_array = *array; + break; + case CEED_MEM_DEVICE: + impl->d_array = *array; + break; } return CEED_ERROR_SUCCESS; @@ -454,33 +417,25 @@ static int CeedVectorGetArray_Hip(const CeedVector vec, //------------------------------------------------------------------------------ // Get write access to a vector via the specified mem_type //------------------------------------------------------------------------------ -static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, - const CeedMemType mem_type, CeedScalar **array) { - int ierr; +static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); bool has_array_of_type = true; - ierr = CeedVectorHasArrayOfType_Hip(vec, mem_type, &has_array_of_type); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorHasArrayOfType_Hip(vec, mem_type, &has_array_of_type)); if (!has_array_of_type) { // Allocate if array is not yet allocated - ierr = CeedVectorSetArray(vec, mem_type, CEED_COPY_VALUES, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetArray(vec, mem_type, CEED_COPY_VALUES, NULL)); } else { // Select dirty array switch (mem_type) { - case CEED_MEM_HOST: - if (impl->h_array_borrowed) - impl->h_array = impl->h_array_borrowed; - else - impl->h_array = impl->h_array_owned; - break; - case CEED_MEM_DEVICE: - if (impl->d_array_borrowed) - impl->d_array = impl->d_array_borrowed; - else - impl->d_array = impl->d_array_owned; + case CEED_MEM_HOST: + if (impl->h_array_borrowed) impl->h_array = impl->h_array_borrowed; + else impl->h_array = impl->h_array_owned; + break; + case CEED_MEM_DEVICE: + if (impl->d_array_borrowed) impl->d_array = impl->d_array_borrowed; + else impl->d_array = impl->d_array_owned; } } @@ -490,57 +445,50 @@ static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, //------------------------------------------------------------------------------ // Get the norm of a CeedVector //------------------------------------------------------------------------------ -static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, - CeedScalar *norm) { - int ierr; +static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *norm) { Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); hipblasHandle_t handle; - ierr = CeedHipGetHipblasHandle(ceed, &handle); CeedChkBackend(ierr); + CeedCallBackend(CeedHipGetHipblasHandle(ceed, &handle)); // Compute norm const CeedScalar *d_array; - ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array)); switch (type) { - case CEED_NORM_1: { - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - ierr = hipblasSasum(handle, length, (float *) d_array, 1, (float *) norm); - } else { - ierr = hipblasDasum(handle, length, (double *) d_array, 1, (double *) norm); + case CEED_NORM_1: { + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { + CeedCallHipblas(ceed, hipblasSasum(handle, length, (float *)d_array, 1, (float *)norm)); + } else { + CeedCallHipblas(ceed, hipblasDasum(handle, length, (double *)d_array, 1, (double *)norm)); + } + break; } - CeedChk_Hipblas(ceed, ierr); - break; - } - case CEED_NORM_2: { - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - ierr = hipblasSnrm2(handle, length, (float *) d_array, 1, (float *) norm); - } else { - ierr = hipblasDnrm2(handle, length, (double *) d_array, 1, (double *) norm); + case CEED_NORM_2: { + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { + CeedCallHipblas(ceed, hipblasSnrm2(handle, length, (float *)d_array, 1, (float *)norm)); + } else { + CeedCallHipblas(ceed, hipblasDnrm2(handle, length, (double *)d_array, 1, (double *)norm)); + } + break; } - CeedChk_Hipblas(ceed, ierr); - break; - } - case CEED_NORM_MAX: { - CeedInt indx; - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - ierr = hipblasIsamax(handle, length, (float *) d_array, 1, &indx); - } else { - ierr = hipblasIdamax(handle, length, (double *) d_array, 1, &indx); + case CEED_NORM_MAX: { + CeedInt indx; + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { + CeedCallHipblas(ceed, hipblasIsamax(handle, length, (float *)d_array, 1, &indx)); + } else { + CeedCallHipblas(ceed, hipblasIdamax(handle, length, (double *)d_array, 1, &indx)); + } + CeedScalar normNoAbs; + CeedCallHip(ceed, hipMemcpy(&normNoAbs, impl->d_array + indx - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); + *norm = fabs(normNoAbs); + break; } - CeedChk_Hipblas(ceed, ierr); - CeedScalar normNoAbs; - ierr = hipMemcpy(&normNoAbs, impl->d_array+indx-1, sizeof(CeedScalar), - hipMemcpyDeviceToHost); CeedChk_Hip(ceed, ierr); - *norm = fabs(normNoAbs); - break; - } } - ierr = CeedVectorRestoreArrayRead(vec, &d_array); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array)); return CEED_ERROR_SUCCESS; } @@ -549,9 +497,9 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, // Take reciprocal of a vector on host //------------------------------------------------------------------------------ static int CeedHostReciprocal_Hip(CeedScalar *h_array, CeedInt length) { - for (int i = 0; i < length; i++) - if (fabs(h_array[i]) > CEED_EPSILON) - h_array[i] = 1./h_array[i]; + for (int i = 0; i < length; i++) { + if (fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i]; + } return CEED_ERROR_SUCCESS; } @@ -564,21 +512,16 @@ int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedInt length); // Take reciprocal of a vector //------------------------------------------------------------------------------ static int CeedVectorReciprocal_Hip(CeedVector vec) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); // Set value for synced device/host array - if (impl->d_array) { - ierr = CeedDeviceReciprocal_Hip(impl->d_array, length); CeedChkBackend(ierr); - } - if (impl->h_array) { - ierr = CeedHostReciprocal_Hip(impl->h_array, length); CeedChkBackend(ierr); - } + if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Hip(impl->d_array, length)); + if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Hip(impl->h_array, length)); return CEED_ERROR_SUCCESS; } @@ -586,39 +529,30 @@ static int CeedVectorReciprocal_Hip(CeedVector vec) { //------------------------------------------------------------------------------ // Compute x = alpha x on the host //------------------------------------------------------------------------------ -static int CeedHostScale_Hip(CeedScalar *x_array, CeedScalar alpha, - CeedInt length) { - for (int i = 0; i < length; i++) - x_array[i] *= alpha; +static int CeedHostScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedInt length) { + for (int i = 0; i < length; i++) x_array[i] *= alpha; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute x = alpha x on device (impl in .cu file) //------------------------------------------------------------------------------ -int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, - CeedInt length); +int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedInt length); //------------------------------------------------------------------------------ // Compute x = alpha x //------------------------------------------------------------------------------ static int CeedVectorScale_Hip(CeedVector x, CeedScalar alpha) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(x, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(x, &ceed)); CeedVector_Hip *x_impl; - ierr = CeedVectorGetData(x, &x_impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedSize length; - ierr = CeedVectorGetLength(x, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(x, &length)); // Set value for synced device/host array - if (x_impl->d_array) { - ierr = CeedDeviceScale_Hip(x_impl->d_array, alpha, length); - CeedChkBackend(ierr); - } - if (x_impl->h_array) { - ierr = CeedHostScale_Hip(x_impl->h_array, alpha, length); CeedChkBackend(ierr); - } + if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Hip(x_impl->d_array, alpha, length)); + if (x_impl->h_array) CeedCallBackend(CeedHostScale_Hip(x_impl->h_array, alpha, length)); return CEED_ERROR_SUCCESS; } @@ -626,42 +560,36 @@ static int CeedVectorScale_Hip(CeedVector x, CeedScalar alpha) { //------------------------------------------------------------------------------ // Compute y = alpha x + y on the host //------------------------------------------------------------------------------ -static int CeedHostAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, - CeedScalar *x_array, CeedInt length) { - for (int i = 0; i < length; i++) - y_array[i] += alpha * x_array[i]; +static int CeedHostAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) { + for (int i = 0; i < length; i++) y_array[i] += alpha * x_array[i]; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute y = alpha x + y on device (impl in .cu file) //------------------------------------------------------------------------------ -int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, - CeedScalar *x_array, CeedInt length); +int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length); //------------------------------------------------------------------------------ // Compute y = alpha x + y //------------------------------------------------------------------------------ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(y, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(y, &ceed)); CeedVector_Hip *y_impl, *x_impl; - ierr = CeedVectorGetData(y, &y_impl); CeedChkBackend(ierr); - ierr = CeedVectorGetData(x, &x_impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(y, &y_impl)); + CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedSize length; - ierr = CeedVectorGetLength(y, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(y, &length)); // Set value for synced device/host array if (y_impl->d_array) { - ierr = CeedVectorSyncArray(x, CEED_MEM_DEVICE); CeedChkBackend(ierr); - ierr = CeedDeviceAXPY_Hip(y_impl->d_array, alpha, x_impl->d_array, length); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); + CeedCallBackend(CeedDeviceAXPY_Hip(y_impl->d_array, alpha, x_impl->d_array, length)); } if (y_impl->h_array) { - ierr = CeedVectorSyncArray(x, CEED_MEM_HOST); CeedChkBackend(ierr); - ierr = CeedHostAXPY_Hip(y_impl->h_array, alpha, x_impl->h_array, length); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); + CeedCallBackend(CeedHostAXPY_Hip(y_impl->h_array, alpha, x_impl->h_array, length)); } return CEED_ERROR_SUCCESS; @@ -670,51 +598,42 @@ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) { //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on the host //------------------------------------------------------------------------------ -static int CeedHostPointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, - CeedScalar *y_array, CeedInt length) { - for (int i = 0; i < length; i++) - w_array[i] = x_array[i] * y_array[i]; +static int CeedHostPointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) { + for (int i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i]; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on device (impl in .cu file) //------------------------------------------------------------------------------ -int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, - CeedScalar *y_array, CeedInt length); +int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length); //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y //------------------------------------------------------------------------------ -static int CeedVectorPointwiseMult_Hip(CeedVector w, CeedVector x, - CeedVector y) { - int ierr; +static int CeedVectorPointwiseMult_Hip(CeedVector w, CeedVector x, CeedVector y) { Ceed ceed; - ierr = CeedVectorGetCeed(w, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(w, &ceed)); CeedVector_Hip *w_impl, *x_impl, *y_impl; - ierr = CeedVectorGetData(w, &w_impl); CeedChkBackend(ierr); - ierr = CeedVectorGetData(x, &x_impl); CeedChkBackend(ierr); - ierr = CeedVectorGetData(y, &y_impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(w, &w_impl)); + CeedCallBackend(CeedVectorGetData(x, &x_impl)); + CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedSize length; - ierr = CeedVectorGetLength(w, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(w, &length)); // Set value for synced device/host array if (!w_impl->d_array && !w_impl->h_array) { - ierr = CeedVectorSetValue(w, 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetValue(w, 0.0)); } if (w_impl->d_array) { - ierr = CeedVectorSyncArray(x, CEED_MEM_DEVICE); CeedChkBackend(ierr); - ierr = CeedVectorSyncArray(y, CEED_MEM_DEVICE); CeedChkBackend(ierr); - ierr = CeedDevicePointwiseMult_Hip(w_impl->d_array, x_impl->d_array, - y_impl->d_array, length); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); + CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_DEVICE)); + CeedCallBackend(CeedDevicePointwiseMult_Hip(w_impl->d_array, x_impl->d_array, y_impl->d_array, length)); } if (w_impl->h_array) { - ierr = CeedVectorSyncArray(x, CEED_MEM_HOST); CeedChkBackend(ierr); - ierr = CeedVectorSyncArray(y, CEED_MEM_HOST); CeedChkBackend(ierr); - ierr = CeedHostPointwiseMult_Hip(w_impl->h_array, x_impl->h_array, - y_impl->h_array, length); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); + CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST)); + CeedCallBackend(CeedHostPointwiseMult_Hip(w_impl->h_array, x_impl->h_array, y_impl->h_array, length)); } return CEED_ERROR_SUCCESS; @@ -724,15 +643,14 @@ static int CeedVectorPointwiseMult_Hip(CeedVector w, CeedVector x, // Destroy the vector //------------------------------------------------------------------------------ static int CeedVectorDestroy_Hip(const CeedVector vec) { - int ierr; Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedVector_Hip *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - ierr = hipFree(impl->d_array_owned); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&impl->h_array_owned); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallHip(ceed, hipFree(impl->d_array_owned)); + CeedCallBackend(CeedFree(&impl->h_array_owned)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -742,44 +660,27 @@ static int CeedVectorDestroy_Hip(const CeedVector vec) { //------------------------------------------------------------------------------ int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) { CeedVector_Hip *impl; - int ierr; - Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", - CeedVectorHasValidArray_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", - CeedVectorHasBorrowedArrayOfType_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", - CeedVectorSetArray_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", - CeedVectorTakeArray_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", - (int (*)())(CeedVectorSetValue_Hip)); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", - CeedVectorSyncArray_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", - CeedVectorGetArray_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", - CeedVectorGetArrayRead_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", - CeedVectorGetArrayWrite_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "Norm", - CeedVectorNorm_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", - CeedVectorReciprocal_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "Scale", - (int (*)())(CeedVectorScale_Hip)); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", - (int (*)())(CeedVectorAXPY_Hip)); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", - CeedVectorPointwiseMult_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", - CeedVectorDestroy_Hip); CeedChkBackend(ierr); - - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedVectorSetData(vec, impl); CeedChkBackend(ierr); + Ceed ceed; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())(CeedVectorSetValue_Hip))); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())(CeedVectorScale_Hip))); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())(CeedVectorAXPY_Hip))); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Hip)); + + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedVectorSetData(vec, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c index 0750078d96..accd6efb3f 100644 --- a/backends/hip-ref/ceed-hip-ref.c +++ b/backends/hip-ref/ceed-hip-ref.c @@ -5,11 +5,12 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-hip-ref.h" + #include -#include +#include #include -#include "ceed-hip-ref.h" +#include //------------------------------------------------------------------------------ // HIP preferred MemType @@ -23,13 +24,10 @@ static int CeedGetPreferredMemType_Hip(CeedMemType *type) { // Get hipBLAS handle //------------------------------------------------------------------------------ int CeedHipGetHipblasHandle(Ceed ceed, hipblasHandle_t *handle) { - int ierr; Ceed_Hip *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); - if (!data->hipblas_handle) { - ierr = hipblasCreate(&data->hipblas_handle); CeedChk_Hipblas(ceed, ierr); - } + if (!data->hipblas_handle) CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle)); *handle = data->hipblas_handle; return CEED_ERROR_SUCCESS; } @@ -38,53 +36,36 @@ int CeedHipGetHipblasHandle(Ceed ceed, hipblasHandle_t *handle) { // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Hip(const char *resource, Ceed ceed) { - int ierr; - char *resource_root; - ierr = CeedHipGetResourceRoot(ceed, resource, &resource_root); - CeedChkBackend(ierr); - if (strcmp(resource_root, "/gpu/hip/ref")) + CeedCallBackend(CeedHipGetResourceRoot(ceed, resource, &resource_root)); + if (strcmp(resource_root, "/gpu/hip/ref")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Hip backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedFree(&resource_root); CeedChkBackend(ierr); - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Hip backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedSetDeterministic(ceed, true)); Ceed_Hip *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); - ierr = CeedHipInit(ceed, resource); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedSetData(ceed, data)); + CeedCallBackend(CeedHipInit(ceed, resource)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "GetPreferredMemType", - CeedGetPreferredMemType_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", - CeedVectorCreate_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", - CeedBasisCreateTensorH1_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", - CeedBasisCreateH1_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", - CeedElemRestrictionCreate_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, - "ElemRestrictionCreateBlocked", - CeedElemRestrictionCreateBlocked_Hip); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", - CeedQFunctionCreate_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", - CeedQFunctionContextCreate_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", - CeedOperatorCreate_Hip); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", - CeedDestroy_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "GetPreferredMemType", CeedGetPreferredMemType_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateBlocked", CeedElemRestrictionCreateBlocked_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Hip(void) { - return CeedRegister("/gpu/hip/ref", CeedInit_Hip, 40); -} +CEED_INTERN int CeedRegister_Hip(void) { return CeedRegister("/gpu/hip/ref", CeedInit_Hip, 40); } //------------------------------------------------------------------------------ diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h index ff55d43150..d51f49aa87 100644 --- a/backends/hip-ref/ceed-hip-ref.h +++ b/backends/hip-ref/ceed-hip-ref.h @@ -8,9 +8,10 @@ #ifndef _ceed_hip_h #define _ceed_hip_h -#include #include +#include #include + #include "../hip/ceed-hip-common.h" typedef struct { @@ -23,48 +24,48 @@ typedef struct { } CeedVector_Hip; typedef struct { - hipModule_t module; + hipModule_t module; hipFunction_t StridedTranspose; hipFunction_t StridedNoTranspose; hipFunction_t OffsetTranspose; hipFunction_t OffsetNoTranspose; - CeedInt num_nodes; - CeedInt *h_ind; - CeedInt *h_ind_allocated; - CeedInt *d_ind; - CeedInt *d_ind_allocated; - CeedInt *d_t_offsets; - CeedInt *d_t_indices; - CeedInt *d_l_vec_indices; + CeedInt num_nodes; + CeedInt *h_ind; + CeedInt *h_ind_allocated; + CeedInt *d_ind; + CeedInt *d_ind_allocated; + CeedInt *d_t_offsets; + CeedInt *d_t_indices; + CeedInt *d_l_vec_indices; } CeedElemRestriction_Hip; typedef struct { - hipModule_t module; + hipModule_t module; hipFunction_t Interp; hipFunction_t Grad; hipFunction_t Weight; - CeedScalar *d_interp_1d; - CeedScalar *d_grad_1d; - CeedScalar *d_q_weight_1d; + CeedScalar *d_interp_1d; + CeedScalar *d_grad_1d; + CeedScalar *d_q_weight_1d; } CeedBasis_Hip; typedef struct { - hipModule_t module; + hipModule_t module; hipFunction_t Interp; hipFunction_t Grad; hipFunction_t Weight; - CeedScalar *d_interp; - CeedScalar *d_grad; - CeedScalar *d_q_weight; + CeedScalar *d_interp; + CeedScalar *d_grad; + CeedScalar *d_q_weight; } CeedBasisNonTensor_Hip; typedef struct { - hipModule_t module; - char *qfunction_name; - char *qfunction_source; + hipModule_t module; + char *qfunction_name; + char *qfunction_source; hipFunction_t QFunction; - Fields_Hip fields; - void *d_c; + Fields_Hip fields; + void *d_c; } CeedQFunction_Hip; typedef struct { @@ -77,34 +78,34 @@ typedef struct { } CeedQFunctionContext_Hip; typedef struct { - hipModule_t module; - hipFunction_t linearDiagonal; - hipFunction_t linearPointBlock; - CeedBasis basisin, basisout; + hipModule_t module; + hipFunction_t linearDiagonal; + hipFunction_t linearPointBlock; + CeedBasis basisin, basisout; CeedElemRestriction diagrstr, pbdiagrstr; - CeedVector elemdiag, pbelemdiag; - CeedInt numemodein, numemodeout, nnodes; - CeedEvalMode *h_emodein, *h_emodeout; - CeedEvalMode *d_emodein, *d_emodeout; - CeedScalar *d_identity, *d_interpin, *d_interpout, *d_gradin, *d_gradout; + CeedVector elemdiag, pbelemdiag; + CeedInt numemodein, numemodeout, nnodes; + CeedEvalMode *h_emodein, *h_emodeout; + CeedEvalMode *d_emodein, *d_emodeout; + CeedScalar *d_identity, *d_interpin, *d_interpout, *d_gradin, *d_gradout; } CeedOperatorDiag_Hip; typedef struct { - hipModule_t module; + hipModule_t module; hipFunction_t linearAssemble; - CeedInt nelem, block_size_x, block_size_y, elemsPerBlock; - CeedScalar *d_B_in, *d_B_out; + CeedInt nelem, block_size_x, block_size_y, elemsPerBlock; + CeedScalar *d_B_in, *d_B_out; } CeedOperatorAssemble_Hip; typedef struct { - CeedVector *evecs; // E-vectors, inputs followed by outputs - CeedVector *qvecsin; // Input Q-vectors needed to apply operator - CeedVector *qvecsout; // Output Q-vectors needed to apply operator - CeedInt numein; - CeedInt numeout; - CeedInt qfnumactivein, qfnumactiveout; - CeedVector *qfactivein; - CeedOperatorDiag_Hip *diag; + CeedVector *evecs; // E-vectors, inputs followed by outputs + CeedVector *qvecsin; // Input Q-vectors needed to apply operator + CeedVector *qvecsout; // Output Q-vectors needed to apply operator + CeedInt numein; + CeedInt numeout; + CeedInt qfnumactivein, qfnumactiveout; + CeedVector *qfactivein; + CeedOperatorDiag_Hip *diag; CeedOperatorAssemble_Hip *asmb; } CeedOperator_Hip; @@ -112,30 +113,20 @@ CEED_INTERN int CeedHipGetHipblasHandle(Ceed ceed, hipblasHandle_t *handle); CEED_INTERN int CeedVectorCreate_Hip(CeedSize n, CeedVector vec); -CEED_INTERN int CeedElemRestrictionCreate_Hip(CeedMemType mtype, - CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r); +CEED_INTERN int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r); -CEED_INTERN int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, - const CeedCopyMode cmode, const CeedInt *indices, - const CeedElemRestriction res); +CEED_INTERN int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *indices, + const CeedElemRestriction res); -CEED_INTERN int CeedBasisApplyElems_Hip(CeedBasis basis, const CeedInt nelem, - CeedTransposeMode tmode, CeedEvalMode emode, const CeedVector u, CeedVector v); +CEED_INTERN int CeedBasisApplyElems_Hip(CeedBasis basis, const CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, const CeedVector u, + CeedVector v); -CEED_INTERN int CeedQFunctionApplyElems_Hip(CeedQFunction qf, const CeedInt Q, - const CeedVector *const u, const CeedVector *v); +CEED_INTERN int CeedQFunctionApplyElems_Hip(CeedQFunction qf, const CeedInt Q, const CeedVector *const u, const CeedVector *v); -CEED_INTERN int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P1d, - CeedInt Q1d, - const CeedScalar *interp1d, - const CeedScalar *grad1d, - const CeedScalar *qref1d, - const CeedScalar *qweight1d, - CeedBasis basis); +CEED_INTERN int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, + const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); -CEED_INTERN int CeedBasisCreateH1_Hip(CeedElemTopology, CeedInt, CeedInt, - CeedInt, const CeedScalar *, - const CeedScalar *, const CeedScalar *, +CEED_INTERN int CeedBasisCreateH1_Hip(CeedElemTopology, CeedInt, CeedInt, CeedInt, const CeedScalar *, const CeedScalar *, const CeedScalar *, const CeedScalar *, CeedBasis); CEED_INTERN int CeedQFunctionCreate_Hip(CeedQFunction qf); diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp index 7c5525b953..4381f917f3 100644 --- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp +++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp @@ -11,25 +11,21 @@ //------------------------------------------------------------------------------ // Kernel for set value on device //------------------------------------------------------------------------------ -__global__ static void setValueK(CeedScalar * __restrict__ vec, CeedInt size, - CeedScalar val) { +__global__ static void setValueK(CeedScalar *__restrict__ vec, CeedInt size, CeedScalar val) { int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) - return; + if (idx >= size) return; vec[idx] = val; } //------------------------------------------------------------------------------ // Set value on device memory //------------------------------------------------------------------------------ -extern "C" int CeedDeviceSetValue_Hip(CeedScalar* d_array, CeedInt length, - CeedScalar val) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; +extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedInt length, CeedScalar val) { + const int bsize = 512; + const int vecsize = length; + int gridsize = vecsize / bsize; - if (bsize * gridsize < vecsize) - gridsize += 1; + if (bsize * gridsize < vecsize) gridsize += 1; hipLaunchKernelGGL(setValueK, dim3(gridsize), dim3(bsize), 0, 0, d_array, length, val); return 0; } @@ -37,24 +33,21 @@ extern "C" int CeedDeviceSetValue_Hip(CeedScalar* d_array, CeedInt length, //------------------------------------------------------------------------------ // Kernel for taking reciprocal //------------------------------------------------------------------------------ -__global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedInt size) { +__global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedInt size) { int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) - return; - if (fabs(vec[idx]) > 1E-16) - vec[idx] = 1./vec[idx]; + if (idx >= size) return; + if (fabs(vec[idx]) > 1E-16) vec[idx] = 1. / vec[idx]; } //------------------------------------------------------------------------------ // Take vector reciprocal in device memory //------------------------------------------------------------------------------ -extern "C" int CeedDeviceReciprocal_Hip(CeedScalar* d_array, CeedInt length) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; +extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedInt length) { + const int bsize = 512; + const int vecsize = length; + int gridsize = vecsize / bsize; - if (bsize * gridsize < vecsize) - gridsize += 1; + if (bsize * gridsize < vecsize) gridsize += 1; hipLaunchKernelGGL(rcpValueK, dim3(gridsize), dim3(bsize), 0, 0, d_array, length); return 0; } @@ -62,80 +55,65 @@ extern "C" int CeedDeviceReciprocal_Hip(CeedScalar* d_array, CeedInt length) { //------------------------------------------------------------------------------ // Kernel for scale //------------------------------------------------------------------------------ -__global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, - CeedInt size) { +__global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedInt size) { int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) - return; + if (idx >= size) return; x[idx] *= alpha; } //------------------------------------------------------------------------------ // Compute x = alpha x on device //------------------------------------------------------------------------------ -extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, - CeedInt length) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; +extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedInt length) { + const int bsize = 512; + const int vecsize = length; + int gridsize = vecsize / bsize; - if (bsize * gridsize < vecsize) - gridsize += 1; - hipLaunchKernelGGL(scaleValueK, dim3(gridsize), dim3(bsize), 0, 0, x_array, alpha, - length); + if (bsize * gridsize < vecsize) gridsize += 1; + hipLaunchKernelGGL(scaleValueK, dim3(gridsize), dim3(bsize), 0, 0, x_array, alpha, length); return 0; } //------------------------------------------------------------------------------ // Kernel for axpy //------------------------------------------------------------------------------ -__global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, - CeedScalar * __restrict__ x, CeedInt size) { +__global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedInt size) { int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) - return; + if (idx >= size) return; y[idx] += alpha * x[idx]; } //------------------------------------------------------------------------------ // Compute y = alpha x + y on device //------------------------------------------------------------------------------ -extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, - CeedScalar *x_array, CeedInt length) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; +extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) { + const int bsize = 512; + const int vecsize = length; + int gridsize = vecsize / bsize; - if (bsize * gridsize < vecsize) - gridsize += 1; - hipLaunchKernelGGL(axpyValueK, dim3(gridsize), dim3(bsize), 0, 0, y_array, alpha, - x_array, length); + if (bsize * gridsize < vecsize) gridsize += 1; + hipLaunchKernelGGL(axpyValueK, dim3(gridsize), dim3(bsize), 0, 0, y_array, alpha, x_array, length); return 0; } //------------------------------------------------------------------------------ // Kernel for pointwise mult //------------------------------------------------------------------------------ -__global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, - CeedScalar * x, CeedScalar * __restrict__ y, CeedInt size) { +__global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedInt size) { int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) - return; + if (idx >= size) return; w[idx] = x[idx] * y[idx]; } //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on device //------------------------------------------------------------------------------ -extern "C" int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, - CeedScalar *y_array, CeedInt length) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; +extern "C" int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) { + const int bsize = 512; + const int vecsize = length; + int gridsize = vecsize / bsize; - if (bsize * gridsize < vecsize) - gridsize += 1; - hipLaunchKernelGGL(pointwiseMultValueK, dim3(gridsize), dim3(bsize), 0, 0, w_array, - x_array, y_array, length); + if (bsize * gridsize < vecsize) gridsize += 1; + hipLaunchKernelGGL(pointwiseMultValueK, dim3(gridsize), dim3(bsize), 0, 0, w_array, x_array, y_array, length); return 0; } diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index a497ec2755..af645825e2 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -5,27 +5,26 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include -#include "ceed-hip-shared.h" + #include "../hip/ceed-hip-common.h" #include "../hip/ceed-hip-compile.h" +#include "ceed-hip-shared.h" //------------------------------------------------------------------------------ // Compute a block size based on required minimum threads //------------------------------------------------------------------------------ static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) { - CeedInt maxSize = 1024; // Max total threads per block - CeedInt currentSize = 64; // Start with one group + CeedInt maxSize = 1024; // Max total threads per block + CeedInt currentSize = 64; // Start with one group - while(currentSize < maxSize) { - if (currentSize > required) - break; - else - currentSize = currentSize * 2; + while (currentSize < maxSize) { + if (currentSize > required) break; + else currentSize = currentSize * 2; } return currentSize; } @@ -35,52 +34,48 @@ static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) { // num_comp (num_comp not currently used, but may be again in other basis // parallelization options) //------------------------------------------------------------------------------ -static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, - const CeedInt Q_1d, - const CeedInt num_comp, CeedInt *block_sizes) { - +static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, const CeedInt num_comp, CeedInt *block_sizes) { // Note that this will use the same block sizes for all dimensions when compiling, // but as each basis object is defined for a particular dimension, we will never // call any kernels except the ones for the dimension for which we have computed the // block sizes. const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d); switch (dim) { - case 1: { - // Interp kernels: - block_sizes[0] = 256; - - // Grad kernels: - block_sizes[1] = 256; - - // Weight kernels: - block_sizes[2] = 256; - - } break; - case 2: { - // Interp kernels: - CeedInt required = thread_1d * thread_1d; - block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); - - // Grad kernels: currently use same required minimum threads - block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); - - // Weight kernels: - required = CeedIntMax(64, Q_1d * Q_1d); - block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); - - } break; - case 3: { - // Interp kernels: - CeedInt required = thread_1d * thread_1d; - block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); - - // Grad kernels: currently use same required minimum threads - block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); - - // Weight kernels: - required = Q_1d * Q_1d * Q_1d; - block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); - } + case 1: { + // Interp kernels: + block_sizes[0] = 256; + + // Grad kernels: + block_sizes[1] = 256; + + // Weight kernels: + block_sizes[2] = 256; + } break; + case 2: { + // Interp kernels: + CeedInt required = thread_1d * thread_1d; + block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); + + // Grad kernels: currently use same required minimum threads + block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); + + // Weight kernels: + required = CeedIntMax(64, Q_1d * Q_1d); + block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); + + } break; + case 3: { + // Interp kernels: + CeedInt required = thread_1d * thread_1d; + block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); + + // Grad kernels: currently use same required minimum threads + block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); + + // Weight kernels: + required = Q_1d * Q_1d * Q_1d; + block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); + } } return CEED_ERROR_SUCCESS; @@ -89,207 +84,150 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, //------------------------------------------------------------------------------ // Apply basis //------------------------------------------------------------------------------ -int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, - CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector u, +int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - int ierr; Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Hip *ceed_Hip; - CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); CeedBasis_Hip_shared *data; - CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedInt dim, num_comp; - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); // Read vectors const CeedScalar *d_u; - CeedScalar *d_v; + CeedScalar *d_v; if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); } - ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); // Apply basis operation switch (eval_mode) { - case CEED_EVAL_INTERP: { - CeedInt P_1d, Q_1d; - CeedInt block_size = data->block_sizes[0]; - ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - void *interp_args[] = {(void *) &num_elem, &data->d_interp_1d, - &d_u, &d_v - }; - if (dim == 1) { - CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; - elems_per_block = elems_per_block > 0 ? elems_per_block : 1; - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d, - 1, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, 1, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); + case CEED_EVAL_INTERP: { + CeedInt P_1d, Q_1d; + CeedInt block_size = data->block_sizes[0]; + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; + if (dim == 1) { + CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); + } + } else if (dim == 2) { + // Check if required threads is small enough to do multiple elems + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend( + CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } + } else if (dim == 3) { + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend( + CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); + } } - } else if (dim == 2) { - // Check if required threads is small enough to do multiple elems - const CeedInt elems_per_block = CeedIntMax(block_size / - (thread_1d*thread_1d), 1); - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( - CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); + } break; + case CEED_EVAL_GRAD: { + CeedInt P_1d, Q_1d; + CeedInt block_size = data->block_sizes[1]; + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + CeedScalar *d_grad_1d = data->d_grad_1d; + if (data->d_collo_grad_1d) { + d_grad_1d = data->d_collo_grad_1d; } - } else if (dim == 3) { - const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d*thread_1d), - 1); - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( - CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, - elems_per_block, shared_mem, - interp_args); CeedChkBackend(ierr); + void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; + if (dim == 1) { + CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); + } + } else if (dim == 2) { + // Check if required threads is small enough to do multiple elems + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } + } else if (dim == 3) { + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); + } } - } - } break; - case CEED_EVAL_GRAD: { - CeedInt P_1d, Q_1d; - CeedInt block_size = data->block_sizes[1]; - ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); - CeedScalar *d_grad_1d = data->d_grad_1d; - if (data->d_collo_grad_1d) { - d_grad_1d = data->d_collo_grad_1d; - } - void *grad_args[] = {(void *) &num_elem, &data->d_interp_1d, - &d_grad_1d, &d_u, &d_v - }; - if (dim == 1) { - CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; - elems_per_block = elems_per_block > 0 ? elems_per_block : 1; - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, 1, - elems_per_block, shared_mem, grad_args); - } else { - ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, 1, - elems_per_block, shared_mem, grad_args); + } break; + case CEED_EVAL_WEIGHT: { + CeedInt Q_1d; + CeedInt block_size = data->block_sizes[2]; + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; + if (dim == 1) { + const CeedInt opt_elems = block_size / Q_1d; + const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; + const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedCallBackend(CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); + } else if (dim == 2) { + const CeedInt opt_elems = block_size / (Q_1d * Q_1d); + const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; + const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedCallBackend(CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); + } else if (dim == 3) { + const CeedInt opt_elems = block_size / (Q_1d * Q_1d); + const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; + const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedCallBackend(CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); } - CeedChkBackend(ierr); - } else if (dim == 2) { - // Check if required threads is small enough to do multiple elems - const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d*thread_1d), - 1); - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( - CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - grad_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, - elems_per_block, shared_mem, - grad_args); CeedChkBackend(ierr); - } - } else if (dim == 3) { - const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d*thread_1d), - 1); - CeedInt grid = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof( - CeedScalar); - if (t_mode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, - thread_1d, - elems_per_block, shared_mem, - grad_args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, - elems_per_block, shared_mem, - grad_args); CeedChkBackend(ierr); - } - } - } break; - case CEED_EVAL_WEIGHT: { - CeedInt Q_1d; - CeedInt block_size = data->block_sizes[2]; - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); - void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; - if (dim == 1) { - const CeedInt opt_elems = block_size / Q_1d; - const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, - elems_per_block, 1, weight_args); - CeedChkBackend(ierr); - } else if (dim == 2) { - const CeedInt opt_elems = block_size / (Q_1d * Q_1d); - const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, - elems_per_block, weight_args); - CeedChkBackend(ierr); - } else if (dim == 3) { - const CeedInt opt_elems = block_size / (Q_1d * Q_1d); - const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + - ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); - ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, - elems_per_block, weight_args); - CeedChkBackend(ierr); - } - } break; - // LCOV_EXCL_START - // Evaluate the divergence to/from the quadrature points - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - // Evaluate the curl to/from the quadrature points - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - // Take no action, BasisApply should not have been called - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + } break; + // LCOV_EXCL_START + // Evaluate the divergence to/from the quadrature points + case CEED_EVAL_DIV: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); + // Evaluate the curl to/from the quadrature points + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + // Take no action, BasisApply should not have been called + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP } // Restore vectors if (eval_mode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); } - ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); return CEED_ERROR_SUCCESS; } @@ -297,20 +235,19 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, // Destroy basis //------------------------------------------------------------------------------ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { - int ierr; Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasis_Hip_shared *data; - ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedChk_Hip(ceed, hipModuleUnload(data->module)); + CeedCallHip(ceed, hipModuleUnload(data->module)); - ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr); - ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr); - ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr); - ierr = hipFree(data->d_collo_grad_1d); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); + CeedCallHip(ceed, hipFree(data->d_interp_1d)); + CeedCallHip(ceed, hipFree(data->d_grad_1d)); + CeedCallHip(ceed, hipFree(data->d_collo_grad_1d)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -318,100 +255,66 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { //------------------------------------------------------------------------------ // Create tensor basis //------------------------------------------------------------------------------ -int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, - const CeedScalar *interp_1d, - const CeedScalar *grad_1d, - const CeedScalar *q_ref1d, - const CeedScalar *q_weight_1d, - CeedBasis basis) { - int ierr; +int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasis_Hip_shared *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU const CeedInt qBytes = Q_1d * sizeof(CeedScalar); - ierr = hipMalloc((void **)&data->d_q_weight_1d, qBytes); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, qBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, qBytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, qBytes, hipMemcpyHostToDevice)); const CeedInt iBytes = qBytes * P_1d; - ierr = hipMalloc((void **)&data->d_interp_1d, iBytes); CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_interp_1d, interp_1d, iBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, iBytes)); + CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, iBytes, hipMemcpyHostToDevice)); - ierr = hipMalloc((void **)&data->d_grad_1d, iBytes); CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_grad_1d, grad_1d, iBytes, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, iBytes)); + CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, iBytes, hipMemcpyHostToDevice)); // Compute collocated gradient and copy to GPU - data->d_collo_grad_1d = NULL; + data->d_collo_grad_1d = NULL; bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; if (has_collocated_grad) { CeedScalar *collo_grad_1d; - ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); - ierr = hipMalloc((void **)&data->d_collo_grad_1d, qBytes * Q_1d); - CeedChk_Hip(ceed, ierr); - ierr = hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, qBytes * Q_1d, - hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); - ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); + CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); + CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_collo_grad_1d, qBytes * Q_1d)); + CeedCallHip(ceed, hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, qBytes * Q_1d, hipMemcpyHostToDevice)); + CeedCallBackend(CeedFree(&collo_grad_1d)); } // Set number of threads per block for basis kernels CeedInt num_comp; - ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); - ierr = ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, - data->block_sizes); - CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes)); // Compile basis kernels char *basis_kernel_path, *basis_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/hip/hip-shared-basis-tensor.h", - &basis_kernel_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor.h", &basis_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); - ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 11, - "BASIS_Q_1D", Q_1d, - "BASIS_P_1D", P_1d, - "T_1D", CeedIntMax(Q_1d, P_1d), - "BASIS_DIM", dim, - "BASIS_NUM_COMP", num_comp, - "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), - "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), - "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], - "BASIS_GRAD_BLOCK_SIZE", data->block_sizes[1], - "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], - "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad - ); CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); - CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, "InterpTranspose", - &data->InterpTranspose); - CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); - CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, "GradTranspose", - &data->GradTranspose); - CeedChkBackend(ierr); - ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); - CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); - - ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); + CeedCallBackend(CeedCompileHip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", CeedIntMax(Q_1d, P_1d), + "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", + CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE", + data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD", + has_collocated_grad)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "GradTranspose", &data->GradTranspose)); + CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight)); + CeedCallBackend(CeedFree(&basis_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + + CeedCallBackend(CeedBasisSetData(basis, data)); // Register backend functions - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApplyTensor_Hip_shared); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", - CeedBasisDestroy_Hip_shared); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/hip-shared/ceed-hip-shared.c b/backends/hip-shared/ceed-hip-shared.c index 2925b1bf5c..9f8fdbdc32 100644 --- a/backends/hip-shared/ceed-hip-shared.c +++ b/backends/hip-shared/ceed-hip-shared.c @@ -5,51 +5,43 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-hip-shared.h" + #include +#include #include #include -#include "ceed-hip-shared.h" //------------------------------------------------------------------------------ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Hip_shared(const char *resource, Ceed ceed) { - int ierr; - char *resource_root; - ierr = CeedHipGetResourceRoot(ceed, resource, &resource_root); - CeedChkBackend(ierr); - if (strcmp(resource_root, "/gpu/hip/shared")) + CeedCallBackend(CeedHipGetResourceRoot(ceed, resource, &resource_root)); + if (strcmp(resource_root, "/gpu/hip/shared")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Hip backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedFree(&resource_root); CeedChkBackend(ierr); - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Hip backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedSetDeterministic(ceed, true)); Ceed_Hip *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); - ierr = CeedHipInit(ceed, resource); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); + CeedCallBackend(CeedSetData(ceed, data)); + CeedCallBackend(CeedHipInit(ceed, resource)); Ceed ceed_ref; - CeedInit("/gpu/hip/ref", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/gpu/hip/ref", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", - CeedBasisCreateTensorH1_Hip_shared); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", - CeedDestroy_Hip); CeedChkBackend(ierr); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Hip_shared)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Register backend //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Hip_Shared(void) { - return CeedRegister("/gpu/hip/shared", CeedInit_Hip_shared, 25); -} +CEED_INTERN int CeedRegister_Hip_Shared(void) { return CeedRegister("/gpu/hip/shared", CeedInit_Hip_shared, 25); } //------------------------------------------------------------------------------ diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h index 567ef63bc0..89f190e328 100644 --- a/backends/hip-shared/ceed-hip-shared.h +++ b/backends/hip-shared/ceed-hip-shared.h @@ -8,27 +8,27 @@ #ifndef _ceed_hip_shared_h #define _ceed_hip_shared_h -#include #include +#include #include + #include "../hip/ceed-hip-common.h" typedef struct { - hipModule_t module; + hipModule_t module; hipFunction_t Interp; hipFunction_t InterpTranspose; hipFunction_t Grad; hipFunction_t GradTranspose; hipFunction_t Weight; - CeedInt block_sizes[3]; // interp, grad, weight thread block sizes - CeedScalar *d_interp_1d; - CeedScalar *d_grad_1d; - CeedScalar *d_collo_grad_1d; - CeedScalar *d_q_weight_1d; + CeedInt block_sizes[3]; // interp, grad, weight thread block sizes + CeedScalar *d_interp_1d; + CeedScalar *d_grad_1d; + CeedScalar *d_collo_grad_1d; + CeedScalar *d_q_weight_1d; } CeedBasis_Hip_shared; -CEED_INTERN int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P1d, - CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, - const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); +CEED_INTERN int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, + const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); -#endif // _ceed_hip_shared_h +#endif // _ceed_hip_shared_h diff --git a/backends/hip/ceed-hip-common.c b/backends/hip/ceed-hip-common.c index 221198b9da..9b8c6e0a18 100644 --- a/backends/hip/ceed-hip-common.c +++ b/backends/hip/ceed-hip-common.c @@ -5,24 +5,20 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-hip-common.h" + #include -#include +#include #include -#include "ceed-hip-common.h" +#include //------------------------------------------------------------------------------ // Get root resource without device spec //------------------------------------------------------------------------------ -int CeedHipGetResourceRoot(Ceed ceed, const char *resource, - char **resource_root) { - int ierr; - - char *device_spec = strstr(resource, ":device_id="); - size_t resource_root_len = device_spec - ? (size_t)(device_spec - resource) + 1 - : strlen(resource) + 1; - ierr = CeedCalloc(resource_root_len, resource_root); CeedChkBackend(ierr); +int CeedHipGetResourceRoot(Ceed ceed, const char *resource, char **resource_root) { + char *device_spec = strstr(resource, ":device_id="); + size_t resource_root_len = device_spec ? (size_t)(device_spec - resource) + 1 : strlen(resource) + 1; + CeedCallBackend(CeedCalloc(resource_root_len, resource_root)); memcpy(*resource_root, resource, resource_root_len - 1); return CEED_ERROR_SUCCESS; @@ -32,24 +28,22 @@ int CeedHipGetResourceRoot(Ceed ceed, const char *resource, // Device information backend init //------------------------------------------------------------------------------ int CeedHipInit(Ceed ceed, const char *resource) { - int ierr; const char *device_spec = strstr(resource, ":device_id="); - const int device_id = (device_spec) ? atoi(device_spec + 11) : -1; + const int device_id = (device_spec) ? atoi(device_spec + 11) : -1; int current_device_id; - ierr = hipGetDevice(¤t_device_id); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipGetDevice(¤t_device_id)); if (device_id >= 0 && current_device_id != device_id) { - ierr = hipSetDevice(device_id); CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipSetDevice(device_id)); current_device_id = device_id; } struct hipDeviceProp_t device_prop; - ierr = hipGetDeviceProperties(&device_prop, current_device_id); - CeedChk_Hip(ceed, ierr); + CeedCallHip(ceed, hipGetDeviceProperties(&device_prop, current_device_id)); Ceed_Hip *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); - data->device_id = current_device_id; + CeedCallBackend(CeedGetData(ceed, &data)); + data->device_id = current_device_id; data->opt_block_size = 256; return CEED_ERROR_SUCCESS; } @@ -58,13 +52,12 @@ int CeedHipInit(Ceed ceed, const char *resource) { // Backend Destroy //------------------------------------------------------------------------------ int CeedDestroy_Hip(Ceed ceed) { - int ierr; Ceed_Hip *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); if (data->hipblas_handle) { - ierr = hipblasDestroy(data->hipblas_handle); CeedChk_Hipblas(ceed, ierr); + CeedCallHipblas(ceed, hipblasDestroy(data->hipblas_handle)); } - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip/ceed-hip-common.h b/backends/hip/ceed-hip-common.h index f3650d4615..33386ef1cf 100644 --- a/backends/hip/ceed-hip-common.h +++ b/backends/hip/ceed-hip-common.h @@ -8,7 +8,6 @@ #ifndef _ceed_common_hip_h #define _ceed_common_hip_h -#include #include #include #include @@ -20,25 +19,39 @@ #define QUOTE(...) #__VA_ARGS__ -#define CeedChk_Hip(ceed, x) \ -do { \ - hipError_t hip_result = x; \ - if (hip_result != hipSuccess) { \ - const char *msg = hipGetErrorName(hip_result); \ - return CeedError((ceed), CEED_ERROR_BACKEND, msg); \ - } \ -} while (0) +#define CeedChk_Hip(ceed, x) \ + do { \ + hipError_t hip_result = x; \ + if (hip_result != hipSuccess) { \ + const char *msg = hipGetErrorName(hip_result); \ + return CeedError((ceed), CEED_ERROR_BACKEND, msg); \ + } \ + } while (0) -#define CeedChk_Hipblas(ceed, x) \ -do { \ - hipblasStatus_t hipblas_result = x; \ - if (hipblas_result != HIPBLAS_STATUS_SUCCESS) { \ - const char *msg = hipblasGetErrorName(hipblas_result); \ - return CeedError((ceed), CEED_ERROR_BACKEND, msg); \ - } \ -} while (0) +#define CeedChk_Hipblas(ceed, x) \ + do { \ + hipblasStatus_t hipblas_result = x; \ + if (hipblas_result != HIPBLAS_STATUS_SUCCESS) { \ + const char *msg = hipblasGetErrorName(hipblas_result); \ + return CeedError((ceed), CEED_ERROR_BACKEND, msg); \ + } \ + } while (0) -#define CASE(name) case name: return #name +#define CeedCallHip(ceed, ...) \ + do { \ + hipError_t ierr_q_ = __VA_ARGS__; \ + CeedChk_Hip(ceed, ierr_q_); \ + } while (0); + +#define CeedCallHipblas(ceed, ...) \ + do { \ + hipblasStatus_t ierr_q_ = __VA_ARGS__; \ + CeedChk_Hipblas(ceed, ierr_q_); \ + } while (0); + +#define CASE(name) \ + case name: \ + return #name // LCOV_EXCL_START CEED_UNUSED static const char *hipblasGetErrorName(hipblasStatus_t error) { switch (error) { @@ -50,22 +63,22 @@ CEED_UNUSED static const char *hipblasGetErrorName(hipblasStatus_t error) { CASE(HIPBLAS_STATUS_MAPPING_ERROR); CASE(HIPBLAS_STATUS_EXECUTION_FAILED); CASE(HIPBLAS_STATUS_INTERNAL_ERROR); - default: return "HIPBLAS_STATUS_UNKNOWN_ERROR"; + default: + return "HIPBLAS_STATUS_UNKNOWN_ERROR"; } } // LCOV_EXCL_STOP typedef struct { - int opt_block_size; - int device_id; + int opt_block_size; + int device_id; hipblasHandle_t hipblas_handle; } Ceed_Hip; -CEED_INTERN int CeedHipGetResourceRoot(Ceed ceed, const char *resource, - char **resource_root); +CEED_INTERN int CeedHipGetResourceRoot(Ceed ceed, const char *resource, char **resource_root); CEED_INTERN int CeedHipInit(Ceed ceed, const char *resource); CEED_INTERN int CeedDestroy_Hip(Ceed ceed); -#endif // _ceed_hip_common_h +#endif // _ceed_hip_common_h diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp index 250ab855c5..3f63d334a9 100644 --- a/backends/hip/ceed-hip-compile.cpp +++ b/backends/hip/ceed-hip-compile.cpp @@ -5,30 +5,36 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-hip-compile.h" + #include +#include #include -#include +#include #include #include -#include + +#include + #include "ceed-hip-common.h" -#include "ceed-hip-compile.h" -#define CeedChk_hiprtc(ceed, x) \ -do { \ - hiprtcResult result = static_cast(x); \ - if (result != HIPRTC_SUCCESS) \ - return CeedError((ceed), CEED_ERROR_BACKEND, hiprtcGetErrorString(result)); \ -} while (0) +#define CeedChk_hiprtc(ceed, x) \ + do { \ + hiprtcResult result = static_cast(x); \ + if (result != HIPRTC_SUCCESS) return CeedError((ceed), CEED_ERROR_BACKEND, hiprtcGetErrorString(result)); \ + } while (0) + +#define CeedCallHiprtc(ceed, ...) \ + do { \ + int ierr_q_ = __VA_ARGS__; \ + CeedChk_hiprtc(ceed, ierr_q_); \ + } while (0); //------------------------------------------------------------------------------ // Compile HIP kernel //------------------------------------------------------------------------------ -int CeedCompileHip(Ceed ceed, const char *source, hipModule_t *module, - const CeedInt num_defines, ...) { - int ierr; - hipFree(0); // Make sure a Context exists for hiprtc +int CeedCompileHip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) { + hipFree(0); // Make sure a Context exists for hiprtc hiprtcProgram prog; std::ostringstream code; @@ -36,7 +42,7 @@ int CeedCompileHip(Ceed ceed, const char *source, hipModule_t *module, // Add hip runtime include statement for generation if runtime < 40400000 // (implies ROCm < 4.5) int runtime_version; - CeedChk_Hip(ceed, hipRuntimeGetVersion(&runtime_version)); + CeedCallHip(ceed, hipRuntimeGetVersion(&runtime_version)); if (runtime_version < 40400000) { code << "\n#include \n"; } @@ -53,10 +59,10 @@ int CeedCompileHip(Ceed ceed, const char *source, hipModule_t *module, va_list args; va_start(args, num_defines); char *name; - int val; + int val; for (int i = 0; i < num_defines; i++) { name = va_arg(args, char *); - val = va_arg(args, int); + val = va_arg(args, int); code << "#define " << name << " " << val << "\n"; } va_end(args); @@ -64,33 +70,30 @@ int CeedCompileHip(Ceed ceed, const char *source, hipModule_t *module, // Standard libCEED definitions for HIP backends char *jit_defs_path, *jit_defs_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/hip/hip-jit.h", - &jit_defs_path); CeedChkBackend(ierr); - ierr = CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source); - CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-jit.h", &jit_defs_path)); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source)); code << jit_defs_source; code << "\n\n"; - ierr = CeedFree(&jit_defs_path); CeedChkBackend(ierr); - ierr = CeedFree(&jit_defs_source); CeedChkBackend(ierr); - + CeedCallBackend(CeedFree(&jit_defs_path)); + CeedCallBackend(CeedFree(&jit_defs_source)); + // Non-macro options - const int num_opts = 3; + const int num_opts = 3; const char *opts[num_opts]; opts[0] = "-default-device"; struct hipDeviceProp_t prop; - Ceed_Hip *ceed_data; - ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChkBackend(ierr); - CeedChk_Hip(ceed, hipGetDeviceProperties(&prop, ceed_data->device_id)); - std::string arch_arg = "--gpu-architecture=" + std::string(prop.gcnArchName); - opts[1] = arch_arg.c_str(); - opts[2] = "-munsafe-fp-atomics"; + Ceed_Hip *ceed_data; + CeedCallBackend(CeedGetData(ceed, (void **)&ceed_data)); + CeedCallHip(ceed, hipGetDeviceProperties(&prop, ceed_data->device_id)); + std::string arch_arg = "--gpu-architecture=" + std::string(prop.gcnArchName); + opts[1] = arch_arg.c_str(); + opts[2] = "-munsafe-fp-atomics"; // Add string source argument provided in call code << source; // Create Program - CeedChk_hiprtc(ceed, hiprtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL)); + CeedCallHiprtc(ceed, hiprtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL)); // Compile kernel hiprtcResult result = hiprtcCompileProgram(prog, num_opts, opts); @@ -98,21 +101,20 @@ int CeedCompileHip(Ceed ceed, const char *source, hipModule_t *module, size_t log_size; CeedChk_hiprtc(ceed, hiprtcGetProgramLogSize(prog, &log_size)); char *log; - ierr = CeedMalloc(log_size, &log); CeedChkBackend(ierr); - CeedChk_hiprtc(ceed, hiprtcGetProgramLog(prog, log)); - return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", - hiprtcGetErrorString(result), log); + CeedCallBackend(CeedMalloc(log_size, &log)); + CeedCallHiprtc(ceed, hiprtcGetProgramLog(prog, log)); + return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log); } size_t ptx_size; - CeedChk_hiprtc(ceed, hiprtcGetCodeSize(prog, &ptx_size)); + CeedCallHiprtc(ceed, hiprtcGetCodeSize(prog, &ptx_size)); char *ptx; - ierr = CeedMalloc(ptx_size, &ptx); CeedChkBackend(ierr); - CeedChk_hiprtc(ceed, hiprtcGetCode(prog, ptx)); - CeedChk_hiprtc(ceed, hiprtcDestroyProgram(&prog)); + CeedCallBackend(CeedMalloc(ptx_size, &ptx)); + CeedCallHiprtc(ceed, hiprtcGetCode(prog, ptx)); + CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog)); - CeedChk_Hip(ceed, hipModuleLoadData(module, ptx)); - ierr = CeedFree(&ptx); CeedChkBackend(ierr); + CeedCallHip(ceed, hipModuleLoadData(module, ptx)); + CeedCallBackend(CeedFree(&ptx)); return CEED_ERROR_SUCCESS; } @@ -120,44 +122,33 @@ int CeedCompileHip(Ceed ceed, const char *source, hipModule_t *module, //------------------------------------------------------------------------------ // Get HIP kernel //------------------------------------------------------------------------------ -int CeedGetKernelHip(Ceed ceed, hipModule_t module, const char *name, - hipFunction_t *kernel) { - - CeedChk_Hip(ceed, hipModuleGetFunction(kernel, module, name)); +int CeedGetKernelHip(Ceed ceed, hipModule_t module, const char *name, hipFunction_t *kernel) { + CeedCallHip(ceed, hipModuleGetFunction(kernel, module, name)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Run HIP kernel //------------------------------------------------------------------------------ -int CeedRunKernelHip(Ceed ceed, hipFunction_t kernel, const int grid_size, - const int block_size, void **args) { - CeedChk_Hip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size, 1, - 1, 0, NULL, args, NULL)); +int CeedRunKernelHip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size, void **args) { + CeedCallHip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size, 1, 1, 0, NULL, args, NULL)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Run HIP kernel for spatial dimension //------------------------------------------------------------------------------ -int CeedRunKernelDimHip(Ceed ceed, hipFunction_t kernel, const int grid_size, - const int block_size_x, const int block_size_y, - const int block_size_z, void **args) { - CeedChk_Hip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, - block_size_x, block_size_y, block_size_z, - 0, NULL, args, NULL)); +int CeedRunKernelDimHip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, + void **args) { + CeedCallHip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, 0, NULL, args, NULL)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Run HIP kernel for spatial dimension with shared memory //------------------------------------------------------------------------------ -int CeedRunKernelDimSharedHip(Ceed ceed, hipFunction_t kernel, const int grid_size, - const int block_size_x, const int block_size_y, - const int block_size_z, const int shared_mem_size, - void **args) { - CeedChk_Hip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, - block_size_x, block_size_y, block_size_z, - shared_mem_size, NULL, args, NULL)); +int CeedRunKernelDimSharedHip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, void **args) { + CeedCallHip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip/ceed-hip-compile.h b/backends/hip/ceed-hip-compile.h index abcb260270..7a110ed5a2 100644 --- a/backends/hip/ceed-hip-compile.h +++ b/backends/hip/ceed-hip-compile.h @@ -8,30 +8,22 @@ #ifndef _ceed_hip_compile_h #define _ceed_hip_compile_h +#include #include #include -static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { - return (numerator + denominator - 1) / denominator; -} +static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; } -CEED_INTERN int CeedCompileHip(Ceed ceed, const char *source, - hipModule_t *module, const CeedInt num_defines, ...); +CEED_INTERN int CeedCompileHip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...); -CEED_INTERN int CeedGetKernelHip(Ceed ceed, hipModule_t module, - const char *name, hipFunction_t *kernel); +CEED_INTERN int CeedGetKernelHip(Ceed ceed, hipModule_t module, const char *name, hipFunction_t *kernel); -CEED_INTERN int CeedRunKernelHip(Ceed ceed, hipFunction_t kernel, - const int grid_size, - const int block_size, void **args); +CEED_INTERN int CeedRunKernelHip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size, void **args); -CEED_INTERN int CeedRunKernelDimHip(Ceed ceed, hipFunction_t kernel, - const int grid_size, - const int block_size_x, const int block_size_y, +CEED_INTERN int CeedRunKernelDimHip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, void **args); -CEED_INTERN int CeedRunKernelDimSharedHip(Ceed ceed, hipFunction_t kernel, - const int grid_size, const int block_size_x, const int block_size_y, - const int block_size_z, const int shared_mem_size, void **args); +CEED_INTERN int CeedRunKernelDimSharedHip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, void **args); -#endif // _ceed_hip_compile_h +#endif // _ceed_hip_compile_h diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c index 4c55667a34..43cfc6d81e 100644 --- a/backends/magma/ceed-magma-basis.c +++ b/backends/magma/ceed-magma-basis.c @@ -5,10 +5,11 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-magma.h" #ifdef CEED_MAGMA_USE_HIP #include "../hip/ceed-hip-common.h" @@ -21,561 +22,477 @@ #ifdef __cplusplus CEED_INTERN "C" #endif -int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, - CeedTransposeMode tmode, CeedEvalMode emode, - CeedVector U, CeedVector V) { - int ierr; + int + CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedInt dim, ncomp, ndof; - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); - ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); Ceed_Magma *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); const CeedScalar *u; - CeedScalar *v; + CeedScalar *v; if (emode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &u)); } else if (emode != CEED_EVAL_WEIGHT) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "An input vector is required for this CeedEvalMode"); + return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); // LCOV_EXCL_STOP } - ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &v)); CeedBasis_Magma *impl; - ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &impl)); CeedInt P1d, Q1d; - ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q1d)); - CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT - ", comp = %" CeedInt_FMT, ncomp*CeedIntPow(P1d, dim), ncomp); + CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * CeedIntPow(P1d, dim), ncomp); if (tmode == CEED_TRANSPOSE) { CeedSize length; - ierr = CeedVectorGetLength(V, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(V, &length)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) v, length, - data->queue); + magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)v, length, data->queue); } else { - magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) v, length, - data->queue); + magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)v, length, data->queue); } - ceed_magma_queue_sync( data->queue ); + ceed_magma_queue_sync(data->queue); } switch (emode) { - case CEED_EVAL_INTERP: { - CeedInt P = P1d, Q = Q1d; - if (tmode == CEED_TRANSPOSE) { - P = Q1d; Q = P1d; - } - - // Define element sizes for dofs/quad - CeedInt elquadsize = CeedIntPow(Q1d, dim); - CeedInt eldofssize = CeedIntPow(P1d, dim); + case CEED_EVAL_INTERP: { + CeedInt P = P1d, Q = Q1d; + if (tmode == CEED_TRANSPOSE) { + P = Q1d; + Q = P1d; + } - // E-vector ordering -------------- Q-vector ordering - // component component - // elem elem - // node node + // Define element sizes for dofs/quad + CeedInt elquadsize = CeedIntPow(Q1d, dim); + CeedInt eldofssize = CeedIntPow(P1d, dim); - // --- Define strides for NOTRANSPOSE mode: --- - // Input (u) is E-vector, output (v) is Q-vector + // E-vector ordering -------------- Q-vector ordering + // component component + // elem elem + // node node - // Element strides - CeedInt u_elstride = eldofssize; - CeedInt v_elstride = elquadsize; - // Component strides - CeedInt u_compstride = nelem * eldofssize; - CeedInt v_compstride = nelem * elquadsize; + // --- Define strides for NOTRANSPOSE mode: --- + // Input (u) is E-vector, output (v) is Q-vector - // --- Swap strides for TRANSPOSE mode: --- - if (tmode == CEED_TRANSPOSE) { - // Input (u) is Q-vector, output (v) is E-vector // Element strides - v_elstride = eldofssize; - u_elstride = elquadsize; + CeedInt u_elstride = eldofssize; + CeedInt v_elstride = elquadsize; // Component strides - v_compstride = nelem * eldofssize; - u_compstride = nelem * elquadsize; - } + CeedInt u_compstride = nelem * eldofssize; + CeedInt v_compstride = nelem * elquadsize; + + // --- Swap strides for TRANSPOSE mode: --- + if (tmode == CEED_TRANSPOSE) { + // Input (u) is Q-vector, output (v) is E-vector + // Element strides + v_elstride = eldofssize; + u_elstride = elquadsize; + // Component strides + v_compstride = nelem * eldofssize; + u_compstride = nelem * elquadsize; + } - CeedInt nthreads = 1; - CeedInt ntcol = 1; - CeedInt shmem = 0; - CeedInt maxPQ = CeedIntMax(P, Q); + CeedInt nthreads = 1; + CeedInt ntcol = 1; + CeedInt shmem = 0; + CeedInt maxPQ = CeedIntMax(P, Q); + + switch (dim) { + case 1: + nthreads = maxPQ; + ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); + shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q)); + shmem += sizeof(CeedScalar) * (P * Q); + break; + case 2: + nthreads = maxPQ; + ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); + shmem += P * Q * sizeof(CeedScalar); // for sT + shmem += ntcol * (P * maxPQ * sizeof(CeedScalar)); // for reforming rU we need PxP, and for the intermediate output we need PxQ + break; + case 3: + nthreads = maxPQ * maxPQ; + ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); + shmem += sizeof(CeedScalar) * (P * Q); // for sT + shmem += sizeof(CeedScalar) * ntcol * + (CeedIntMax(P * P * maxPQ, + P * Q * Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) + } + CeedInt grid = (nelem + ntcol - 1) / ntcol; + void *args[] = {&impl->dinterp1d, &u, &u_elstride, &u_compstride, &v, &v_elstride, &v_compstride, &nelem}; - switch (dim) { - case 1: - nthreads = maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); - shmem += sizeof(CeedScalar) * ntcol * ( ncomp * (1*P + 1*Q) ); - shmem += sizeof(CeedScalar) * (P*Q); - break; - case 2: - nthreads = maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); - shmem += P*Q *sizeof(CeedScalar); // for sT - shmem += ntcol * ( P*maxPQ*sizeof( - CeedScalar) ); // for reforming rU we need PxP, and for the intermediate output we need PxQ - break; - case 3: - nthreads = maxPQ*maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); - shmem += sizeof(CeedScalar)* (P*Q); // for sT - shmem += sizeof(CeedScalar)* ntcol * (CeedIntMax(P*P*maxPQ, - P*Q*Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) - } - CeedInt grid = (nelem + ntcol-1) / ntcol; - void *args[] = {&impl->dinterp1d, - &u, &u_elstride, &u_compstride, - &v, &v_elstride, &v_compstride, - &nelem - }; - - if (tmode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, - nthreads, ntcol, 1, shmem, - args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, - nthreads, ntcol, 1, shmem, - args); CeedChkBackend(ierr); - } - } - break; - case CEED_EVAL_GRAD: { - CeedInt P = P1d, Q = Q1d; - // In CEED_NOTRANSPOSE mode: - // u is (P^dim x nc), column-major layout (nc = ncomp) - // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) - // In CEED_TRANSPOSE mode, the sizes of u and v are switched. - if (tmode == CEED_TRANSPOSE) { - P = Q1d, Q = P1d; - } + if (tmode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, nthreads, ntcol, 1, shmem, args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, nthreads, ntcol, 1, shmem, args)); + } + } break; + case CEED_EVAL_GRAD: { + CeedInt P = P1d, Q = Q1d; + // In CEED_NOTRANSPOSE mode: + // u is (P^dim x nc), column-major layout (nc = ncomp) + // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) + // In CEED_TRANSPOSE mode, the sizes of u and v are switched. + if (tmode == CEED_TRANSPOSE) { + P = Q1d, Q = P1d; + } - // Define element sizes for dofs/quad - CeedInt elquadsize = CeedIntPow(Q1d, dim); - CeedInt eldofssize = CeedIntPow(P1d, dim); - - // E-vector ordering -------------- Q-vector ordering - // dim - // component component - // elem elem - // node node - - // --- Define strides for NOTRANSPOSE mode: --- - // Input (u) is E-vector, output (v) is Q-vector - - // Element strides - CeedInt u_elstride = eldofssize; - CeedInt v_elstride = elquadsize; - // Component strides - CeedInt u_compstride = nelem * eldofssize; - CeedInt v_compstride = nelem * elquadsize; - // Dimension strides - CeedInt u_dimstride = 0; - CeedInt v_dimstride = nelem * elquadsize * ncomp; - - // --- Swap strides for TRANSPOSE mode: --- - if (tmode == CEED_TRANSPOSE) { - // Input (u) is Q-vector, output (v) is E-vector - // Element strides - v_elstride = eldofssize; - u_elstride = elquadsize; - // Component strides - v_compstride = nelem * eldofssize; - u_compstride = nelem * elquadsize; - // Dimension strides - v_dimstride = 0; - u_dimstride = nelem * elquadsize * ncomp; + // Define element sizes for dofs/quad + CeedInt elquadsize = CeedIntPow(Q1d, dim); + CeedInt eldofssize = CeedIntPow(P1d, dim); - } + // E-vector ordering -------------- Q-vector ordering + // dim + // component component + // elem elem + // node node - CeedInt nthreads = 1; - CeedInt ntcol = 1; - CeedInt shmem = 0; - CeedInt maxPQ = CeedIntMax(P, Q); + // --- Define strides for NOTRANSPOSE mode: --- + // Input (u) is E-vector, output (v) is Q-vector - switch (dim) { - case 1: - nthreads = maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); - shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1*P + 1*Q)); - shmem += sizeof(CeedScalar) * (P*Q); - break; - case 2: - nthreads = maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); - shmem += sizeof(CeedScalar) * 2*P*Q; // for sTinterp and sTgrad - shmem += sizeof(CeedScalar) * ntcol * - (P*maxPQ); // for reforming rU we need PxP, and for the intermediate output we need PxQ - break; - case 3: - nthreads = maxPQ * maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); - shmem += sizeof(CeedScalar) * 2*P*Q; // for sTinterp and sTgrad - shmem += sizeof(CeedScalar) * ntcol * CeedIntMax(P*P*P, - (P*P*Q) + - (P*Q*Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) - } - CeedInt grid = (nelem + ntcol-1) / ntcol; - void *args[] = {&impl->dinterp1d, &impl->dgrad1d, - &u, &u_elstride, &u_compstride, &u_dimstride, - &v, &v_elstride, &v_compstride, &v_dimstride, - &nelem - }; - - if (tmode == CEED_TRANSPOSE) { - ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, - nthreads, ntcol, 1, shmem, - args); CeedChkBackend(ierr); - } else { - ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, - nthreads, ntcol, 1, shmem, - args); CeedChkBackend(ierr); - } - } - break; - case CEED_EVAL_WEIGHT: { - if (tmode == CEED_TRANSPOSE) - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); - // LCOV_EXCL_STOP - CeedInt Q = Q1d; - CeedInt eldofssize = CeedIntPow(Q, dim); - CeedInt nthreads = 1; - CeedInt ntcol = 1; - CeedInt shmem = 0; + // Element strides + CeedInt u_elstride = eldofssize; + CeedInt v_elstride = elquadsize; + // Component strides + CeedInt u_compstride = nelem * eldofssize; + CeedInt v_compstride = nelem * elquadsize; + // Dimension strides + CeedInt u_dimstride = 0; + CeedInt v_dimstride = nelem * elquadsize * ncomp; + + // --- Swap strides for TRANSPOSE mode: --- + if (tmode == CEED_TRANSPOSE) { + // Input (u) is Q-vector, output (v) is E-vector + // Element strides + v_elstride = eldofssize; + u_elstride = elquadsize; + // Component strides + v_compstride = nelem * eldofssize; + u_compstride = nelem * elquadsize; + // Dimension strides + v_dimstride = 0; + u_dimstride = nelem * elquadsize * ncomp; + } - switch (dim) { - case 1: - nthreads = Q; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); - shmem += sizeof(CeedScalar) * Q; // for dqweight1d - shmem += sizeof(CeedScalar) * ntcol * Q; // for output - break; - case 2: - nthreads = Q; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); - shmem += sizeof(CeedScalar) * Q; // for dqweight1d - break; - case 3: - nthreads = Q * Q; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); - shmem += sizeof(CeedScalar) * Q; // for dqweight1d - } - CeedInt grid = (nelem + ntcol-1) / ntcol; - void *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem}; + CeedInt nthreads = 1; + CeedInt ntcol = 1; + CeedInt shmem = 0; + CeedInt maxPQ = CeedIntMax(P, Q); + + switch (dim) { + case 1: + nthreads = maxPQ; + ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); + shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q)); + shmem += sizeof(CeedScalar) * (P * Q); + break; + case 2: + nthreads = maxPQ; + ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); + shmem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad + shmem += sizeof(CeedScalar) * ntcol * (P * maxPQ); // for reforming rU we need PxP, and for the intermediate output we need PxQ + break; + case 3: + nthreads = maxPQ * maxPQ; + ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); + shmem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad + shmem += sizeof(CeedScalar) * ntcol * + CeedIntMax(P * P * P, + (P * P * Q) + (P * Q * Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) + } + CeedInt grid = (nelem + ntcol - 1) / ntcol; + void *args[] = {&impl->dinterp1d, &impl->dgrad1d, &u, &u_elstride, &u_compstride, &u_dimstride, &v, + &v_elstride, &v_compstride, &v_dimstride, &nelem}; + + if (tmode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, nthreads, ntcol, 1, shmem, args)); + } else { + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, nthreads, ntcol, 1, shmem, args)); + } + } break; + case CEED_EVAL_WEIGHT: { + if (tmode == CEED_TRANSPOSE) + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); + // LCOV_EXCL_STOP + CeedInt Q = Q1d; + CeedInt eldofssize = CeedIntPow(Q, dim); + CeedInt nthreads = 1; + CeedInt ntcol = 1; + CeedInt shmem = 0; + + switch (dim) { + case 1: + nthreads = Q; + ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); + shmem += sizeof(CeedScalar) * Q; // for dqweight1d + shmem += sizeof(CeedScalar) * ntcol * Q; // for output + break; + case 2: + nthreads = Q; + ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); + shmem += sizeof(CeedScalar) * Q; // for dqweight1d + break; + case 3: + nthreads = Q * Q; + ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); + shmem += sizeof(CeedScalar) * Q; // for dqweight1d + } + CeedInt grid = (nelem + ntcol - 1) / ntcol; + void *args[] = {&impl->dqweight1d, &v, &eldofssize, &nelem}; - ierr = CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, - nthreads, ntcol, 1, shmem, - args); CeedChkBackend(ierr); - } - break; - // LCOV_EXCL_START - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, nthreads, ntcol, 1, shmem, args)); + } break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP } // must sync to ensure completeness - ceed_magma_queue_sync( data->queue ); + ceed_magma_queue_sync(data->queue); - if (emode!=CEED_EVAL_WEIGHT) { - ierr = CeedVectorRestoreArrayRead(U, &u); CeedChkBackend(ierr); + if (emode != CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedVectorRestoreArrayRead(U, &u)); } - ierr = CeedVectorRestoreArray(V, &v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(V, &v)); return CEED_ERROR_SUCCESS; } #ifdef __cplusplus CEED_INTERN "C" #endif -int CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem, - CeedTransposeMode tmode, CeedEvalMode emode, - CeedVector U, CeedVector V) { - int ierr; + int + CeedBasisApplyNonTensor_f64_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Magma *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); CeedInt dim, ncomp, ndof, nqpt; - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); - ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt)); const CeedScalar *du; - CeedScalar *dv; + CeedScalar *dv; if (emode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); } else if (emode != CEED_EVAL_WEIGHT) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "An input vector is required for this CeedEvalMode"); + return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); // LCOV_EXCL_STOP } - ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); CeedBasisNonTensor_Magma *impl; - ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &impl)); - CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT - ", comp = %" CeedInt_FMT, ncomp*ndof, ncomp); + CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp); if (tmode == CEED_TRANSPOSE) { CeedSize length; - ierr = CeedVectorGetLength(V, &length); + CeedCallBackend(CeedVectorGetLength(V, &length)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length, - data->queue); + magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); } else { - magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length, - data->queue); + magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue); } - ceed_magma_queue_sync( data->queue ); + ceed_magma_queue_sync(data->queue); } switch (emode) { - case CEED_EVAL_INTERP: { - CeedInt P = ndof, Q = nqpt; - if (tmode == CEED_TRANSPOSE) - magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, - P, nelem*ncomp, Q, - 1.0, (double *)impl->dinterp, P, - (double *)du, Q, - 0.0, (double *)dv, P, data->queue); - else - magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, - Q, nelem*ncomp, P, - 1.0, (double *)impl->dinterp, P, - (double *)du, P, - 0.0, (double *)dv, Q, data->queue); - } - break; - - case CEED_EVAL_GRAD: { - CeedInt P = ndof, Q = nqpt; - if (tmode == CEED_TRANSPOSE) { - CeedScalar beta = 0.0; - for(int d=0; d0) - beta = 1.0; - magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, - P, nelem*ncomp, Q, - 1.0, (double *)(impl->dgrad + d*P*Q), P, - (double *)(du + d*nelem*ncomp*Q), Q, - beta, (double *)dv, P, data->queue); + case CEED_EVAL_INTERP: { + CeedInt P = ndof, Q = nqpt; + if (tmode == CEED_TRANSPOSE) + magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (double *)impl->dinterp, P, (double *)du, Q, 0.0, (double *)dv, P, + data->queue); + else + magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (double *)impl->dinterp, P, (double *)du, P, 0.0, (double *)dv, Q, + data->queue); + } break; + + case CEED_EVAL_GRAD: { + CeedInt P = ndof, Q = nqpt; + if (tmode == CEED_TRANSPOSE) { + CeedScalar beta = 0.0; + for (int d = 0; d < dim; d++) { + if (d > 0) beta = 1.0; + magma_dgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (double *)(impl->dgrad + d * P * Q), P, + (double *)(du + d * nelem * ncomp * Q), Q, beta, (double *)dv, P, data->queue); + } + } else { + for (int d = 0; d < dim; d++) + magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (double *)(impl->dgrad + d * P * Q), P, (double *)du, P, 0.0, + (double *)(dv + d * nelem * ncomp * Q), Q, data->queue); } - } else { - for(int d=0; d< dim; d++) - magma_dgemm_nontensor(MagmaTrans, MagmaNoTrans, - Q, nelem*ncomp, P, - 1.0, (double *)(impl->dgrad + d*P*Q), P, - (double *)du, P, - 0.0, (double *)(dv + d*nelem*ncomp*Q), Q, data->queue); - } - } - break; + } break; - case CEED_EVAL_WEIGHT: { - if (tmode == CEED_TRANSPOSE) - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); - // LCOV_EXCL_STOP + case CEED_EVAL_WEIGHT: { + if (tmode == CEED_TRANSPOSE) + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); + // LCOV_EXCL_STOP - int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; - int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlockdqweight, dv, - data->queue); - CeedChkBackend(ierr); - } - break; - - // LCOV_EXCL_START - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + int elemsPerBlock = 1; // basis->Q1d < 7 ? optElems[basis->Q1d] : 1; + int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); + magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue); + } break; + + // LCOV_EXCL_START + case CEED_EVAL_DIV: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP } // must sync to ensure completeness - ceed_magma_queue_sync( data->queue ); + ceed_magma_queue_sync(data->queue); - if (emode!=CEED_EVAL_WEIGHT) { - ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr); + if (emode != CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); } - ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(V, &dv)); return CEED_ERROR_SUCCESS; } -int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem, - CeedTransposeMode tmode, CeedEvalMode emode, - CeedVector U, CeedVector V) { - int ierr; +int CeedBasisApplyNonTensor_f32_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Magma *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); CeedInt dim, ncomp, ndof, nqpt; - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); - ierr = CeedBasisGetNumNodes(basis, &ndof); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt)); const CeedScalar *du; - CeedScalar *dv; + CeedScalar *dv; if (emode != CEED_EVAL_WEIGHT) { - ierr = CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); } else if (emode != CEED_EVAL_WEIGHT) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "An input vector is required for this CeedEvalMode"); + return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); // LCOV_EXCL_STOP } - ierr = CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); CeedBasisNonTensor_Magma *impl; - ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &impl)); - CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT - ", comp = %" CeedInt_FMT, ncomp*ndof, ncomp); + CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp); if (tmode == CEED_TRANSPOSE) { CeedSize length; - ierr = CeedVectorGetLength(V, &length); + CeedCallBackend(CeedVectorGetLength(V, &length)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *) dv, length, - data->queue); + magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); } else { - magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *) dv, length, - data->queue); + magmablas_dlaset(MagmaFull, length, 1, 0., 0., (double *)dv, length, data->queue); } - ceed_magma_queue_sync( data->queue ); + ceed_magma_queue_sync(data->queue); } switch (emode) { - case CEED_EVAL_INTERP: { - CeedInt P = ndof, Q = nqpt; - if (tmode == CEED_TRANSPOSE) - magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, - P, nelem*ncomp, Q, - 1.0, (float *)impl->dinterp, P, - (float *)du, Q, - 0.0, (float *)dv, P, data->queue); - else - magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, - Q, nelem*ncomp, P, - 1.0, (float *)impl->dinterp, P, - (float *)du, P, - 0.0, (float *)dv, Q, data->queue); - } - break; - - case CEED_EVAL_GRAD: { - CeedInt P = ndof, Q = nqpt; - if (tmode == CEED_TRANSPOSE) { - CeedScalar beta = 0.0; - for(int d=0; d0) - beta = 1.0; - magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, - P, nelem*ncomp, Q, - 1.0, (float *)(impl->dgrad + d*P*Q), P, - (float *)(du + d*nelem*ncomp*Q), Q, - beta, (float *)dv, P, data->queue); + case CEED_EVAL_INTERP: { + CeedInt P = ndof, Q = nqpt; + if (tmode == CEED_TRANSPOSE) + magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (float *)impl->dinterp, P, (float *)du, Q, 0.0, (float *)dv, P, + data->queue); + else + magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (float *)impl->dinterp, P, (float *)du, P, 0.0, (float *)dv, Q, + data->queue); + } break; + + case CEED_EVAL_GRAD: { + CeedInt P = ndof, Q = nqpt; + if (tmode == CEED_TRANSPOSE) { + CeedScalar beta = 0.0; + for (int d = 0; d < dim; d++) { + if (d > 0) beta = 1.0; + magma_sgemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, (float *)(impl->dgrad + d * P * Q), P, + (float *)(du + d * nelem * ncomp * Q), Q, beta, (float *)dv, P, data->queue); + } + } else { + for (int d = 0; d < dim; d++) + magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, (float *)(impl->dgrad + d * P * Q), P, (float *)du, P, 0.0, + (float *)(dv + d * nelem * ncomp * Q), Q, data->queue); } - } else { - for(int d=0; d< dim; d++) - magma_sgemm_nontensor(MagmaTrans, MagmaNoTrans, - Q, nelem*ncomp, P, - 1.0, (float *)(impl->dgrad + d*P*Q), P, - (float *)du, P, - 0.0, (float *)(dv + d*nelem*ncomp*Q), Q, data->queue); - } - } - break; + } break; - case CEED_EVAL_WEIGHT: { - if (tmode == CEED_TRANSPOSE) - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); - // LCOV_EXCL_STOP + case CEED_EVAL_WEIGHT: { + if (tmode == CEED_TRANSPOSE) + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); + // LCOV_EXCL_STOP - int elemsPerBlock = 1;//basis->Q1d < 7 ? optElems[basis->Q1d] : 1; - int grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlockdqweight, dv, - data->queue); - CeedChkBackend(ierr); - } - break; - - // LCOV_EXCL_START - case CEED_EVAL_DIV: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); - case CEED_EVAL_CURL: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_NONE does not make sense in this context"); - // LCOV_EXCL_STOP + int elemsPerBlock = 1; // basis->Q1d < 7 ? optElems[basis->Q1d] : 1; + int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); + magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue); + } break; + + // LCOV_EXCL_START + case CEED_EVAL_DIV: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); + case CEED_EVAL_CURL: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP } // must sync to ensure completeness - ceed_magma_queue_sync( data->queue ); + ceed_magma_queue_sync(data->queue); - if (emode!=CEED_EVAL_WEIGHT) { - ierr = CeedVectorRestoreArrayRead(U, &du); CeedChkBackend(ierr); + if (emode != CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); } - ierr = CeedVectorRestoreArray(V, &dv); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(V, &dv)); return CEED_ERROR_SUCCESS; } #ifdef __cplusplus CEED_INTERN "C" #endif -int CeedBasisDestroy_Magma(CeedBasis basis) { - int ierr; + int + CeedBasisDestroy_Magma(CeedBasis basis) { CeedBasis_Magma *impl; - ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &impl)); - ierr = magma_free(impl->dqref1d); CeedChkBackend(ierr); - ierr = magma_free(impl->dinterp1d); CeedChkBackend(ierr); - ierr = magma_free(impl->dgrad1d); CeedChkBackend(ierr); - ierr = magma_free(impl->dqweight1d); CeedChkBackend(ierr); + CeedCallBackend(magma_free(impl->dqref1d)); + CeedCallBackend(magma_free(impl->dinterp1d)); + CeedCallBackend(magma_free(impl->dgrad1d)); + CeedCallBackend(magma_free(impl->dqweight1d)); Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); - #ifdef CEED_MAGMA_USE_HIP - ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr); - #else - ierr = cuModuleUnload(impl->module); CeedChk_Cu(ceed, ierr); - #endif + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); +#ifdef CEED_MAGMA_USE_HIP + CeedCallHip(ceed, hipModuleUnload(impl->module)); +#else + CeedCallCuda(ceed, cuModuleUnload(impl->module)); +#endif - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -583,17 +500,17 @@ int CeedBasisDestroy_Magma(CeedBasis basis) { #ifdef __cplusplus CEED_INTERN "C" #endif -int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { - int ierr; + int + CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { CeedBasisNonTensor_Magma *impl; - ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &impl)); - ierr = magma_free(impl->dqref); CeedChkBackend(ierr); - ierr = magma_free(impl->dinterp); CeedChkBackend(ierr); - ierr = magma_free(impl->dgrad); CeedChkBackend(ierr); - ierr = magma_free(impl->dqweight); CeedChkBackend(ierr); + CeedCallBackend(magma_free(impl->dqref)); + CeedCallBackend(magma_free(impl->dinterp)); + CeedCallBackend(magma_free(impl->dgrad)); + CeedCallBackend(magma_free(impl->dqweight)); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -601,167 +518,103 @@ int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { #ifdef __cplusplus CEED_INTERN "C" #endif -int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, - const CeedScalar *interp1d, - const CeedScalar *grad1d, - const CeedScalar *qref1d, - const CeedScalar *qweight1d, CeedBasis basis) { - int ierr; + int + CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, + const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis) { CeedBasis_Magma *impl; - ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); // Check for supported parameters CeedInt ncomp = 0; - ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); Ceed_Magma *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); // Compile kernels char *magma_common_path; char *interp_path, *grad_path, *weight_path; char *basis_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/magma/magma_common_device.h", - &magma_common_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_device.h", &magma_common_path)); CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, magma_common_path, - &basis_kernel_source); - CeedChkBackend(ierr); - char *interp_name_base = "ceed/jit-source/magma/interp"; - CeedInt interp_name_len = strlen(interp_name_base) + 6; - char interp_name[interp_name_len]; - snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", - interp_name_base, dim); - ierr = CeedGetJitAbsolutePath(ceed, interp_name, &interp_path); - CeedChkBackend(ierr); - ierr = CeedLoadSourceToInitializedBuffer(ceed, interp_path, - &basis_kernel_source); - CeedChkBackend(ierr); - char *grad_name_base = "ceed/jit-source/magma/grad"; - CeedInt grad_name_len = strlen(grad_name_base) + 6; - char grad_name[grad_name_len]; - snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, - dim); - ierr = CeedGetJitAbsolutePath(ceed, grad_name, &grad_path); - CeedChkBackend(ierr); - ierr = CeedLoadSourceToInitializedBuffer(ceed, grad_path, - &basis_kernel_source); - CeedChkBackend(ierr); - char *weight_name_base = "ceed/jit-source/magma/weight"; - CeedInt weight_name_len = strlen(weight_name_base) + 6; - char weight_name[weight_name_len]; - snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", - weight_name_base, dim); - ierr = CeedGetJitAbsolutePath(ceed, weight_name, &weight_path); - CeedChkBackend(ierr); - ierr = CeedLoadSourceToInitializedBuffer(ceed, weight_path, - &basis_kernel_source); - CeedChkBackend(ierr); - CeedDebug256(ceed, 2, - "----- Loading Basis Kernel Source Complete! -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); + char *interp_name_base = "ceed/jit-source/magma/interp"; + CeedInt interp_name_len = strlen(interp_name_base) + 6; + char interp_name[interp_name_len]; + snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source)); + char *grad_name_base = "ceed/jit-source/magma/grad"; + CeedInt grad_name_len = strlen(grad_name_base) + 6; + char grad_name[grad_name_len]; + snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source)); + char *weight_name_base = "ceed/jit-source/magma/weight"; + CeedInt weight_name_len = strlen(weight_name_base) + 6; + char weight_name[weight_name_len]; + snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source)); + CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip // data Ceed delegate; - ierr = CeedGetDelegate(ceed, &delegate); CeedChkBackend(ierr); - ierr = CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, - "DIM", dim, - "NCOMP", ncomp, - "P", P1d, - "Q", Q1d, - "MAXPQ", CeedIntMax(P1d, Q1d)); - CeedChkBackend(ierr); + CeedCallBackend(CeedGetDelegate(ceed, &delegate)); + CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", ncomp, "P", P1d, "Q", Q1d, "MAXPQ", + CeedIntMax(P1d, Q1d))); // Kernel setup switch (dim) { - case 1: - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", - &impl->magma_interp); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", - &impl->magma_interp_tr); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", - &impl->magma_grad); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", - &impl->magma_grad_tr); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", - &impl->magma_weight); - CeedChkBackend(ierr); - break; - case 2: - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", - &impl->magma_interp); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", - &impl->magma_interp_tr); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", - &impl->magma_grad); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", - &impl->magma_grad_tr); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", - &impl->magma_weight); - CeedChkBackend(ierr); - break; - case 3: - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", - &impl->magma_interp); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", - &impl->magma_interp_tr); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", - &impl->magma_grad); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", - &impl->magma_grad_tr); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", - &impl->magma_weight); - CeedChkBackend(ierr); + case 1: + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_1d_kernel", &impl->magma_interp)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_1d_kernel", &impl->magma_interp_tr)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_1d_kernel", &impl->magma_grad)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_1d_kernel", &impl->magma_grad_tr)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_1d_kernel", &impl->magma_weight)); + break; + case 2: + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_2d_kernel", &impl->magma_interp)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_2d_kernel", &impl->magma_interp_tr)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_2d_kernel", &impl->magma_grad)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_2d_kernel", &impl->magma_grad_tr)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_2d_kernel", &impl->magma_weight)); + break; + case 3: + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpn_3d_kernel", &impl->magma_interp)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_interpt_3d_kernel", &impl->magma_interp_tr)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradn_3d_kernel", &impl->magma_grad)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_gradt_3d_kernel", &impl->magma_grad_tr)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_weight_3d_kernel", &impl->magma_weight)); } - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApply_Magma); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", - CeedBasisDestroy_Magma); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); // Copy qref1d to the GPU - ierr = magma_malloc((void **)&impl->dqref1d, Q1d*sizeof(qref1d[0])); - CeedChkBackend(ierr); - magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, - data->queue); + CeedCallBackend(magma_malloc((void **)&impl->dqref1d, Q1d * sizeof(qref1d[0]))); + magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, data->queue); // Copy interp1d to the GPU - ierr = magma_malloc((void **)&impl->dinterp1d, Q1d*P1d*sizeof(interp1d[0])); - CeedChkBackend(ierr); - magma_setvector(Q1d*P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, - data->queue); + CeedCallBackend(magma_malloc((void **)&impl->dinterp1d, Q1d * P1d * sizeof(interp1d[0]))); + magma_setvector(Q1d * P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, data->queue); // Copy grad1d to the GPU - ierr = magma_malloc((void **)&impl->dgrad1d, Q1d*P1d*sizeof(grad1d[0])); - CeedChkBackend(ierr); - magma_setvector(Q1d*P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, - data->queue); + CeedCallBackend(magma_malloc((void **)&impl->dgrad1d, Q1d * P1d * sizeof(grad1d[0]))); + magma_setvector(Q1d * P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, data->queue); // Copy qweight1d to the GPU - ierr = magma_malloc((void **)&impl->dqweight1d, Q1d*sizeof(qweight1d[0])); - CeedChkBackend(ierr); - magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, - data->queue); - - ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); - ierr = CeedFree(&magma_common_path); CeedChkBackend(ierr); - ierr = CeedFree(&interp_path); CeedChkBackend(ierr); - ierr = CeedFree(&grad_path); CeedChkBackend(ierr); - ierr = CeedFree(&weight_path); CeedChkBackend(ierr); - ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(magma_malloc((void **)&impl->dqweight1d, Q1d * sizeof(qweight1d[0]))); + magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, data->queue); + + CeedCallBackend(CeedBasisSetData(basis, impl)); + CeedCallBackend(CeedBasisSetData(basis, impl)); + CeedCallBackend(CeedFree(&magma_common_path)); + CeedCallBackend(CeedFree(&interp_path)); + CeedCallBackend(CeedFree(&grad_path)); + CeedCallBackend(CeedFree(&weight_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); return CEED_ERROR_SUCCESS; } @@ -769,55 +622,41 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, #ifdef __cplusplus CEED_INTERN "C" #endif -int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, - CeedInt nqpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *qref, - const CeedScalar *qweight, CeedBasis basis) { - int ierr; + int + CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis) { CeedBasisNonTensor_Magma *impl; - Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + Ceed ceed; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); Ceed_Magma *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApplyNonTensor_f64_Magma); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_f64_Magma)); } else { - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApplyNonTensor_f32_Magma); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_f32_Magma)); } - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", - CeedBasisDestroyNonTensor_Magma); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); - ierr = CeedCalloc(1,&impl); CeedChkBackend(ierr); - ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedBasisSetData(basis, impl)); // Copy qref to the GPU - ierr = magma_malloc((void **)&impl->dqref, nqpts*sizeof(qref[0])); - CeedChkBackend(ierr); + CeedCallBackend(magma_malloc((void **)&impl->dqref, nqpts * sizeof(qref[0]))); magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue); // Copy interp to the GPU - ierr = magma_malloc((void **)&impl->dinterp, nqpts*ndof*sizeof(interp[0])); - CeedChkBackend(ierr); - magma_setvector(nqpts*ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, - data->queue); + CeedCallBackend(magma_malloc((void **)&impl->dinterp, nqpts * ndof * sizeof(interp[0]))); + magma_setvector(nqpts * ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, data->queue); // Copy grad to the GPU - ierr = magma_malloc((void **)&impl->dgrad, nqpts*ndof*dim*sizeof(grad[0])); - CeedChkBackend(ierr); - magma_setvector(nqpts*ndof*dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, - data->queue); + CeedCallBackend(magma_malloc((void **)&impl->dgrad, nqpts * ndof * dim * sizeof(grad[0]))); + magma_setvector(nqpts * ndof * dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, data->queue); // Copy qweight to the GPU - ierr = magma_malloc((void **)&impl->dqweight, nqpts*sizeof(qweight[0])); - CeedChkBackend(ierr); - magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, - data->queue); + CeedCallBackend(magma_malloc((void **)&impl->dqweight, nqpts * sizeof(qweight[0]))); + magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, data->queue); return CEED_ERROR_SUCCESS; } diff --git a/backends/magma/ceed-magma-det.c b/backends/magma/ceed-magma-det.c index 27d2a42e33..2578b354fb 100644 --- a/backends/magma/ceed-magma-det.c +++ b/backends/magma/ceed-magma-det.c @@ -5,30 +5,29 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include + #include "ceed-magma.h" CEED_INTERN int CeedInit_Magma_Det(const char *resource, Ceed ceed) { - int ierr; - const int nrc = 18; // number of characters in resource - if (strncmp(resource, "/gpu/cuda/magma/det", nrc) - && strncmp(resource, "/gpu/hip/magma/det", nrc)) + const int nrc = 18; // number of characters in resource + if (strncmp(resource, "/gpu/cuda/magma/det", nrc) && strncmp(resource, "/gpu/hip/magma/det", nrc)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Magma backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Magma backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); Ceed_Magma *data; - ierr = CeedCalloc(sizeof(Ceed_Magma), &data); CeedChkBackend(ierr); - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(sizeof(Ceed_Magma), &data)); + CeedCallBackend(CeedSetData(ceed, data)); // get/set device ID const char *device_spec = strstr(resource, ":device_id="); - const int deviceID = (device_spec) ? atoi(device_spec+11) : -1; + const int deviceID = (device_spec) ? atoi(device_spec + 11) : -1; int currentDeviceID; magma_getdevice(¤tDeviceID); @@ -42,30 +41,29 @@ CEED_INTERN int CeedInit_Magma_Det(const char *resource, Ceed ceed) { // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceedref; - #ifdef CEED_MAGMA_USE_HIP - CeedInit("/gpu/hip/magma", &ceedref); - #else - CeedInit("/gpu/cuda/magma", &ceedref); - #endif - ierr = CeedSetDelegate(ceed, ceedref); CeedChkBackend(ierr); +#ifdef CEED_MAGMA_USE_HIP + CeedCallBackend(CeedInit("/gpu/hip/magma", &ceedref)); +#else + CeedCallBackend(CeedInit("/gpu/cuda/magma", &ceedref)); +#endif + CeedCallBackend(CeedSetDelegate(ceed, ceedref)); // Create reference CEED for restriction Ceed restrictionceedref; - #ifdef CEED_MAGMA_USE_HIP +#ifdef CEED_MAGMA_USE_HIP CeedInit("/gpu/hip/ref", &restrictionceedref); - #else +#else CeedInit("/gpu/cuda/ref", &restrictionceedref); - #endif - ierr = CeedSetObjectDelegate(ceed, restrictionceedref, "ElemRestriction"); - CeedChkBackend(ierr); +#endif + CeedCallBackend(CeedSetObjectDelegate(ceed, restrictionceedref, "ElemRestriction")); return CEED_ERROR_SUCCESS; } CEED_INTERN int CeedRegister_Magma_Det(void) { - #ifdef CEED_MAGMA_USE_HIP +#ifdef CEED_MAGMA_USE_HIP return CeedRegister("/gpu/hip/magma/det", CeedInit_Magma_Det, 125); - #else +#else return CeedRegister("/gpu/cuda/magma/det", CeedInit_Magma_Det, 125); - #endif +#endif } diff --git a/backends/magma/ceed-magma-restriction.c b/backends/magma/ceed-magma-restriction.c index 5e8ab8ef7b..1aa75472ea 100644 --- a/backends/magma/ceed-magma-restriction.c +++ b/backends/magma/ceed-magma-restriction.c @@ -5,10 +5,11 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-magma.h" #ifdef CEED_MAGMA_USE_HIP #include "../hip/ceed-hip-common.h" @@ -18,18 +19,15 @@ #include "../cuda/ceed-cuda-compile.h" #endif -static int CeedElemRestrictionApply_Magma(CeedElemRestriction r, - CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) { - - int ierr; +static int CeedElemRestrictionApply_Magma(CeedElemRestriction r, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) { CeedElemRestriction_Magma *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); Ceed_Magma *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); CeedInt nelem; CeedElemRestrictionGetNumElements(r, &nelem); @@ -41,300 +39,240 @@ static int CeedElemRestrictionApply_Magma(CeedElemRestriction r, CeedElemRestrictionGetNumComponents(r, &ncomp); const CeedScalar *du; - CeedScalar *dv; + CeedScalar *dv; - ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &du); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &du)); if (tmode == CEED_TRANSPOSE) { // Sum into for transpose mode, e-vec to l-vec - ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &dv)); } else { // Overwrite for notranspose mode, l-vec to e-vec - ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &dv); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &dv)); } bool isStrided; - ierr = CeedElemRestrictionIsStrided(r, &isStrided); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionIsStrided(r, &isStrided)); if (isStrided) { // Strided Restriction - CeedInt strides[3]; + CeedInt strides[3]; CeedInt *dstrides; - ierr = magma_malloc( (void **)&dstrides, - 3 * sizeof(CeedInt)); CeedChkBackend(ierr); + CeedCallBackend(magma_malloc((void **)&dstrides, 3 * sizeof(CeedInt))); // Check to see if we should use magma Q-/E-Vector layout // (dimension = slowest index, then component, then element, // then node) bool backendstrides; - ierr = CeedElemRestrictionHasBackendStrides(r, &backendstrides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &backendstrides)); if (backendstrides) { - - strides[0] = 1; // node stride - strides[1] = esize * nelem; //component stride - strides[2] = esize; //element stride + strides[0] = 1; // node stride + strides[1] = esize * nelem; // component stride + strides[2] = esize; // element stride magma_setvector(3, sizeof(CeedInt), strides, 1, dstrides, 1, data->queue); } else { - // Get the new strides - ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); magma_setvector(3, sizeof(CeedInt), strides, 1, dstrides, 1, data->queue); } - void *args[] = {&ncomp, &esize, &nelem, &dstrides, &du, &dv}; - CeedInt grid = nelem; + void *args[] = {&ncomp, &esize, &nelem, &dstrides, &du, &dv}; + CeedInt grid = nelem; CeedInt blocksize = 256; // Perform strided restriction with dstrides if (tmode == CEED_TRANSPOSE) { - ierr = CeedRunKernelMagma(ceed, impl->StridedTranspose, - grid, blocksize, args); CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelMagma(ceed, impl->StridedTranspose, grid, blocksize, args)); } else { - ierr = CeedRunKernelMagma(ceed, impl->StridedNoTranspose, - grid, blocksize, args); CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelMagma(ceed, impl->StridedNoTranspose, grid, blocksize, args)); } - ierr = magma_free(dstrides); CeedChkBackend(ierr); + CeedCallBackend(magma_free(dstrides)); - } else { // Offsets array provided, standard restriction + } else { // Offsets array provided, standard restriction CeedInt compstride; - ierr = CeedElemRestrictionGetCompStride(r, &compstride); CeedChkBackend(ierr); - void *args[] = {&ncomp, &compstride, &esize, &nelem, &impl->doffsets, &du, &dv}; - CeedInt grid = nelem; + CeedCallBackend(CeedElemRestrictionGetCompStride(r, &compstride)); + void *args[] = {&ncomp, &compstride, &esize, &nelem, &impl->doffsets, &du, &dv}; + CeedInt grid = nelem; CeedInt blocksize = 256; if (tmode == CEED_TRANSPOSE) { - ierr = CeedRunKernelMagma(ceed, impl->OffsetTranspose, - grid, blocksize, args); CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelMagma(ceed, impl->OffsetTranspose, grid, blocksize, args)); } else { - ierr = CeedRunKernelMagma(ceed, impl->OffsetNoTranspose, - grid, blocksize, args); CeedChkBackend(ierr); + CeedCallBackend(CeedRunKernelMagma(ceed, impl->OffsetNoTranspose, grid, blocksize, args)); } - } - ierr = CeedVectorRestoreArrayRead(u, &du); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArray(v, &dv); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(u, &du)); + CeedCallBackend(CeedVectorRestoreArray(v, &dv)); return CEED_ERROR_SUCCESS; } -int CeedElemRestrictionApplyBlock_Magma(CeedElemRestriction r, CeedInt block, - CeedTransposeMode tmode, CeedVector u, - CeedVector v, CeedRequest *request) { - int ierr; +int CeedElemRestrictionApplyBlock_Magma(CeedElemRestriction r, CeedInt block, CeedTransposeMode tmode, CeedVector u, CeedVector v, + CeedRequest *request) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement blocked restrictions"); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); // LCOV_EXCL_STOP } -static int CeedElemRestrictionGetOffsets_Magma(CeedElemRestriction rstr, - CeedMemType mtype, const CeedInt **offsets) { - int ierr; +static int CeedElemRestrictionGetOffsets_Magma(CeedElemRestriction rstr, CeedMemType mtype, const CeedInt **offsets) { CeedElemRestriction_Magma *impl; - ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); switch (mtype) { - case CEED_MEM_HOST: - *offsets = impl->offsets; - break; - case CEED_MEM_DEVICE: - *offsets = impl->doffsets; - break; + case CEED_MEM_HOST: + *offsets = impl->offsets; + break; + case CEED_MEM_DEVICE: + *offsets = impl->doffsets; + break; } return CEED_ERROR_SUCCESS; } static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) { - int ierr; CeedElemRestriction_Magma *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); // Free if we own the data if (impl->own_) { if (impl->own_ == OWNED_PINNED) { - ierr = magma_free_pinned(impl->offsets); CeedChkBackend(ierr); + CeedCallBackend(magma_free_pinned(impl->offsets)); } else if (impl->own_ == OWNED_UNPINNED) { free(impl->offsets); } - ierr = magma_free(impl->doffsets); CeedChkBackend(ierr); + CeedCallBackend(magma_free(impl->doffsets)); } else if (impl->down_) { - ierr = magma_free(impl->doffsets); CeedChkBackend(ierr); + CeedCallBackend(magma_free(impl->doffsets)); } Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); - #ifdef CEED_MAGMA_USE_HIP - ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr); - #else - ierr = cuModuleUnload(impl->module); CeedChk_Cu(ceed, ierr); - #endif - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +#ifdef CEED_MAGMA_USE_HIP + CeedCallHip(ceed, hipModuleUnload(impl->module)); +#else + CeedCallCuda(ceed, cuModuleUnload(impl->module)); +#endif + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } -int CeedElemRestrictionCreate_Magma(CeedMemType mtype, CeedCopyMode cmode, - const CeedInt *offsets, CeedElemRestriction r) { - int ierr; +int CeedElemRestrictionCreate_Magma(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *offsets, CeedElemRestriction r) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); Ceed_Magma *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); CeedInt elemsize, nelem; - ierr = CeedElemRestrictionGetNumElements(r, &nelem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(r, &elemsize); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetNumElements(r, &nelem)); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elemsize)); CeedInt size = elemsize * nelem; CeedElemRestriction_Magma *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); impl->doffsets = NULL; impl->offsets = NULL; - impl->own_ = OWNED_NONE; - impl->down_= 0; + impl->own_ = OWNED_NONE; + impl->down_ = 0; if (mtype == CEED_MEM_HOST) { // memory is on the host; own_ = 0 switch (cmode) { - case CEED_COPY_VALUES: - impl->own_ = OWNED_PINNED; - - if (offsets != NULL) { - - ierr = magma_malloc( (void **)&impl->doffsets, - size * sizeof(CeedInt)); CeedChkBackend(ierr); - ierr = magma_malloc_pinned( (void **)&impl->offsets, - size * sizeof(CeedInt)); CeedChkBackend(ierr); - memcpy(impl->offsets, offsets, size * sizeof(CeedInt)); - - magma_setvector(size, sizeof(CeedInt), offsets, 1, impl->doffsets, 1, - data->queue); - } - break; - case CEED_OWN_POINTER: - impl->own_ = OWNED_UNPINNED; - - if (offsets != NULL) { - ierr = magma_malloc( (void **)&impl->doffsets, - size * sizeof(CeedInt)); CeedChkBackend(ierr); + case CEED_COPY_VALUES: + impl->own_ = OWNED_PINNED; + + if (offsets != NULL) { + CeedCallBackend(magma_malloc((void **)&impl->doffsets, size * sizeof(CeedInt))); + CeedCallBackend(magma_malloc_pinned((void **)&impl->offsets, size * sizeof(CeedInt))); + memcpy(impl->offsets, offsets, size * sizeof(CeedInt)); + + magma_setvector(size, sizeof(CeedInt), offsets, 1, impl->doffsets, 1, data->queue); + } + break; + case CEED_OWN_POINTER: + impl->own_ = OWNED_UNPINNED; + + if (offsets != NULL) { + CeedCallBackend(magma_malloc((void **)&impl->doffsets, size * sizeof(CeedInt))); + impl->offsets = (CeedInt *)offsets; + + magma_setvector(size, sizeof(CeedInt), offsets, 1, impl->doffsets, 1, data->queue); + } + break; + case CEED_USE_POINTER: + if (offsets != NULL) { + CeedCallBackend(magma_malloc((void **)&impl->doffsets, size * sizeof(CeedInt))); + magma_setvector(size, sizeof(CeedInt), offsets, 1, impl->doffsets, 1, data->queue); + } + impl->down_ = 1; impl->offsets = (CeedInt *)offsets; - - magma_setvector(size, sizeof(CeedInt), offsets, 1, impl->doffsets, 1, - data->queue); - } - break; - case CEED_USE_POINTER: - if (offsets != NULL) { - ierr = magma_malloc( (void **)&impl->doffsets, - size * sizeof(CeedInt)); CeedChkBackend(ierr); - magma_setvector(size, sizeof(CeedInt), offsets, 1, impl->doffsets, 1, - data->queue); - } - impl->down_ = 1; - impl->offsets = (CeedInt *)offsets; } } else if (mtype == CEED_MEM_DEVICE) { // memory is on the device; own = 0 switch (cmode) { - case CEED_COPY_VALUES: - ierr = magma_malloc( (void **)&impl->doffsets, - size * sizeof(CeedInt)); CeedChkBackend(ierr); - ierr = magma_malloc_pinned( (void **)&impl->offsets, - size * sizeof(CeedInt)); CeedChkBackend(ierr); - impl->own_ = OWNED_PINNED; - - if (offsets) - magma_getvector(size, sizeof(CeedInt), impl->doffsets, 1, (void *)offsets, 1, - data->queue); - break; - case CEED_OWN_POINTER: - impl->doffsets = (CeedInt *)offsets; - ierr = magma_malloc_pinned( (void **)&impl->offsets, - size * sizeof(CeedInt)); CeedChkBackend(ierr); - impl->own_ = OWNED_PINNED; - - break; - case CEED_USE_POINTER: - impl->doffsets = (CeedInt *)offsets; - impl->offsets = NULL; + case CEED_COPY_VALUES: + CeedCallBackend(magma_malloc((void **)&impl->doffsets, size * sizeof(CeedInt))); + CeedCallBackend(magma_malloc_pinned((void **)&impl->offsets, size * sizeof(CeedInt))); + impl->own_ = OWNED_PINNED; + + if (offsets) magma_getvector(size, sizeof(CeedInt), impl->doffsets, 1, (void *)offsets, 1, data->queue); + break; + case CEED_OWN_POINTER: + impl->doffsets = (CeedInt *)offsets; + CeedCallBackend(magma_malloc_pinned((void **)&impl->offsets, size * sizeof(CeedInt))); + impl->own_ = OWNED_PINNED; + + break; + case CEED_USE_POINTER: + impl->doffsets = (CeedInt *)offsets; + impl->offsets = NULL; } - } else - return CeedError(ceed, CEED_ERROR_BACKEND, - "Only MemType = HOST or DEVICE supported"); + } else return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); // Compile kernels char *magma_common_path; char *restriction_kernel_path, *restriction_kernel_source; - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/magma/magma_common_device.h", - &magma_common_path); CeedChkBackend(ierr); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_device.h", &magma_common_path)); CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n"); - ierr = CeedLoadSourceToBuffer(ceed, magma_common_path, - &restriction_kernel_source); - CeedChkBackend(ierr); - ierr = CeedGetJitAbsolutePath(ceed, - "ceed/jit-source/magma/elem_restriction.h", - &restriction_kernel_path); CeedChkBackend(ierr); - ierr = CeedLoadSourceToInitializedBuffer(ceed, restriction_kernel_path, - &restriction_kernel_source); - CeedChkBackend(ierr); - CeedDebug256(ceed, 2, - "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &restriction_kernel_source)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/elem_restriction.h", &restriction_kernel_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source Complete! -----\n"); // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip // data Ceed delegate; - ierr = CeedGetDelegate(ceed, &delegate); CeedChkBackend(ierr); - ierr = CeedCompileMagma(delegate, restriction_kernel_source, &impl->module, 0); - CeedChkBackend(ierr); + CeedCallBackend(CeedGetDelegate(ceed, &delegate)); + CeedCallBackend(CeedCompileMagma(delegate, restriction_kernel_source, &impl->module, 0)); // Kernel setup - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_readDofsStrided_kernel", - &impl->StridedNoTranspose); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_readDofsOffset_kernel", - &impl->OffsetNoTranspose); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_writeDofsStrided_kernel", - &impl->StridedTranspose); - CeedChkBackend(ierr); - ierr = CeedGetKernelMagma(ceed, impl->module, "magma_writeDofsOffset_kernel", - &impl->OffsetTranspose); - CeedChkBackend(ierr); - - ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr); - CeedInt layout[3] = {1, elemsize*nelem, elemsize}; - ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", - CeedElemRestrictionApply_Magma); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", - CeedElemRestrictionApplyBlock_Magma); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", - CeedElemRestrictionGetOffsets_Magma); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", - CeedElemRestrictionDestroy_Magma); CeedChkBackend(ierr); - ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr); - ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_readDofsStrided_kernel", &impl->StridedNoTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_readDofsOffset_kernel", &impl->OffsetNoTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_writeDofsStrided_kernel", &impl->StridedTranspose)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module, "magma_writeDofsOffset_kernel", &impl->OffsetTranspose)); + + CeedCallBackend(CeedElemRestrictionSetData(r, impl)); + CeedInt layout[3] = {1, elemsize * nelem, elemsize}; + CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", CeedElemRestrictionApplyBlock_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Magma)); + CeedCallBackend(CeedFree(&restriction_kernel_path)); + CeedCallBackend(CeedFree(&restriction_kernel_source)); return CEED_ERROR_SUCCESS; } -int CeedElemRestrictionCreateBlocked_Magma(const CeedMemType mtype, - const CeedCopyMode cmode, const CeedInt *offsets, - const CeedElemRestriction r) { - int ierr; +int CeedElemRestrictionCreateBlocked_Magma(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *offsets, const CeedElemRestriction r) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Backend does not implement blocked restrictions"); + return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); // LCOV_EXCL_STOP return CEED_ERROR_SUCCESS; diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c index 5625235ad6..447878ed7a 100644 --- a/backends/magma/ceed-magma.c +++ b/backends/magma/ceed-magma.c @@ -5,47 +5,47 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-magma.h" + #include -#include +#include #include -#include "ceed-magma.h" +#include static int CeedDestroy_Magma(Ceed ceed) { - int ierr; Ceed_Magma *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); - magma_queue_destroy( data->queue ); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); + magma_queue_destroy(data->queue); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } static int CeedInit_Magma(const char *resource, Ceed ceed) { - int ierr; - const int nrc = 14; // number of characters in resource - if (strncmp(resource, "/gpu/cuda/magma", nrc) - && strncmp(resource, "/gpu/hip/magma", nrc)) + int ierr; + const int nrc = 14; // number of characters in resource + if (strncmp(resource, "/gpu/cuda/magma", nrc) && strncmp(resource, "/gpu/hip/magma", nrc)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Magma backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Magma backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } ierr = magma_init(); - if (ierr) + if (ierr) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "error in magma_init(): %d\n", ierr); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } Ceed_Magma *data; - ierr = CeedCalloc(sizeof(Ceed_Magma), &data); CeedChkBackend(ierr); - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(sizeof(Ceed_Magma), &data)); + CeedCallBackend(CeedSetData(ceed, data)); // kernel selection data->basis_kernel_mode = MAGMA_KERNEL_DIM_SPECIFIC; // get/set device ID const char *device_spec = strstr(resource, ":device_id="); - const int deviceID = (device_spec) ? atoi(device_spec+11) : -1; + const int deviceID = (device_spec) ? atoi(device_spec + 11) : -1; int currentDeviceID; magma_getdevice(¤tDeviceID); @@ -55,40 +55,34 @@ static int CeedInit_Magma(const char *resource, Ceed ceed) { } // create a queue that uses the null stream data->device = currentDeviceID; - #ifdef CEED_MAGMA_USE_HIP +#ifdef CEED_MAGMA_USE_HIP magma_queue_create_from_hip(data->device, NULL, NULL, NULL, &(data->queue)); - #else +#else magma_queue_create_from_cuda(data->device, NULL, NULL, NULL, &(data->queue)); - #endif +#endif // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceedref; - #ifdef CEED_MAGMA_USE_HIP - CeedInit("/gpu/hip/ref", &ceedref); - #else - CeedInit("/gpu/cuda/ref", &ceedref); - #endif - ierr = CeedSetDelegate(ceed, ceedref); CeedChkBackend(ierr); +#ifdef CEED_MAGMA_USE_HIP + CeedCallBackend(CeedInit("/gpu/hip/ref", &ceedref)); +#else + CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceedref)); +#endif + CeedCallBackend(CeedSetDelegate(ceed, ceedref)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", - CeedElemRestrictionCreate_Magma); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, - "ElemRestrictionCreateBlocked", - CeedElemRestrictionCreateBlocked_Magma); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", - CeedBasisCreateTensorH1_Magma); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", - CeedBasisCreateH1_Magma); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", - CeedDestroy_Magma); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateBlocked", CeedElemRestrictionCreateBlocked_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma)); return CEED_ERROR_SUCCESS; } CEED_INTERN int CeedRegister_Magma(void) { - #ifdef CEED_MAGMA_USE_HIP +#ifdef CEED_MAGMA_USE_HIP return CeedRegister("/gpu/hip/magma", CeedInit_Magma, 120); - #else +#else return CeedRegister("/gpu/cuda/magma", CeedInit_Magma, 120); - #endif +#endif } diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h index fd9b6c24ab..7b0d6aab83 100644 --- a/backends/magma/ceed-magma.h +++ b/backends/magma/ceed-magma.h @@ -9,8 +9,8 @@ #ifndef _ceed_magma_h #define _ceed_magma_h -#include #include +#include #include #define MAGMA_MAXTHREADS_1D 128 @@ -24,7 +24,7 @@ #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) #ifdef CEED_MAGMA_USE_HIP -typedef hipModule_t CeedMagmaModule; +typedef hipModule_t CeedMagmaModule; typedef hipFunction_t CeedMagmaFunction; #define CeedCompileMagma CeedCompileHip #define CeedGetKernelMagma CeedGetKernelHip @@ -32,7 +32,7 @@ typedef hipFunction_t CeedMagmaFunction; #define CeedRunKernelDimMagma CeedRunKernelDimHip #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedHip #else -typedef CUmodule CeedMagmaModule; +typedef CUmodule CeedMagmaModule; typedef CUfunction CeedMagmaFunction; #define CeedCompileMagma CeedCompileCuda #define CeedGetKernelMagma CeedGetKernelCuda @@ -41,28 +41,25 @@ typedef CUfunction CeedMagmaFunction; #define CeedRunKernelDimSharedMagma CeedRunKernelDimSharedCuda #endif -typedef enum { - MAGMA_KERNEL_DIM_GENERIC=101, - MAGMA_KERNEL_DIM_SPECIFIC=102 -} magma_kernel_mode_t; +typedef enum { MAGMA_KERNEL_DIM_GENERIC = 101, MAGMA_KERNEL_DIM_SPECIFIC = 102 } magma_kernel_mode_t; typedef struct { magma_kernel_mode_t basis_kernel_mode; - magma_device_t device; - magma_queue_t queue; + magma_device_t device; + magma_queue_t queue; } Ceed_Magma; typedef struct { - CeedMagmaModule module; + CeedMagmaModule module; CeedMagmaFunction magma_interp; CeedMagmaFunction magma_interp_tr; CeedMagmaFunction magma_grad; CeedMagmaFunction magma_grad_tr; CeedMagmaFunction magma_weight; - CeedScalar *dqref1d; - CeedScalar *dinterp1d; - CeedScalar *dgrad1d; - CeedScalar *dqweight1d; + CeedScalar *dqref1d; + CeedScalar *dinterp1d; + CeedScalar *dgrad1d; + CeedScalar *dqweight1d; } CeedBasis_Magma; typedef struct { @@ -79,21 +76,21 @@ typedef enum { } OwnershipMode; typedef struct { - CeedMagmaModule module; + CeedMagmaModule module; CeedMagmaFunction StridedTranspose; CeedMagmaFunction StridedNoTranspose; CeedMagmaFunction OffsetTranspose; CeedMagmaFunction OffsetNoTranspose; - CeedInt *offsets; - CeedInt *doffsets; - OwnershipMode own_; - int down_; // cover a case where we own Device memory + CeedInt *offsets; + CeedInt *doffsets; + OwnershipMode own_; + int down_; // cover a case where we own Device memory } CeedElemRestriction_Magma; typedef struct { const CeedScalar **inputs; - CeedScalar **outputs; - bool setupdone; + CeedScalar **outputs; + bool setupdone; } CeedQFunction_Magma; #define USE_MAGMA_BATCH @@ -101,75 +98,33 @@ typedef struct { #define USE_MAGMA_BATCH3 #define USE_MAGMA_BATCH4 -CEED_INTERN void -magma_weight_nontensor( - magma_int_t grid, magma_int_t threads, magma_int_t nelem, - magma_int_t Q, - CeedScalar *dqweight, CeedScalar *dv, magma_queue_t queue); - -CEED_INTERN int -magma_dgemm_nontensor( - magma_trans_t transA, magma_trans_t transB, - magma_int_t m, magma_int_t n, magma_int_t k, - double alpha, const double *dA, magma_int_t ldda, - const double *dB, magma_int_t lddb, - double beta, double *dC, magma_int_t lddc, - magma_queue_t queue ); - -CEED_INTERN int -magma_sgemm_nontensor( - magma_trans_t transA, magma_trans_t transB, - magma_int_t m, magma_int_t n, magma_int_t k, - float alpha, const float *dA, magma_int_t ldda, - const float *dB, magma_int_t lddb, - float beta, float *dC, magma_int_t lddc, - magma_queue_t queue ); - -CEED_INTERN void -gemm_selector( - int gpu_arch, - char precision, char transA, - int m, int n, int k, - int *nbatch, int *use_magma ); - -CEED_INTERN magma_int_t -magma_isdevptr(const void *A); - -CEED_INTERN int -CeedBasisCreateTensorH1_Magma( - CeedInt dim, CeedInt P1d, - CeedInt Q1d, - const CeedScalar *interp1d, - const CeedScalar *grad1d, - const CeedScalar *qref1d, - const CeedScalar *qweight1d, - CeedBasis basis); - -CEED_INTERN int -CeedBasisCreateH1_Magma( - CeedElemTopology topo, CeedInt dim, - CeedInt ndof, CeedInt nqpts, - const CeedScalar *interp, - const CeedScalar *grad, - const CeedScalar *qref, - const CeedScalar *qweight, - CeedBasis basis); - -CEED_INTERN int -CeedElemRestrictionCreate_Magma( - CeedMemType mtype, - CeedCopyMode cmode, - const CeedInt *offsets, - CeedElemRestriction r); - -CEED_INTERN int -CeedElemRestrictionCreateBlocked_Magma( - const CeedMemType mtype, - const CeedCopyMode cmode, - const CeedInt *offsets, - const CeedElemRestriction res); - -CEED_INTERN int CeedOperatorCreate_Magma(CeedOperator op); +CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv, + magma_queue_t queue); + +CEED_INTERN int magma_dgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, + const double *dA, magma_int_t ldda, const double *dB, magma_int_t lddb, double beta, double *dC, + magma_int_t lddc, magma_queue_t queue); + +CEED_INTERN int magma_sgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, float alpha, + const float *dA, magma_int_t ldda, const float *dB, magma_int_t lddb, float beta, float *dC, magma_int_t lddc, + magma_queue_t queue); + +CEED_INTERN void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma); + +CEED_INTERN magma_int_t magma_isdevptr(const void *A); + +CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, + const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); + +CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis); + +CEED_INTERN int CeedElemRestrictionCreate_Magma(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *offsets, CeedElemRestriction r); + +CEED_INTERN int CeedElemRestrictionCreateBlocked_Magma(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *offsets, + const CeedElemRestriction res); + +CEED_INTERN int CeedOperatorCreate_Magma(CeedOperator op); // comment the line below to use the default magma_is_devptr function #define magma_is_devptr magma_isdevptr diff --git a/backends/magma/gemm_selector.cpp b/backends/magma/gemm_selector.cpp index e778f11115..e58a4d66fa 100644 --- a/backends/magma/gemm_selector.cpp +++ b/backends/magma/gemm_selector.cpp @@ -1,114 +1,102 @@ -#include -#include -#include -#include -#include -#include"ceed-magma.h" +#include +#include -#include"./gemm_tuning/indices.h" +#include +#include +#include + +#include "./gemm_tuning/indices.h" +#include "ceed-magma.h" #ifdef CEED_MAGMA_USE_HIP -#include"./gemm_tuning/mi100.h" -#include"./gemm_tuning/mi250x.h" +#include "./gemm_tuning/mi100.h" +#include "./gemm_tuning/mi250x.h" #else -#include"./gemm_tuning/a100.h" -#include"./gemm_tuning/v100.h" +#include "./gemm_tuning/a100.h" +#include "./gemm_tuning/v100.h" #endif //////////////////////////////////////////////////////////////////////////////// -static void* gemm_selector_get_data(int gpu_arch, char precision, char transA) -{ - // a default - #ifdef CEED_MAGMA_USE_HIP - void* data = (void*)&sgemm_nn_mi250x; - #else - void* data = (void*)&sgemm_nn_a100; - #endif +static void *gemm_selector_get_data(int gpu_arch, char precision, char transA) { +// a default +#ifdef CEED_MAGMA_USE_HIP + void *data = (void *)&sgemm_nn_mi250x; +#else + void *data = (void *)&sgemm_nn_a100; +#endif - #ifdef CEED_MAGMA_USE_HIP - if( gpu_arch >= 910 ) { +#ifdef CEED_MAGMA_USE_HIP + if (gpu_arch >= 910) { // gfx90a or newer - data = ( precision == 's' ) ? - (( transA == 'n') ? (void*)&sgemm_nn_mi250x : (void*)&sgemm_tn_mi250x ): - (( transA == 'n') ? (void*)&dgemm_nn_mi250x : (void*)&dgemm_tn_mi250x ); - } - else{ + data = (precision == 's') ? ((transA == 'n') ? (void *)&sgemm_nn_mi250x : (void *)&sgemm_tn_mi250x) + : ((transA == 'n') ? (void *)&dgemm_nn_mi250x : (void *)&dgemm_tn_mi250x); + } else { // gfx908 or older - data = ( precision == 's' ) ? - (( transA == 'n') ? (void*)&sgemm_nn_mi100 : (void*)&sgemm_tn_mi100 ): - (( transA == 'n') ? (void*)&dgemm_nn_mi100 : (void*)&dgemm_tn_mi100 ); + data = (precision == 's') ? ((transA == 'n') ? (void *)&sgemm_nn_mi100 : (void *)&sgemm_tn_mi100) + : ((transA == 'n') ? (void *)&dgemm_nn_mi100 : (void *)&dgemm_tn_mi100); } - #else - if( gpu_arch >= 800 ) { +#else + if (gpu_arch >= 800) { // sm80 or newer - data = ( precision == 's' ) ? - (( transA == 'n') ? (void*)&sgemm_nn_a100 : (void*)&sgemm_tn_a100 ): - (( transA == 'n') ? (void*)&dgemm_nn_a100 : (void*)&dgemm_tn_a100 ); - } - else { + data = (precision == 's') ? ((transA == 'n') ? (void *)&sgemm_nn_a100 : (void *)&sgemm_tn_a100) + : ((transA == 'n') ? (void *)&dgemm_nn_a100 : (void *)&dgemm_tn_a100); + } else { // sm70 or older - data = ( precision == 's' ) ? - (( transA == 'n') ? (void*)&sgemm_nn_v100 : (void*)&sgemm_tn_v100 ): - (( transA == 'n') ? (void*)&dgemm_nn_v100 : (void*)&dgemm_tn_v100 ); + data = (precision == 's') ? ((transA == 'n') ? (void *)&sgemm_nn_v100 : (void *)&sgemm_tn_v100) + : ((transA == 'n') ? (void *)&dgemm_nn_v100 : (void *)&dgemm_tn_v100); } - #endif +#endif return data; } //////////////////////////////////////////////////////////////////////////////// -void gemm_selector( - int gpu_arch, - char precision, char transA, - int m, int n, int k, - int *nbatch, int *use_magma ) -{ - // defaults - *nbatch = n; - *use_magma = 0; - std::vector< std::array > *data = NULL; - data = (std::vector< std::array >*) - gemm_selector_get_data(gpu_arch, precision, transA); +void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma) { + // defaults + *nbatch = n; + *use_magma = 0; + std::vector > *data = NULL; + data = (std::vector > *)gemm_selector_get_data(gpu_arch, precision, transA); - int ir = -1; - double norm = std::numeric_limits::max(); - for(size_t i = 0; i < data->size(); i++) { - int im = (*data)[i][M_INDEX]; - int in = (*data)[i][N_INDEX]; - int ik = (*data)[i][K_INDEX]; + int ir = -1; + double norm = std::numeric_limits::max(); + for (size_t i = 0; i < data->size(); i++) { + int im = (*data)[i][M_INDEX]; + int in = (*data)[i][N_INDEX]; + int ik = (*data)[i][K_INDEX]; - double mdiff = (double)(im-m); - double ndiff = (double)(in-n); - double kdiff = (double)(ik-k); + double mdiff = (double)(im - m); + double ndiff = (double)(in - n); + double kdiff = (double)(ik - k); - double nrm = sqrt( mdiff*mdiff + ndiff*ndiff + kdiff*kdiff ); + double nrm = sqrt(mdiff * mdiff + ndiff * ndiff + kdiff * kdiff); - if( nrm < norm ) { - norm = nrm; - ir = i; - } + if (nrm < norm) { + norm = nrm; + ir = i; + } - if( nrm == 0 ) { - // the input (m, n, k) exactly matches a record in `data` - // no need to search further - break; - } + if (nrm == 0) { + // the input (m, n, k) exactly matches a record in `data` + // no need to search further + break; } + } - if( ir >= 0 ) { - #if 0 + if (ir >= 0) { +#if 0 printf("matching record {%3d, %3d, %3d, %3d, %3d}\n", (*data)[ir][M_INDEX], (*data)[ir][N_INDEX], (*data)[ir][K_INDEX], (*data)[ir][N_BATCH_INDEX], (*data)[ir][USE_MAGMA_INDEX] ); - #endif - *use_magma = (*data)[ir][USE_MAGMA_INDEX]; +#endif + *use_magma = (*data)[ir][USE_MAGMA_INDEX]; - // if the closest match indicates that n = nbatch, - // that means calling the regular non-batch gemm. - // So nbatch is set to n instead of the 'nbatch' - // entry of the matching record - int n_ = (*data)[ir][N_INDEX]; - int nbatch_ = (*data)[ir][N_BATCH_INDEX]; - *nbatch = (n_ == nbatch_) ? n : nbatch_; - } + // if the closest match indicates that n = nbatch, + // that means calling the regular non-batch gemm. + // So nbatch is set to n instead of the 'nbatch' + // entry of the matching record + int n_ = (*data)[ir][N_INDEX]; + int nbatch_ = (*data)[ir][N_BATCH_INDEX]; + *nbatch = (n_ == nbatch_) ? n : nbatch_; + } } diff --git a/backends/magma/gemm_tuning/a100.h b/backends/magma/gemm_tuning/a100.h index bf9ab10172..7eec47bc97 100644 --- a/backends/magma/gemm_tuning/a100.h +++ b/backends/magma/gemm_tuning/a100.h @@ -2,2454 +2,2449 @@ // auto-generated from data on a100-cuda11.2 //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > sgemm_nn_a100 = -{ - {3 , 512 , 1 , 256 , 0 }, - {3 , 1024 , 1 , 32 , 0 }, - {3 , 1536 , 1 , 32 , 0 }, - {3 , 2048 , 1 , 2048 , 0 }, - {3 , 2560 , 1 , 256 , 0 }, - {3 , 3072 , 1 , 32 , 0 }, - {3 , 3584 , 1 , 32 , 0 }, - {3 , 4096 , 1 , 512 , 0 }, - {3 , 4608 , 1 , 512 , 0 }, - {3 , 5120 , 1 , 64 , 0 }, - {3 , 5632 , 1 , 32 , 0 }, - {3 , 6144 , 1 , 128 , 0 }, - {3 , 6656 , 1 , 32 , 0 }, - {3 , 7168 , 1 , 512 , 0 }, - {3 , 7680 , 1 , 32 , 0 }, - {3 , 8192 , 1 , 64 , 0 }, - {3 , 8704 , 1 , 512 , 0 }, - {3 , 9216 , 1 , 32 , 0 }, - {3 , 9728 , 1 , 32 , 0 }, - {3 , 10240 , 1 , 128 , 0 }, - {3 , 20480 , 1 , 128 , 0 }, - {3 , 30720 , 1 , 256 , 0 }, - {3 , 40960 , 1 , 512 , 0 }, - {3 , 51200 , 1 , 2048 , 0 }, - {3 , 61440 , 1 , 32 , 0 }, - {3 , 71680 , 1 , 2048 , 0 }, - {3 , 81920 , 1 , 2048 , 0 }, - {3 , 92160 , 1 , 32 , 0 }, - {3 , 102400 , 1 , 32 , 0 }, - {3 , 204800 , 1 , 256 , 0 }, - {3 , 307200 , 1 , 4096 , 0 }, - {3 , 409600 , 1 , 8192 , 1 }, - {3 , 512000 , 1 , 2048 , 1 }, - {3 , 614400 , 1 , 8192 , 1 }, - {3 , 716800 , 1 , 1024 , 1 }, - {3 , 819200 , 1 , 8192 , 1 }, - {3 , 921600 , 1 , 1024 , 1 }, - {3 , 1024000, 1 , 4096 , 1 }, - {4 , 512 , 1 , 64 , 0 }, - {4 , 1024 , 1 , 32 , 0 }, - {4 , 1536 , 1 , 64 , 0 }, - {4 , 2048 , 1 , 32 , 0 }, - {4 , 2560 , 1 , 64 , 0 }, - {4 , 3072 , 1 , 128 , 0 }, - {4 , 3584 , 1 , 512 , 0 }, - {4 , 4096 , 1 , 64 , 0 }, - {4 , 4608 , 1 , 512 , 0 }, - {4 , 5120 , 1 , 1024 , 0 }, - {4 , 5632 , 1 , 256 , 0 }, - {4 , 6144 , 1 , 2048 , 0 }, - {4 , 6656 , 1 , 256 , 0 }, - {4 , 7168 , 1 , 512 , 0 }, - {4 , 7680 , 1 , 128 , 0 }, - {4 , 8192 , 1 , 32 , 0 }, - {4 , 8704 , 1 , 32 , 0 }, - {4 , 9216 , 1 , 32 , 0 }, - {4 , 9728 , 1 , 32 , 0 }, - {4 , 10240 , 1 , 2048 , 0 }, - {4 , 20480 , 1 , 256 , 0 }, - {4 , 30720 , 1 , 32 , 0 }, - {4 , 40960 , 1 , 32 , 0 }, - {4 , 51200 , 1 , 128 , 0 }, - {4 , 61440 , 1 , 64 , 0 }, - {4 , 71680 , 1 , 1024 , 0 }, - {4 , 81920 , 1 , 1024 , 0 }, - {4 , 92160 , 1 , 512 , 0 }, - {4 , 102400 , 1 , 1024 , 0 }, - {4 , 204800 , 1 , 64 , 0 }, - {4 , 307200 , 1 , 1024 , 1 }, - {4 , 409600 , 1 , 16384 , 1 }, - {4 , 512000 , 1 , 2048 , 1 }, - {4 , 614400 , 1 , 4096 , 1 }, - {4 , 716800 , 1 , 4096 , 1 }, - {4 , 819200 , 1 , 32768 , 1 }, - {4 , 921600 , 1 , 2048 , 1 }, - {4 , 1024000, 1 , 8192 , 1 }, - {6 , 512 , 3 , 512 , 0 }, - {6 , 1024 , 3 , 1024 , 0 }, - {6 , 1536 , 3 , 256 , 1 }, - {6 , 2048 , 3 , 1024 , 1 }, - {6 , 2560 , 3 , 32 , 1 }, - {6 , 3072 , 3 , 512 , 1 }, - {6 , 3584 , 3 , 256 , 1 }, - {6 , 4096 , 3 , 256 , 1 }, - {6 , 4608 , 3 , 32 , 1 }, - {6 , 5120 , 3 , 64 , 1 }, - {6 , 5632 , 3 , 512 , 1 }, - {6 , 6144 , 3 , 1024 , 1 }, - {6 , 6656 , 3 , 64 , 1 }, - {6 , 7168 , 3 , 512 , 1 }, - {6 , 7680 , 3 , 32 , 1 }, - {6 , 8192 , 3 , 8192 , 0 }, - {6 , 8704 , 3 , 32 , 1 }, - {6 , 9216 , 3 , 512 , 1 }, - {6 , 9728 , 3 , 128 , 1 }, - {6 , 10240 , 3 , 1024 , 1 }, - {6 , 20480 , 3 , 2048 , 1 }, - {6 , 30720 , 3 , 1024 , 1 }, - {6 , 40960 , 3 , 128 , 1 }, - {6 , 51200 , 3 , 128 , 1 }, - {6 , 61440 , 3 , 512 , 1 }, - {6 , 71680 , 3 , 256 , 1 }, - {6 , 81920 , 3 , 1024 , 1 }, - {6 , 92160 , 3 , 1024 , 1 }, - {6 , 102400 , 3 , 1024 , 1 }, - {6 , 204800 , 3 , 4096 , 1 }, - {6 , 307200 , 3 , 512 , 1 }, - {6 , 409600 , 3 , 16384 , 1 }, - {6 , 512000 , 3 , 1024 , 1 }, - {6 , 614400 , 3 , 8192 , 1 }, - {6 , 716800 , 3 , 4096 , 1 }, - {6 , 819200 , 3 , 8192 , 1 }, - {6 , 921600 , 3 , 4096 , 1 }, - {6 , 1024000, 3 , 8192 , 1 }, - {10 , 512 , 4 , 512 , 0 }, - {10 , 1024 , 4 , 1024 , 0 }, - {10 , 1536 , 4 , 32 , 1 }, - {10 , 2048 , 4 , 512 , 1 }, - {10 , 2560 , 4 , 32 , 1 }, - {10 , 3072 , 4 , 64 , 1 }, - {10 , 3584 , 4 , 512 , 1 }, - {10 , 4096 , 4 , 4096 , 0 }, - {10 , 4608 , 4 , 512 , 1 }, - {10 , 5120 , 4 , 512 , 1 }, - {10 , 5632 , 4 , 512 , 1 }, - {10 , 6144 , 4 , 256 , 1 }, - {10 , 6656 , 4 , 32 , 1 }, - {10 , 7168 , 4 , 32 , 1 }, - {10 , 7680 , 4 , 256 , 1 }, - {10 , 8192 , 4 , 8192 , 0 }, - {10 , 8704 , 4 , 256 , 1 }, - {10 , 9216 , 4 , 128 , 1 }, - {10 , 9728 , 4 , 128 , 1 }, - {10 , 10240 , 4 , 32 , 1 }, - {10 , 20480 , 4 , 256 , 1 }, - {10 , 30720 , 4 , 2048 , 1 }, - {10 , 40960 , 4 , 256 , 1 }, - {10 , 51200 , 4 , 2048 , 1 }, - {10 , 61440 , 4 , 4096 , 1 }, - {10 , 71680 , 4 , 2048 , 1 }, - {10 , 81920 , 4 , 512 , 1 }, - {10 , 92160 , 4 , 256 , 1 }, - {10 , 102400 , 4 , 4096 , 1 }, - {10 , 204800 , 4 , 1024 , 1 }, - {10 , 307200 , 4 , 1024 , 1 }, - {10 , 409600 , 4 , 16384 , 1 }, - {10 , 512000 , 4 , 4096 , 1 }, - {10 , 614400 , 4 , 8192 , 1 }, - {10 , 716800 , 4 , 4096 , 1 }, - {10 , 819200 , 4 , 32768 , 1 }, - {10 , 921600 , 4 , 4096 , 1 }, - {10 , 1024000, 4 , 8192 , 1 }, - {10 , 512 , 6 , 512 , 0 }, - {10 , 1024 , 6 , 1024 , 0 }, - {10 , 1536 , 6 , 512 , 1 }, - {10 , 2048 , 6 , 2048 , 0 }, - {10 , 2560 , 6 , 512 , 1 }, - {10 , 3072 , 6 , 32 , 1 }, - {10 , 3584 , 6 , 256 , 1 }, - {10 , 4096 , 6 , 4096 , 0 }, - {10 , 4608 , 6 , 512 , 1 }, - {10 , 5120 , 6 , 256 , 1 }, - {10 , 5632 , 6 , 512 , 1 }, - {10 , 6144 , 6 , 256 , 1 }, - {10 , 6656 , 6 , 512 , 1 }, - {10 , 7168 , 6 , 512 , 1 }, - {10 , 7680 , 6 , 512 , 1 }, - {10 , 8192 , 6 , 8192 , 0 }, - {10 , 8704 , 6 , 256 , 1 }, - {10 , 9216 , 6 , 256 , 1 }, - {10 , 9728 , 6 , 256 , 1 }, - {10 , 10240 , 6 , 512 , 1 }, - {10 , 20480 , 6 , 64 , 1 }, - {10 , 30720 , 6 , 1024 , 1 }, - {10 , 40960 , 6 , 4096 , 1 }, - {10 , 51200 , 6 , 512 , 1 }, - {10 , 61440 , 6 , 4096 , 1 }, - {10 , 71680 , 6 , 256 , 1 }, - {10 , 81920 , 6 , 16384 , 1 }, - {10 , 92160 , 6 , 2048 , 1 }, - {10 , 102400 , 6 , 2048 , 1 }, - {10 , 204800 , 6 , 2048 , 1 }, - {10 , 307200 , 6 , 2048 , 1 }, - {10 , 409600 , 6 , 2048 , 1 }, - {10 , 512000 , 6 , 4096 , 1 }, - {10 , 614400 , 6 , 512 , 1 }, - {10 , 716800 , 6 , 4096 , 1 }, - {10 , 819200 , 6 , 32768 , 1 }, - {10 , 921600 , 6 , 1024 , 1 }, - {10 , 1024000, 6 , 8192 , 1 }, - {15 , 512 , 12 , 512 , 0 }, - {15 , 1024 , 12 , 512 , 1 }, - {15 , 1536 , 12 , 32 , 1 }, - {15 , 2048 , 12 , 2048 , 0 }, - {15 , 2560 , 12 , 512 , 1 }, - {15 , 3072 , 12 , 256 , 1 }, - {15 , 3584 , 12 , 512 , 1 }, - {15 , 4096 , 12 , 128 , 1 }, - {15 , 4608 , 12 , 512 , 1 }, - {15 , 5120 , 12 , 256 , 1 }, - {15 , 5632 , 12 , 32 , 1 }, - {15 , 6144 , 12 , 64 , 1 }, - {15 , 6656 , 12 , 32 , 1 }, - {15 , 7168 , 12 , 512 , 1 }, - {15 , 7680 , 12 , 32 , 1 }, - {15 , 8192 , 12 , 128 , 1 }, - {15 , 8704 , 12 , 256 , 1 }, - {15 , 9216 , 12 , 512 , 1 }, - {15 , 9728 , 12 , 64 , 1 }, - {15 , 10240 , 12 , 512 , 1 }, - {15 , 20480 , 12 , 4096 , 1 }, - {15 , 30720 , 12 , 128 , 1 }, - {15 , 40960 , 12 , 8192 , 1 }, - {15 , 51200 , 12 , 2048 , 1 }, - {15 , 61440 , 12 , 1024 , 1 }, - {15 , 71680 , 12 , 2048 , 1 }, - {15 , 81920 , 12 , 16384 , 1 }, - {15 , 92160 , 12 , 2048 , 1 }, - {15 , 102400 , 12 , 1024 , 1 }, - {15 , 204800 , 12 , 8192 , 1 }, - {15 , 307200 , 12 , 1024 , 1 }, - {15 , 409600 , 12 , 16384 , 1 }, - {15 , 512000 , 12 , 1024 , 1 }, - {15 , 614400 , 12 , 8192 , 1 }, - {15 , 716800 , 12 , 4096 , 1 }, - {15 , 819200 , 12 , 16384 , 1 }, - {15 , 921600 , 12 , 4096 , 1 }, - {15 , 1024000, 12 , 8192 , 1 }, - {20 , 512 , 11 , 64 , 1 }, - {20 , 1024 , 11 , 128 , 1 }, - {20 , 1536 , 11 , 128 , 1 }, - {20 , 2048 , 11 , 2048 , 0 }, - {20 , 2560 , 11 , 32 , 1 }, - {20 , 3072 , 11 , 512 , 1 }, - {20 , 3584 , 11 , 64 , 1 }, - {20 , 4096 , 11 , 4096 , 0 }, - {20 , 4608 , 11 , 128 , 1 }, - {20 , 5120 , 11 , 1024 , 1 }, - {20 , 5632 , 11 , 32 , 1 }, - {20 , 6144 , 11 , 256 , 1 }, - {20 , 6656 , 11 , 512 , 1 }, - {20 , 7168 , 11 , 64 , 1 }, - {20 , 7680 , 11 , 32 , 1 }, - {20 , 8192 , 11 , 8192 , 0 }, - {20 , 8704 , 11 , 512 , 1 }, - {20 , 9216 , 11 , 32 , 1 }, - {20 , 9728 , 11 , 128 , 1 }, - {20 , 10240 , 11 , 1024 , 1 }, - {20 , 20480 , 11 , 512 , 1 }, - {20 , 30720 , 11 , 128 , 1 }, - {20 , 40960 , 11 , 4096 , 1 }, - {20 , 51200 , 11 , 512 , 1 }, - {20 , 61440 , 11 , 4096 , 1 }, - {20 , 71680 , 11 , 2048 , 1 }, - {20 , 81920 , 11 , 8192 , 1 }, - {20 , 92160 , 11 , 2048 , 1 }, - {20 , 102400 , 11 , 512 , 1 }, - {20 , 204800 , 11 , 256 , 1 }, - {20 , 307200 , 11 , 4096 , 1 }, - {20 , 409600 , 11 , 16384 , 1 }, - {20 , 512000 , 11 , 512000 , 0 }, - {20 , 614400 , 11 , 614400 , 0 }, - {20 , 716800 , 11 , 716800 , 0 }, - {20 , 819200 , 11 , 819200 , 0 }, - {20 , 921600 , 11 , 921600 , 0 }, - {20 , 1024000, 11 , 1024000, 0 }, - {21 , 512 , 16 , 512 , 0 }, - {21 , 1024 , 16 , 1024 , 0 }, - {21 , 1536 , 16 , 256 , 1 }, - {21 , 2048 , 16 , 2048 , 0 }, - {21 , 2560 , 16 , 256 , 1 }, - {21 , 3072 , 16 , 64 , 1 }, - {21 , 3584 , 16 , 256 , 1 }, - {21 , 4096 , 16 , 4096 , 0 }, - {21 , 4608 , 16 , 64 , 1 }, - {21 , 5120 , 16 , 512 , 1 }, - {21 , 5632 , 16 , 256 , 1 }, - {21 , 6144 , 16 , 32 , 1 }, - {21 , 6656 , 16 , 512 , 1 }, - {21 , 7168 , 16 , 1024 , 1 }, - {21 , 7680 , 16 , 128 , 1 }, - {21 , 8192 , 16 , 32 , 1 }, - {21 , 8704 , 16 , 128 , 1 }, - {21 , 9216 , 16 , 512 , 1 }, - {21 , 9728 , 16 , 32 , 1 }, - {21 , 10240 , 16 , 256 , 1 }, - {21 , 20480 , 16 , 256 , 1 }, - {21 , 30720 , 16 , 1024 , 1 }, - {21 , 40960 , 16 , 8192 , 1 }, - {21 , 51200 , 16 , 64 , 1 }, - {21 , 61440 , 16 , 4096 , 1 }, - {21 , 71680 , 16 , 2048 , 1 }, - {21 , 81920 , 16 , 2048 , 1 }, - {21 , 92160 , 16 , 512 , 1 }, - {21 , 102400 , 16 , 4096 , 1 }, - {21 , 204800 , 16 , 8192 , 1 }, - {21 , 307200 , 16 , 4096 , 1 }, - {21 , 409600 , 16 , 409600 , 0 }, - {21 , 512000 , 16 , 512000 , 0 }, - {21 , 614400 , 16 , 614400 , 0 }, - {21 , 716800 , 16 , 716800 , 0 }, - {21 , 819200 , 16 , 819200 , 0 }, - {21 , 921600 , 16 , 921600 , 0 }, - {21 , 1024000, 16 , 1024000, 0 }, - {28 , 512 , 25 , 64 , 1 }, - {28 , 1024 , 25 , 1024 , 0 }, - {28 , 1536 , 25 , 256 , 1 }, - {28 , 2048 , 25 , 2048 , 0 }, - {28 , 2560 , 25 , 64 , 1 }, - {28 , 3072 , 25 , 128 , 1 }, - {28 , 3584 , 25 , 32 , 1 }, - {28 , 4096 , 25 , 256 , 1 }, - {28 , 4608 , 25 , 256 , 1 }, - {28 , 5120 , 25 , 64 , 1 }, - {28 , 5632 , 25 , 128 , 1 }, - {28 , 6144 , 25 , 1024 , 1 }, - {28 , 6656 , 25 , 256 , 1 }, - {28 , 7168 , 25 , 64 , 1 }, - {28 , 7680 , 25 , 256 , 1 }, - {28 , 8192 , 25 , 8192 , 0 }, - {28 , 8704 , 25 , 256 , 1 }, - {28 , 9216 , 25 , 256 , 1 }, - {28 , 9728 , 25 , 128 , 1 }, - {28 , 10240 , 25 , 1024 , 1 }, - {28 , 20480 , 25 , 1024 , 1 }, - {28 , 30720 , 25 , 1024 , 1 }, - {28 , 40960 , 25 , 2048 , 1 }, - {28 , 51200 , 25 , 1024 , 1 }, - {28 , 61440 , 25 , 4096 , 1 }, - {28 , 71680 , 25 , 512 , 1 }, - {28 , 81920 , 25 , 512 , 1 }, - {28 , 92160 , 25 , 2048 , 1 }, - {28 , 102400 , 25 , 4096 , 1 }, - {28 , 204800 , 25 , 4096 , 1 }, - {28 , 307200 , 25 , 307200 , 0 }, - {28 , 409600 , 25 , 409600 , 0 }, - {28 , 512000 , 25 , 512000 , 0 }, - {28 , 614400 , 25 , 614400 , 0 }, - {28 , 716800 , 25 , 716800 , 0 }, - {28 , 819200 , 25 , 819200 , 0 }, - {28 , 921600 , 25 , 921600 , 0 }, - {28 , 1024000, 25 , 1024000, 0 }, - {35 , 512 , 24 , 512 , 0 }, - {35 , 1024 , 24 , 1024 , 0 }, - {35 , 1536 , 24 , 256 , 1 }, - {35 , 2048 , 24 , 2048 , 0 }, - {35 , 2560 , 24 , 32 , 1 }, - {35 , 3072 , 24 , 512 , 1 }, - {35 , 3584 , 24 , 32 , 1 }, - {35 , 4096 , 24 , 4096 , 0 }, - {35 , 4608 , 24 , 256 , 1 }, - {35 , 5120 , 24 , 32 , 1 }, - {35 , 5632 , 24 , 128 , 1 }, - {35 , 6144 , 24 , 512 , 1 }, - {35 , 6656 , 24 , 64 , 1 }, - {35 , 7168 , 24 , 1024 , 1 }, - {35 , 7680 , 24 , 512 , 1 }, - {35 , 8192 , 24 , 512 , 1 }, - {35 , 8704 , 24 , 32 , 1 }, - {35 , 9216 , 24 , 64 , 1 }, - {35 , 9728 , 24 , 512 , 1 }, - {35 , 10240 , 24 , 1024 , 1 }, - {35 , 20480 , 24 , 32 , 1 }, - {35 , 30720 , 24 , 512 , 1 }, - {35 , 40960 , 24 , 2048 , 1 }, - {35 , 51200 , 24 , 1024 , 1 }, - {35 , 61440 , 24 , 64 , 1 }, - {35 , 71680 , 24 , 1024 , 1 }, - {35 , 81920 , 24 , 256 , 1 }, - {35 , 92160 , 24 , 512 , 1 }, - {35 , 102400 , 24 , 512 , 1 }, - {35 , 204800 , 24 , 4096 , 1 }, - {35 , 307200 , 24 , 307200 , 0 }, - {35 , 409600 , 24 , 409600 , 0 }, - {35 , 512000 , 24 , 512000 , 0 }, - {35 , 614400 , 24 , 614400 , 0 }, - {35 , 716800 , 24 , 716800 , 0 }, - {35 , 819200 , 24 , 819200 , 0 }, - {35 , 921600 , 24 , 921600 , 0 }, - {35 , 1024000, 24 , 1024000, 0 }, - {36 , 512 , 33 , 512 , 0 }, - {36 , 1024 , 33 , 1024 , 0 }, - {36 , 1536 , 33 , 128 , 1 }, - {36 , 2048 , 33 , 1024 , 1 }, - {36 , 2560 , 33 , 512 , 1 }, - {36 , 3072 , 33 , 256 , 1 }, - {36 , 3584 , 33 , 32 , 1 }, - {36 , 4096 , 33 , 4096 , 0 }, - {36 , 4608 , 33 , 64 , 1 }, - {36 , 5120 , 33 , 128 , 1 }, - {36 , 5632 , 33 , 128 , 1 }, - {36 , 6144 , 33 , 256 , 1 }, - {36 , 6656 , 33 , 128 , 1 }, - {36 , 7168 , 33 , 256 , 1 }, - {36 , 7680 , 33 , 256 , 1 }, - {36 , 8192 , 33 , 8192 , 1 }, - {36 , 8704 , 33 , 32 , 1 }, - {36 , 9216 , 33 , 256 , 1 }, - {36 , 9728 , 33 , 32 , 1 }, - {36 , 10240 , 33 , 512 , 1 }, - {36 , 20480 , 33 , 2048 , 1 }, - {36 , 30720 , 33 , 2048 , 1 }, - {36 , 40960 , 33 , 8192 , 1 }, - {36 , 51200 , 33 , 256 , 1 }, - {36 , 61440 , 33 , 4096 , 1 }, - {36 , 71680 , 33 , 512 , 1 }, - {36 , 81920 , 33 , 16384 , 1 }, - {36 , 92160 , 33 , 512 , 1 }, - {36 , 102400 , 33 , 2048 , 1 }, - {36 , 204800 , 33 , 8192 , 1 }, - {36 , 307200 , 33 , 512 , 1 }, - {36 , 409600 , 33 , 409600 , 0 }, - {36 , 512000 , 33 , 512000 , 0 }, - {36 , 614400 , 33 , 614400 , 0 }, - {36 , 716800 , 33 , 716800 , 0 }, - {36 , 819200 , 33 , 819200 , 0 }, - {36 , 921600 , 33 , 921600 , 0 }, - {36 , 1024000, 33 , 1024000, 0 }, - {45 , 512 , 42 , 512 , 0 }, - {45 , 1024 , 42 , 1024 , 0 }, - {45 , 1536 , 42 , 64 , 1 }, - {45 , 2048 , 42 , 2048 , 0 }, - {45 , 2560 , 42 , 32 , 1 }, - {45 , 3072 , 42 , 1024 , 1 }, - {45 , 3584 , 42 , 128 , 1 }, - {45 , 4096 , 42 , 2048 , 1 }, - {45 , 4608 , 42 , 64 , 1 }, - {45 , 5120 , 42 , 64 , 1 }, - {45 , 5632 , 42 , 32 , 1 }, - {45 , 6144 , 42 , 64 , 1 }, - {45 , 6656 , 42 , 512 , 1 }, - {45 , 7168 , 42 , 1024 , 1 }, - {45 , 7680 , 42 , 256 , 1 }, - {45 , 8192 , 42 , 4096 , 1 }, - {45 , 8704 , 42 , 64 , 1 }, - {45 , 9216 , 42 , 512 , 1 }, - {45 , 9728 , 42 , 256 , 1 }, - {45 , 10240 , 42 , 1024 , 1 }, - {45 , 20480 , 42 , 4096 , 1 }, - {45 , 30720 , 42 , 2048 , 1 }, - {45 , 40960 , 42 , 512 , 1 }, - {45 , 51200 , 42 , 2048 , 1 }, - {45 , 61440 , 42 , 4096 , 1 }, - {45 , 71680 , 42 , 1024 , 1 }, - {45 , 81920 , 42 , 16384 , 1 }, - {45 , 92160 , 42 , 1024 , 1 }, - {45 , 102400 , 42 , 1024 , 1 }, - {45 , 204800 , 42 , 8192 , 1 }, - {45 , 307200 , 42 , 307200 , 0 }, - {45 , 409600 , 42 , 409600 , 0 }, - {45 , 512000 , 42 , 512000 , 0 }, - {45 , 614400 , 42 , 614400 , 0 }, - {45 , 716800 , 42 , 716800 , 0 }, - {45 , 819200 , 42 , 819200 , 0 }, - {45 , 921600 , 42 , 921600 , 0 }, - {45 , 1024000, 42 , 1024000, 0 }, - {56 , 512 , 43 , 512 , 0 }, - {56 , 1024 , 43 , 64 , 1 }, - {56 , 1536 , 43 , 128 , 1 }, - {56 , 2048 , 43 , 2048 , 0 }, - {56 , 2560 , 43 , 32 , 1 }, - {56 , 3072 , 43 , 1024 , 1 }, - {56 , 3584 , 43 , 32 , 1 }, - {56 , 4096 , 43 , 512 , 1 }, - {56 , 4608 , 43 , 64 , 1 }, - {56 , 5120 , 43 , 64 , 1 }, - {56 , 5632 , 43 , 32 , 1 }, - {56 , 6144 , 43 , 256 , 1 }, - {56 , 6656 , 43 , 64 , 1 }, - {56 , 7168 , 43 , 64 , 1 }, - {56 , 7680 , 43 , 32 , 1 }, - {56 , 8192 , 43 , 64 , 1 }, - {56 , 8704 , 43 , 512 , 1 }, - {56 , 9216 , 43 , 1024 , 1 }, - {56 , 9728 , 43 , 64 , 1 }, - {56 , 10240 , 43 , 1024 , 1 }, - {56 , 20480 , 43 , 1024 , 1 }, - {56 , 30720 , 43 , 2048 , 1 }, - {56 , 40960 , 43 , 512 , 1 }, - {56 , 51200 , 43 , 256 , 1 }, - {56 , 61440 , 43 , 4096 , 1 }, - {56 , 71680 , 43 , 2048 , 1 }, - {56 , 81920 , 43 , 16384 , 1 }, - {56 , 92160 , 43 , 256 , 1 }, - {56 , 102400 , 43 , 512 , 1 }, - {56 , 204800 , 43 , 204800 , 0 }, - {56 , 307200 , 43 , 307200 , 0 }, - {56 , 409600 , 43 , 409600 , 0 }, - {56 , 512000 , 43 , 512000 , 0 }, - {56 , 614400 , 43 , 614400 , 0 }, - {56 , 716800 , 43 , 716800 , 0 }, - {56 , 819200 , 43 , 819200 , 0 }, - {56 , 921600 , 43 , 921600 , 0 }, - {56 , 1024000, 43 , 1024000, 0 }, - {84 , 512 , 126 , 512 , 0 }, - {84 , 1024 , 126 , 1024 , 0 }, - {84 , 1536 , 126 , 64 , 1 }, - {84 , 2048 , 126 , 256 , 1 }, - {84 , 2560 , 126 , 512 , 1 }, - {84 , 3072 , 126 , 32 , 1 }, - {84 , 3584 , 126 , 256 , 1 }, - {84 , 4096 , 126 , 4096 , 0 }, - {84 , 4608 , 126 , 256 , 1 }, - {84 , 5120 , 126 , 1024 , 1 }, - {84 , 5632 , 126 , 512 , 1 }, - {84 , 6144 , 126 , 512 , 0 }, - {84 , 6656 , 126 , 512 , 1 }, - {84 , 7168 , 126 , 1024 , 1 }, - {84 , 7680 , 126 , 512 , 1 }, - {84 , 8192 , 126 , 4096 , 1 }, - {84 , 8704 , 126 , 512 , 1 }, - {84 , 9216 , 126 , 128 , 1 }, - {84 , 9728 , 126 , 256 , 1 }, - {84 , 10240 , 126 , 2048 , 1 }, - {84 , 20480 , 126 , 2048 , 1 }, - {84 , 30720 , 126 , 1024 , 1 }, - {84 , 40960 , 126 , 512 , 0 }, - {84 , 51200 , 126 , 51200 , 1 }, - {84 , 61440 , 126 , 61440 , 1 }, - {84 , 71680 , 126 , 71680 , 1 }, - {84 , 81920 , 126 , 81920 , 1 }, - {84 , 92160 , 126 , 92160 , 1 }, - {84 , 102400 , 126 , 102400 , 1 }, - {84 , 204800 , 126 , 204800 , 1 }, - {84 , 307200 , 126 , 307200 , 1 }, - {84 , 409600 , 126 , 409600 , 1 }, - {84 , 512000 , 126 , 512000 , 0 }, - {84 , 614400 , 126 , 614400 , 0 }, - {84 , 716800 , 126 , 716800 , 0 }, - {84 , 819200 , 126 , 819200 , 0 }, - {84 , 921600 , 126 , 921600 , 0 }, - {84 , 1024000, 126 , 1024000, 0 }, - {120 , 512 , 210 , 512 , 0 }, - {120 , 1024 , 210 , 1024 , 0 }, - {120 , 1536 , 210 , 128 , 1 }, - {120 , 2048 , 210 , 2048 , 0 }, - {120 , 2560 , 210 , 256 , 1 }, - {120 , 3072 , 210 , 256 , 1 }, - {120 , 3584 , 210 , 512 , 1 }, - {120 , 4096 , 210 , 2048 , 1 }, - {120 , 4608 , 210 , 512 , 1 }, - {120 , 5120 , 210 , 512 , 1 }, - {120 , 5632 , 210 , 256 , 1 }, - {120 , 6144 , 210 , 2048 , 1 }, - {120 , 6656 , 210 , 128 , 1 }, - {120 , 7168 , 210 , 256 , 1 }, - {120 , 7680 , 210 , 256 , 1 }, - {120 , 8192 , 210 , 2048 , 1 }, - {120 , 8704 , 210 , 256 , 1 }, - {120 , 9216 , 210 , 128 , 1 }, - {120 , 9728 , 210 , 256 , 1 }, - {120 , 10240 , 210 , 512 , 1 }, - {120 , 20480 , 210 , 2048 , 1 }, - {120 , 30720 , 210 , 1024 , 1 }, - {120 , 40960 , 210 , 1024 , 0 }, - {120 , 51200 , 210 , 256 , 0 }, - {120 , 61440 , 210 , 512 , 0 }, - {120 , 71680 , 210 , 2048 , 1 }, - {120 , 81920 , 210 , 512 , 0 }, - {120 , 92160 , 210 , 512 , 0 }, - {120 , 102400 , 210 , 256 , 0 }, - {120 , 204800 , 210 , 4096 , 0 }, - {120 , 307200 , 210 , 1024 , 0 }, - {120 , 409600 , 210 , 1024 , 0 }, - {120 , 512000 , 210 , 512000 , 0 }, - {120 , 614400 , 210 , 256 , 0 }, - {120 , 716800 , 210 , 256 , 0 }, - {120 , 819200 , 210 , 128 , 0 }, - {120 , 921600 , 210 , 128 , 0 }, - {120 , 1024000, 210 , 128 , 0 }, - {165 , 512 , 330 , 512 , 0 }, - {165 , 1024 , 330 , 1024 , 0 }, - {165 , 1536 , 330 , 128 , 1 }, - {165 , 2048 , 330 , 512 , 1 }, - {165 , 2560 , 330 , 512 , 1 }, - {165 , 3072 , 330 , 256 , 1 }, - {165 , 3584 , 330 , 256 , 1 }, - {165 , 4096 , 330 , 4096 , 0 }, - {165 , 4608 , 330 , 512 , 1 }, - {165 , 5120 , 330 , 256 , 1 }, - {165 , 5632 , 330 , 512 , 1 }, - {165 , 6144 , 330 , 512 , 1 }, - {165 , 6656 , 330 , 512 , 1 }, - {165 , 7168 , 330 , 512 , 1 }, - {165 , 7680 , 330 , 512 , 1 }, - {165 , 8192 , 330 , 512 , 1 }, - {165 , 8704 , 330 , 512 , 1 }, - {165 , 9216 , 330 , 512 , 1 }, - {165 , 9728 , 330 , 512 , 1 }, - {165 , 10240 , 330 , 1024 , 1 }, - {165 , 20480 , 330 , 2048 , 1 }, - {165 , 30720 , 330 , 1024 , 1 }, - {165 , 40960 , 330 , 40960 , 0 }, - {165 , 51200 , 330 , 51200 , 0 }, - {165 , 61440 , 330 , 61440 , 0 }, - {165 , 71680 , 330 , 71680 , 0 }, - {165 , 81920 , 330 , 81920 , 0 }, - {165 , 92160 , 330 , 92160 , 0 }, - {165 , 102400 , 330 , 102400 , 0 }, - {165 , 204800 , 330 , 204800 , 0 }, - {165 , 307200 , 330 , 307200 , 0 }, - {165 , 409600 , 330 , 409600 , 0 }, - {165 , 512000 , 330 , 512000 , 0 }, - {165 , 614400 , 330 , 614400 , 0 }, - {165 , 716800 , 330 , 716800 , 0 }, - {165 , 819200 , 330 , 819200 , 0 }, - {165 , 921600 , 330 , 921600 , 0 }, - {165 , 1024000, 330 , 1024000, 1 } +std::vector > sgemm_nn_a100 = { + {3, 512, 1, 256, 0}, + {3, 1024, 1, 32, 0}, + {3, 1536, 1, 32, 0}, + {3, 2048, 1, 2048, 0}, + {3, 2560, 1, 256, 0}, + {3, 3072, 1, 32, 0}, + {3, 3584, 1, 32, 0}, + {3, 4096, 1, 512, 0}, + {3, 4608, 1, 512, 0}, + {3, 5120, 1, 64, 0}, + {3, 5632, 1, 32, 0}, + {3, 6144, 1, 128, 0}, + {3, 6656, 1, 32, 0}, + {3, 7168, 1, 512, 0}, + {3, 7680, 1, 32, 0}, + {3, 8192, 1, 64, 0}, + {3, 8704, 1, 512, 0}, + {3, 9216, 1, 32, 0}, + {3, 9728, 1, 32, 0}, + {3, 10240, 1, 128, 0}, + {3, 20480, 1, 128, 0}, + {3, 30720, 1, 256, 0}, + {3, 40960, 1, 512, 0}, + {3, 51200, 1, 2048, 0}, + {3, 61440, 1, 32, 0}, + {3, 71680, 1, 2048, 0}, + {3, 81920, 1, 2048, 0}, + {3, 92160, 1, 32, 0}, + {3, 102400, 1, 32, 0}, + {3, 204800, 1, 256, 0}, + {3, 307200, 1, 4096, 0}, + {3, 409600, 1, 8192, 1}, + {3, 512000, 1, 2048, 1}, + {3, 614400, 1, 8192, 1}, + {3, 716800, 1, 1024, 1}, + {3, 819200, 1, 8192, 1}, + {3, 921600, 1, 1024, 1}, + {3, 1024000, 1, 4096, 1}, + {4, 512, 1, 64, 0}, + {4, 1024, 1, 32, 0}, + {4, 1536, 1, 64, 0}, + {4, 2048, 1, 32, 0}, + {4, 2560, 1, 64, 0}, + {4, 3072, 1, 128, 0}, + {4, 3584, 1, 512, 0}, + {4, 4096, 1, 64, 0}, + {4, 4608, 1, 512, 0}, + {4, 5120, 1, 1024, 0}, + {4, 5632, 1, 256, 0}, + {4, 6144, 1, 2048, 0}, + {4, 6656, 1, 256, 0}, + {4, 7168, 1, 512, 0}, + {4, 7680, 1, 128, 0}, + {4, 8192, 1, 32, 0}, + {4, 8704, 1, 32, 0}, + {4, 9216, 1, 32, 0}, + {4, 9728, 1, 32, 0}, + {4, 10240, 1, 2048, 0}, + {4, 20480, 1, 256, 0}, + {4, 30720, 1, 32, 0}, + {4, 40960, 1, 32, 0}, + {4, 51200, 1, 128, 0}, + {4, 61440, 1, 64, 0}, + {4, 71680, 1, 1024, 0}, + {4, 81920, 1, 1024, 0}, + {4, 92160, 1, 512, 0}, + {4, 102400, 1, 1024, 0}, + {4, 204800, 1, 64, 0}, + {4, 307200, 1, 1024, 1}, + {4, 409600, 1, 16384, 1}, + {4, 512000, 1, 2048, 1}, + {4, 614400, 1, 4096, 1}, + {4, 716800, 1, 4096, 1}, + {4, 819200, 1, 32768, 1}, + {4, 921600, 1, 2048, 1}, + {4, 1024000, 1, 8192, 1}, + {6, 512, 3, 512, 0}, + {6, 1024, 3, 1024, 0}, + {6, 1536, 3, 256, 1}, + {6, 2048, 3, 1024, 1}, + {6, 2560, 3, 32, 1}, + {6, 3072, 3, 512, 1}, + {6, 3584, 3, 256, 1}, + {6, 4096, 3, 256, 1}, + {6, 4608, 3, 32, 1}, + {6, 5120, 3, 64, 1}, + {6, 5632, 3, 512, 1}, + {6, 6144, 3, 1024, 1}, + {6, 6656, 3, 64, 1}, + {6, 7168, 3, 512, 1}, + {6, 7680, 3, 32, 1}, + {6, 8192, 3, 8192, 0}, + {6, 8704, 3, 32, 1}, + {6, 9216, 3, 512, 1}, + {6, 9728, 3, 128, 1}, + {6, 10240, 3, 1024, 1}, + {6, 20480, 3, 2048, 1}, + {6, 30720, 3, 1024, 1}, + {6, 40960, 3, 128, 1}, + {6, 51200, 3, 128, 1}, + {6, 61440, 3, 512, 1}, + {6, 71680, 3, 256, 1}, + {6, 81920, 3, 1024, 1}, + {6, 92160, 3, 1024, 1}, + {6, 102400, 3, 1024, 1}, + {6, 204800, 3, 4096, 1}, + {6, 307200, 3, 512, 1}, + {6, 409600, 3, 16384, 1}, + {6, 512000, 3, 1024, 1}, + {6, 614400, 3, 8192, 1}, + {6, 716800, 3, 4096, 1}, + {6, 819200, 3, 8192, 1}, + {6, 921600, 3, 4096, 1}, + {6, 1024000, 3, 8192, 1}, + {10, 512, 4, 512, 0}, + {10, 1024, 4, 1024, 0}, + {10, 1536, 4, 32, 1}, + {10, 2048, 4, 512, 1}, + {10, 2560, 4, 32, 1}, + {10, 3072, 4, 64, 1}, + {10, 3584, 4, 512, 1}, + {10, 4096, 4, 4096, 0}, + {10, 4608, 4, 512, 1}, + {10, 5120, 4, 512, 1}, + {10, 5632, 4, 512, 1}, + {10, 6144, 4, 256, 1}, + {10, 6656, 4, 32, 1}, + {10, 7168, 4, 32, 1}, + {10, 7680, 4, 256, 1}, + {10, 8192, 4, 8192, 0}, + {10, 8704, 4, 256, 1}, + {10, 9216, 4, 128, 1}, + {10, 9728, 4, 128, 1}, + {10, 10240, 4, 32, 1}, + {10, 20480, 4, 256, 1}, + {10, 30720, 4, 2048, 1}, + {10, 40960, 4, 256, 1}, + {10, 51200, 4, 2048, 1}, + {10, 61440, 4, 4096, 1}, + {10, 71680, 4, 2048, 1}, + {10, 81920, 4, 512, 1}, + {10, 92160, 4, 256, 1}, + {10, 102400, 4, 4096, 1}, + {10, 204800, 4, 1024, 1}, + {10, 307200, 4, 1024, 1}, + {10, 409600, 4, 16384, 1}, + {10, 512000, 4, 4096, 1}, + {10, 614400, 4, 8192, 1}, + {10, 716800, 4, 4096, 1}, + {10, 819200, 4, 32768, 1}, + {10, 921600, 4, 4096, 1}, + {10, 1024000, 4, 8192, 1}, + {10, 512, 6, 512, 0}, + {10, 1024, 6, 1024, 0}, + {10, 1536, 6, 512, 1}, + {10, 2048, 6, 2048, 0}, + {10, 2560, 6, 512, 1}, + {10, 3072, 6, 32, 1}, + {10, 3584, 6, 256, 1}, + {10, 4096, 6, 4096, 0}, + {10, 4608, 6, 512, 1}, + {10, 5120, 6, 256, 1}, + {10, 5632, 6, 512, 1}, + {10, 6144, 6, 256, 1}, + {10, 6656, 6, 512, 1}, + {10, 7168, 6, 512, 1}, + {10, 7680, 6, 512, 1}, + {10, 8192, 6, 8192, 0}, + {10, 8704, 6, 256, 1}, + {10, 9216, 6, 256, 1}, + {10, 9728, 6, 256, 1}, + {10, 10240, 6, 512, 1}, + {10, 20480, 6, 64, 1}, + {10, 30720, 6, 1024, 1}, + {10, 40960, 6, 4096, 1}, + {10, 51200, 6, 512, 1}, + {10, 61440, 6, 4096, 1}, + {10, 71680, 6, 256, 1}, + {10, 81920, 6, 16384, 1}, + {10, 92160, 6, 2048, 1}, + {10, 102400, 6, 2048, 1}, + {10, 204800, 6, 2048, 1}, + {10, 307200, 6, 2048, 1}, + {10, 409600, 6, 2048, 1}, + {10, 512000, 6, 4096, 1}, + {10, 614400, 6, 512, 1}, + {10, 716800, 6, 4096, 1}, + {10, 819200, 6, 32768, 1}, + {10, 921600, 6, 1024, 1}, + {10, 1024000, 6, 8192, 1}, + {15, 512, 12, 512, 0}, + {15, 1024, 12, 512, 1}, + {15, 1536, 12, 32, 1}, + {15, 2048, 12, 2048, 0}, + {15, 2560, 12, 512, 1}, + {15, 3072, 12, 256, 1}, + {15, 3584, 12, 512, 1}, + {15, 4096, 12, 128, 1}, + {15, 4608, 12, 512, 1}, + {15, 5120, 12, 256, 1}, + {15, 5632, 12, 32, 1}, + {15, 6144, 12, 64, 1}, + {15, 6656, 12, 32, 1}, + {15, 7168, 12, 512, 1}, + {15, 7680, 12, 32, 1}, + {15, 8192, 12, 128, 1}, + {15, 8704, 12, 256, 1}, + {15, 9216, 12, 512, 1}, + {15, 9728, 12, 64, 1}, + {15, 10240, 12, 512, 1}, + {15, 20480, 12, 4096, 1}, + {15, 30720, 12, 128, 1}, + {15, 40960, 12, 8192, 1}, + {15, 51200, 12, 2048, 1}, + {15, 61440, 12, 1024, 1}, + {15, 71680, 12, 2048, 1}, + {15, 81920, 12, 16384, 1}, + {15, 92160, 12, 2048, 1}, + {15, 102400, 12, 1024, 1}, + {15, 204800, 12, 8192, 1}, + {15, 307200, 12, 1024, 1}, + {15, 409600, 12, 16384, 1}, + {15, 512000, 12, 1024, 1}, + {15, 614400, 12, 8192, 1}, + {15, 716800, 12, 4096, 1}, + {15, 819200, 12, 16384, 1}, + {15, 921600, 12, 4096, 1}, + {15, 1024000, 12, 8192, 1}, + {20, 512, 11, 64, 1}, + {20, 1024, 11, 128, 1}, + {20, 1536, 11, 128, 1}, + {20, 2048, 11, 2048, 0}, + {20, 2560, 11, 32, 1}, + {20, 3072, 11, 512, 1}, + {20, 3584, 11, 64, 1}, + {20, 4096, 11, 4096, 0}, + {20, 4608, 11, 128, 1}, + {20, 5120, 11, 1024, 1}, + {20, 5632, 11, 32, 1}, + {20, 6144, 11, 256, 1}, + {20, 6656, 11, 512, 1}, + {20, 7168, 11, 64, 1}, + {20, 7680, 11, 32, 1}, + {20, 8192, 11, 8192, 0}, + {20, 8704, 11, 512, 1}, + {20, 9216, 11, 32, 1}, + {20, 9728, 11, 128, 1}, + {20, 10240, 11, 1024, 1}, + {20, 20480, 11, 512, 1}, + {20, 30720, 11, 128, 1}, + {20, 40960, 11, 4096, 1}, + {20, 51200, 11, 512, 1}, + {20, 61440, 11, 4096, 1}, + {20, 71680, 11, 2048, 1}, + {20, 81920, 11, 8192, 1}, + {20, 92160, 11, 2048, 1}, + {20, 102400, 11, 512, 1}, + {20, 204800, 11, 256, 1}, + {20, 307200, 11, 4096, 1}, + {20, 409600, 11, 16384, 1}, + {20, 512000, 11, 512000, 0}, + {20, 614400, 11, 614400, 0}, + {20, 716800, 11, 716800, 0}, + {20, 819200, 11, 819200, 0}, + {20, 921600, 11, 921600, 0}, + {20, 1024000, 11, 1024000, 0}, + {21, 512, 16, 512, 0}, + {21, 1024, 16, 1024, 0}, + {21, 1536, 16, 256, 1}, + {21, 2048, 16, 2048, 0}, + {21, 2560, 16, 256, 1}, + {21, 3072, 16, 64, 1}, + {21, 3584, 16, 256, 1}, + {21, 4096, 16, 4096, 0}, + {21, 4608, 16, 64, 1}, + {21, 5120, 16, 512, 1}, + {21, 5632, 16, 256, 1}, + {21, 6144, 16, 32, 1}, + {21, 6656, 16, 512, 1}, + {21, 7168, 16, 1024, 1}, + {21, 7680, 16, 128, 1}, + {21, 8192, 16, 32, 1}, + {21, 8704, 16, 128, 1}, + {21, 9216, 16, 512, 1}, + {21, 9728, 16, 32, 1}, + {21, 10240, 16, 256, 1}, + {21, 20480, 16, 256, 1}, + {21, 30720, 16, 1024, 1}, + {21, 40960, 16, 8192, 1}, + {21, 51200, 16, 64, 1}, + {21, 61440, 16, 4096, 1}, + {21, 71680, 16, 2048, 1}, + {21, 81920, 16, 2048, 1}, + {21, 92160, 16, 512, 1}, + {21, 102400, 16, 4096, 1}, + {21, 204800, 16, 8192, 1}, + {21, 307200, 16, 4096, 1}, + {21, 409600, 16, 409600, 0}, + {21, 512000, 16, 512000, 0}, + {21, 614400, 16, 614400, 0}, + {21, 716800, 16, 716800, 0}, + {21, 819200, 16, 819200, 0}, + {21, 921600, 16, 921600, 0}, + {21, 1024000, 16, 1024000, 0}, + {28, 512, 25, 64, 1}, + {28, 1024, 25, 1024, 0}, + {28, 1536, 25, 256, 1}, + {28, 2048, 25, 2048, 0}, + {28, 2560, 25, 64, 1}, + {28, 3072, 25, 128, 1}, + {28, 3584, 25, 32, 1}, + {28, 4096, 25, 256, 1}, + {28, 4608, 25, 256, 1}, + {28, 5120, 25, 64, 1}, + {28, 5632, 25, 128, 1}, + {28, 6144, 25, 1024, 1}, + {28, 6656, 25, 256, 1}, + {28, 7168, 25, 64, 1}, + {28, 7680, 25, 256, 1}, + {28, 8192, 25, 8192, 0}, + {28, 8704, 25, 256, 1}, + {28, 9216, 25, 256, 1}, + {28, 9728, 25, 128, 1}, + {28, 10240, 25, 1024, 1}, + {28, 20480, 25, 1024, 1}, + {28, 30720, 25, 1024, 1}, + {28, 40960, 25, 2048, 1}, + {28, 51200, 25, 1024, 1}, + {28, 61440, 25, 4096, 1}, + {28, 71680, 25, 512, 1}, + {28, 81920, 25, 512, 1}, + {28, 92160, 25, 2048, 1}, + {28, 102400, 25, 4096, 1}, + {28, 204800, 25, 4096, 1}, + {28, 307200, 25, 307200, 0}, + {28, 409600, 25, 409600, 0}, + {28, 512000, 25, 512000, 0}, + {28, 614400, 25, 614400, 0}, + {28, 716800, 25, 716800, 0}, + {28, 819200, 25, 819200, 0}, + {28, 921600, 25, 921600, 0}, + {28, 1024000, 25, 1024000, 0}, + {35, 512, 24, 512, 0}, + {35, 1024, 24, 1024, 0}, + {35, 1536, 24, 256, 1}, + {35, 2048, 24, 2048, 0}, + {35, 2560, 24, 32, 1}, + {35, 3072, 24, 512, 1}, + {35, 3584, 24, 32, 1}, + {35, 4096, 24, 4096, 0}, + {35, 4608, 24, 256, 1}, + {35, 5120, 24, 32, 1}, + {35, 5632, 24, 128, 1}, + {35, 6144, 24, 512, 1}, + {35, 6656, 24, 64, 1}, + {35, 7168, 24, 1024, 1}, + {35, 7680, 24, 512, 1}, + {35, 8192, 24, 512, 1}, + {35, 8704, 24, 32, 1}, + {35, 9216, 24, 64, 1}, + {35, 9728, 24, 512, 1}, + {35, 10240, 24, 1024, 1}, + {35, 20480, 24, 32, 1}, + {35, 30720, 24, 512, 1}, + {35, 40960, 24, 2048, 1}, + {35, 51200, 24, 1024, 1}, + {35, 61440, 24, 64, 1}, + {35, 71680, 24, 1024, 1}, + {35, 81920, 24, 256, 1}, + {35, 92160, 24, 512, 1}, + {35, 102400, 24, 512, 1}, + {35, 204800, 24, 4096, 1}, + {35, 307200, 24, 307200, 0}, + {35, 409600, 24, 409600, 0}, + {35, 512000, 24, 512000, 0}, + {35, 614400, 24, 614400, 0}, + {35, 716800, 24, 716800, 0}, + {35, 819200, 24, 819200, 0}, + {35, 921600, 24, 921600, 0}, + {35, 1024000, 24, 1024000, 0}, + {36, 512, 33, 512, 0}, + {36, 1024, 33, 1024, 0}, + {36, 1536, 33, 128, 1}, + {36, 2048, 33, 1024, 1}, + {36, 2560, 33, 512, 1}, + {36, 3072, 33, 256, 1}, + {36, 3584, 33, 32, 1}, + {36, 4096, 33, 4096, 0}, + {36, 4608, 33, 64, 1}, + {36, 5120, 33, 128, 1}, + {36, 5632, 33, 128, 1}, + {36, 6144, 33, 256, 1}, + {36, 6656, 33, 128, 1}, + {36, 7168, 33, 256, 1}, + {36, 7680, 33, 256, 1}, + {36, 8192, 33, 8192, 1}, + {36, 8704, 33, 32, 1}, + {36, 9216, 33, 256, 1}, + {36, 9728, 33, 32, 1}, + {36, 10240, 33, 512, 1}, + {36, 20480, 33, 2048, 1}, + {36, 30720, 33, 2048, 1}, + {36, 40960, 33, 8192, 1}, + {36, 51200, 33, 256, 1}, + {36, 61440, 33, 4096, 1}, + {36, 71680, 33, 512, 1}, + {36, 81920, 33, 16384, 1}, + {36, 92160, 33, 512, 1}, + {36, 102400, 33, 2048, 1}, + {36, 204800, 33, 8192, 1}, + {36, 307200, 33, 512, 1}, + {36, 409600, 33, 409600, 0}, + {36, 512000, 33, 512000, 0}, + {36, 614400, 33, 614400, 0}, + {36, 716800, 33, 716800, 0}, + {36, 819200, 33, 819200, 0}, + {36, 921600, 33, 921600, 0}, + {36, 1024000, 33, 1024000, 0}, + {45, 512, 42, 512, 0}, + {45, 1024, 42, 1024, 0}, + {45, 1536, 42, 64, 1}, + {45, 2048, 42, 2048, 0}, + {45, 2560, 42, 32, 1}, + {45, 3072, 42, 1024, 1}, + {45, 3584, 42, 128, 1}, + {45, 4096, 42, 2048, 1}, + {45, 4608, 42, 64, 1}, + {45, 5120, 42, 64, 1}, + {45, 5632, 42, 32, 1}, + {45, 6144, 42, 64, 1}, + {45, 6656, 42, 512, 1}, + {45, 7168, 42, 1024, 1}, + {45, 7680, 42, 256, 1}, + {45, 8192, 42, 4096, 1}, + {45, 8704, 42, 64, 1}, + {45, 9216, 42, 512, 1}, + {45, 9728, 42, 256, 1}, + {45, 10240, 42, 1024, 1}, + {45, 20480, 42, 4096, 1}, + {45, 30720, 42, 2048, 1}, + {45, 40960, 42, 512, 1}, + {45, 51200, 42, 2048, 1}, + {45, 61440, 42, 4096, 1}, + {45, 71680, 42, 1024, 1}, + {45, 81920, 42, 16384, 1}, + {45, 92160, 42, 1024, 1}, + {45, 102400, 42, 1024, 1}, + {45, 204800, 42, 8192, 1}, + {45, 307200, 42, 307200, 0}, + {45, 409600, 42, 409600, 0}, + {45, 512000, 42, 512000, 0}, + {45, 614400, 42, 614400, 0}, + {45, 716800, 42, 716800, 0}, + {45, 819200, 42, 819200, 0}, + {45, 921600, 42, 921600, 0}, + {45, 1024000, 42, 1024000, 0}, + {56, 512, 43, 512, 0}, + {56, 1024, 43, 64, 1}, + {56, 1536, 43, 128, 1}, + {56, 2048, 43, 2048, 0}, + {56, 2560, 43, 32, 1}, + {56, 3072, 43, 1024, 1}, + {56, 3584, 43, 32, 1}, + {56, 4096, 43, 512, 1}, + {56, 4608, 43, 64, 1}, + {56, 5120, 43, 64, 1}, + {56, 5632, 43, 32, 1}, + {56, 6144, 43, 256, 1}, + {56, 6656, 43, 64, 1}, + {56, 7168, 43, 64, 1}, + {56, 7680, 43, 32, 1}, + {56, 8192, 43, 64, 1}, + {56, 8704, 43, 512, 1}, + {56, 9216, 43, 1024, 1}, + {56, 9728, 43, 64, 1}, + {56, 10240, 43, 1024, 1}, + {56, 20480, 43, 1024, 1}, + {56, 30720, 43, 2048, 1}, + {56, 40960, 43, 512, 1}, + {56, 51200, 43, 256, 1}, + {56, 61440, 43, 4096, 1}, + {56, 71680, 43, 2048, 1}, + {56, 81920, 43, 16384, 1}, + {56, 92160, 43, 256, 1}, + {56, 102400, 43, 512, 1}, + {56, 204800, 43, 204800, 0}, + {56, 307200, 43, 307200, 0}, + {56, 409600, 43, 409600, 0}, + {56, 512000, 43, 512000, 0}, + {56, 614400, 43, 614400, 0}, + {56, 716800, 43, 716800, 0}, + {56, 819200, 43, 819200, 0}, + {56, 921600, 43, 921600, 0}, + {56, 1024000, 43, 1024000, 0}, + {84, 512, 126, 512, 0}, + {84, 1024, 126, 1024, 0}, + {84, 1536, 126, 64, 1}, + {84, 2048, 126, 256, 1}, + {84, 2560, 126, 512, 1}, + {84, 3072, 126, 32, 1}, + {84, 3584, 126, 256, 1}, + {84, 4096, 126, 4096, 0}, + {84, 4608, 126, 256, 1}, + {84, 5120, 126, 1024, 1}, + {84, 5632, 126, 512, 1}, + {84, 6144, 126, 512, 0}, + {84, 6656, 126, 512, 1}, + {84, 7168, 126, 1024, 1}, + {84, 7680, 126, 512, 1}, + {84, 8192, 126, 4096, 1}, + {84, 8704, 126, 512, 1}, + {84, 9216, 126, 128, 1}, + {84, 9728, 126, 256, 1}, + {84, 10240, 126, 2048, 1}, + {84, 20480, 126, 2048, 1}, + {84, 30720, 126, 1024, 1}, + {84, 40960, 126, 512, 0}, + {84, 51200, 126, 51200, 1}, + {84, 61440, 126, 61440, 1}, + {84, 71680, 126, 71680, 1}, + {84, 81920, 126, 81920, 1}, + {84, 92160, 126, 92160, 1}, + {84, 102400, 126, 102400, 1}, + {84, 204800, 126, 204800, 1}, + {84, 307200, 126, 307200, 1}, + {84, 409600, 126, 409600, 1}, + {84, 512000, 126, 512000, 0}, + {84, 614400, 126, 614400, 0}, + {84, 716800, 126, 716800, 0}, + {84, 819200, 126, 819200, 0}, + {84, 921600, 126, 921600, 0}, + {84, 1024000, 126, 1024000, 0}, + {120, 512, 210, 512, 0}, + {120, 1024, 210, 1024, 0}, + {120, 1536, 210, 128, 1}, + {120, 2048, 210, 2048, 0}, + {120, 2560, 210, 256, 1}, + {120, 3072, 210, 256, 1}, + {120, 3584, 210, 512, 1}, + {120, 4096, 210, 2048, 1}, + {120, 4608, 210, 512, 1}, + {120, 5120, 210, 512, 1}, + {120, 5632, 210, 256, 1}, + {120, 6144, 210, 2048, 1}, + {120, 6656, 210, 128, 1}, + {120, 7168, 210, 256, 1}, + {120, 7680, 210, 256, 1}, + {120, 8192, 210, 2048, 1}, + {120, 8704, 210, 256, 1}, + {120, 9216, 210, 128, 1}, + {120, 9728, 210, 256, 1}, + {120, 10240, 210, 512, 1}, + {120, 20480, 210, 2048, 1}, + {120, 30720, 210, 1024, 1}, + {120, 40960, 210, 1024, 0}, + {120, 51200, 210, 256, 0}, + {120, 61440, 210, 512, 0}, + {120, 71680, 210, 2048, 1}, + {120, 81920, 210, 512, 0}, + {120, 92160, 210, 512, 0}, + {120, 102400, 210, 256, 0}, + {120, 204800, 210, 4096, 0}, + {120, 307200, 210, 1024, 0}, + {120, 409600, 210, 1024, 0}, + {120, 512000, 210, 512000, 0}, + {120, 614400, 210, 256, 0}, + {120, 716800, 210, 256, 0}, + {120, 819200, 210, 128, 0}, + {120, 921600, 210, 128, 0}, + {120, 1024000, 210, 128, 0}, + {165, 512, 330, 512, 0}, + {165, 1024, 330, 1024, 0}, + {165, 1536, 330, 128, 1}, + {165, 2048, 330, 512, 1}, + {165, 2560, 330, 512, 1}, + {165, 3072, 330, 256, 1}, + {165, 3584, 330, 256, 1}, + {165, 4096, 330, 4096, 0}, + {165, 4608, 330, 512, 1}, + {165, 5120, 330, 256, 1}, + {165, 5632, 330, 512, 1}, + {165, 6144, 330, 512, 1}, + {165, 6656, 330, 512, 1}, + {165, 7168, 330, 512, 1}, + {165, 7680, 330, 512, 1}, + {165, 8192, 330, 512, 1}, + {165, 8704, 330, 512, 1}, + {165, 9216, 330, 512, 1}, + {165, 9728, 330, 512, 1}, + {165, 10240, 330, 1024, 1}, + {165, 20480, 330, 2048, 1}, + {165, 30720, 330, 1024, 1}, + {165, 40960, 330, 40960, 0}, + {165, 51200, 330, 51200, 0}, + {165, 61440, 330, 61440, 0}, + {165, 71680, 330, 71680, 0}, + {165, 81920, 330, 81920, 0}, + {165, 92160, 330, 92160, 0}, + {165, 102400, 330, 102400, 0}, + {165, 204800, 330, 204800, 0}, + {165, 307200, 330, 307200, 0}, + {165, 409600, 330, 409600, 0}, + {165, 512000, 330, 512000, 0}, + {165, 614400, 330, 614400, 0}, + {165, 716800, 330, 716800, 0}, + {165, 819200, 330, 819200, 0}, + {165, 921600, 330, 921600, 0}, + {165, 1024000, 330, 1024000, 1} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > sgemm_tn_a100 = -{ - {1 , 512 , 3 , 512 , 0 }, - {1 , 1024 , 3 , 1024 , 0 }, - {1 , 1536 , 3 , 32 , 0 }, - {1 , 2048 , 3 , 2048 , 0 }, - {1 , 2560 , 3 , 32 , 0 }, - {1 , 3072 , 3 , 128 , 0 }, - {1 , 3584 , 3 , 128 , 0 }, - {1 , 4096 , 3 , 4096 , 0 }, - {1 , 4608 , 3 , 512 , 0 }, - {1 , 5120 , 3 , 256 , 0 }, - {1 , 5632 , 3 , 64 , 0 }, - {1 , 6144 , 3 , 1024 , 0 }, - {1 , 6656 , 3 , 512 , 0 }, - {1 , 7168 , 3 , 64 , 0 }, - {1 , 7680 , 3 , 128 , 0 }, - {1 , 8192 , 3 , 8192 , 0 }, - {1 , 8704 , 3 , 32 , 0 }, - {1 , 9216 , 3 , 128 , 0 }, - {1 , 9728 , 3 , 128 , 0 }, - {1 , 10240 , 3 , 256 , 0 }, - {1 , 20480 , 3 , 4096 , 0 }, - {1 , 30720 , 3 , 2048 , 0 }, - {1 , 40960 , 3 , 4096 , 0 }, - {1 , 51200 , 3 , 512 , 1 }, - {1 , 61440 , 3 , 1024 , 1 }, - {1 , 71680 , 3 , 2048 , 1 }, - {1 , 81920 , 3 , 16384 , 1 }, - {1 , 92160 , 3 , 2048 , 1 }, - {1 , 102400 , 3 , 1024 , 1 }, - {1 , 204800 , 3 , 4096 , 1 }, - {1 , 307200 , 3 , 2048 , 1 }, - {1 , 409600 , 3 , 8192 , 1 }, - {1 , 512000 , 3 , 4096 , 1 }, - {1 , 614400 , 3 , 8192 , 1 }, - {1 , 716800 , 3 , 2048 , 1 }, - {1 , 819200 , 3 , 4096 , 1 }, - {1 , 921600 , 3 , 4096 , 1 }, - {1 , 1024000, 3 , 8192 , 1 }, - {1 , 512 , 4 , 64 , 0 }, - {1 , 1024 , 4 , 1024 , 0 }, - {1 , 1536 , 4 , 256 , 0 }, - {1 , 2048 , 4 , 2048 , 0 }, - {1 , 2560 , 4 , 64 , 0 }, - {1 , 3072 , 4 , 32 , 0 }, - {1 , 3584 , 4 , 64 , 0 }, - {1 , 4096 , 4 , 4096 , 0 }, - {1 , 4608 , 4 , 32 , 0 }, - {1 , 5120 , 4 , 256 , 0 }, - {1 , 5632 , 4 , 32 , 0 }, - {1 , 6144 , 4 , 1024 , 0 }, - {1 , 6656 , 4 , 64 , 0 }, - {1 , 7168 , 4 , 512 , 0 }, - {1 , 7680 , 4 , 32 , 0 }, - {1 , 8192 , 4 , 8192 , 0 }, - {1 , 8704 , 4 , 64 , 0 }, - {1 , 9216 , 4 , 32 , 0 }, - {1 , 9728 , 4 , 128 , 0 }, - {1 , 10240 , 4 , 2048 , 0 }, - {1 , 20480 , 4 , 256 , 0 }, - {1 , 30720 , 4 , 1024 , 0 }, - {1 , 40960 , 4 , 8192 , 0 }, - {1 , 51200 , 4 , 128 , 0 }, - {1 , 61440 , 4 , 2048 , 1 }, - {1 , 71680 , 4 , 512 , 1 }, - {1 , 81920 , 4 , 1024 , 1 }, - {1 , 92160 , 4 , 512 , 1 }, - {1 , 102400 , 4 , 4096 , 1 }, - {1 , 204800 , 4 , 512 , 1 }, - {1 , 307200 , 4 , 2048 , 1 }, - {1 , 409600 , 4 , 8192 , 1 }, - {1 , 512000 , 4 , 512 , 1 }, - {1 , 614400 , 4 , 8192 , 1 }, - {1 , 716800 , 4 , 1024 , 1 }, - {1 , 819200 , 4 , 16384 , 1 }, - {1 , 921600 , 4 , 4096 , 1 }, - {1 , 1024000, 4 , 2048 , 1 }, - {3 , 512 , 6 , 512 , 0 }, - {3 , 1024 , 6 , 1024 , 0 }, - {3 , 1536 , 6 , 256 , 1 }, - {3 , 2048 , 6 , 2048 , 0 }, - {3 , 2560 , 6 , 256 , 1 }, - {3 , 3072 , 6 , 128 , 1 }, - {3 , 3584 , 6 , 32 , 1 }, - {3 , 4096 , 6 , 4096 , 0 }, - {3 , 4608 , 6 , 512 , 1 }, - {3 , 5120 , 6 , 256 , 1 }, - {3 , 5632 , 6 , 256 , 1 }, - {3 , 6144 , 6 , 32 , 1 }, - {3 , 6656 , 6 , 128 , 1 }, - {3 , 7168 , 6 , 32 , 1 }, - {3 , 7680 , 6 , 32 , 1 }, - {3 , 8192 , 6 , 8192 , 0 }, - {3 , 8704 , 6 , 512 , 1 }, - {3 , 9216 , 6 , 256 , 1 }, - {3 , 9728 , 6 , 512 , 1 }, - {3 , 10240 , 6 , 256 , 1 }, - {3 , 20480 , 6 , 64 , 1 }, - {3 , 30720 , 6 , 2048 , 1 }, - {3 , 40960 , 6 , 4096 , 1 }, - {3 , 51200 , 6 , 1024 , 1 }, - {3 , 61440 , 6 , 1024 , 1 }, - {3 , 71680 , 6 , 1024 , 1 }, - {3 , 81920 , 6 , 16384 , 1 }, - {3 , 92160 , 6 , 512 , 1 }, - {3 , 102400 , 6 , 256 , 1 }, - {3 , 204800 , 6 , 8192 , 1 }, - {3 , 307200 , 6 , 512 , 1 }, - {3 , 409600 , 6 , 16384 , 1 }, - {3 , 512000 , 6 , 2048 , 1 }, - {3 , 614400 , 6 , 8192 , 1 }, - {3 , 716800 , 6 , 1024 , 1 }, - {3 , 819200 , 6 , 16384 , 1 }, - {3 , 921600 , 6 , 4096 , 1 }, - {3 , 1024000, 6 , 8192 , 1 }, - {4 , 512 , 10 , 512 , 0 }, - {4 , 1024 , 10 , 1024 , 0 }, - {4 , 1536 , 10 , 64 , 1 }, - {4 , 2048 , 10 , 2048 , 0 }, - {4 , 2560 , 10 , 256 , 1 }, - {4 , 3072 , 10 , 32 , 1 }, - {4 , 3584 , 10 , 256 , 1 }, - {4 , 4096 , 10 , 4096 , 0 }, - {4 , 4608 , 10 , 64 , 1 }, - {4 , 5120 , 10 , 64 , 1 }, - {4 , 5632 , 10 , 64 , 1 }, - {4 , 6144 , 10 , 256 , 1 }, - {4 , 6656 , 10 , 64 , 1 }, - {4 , 7168 , 10 , 128 , 1 }, - {4 , 7680 , 10 , 64 , 1 }, - {4 , 8192 , 10 , 8192 , 0 }, - {4 , 8704 , 10 , 256 , 1 }, - {4 , 9216 , 10 , 32 , 1 }, - {4 , 9728 , 10 , 32 , 1 }, - {4 , 10240 , 10 , 512 , 1 }, - {4 , 20480 , 10 , 256 , 1 }, - {4 , 30720 , 10 , 512 , 1 }, - {4 , 40960 , 10 , 128 , 1 }, - {4 , 51200 , 10 , 512 , 1 }, - {4 , 61440 , 10 , 2048 , 1 }, - {4 , 71680 , 10 , 1024 , 1 }, - {4 , 81920 , 10 , 512 , 1 }, - {4 , 92160 , 10 , 1024 , 1 }, - {4 , 102400 , 10 , 512 , 1 }, - {4 , 204800 , 10 , 2048 , 1 }, - {4 , 307200 , 10 , 4096 , 1 }, - {4 , 409600 , 10 , 4096 , 1 }, - {4 , 512000 , 10 , 2048 , 1 }, - {4 , 614400 , 10 , 4096 , 1 }, - {4 , 716800 , 10 , 2048 , 1 }, - {4 , 819200 , 10 , 16384 , 1 }, - {4 , 921600 , 10 , 1024 , 1 }, - {4 , 1024000, 10 , 8192 , 1 }, - {6 , 512 , 10 , 512 , 0 }, - {6 , 1024 , 10 , 1024 , 0 }, - {6 , 1536 , 10 , 256 , 1 }, - {6 , 2048 , 10 , 2048 , 0 }, - {6 , 2560 , 10 , 32 , 1 }, - {6 , 3072 , 10 , 512 , 1 }, - {6 , 3584 , 10 , 256 , 1 }, - {6 , 4096 , 10 , 4096 , 0 }, - {6 , 4608 , 10 , 64 , 1 }, - {6 , 5120 , 10 , 32 , 1 }, - {6 , 5632 , 10 , 512 , 1 }, - {6 , 6144 , 10 , 256 , 1 }, - {6 , 6656 , 10 , 32 , 1 }, - {6 , 7168 , 10 , 1024 , 1 }, - {6 , 7680 , 10 , 256 , 1 }, - {6 , 8192 , 10 , 8192 , 0 }, - {6 , 8704 , 10 , 512 , 1 }, - {6 , 9216 , 10 , 64 , 1 }, - {6 , 9728 , 10 , 64 , 1 }, - {6 , 10240 , 10 , 128 , 1 }, - {6 , 20480 , 10 , 128 , 1 }, - {6 , 30720 , 10 , 1024 , 1 }, - {6 , 40960 , 10 , 8192 , 1 }, - {6 , 51200 , 10 , 256 , 1 }, - {6 , 61440 , 10 , 1024 , 1 }, - {6 , 71680 , 10 , 128 , 1 }, - {6 , 81920 , 10 , 16384 , 1 }, - {6 , 92160 , 10 , 512 , 1 }, - {6 , 102400 , 10 , 256 , 1 }, - {6 , 204800 , 10 , 4096 , 1 }, - {6 , 307200 , 10 , 512 , 1 }, - {6 , 409600 , 10 , 4096 , 1 }, - {6 , 512000 , 10 , 1024 , 1 }, - {6 , 614400 , 10 , 4096 , 1 }, - {6 , 716800 , 10 , 4096 , 1 }, - {6 , 819200 , 10 , 16384 , 1 }, - {6 , 921600 , 10 , 1024 , 1 }, - {6 , 1024000, 10 , 8192 , 1 }, - {12 , 512 , 15 , 512 , 0 }, - {12 , 1024 , 15 , 1024 , 0 }, - {12 , 1536 , 15 , 512 , 1 }, - {12 , 2048 , 15 , 2048 , 0 }, - {12 , 2560 , 15 , 512 , 1 }, - {12 , 3072 , 15 , 64 , 1 }, - {12 , 3584 , 15 , 256 , 1 }, - {12 , 4096 , 15 , 4096 , 0 }, - {12 , 4608 , 15 , 32 , 1 }, - {12 , 5120 , 15 , 256 , 1 }, - {12 , 5632 , 15 , 512 , 1 }, - {12 , 6144 , 15 , 512 , 1 }, - {12 , 6656 , 15 , 512 , 1 }, - {12 , 7168 , 15 , 512 , 1 }, - {12 , 7680 , 15 , 256 , 1 }, - {12 , 8192 , 15 , 512 , 1 }, - {12 , 8704 , 15 , 512 , 1 }, - {12 , 9216 , 15 , 1024 , 1 }, - {12 , 9728 , 15 , 32 , 1 }, - {12 , 10240 , 15 , 128 , 1 }, - {12 , 20480 , 15 , 32 , 1 }, - {12 , 30720 , 15 , 64 , 1 }, - {12 , 40960 , 15 , 2048 , 1 }, - {12 , 51200 , 15 , 1024 , 1 }, - {12 , 61440 , 15 , 4096 , 1 }, - {12 , 71680 , 15 , 2048 , 1 }, - {12 , 81920 , 15 , 1024 , 1 }, - {12 , 92160 , 15 , 512 , 1 }, - {12 , 102400 , 15 , 1024 , 1 }, - {12 , 204800 , 15 , 4096 , 1 }, - {12 , 307200 , 15 , 1024 , 1 }, - {12 , 409600 , 15 , 8192 , 1 }, - {12 , 512000 , 15 , 2048 , 1 }, - {12 , 614400 , 15 , 8192 , 1 }, - {12 , 716800 , 15 , 4096 , 1 }, - {12 , 819200 , 15 , 32768 , 1 }, - {12 , 921600 , 15 , 4096 , 1 }, - {12 , 1024000, 15 , 8192 , 1 }, - {11 , 512 , 20 , 512 , 0 }, - {11 , 1024 , 20 , 256 , 1 }, - {11 , 1536 , 20 , 32 , 1 }, - {11 , 2048 , 20 , 2048 , 0 }, - {11 , 2560 , 20 , 256 , 1 }, - {11 , 3072 , 20 , 512 , 1 }, - {11 , 3584 , 20 , 512 , 1 }, - {11 , 4096 , 20 , 4096 , 0 }, - {11 , 4608 , 20 , 128 , 1 }, - {11 , 5120 , 20 , 32 , 1 }, - {11 , 5632 , 20 , 512 , 1 }, - {11 , 6144 , 20 , 512 , 1 }, - {11 , 6656 , 20 , 128 , 1 }, - {11 , 7168 , 20 , 256 , 1 }, - {11 , 7680 , 20 , 512 , 1 }, - {11 , 8192 , 20 , 8192 , 0 }, - {11 , 8704 , 20 , 512 , 1 }, - {11 , 9216 , 20 , 64 , 1 }, - {11 , 9728 , 20 , 64 , 1 }, - {11 , 10240 , 20 , 512 , 1 }, - {11 , 20480 , 20 , 2048 , 1 }, - {11 , 30720 , 20 , 256 , 1 }, - {11 , 40960 , 20 , 2048 , 1 }, - {11 , 51200 , 20 , 256 , 1 }, - {11 , 61440 , 20 , 4096 , 1 }, - {11 , 71680 , 20 , 1024 , 1 }, - {11 , 81920 , 20 , 2048 , 1 }, - {11 , 92160 , 20 , 1024 , 1 }, - {11 , 102400 , 20 , 4096 , 1 }, - {11 , 204800 , 20 , 2048 , 1 }, - {11 , 307200 , 20 , 4096 , 1 }, - {11 , 409600 , 20 , 8192 , 1 }, - {11 , 512000 , 20 , 2048 , 1 }, - {11 , 614400 , 20 , 8192 , 1 }, - {11 , 716800 , 20 , 4096 , 1 }, - {11 , 819200 , 20 , 32768 , 1 }, - {11 , 921600 , 20 , 4096 , 1 }, - {11 , 1024000, 20 , 8192 , 1 }, - {16 , 512 , 21 , 512 , 0 }, - {16 , 1024 , 21 , 32 , 1 }, - {16 , 1536 , 21 , 512 , 1 }, - {16 , 2048 , 21 , 2048 , 0 }, - {16 , 2560 , 21 , 32 , 1 }, - {16 , 3072 , 21 , 128 , 1 }, - {16 , 3584 , 21 , 512 , 1 }, - {16 , 4096 , 21 , 4096 , 0 }, - {16 , 4608 , 21 , 512 , 1 }, - {16 , 5120 , 21 , 32 , 1 }, - {16 , 5632 , 21 , 512 , 1 }, - {16 , 6144 , 21 , 2048 , 1 }, - {16 , 6656 , 21 , 512 , 1 }, - {16 , 7168 , 21 , 512 , 1 }, - {16 , 7680 , 21 , 64 , 1 }, - {16 , 8192 , 21 , 128 , 1 }, - {16 , 8704 , 21 , 256 , 1 }, - {16 , 9216 , 21 , 512 , 1 }, - {16 , 9728 , 21 , 64 , 1 }, - {16 , 10240 , 21 , 32 , 1 }, - {16 , 20480 , 21 , 256 , 1 }, - {16 , 30720 , 21 , 256 , 1 }, - {16 , 40960 , 21 , 4096 , 1 }, - {16 , 51200 , 21 , 512 , 1 }, - {16 , 61440 , 21 , 2048 , 1 }, - {16 , 71680 , 21 , 512 , 1 }, - {16 , 81920 , 21 , 4096 , 1 }, - {16 , 92160 , 21 , 2048 , 1 }, - {16 , 102400 , 21 , 4096 , 1 }, - {16 , 204800 , 21 , 4096 , 1 }, - {16 , 307200 , 21 , 4096 , 1 }, - {16 , 409600 , 21 , 8192 , 1 }, - {16 , 512000 , 21 , 4096 , 1 }, - {16 , 614400 , 21 , 8192 , 1 }, - {16 , 716800 , 21 , 4096 , 1 }, - {16 , 819200 , 21 , 32768 , 1 }, - {16 , 921600 , 21 , 512 , 1 }, - {16 , 1024000, 21 , 8192 , 1 }, - {25 , 512 , 28 , 32 , 1 }, - {25 , 1024 , 28 , 1024 , 0 }, - {25 , 1536 , 28 , 64 , 1 }, - {25 , 2048 , 28 , 2048 , 0 }, - {25 , 2560 , 28 , 64 , 1 }, - {25 , 3072 , 28 , 512 , 1 }, - {25 , 3584 , 28 , 256 , 1 }, - {25 , 4096 , 28 , 2048 , 1 }, - {25 , 4608 , 28 , 256 , 1 }, - {25 , 5120 , 28 , 64 , 1 }, - {25 , 5632 , 28 , 512 , 1 }, - {25 , 6144 , 28 , 128 , 1 }, - {25 , 6656 , 28 , 512 , 1 }, - {25 , 7168 , 28 , 256 , 1 }, - {25 , 7680 , 28 , 512 , 1 }, - {25 , 8192 , 28 , 8192 , 0 }, - {25 , 8704 , 28 , 128 , 1 }, - {25 , 9216 , 28 , 256 , 1 }, - {25 , 9728 , 28 , 64 , 1 }, - {25 , 10240 , 28 , 256 , 1 }, - {25 , 20480 , 28 , 512 , 1 }, - {25 , 30720 , 28 , 1024 , 1 }, - {25 , 40960 , 28 , 4096 , 1 }, - {25 , 51200 , 28 , 1024 , 1 }, - {25 , 61440 , 28 , 512 , 1 }, - {25 , 71680 , 28 , 2048 , 1 }, - {25 , 81920 , 28 , 2048 , 1 }, - {25 , 92160 , 28 , 512 , 1 }, - {25 , 102400 , 28 , 4096 , 1 }, - {25 , 204800 , 28 , 512 , 1 }, - {25 , 307200 , 28 , 307200 , 0 }, - {25 , 409600 , 28 , 409600 , 0 }, - {25 , 512000 , 28 , 512000 , 0 }, - {25 , 614400 , 28 , 614400 , 0 }, - {25 , 716800 , 28 , 716800 , 0 }, - {25 , 819200 , 28 , 819200 , 0 }, - {25 , 921600 , 28 , 921600 , 0 }, - {25 , 1024000, 28 , 1024000, 0 }, - {24 , 512 , 35 , 512 , 0 }, - {24 , 1024 , 35 , 1024 , 0 }, - {24 , 1536 , 35 , 64 , 1 }, - {24 , 2048 , 35 , 2048 , 0 }, - {24 , 2560 , 35 , 128 , 1 }, - {24 , 3072 , 35 , 256 , 1 }, - {24 , 3584 , 35 , 32 , 1 }, - {24 , 4096 , 35 , 64 , 1 }, - {24 , 4608 , 35 , 32 , 1 }, - {24 , 5120 , 35 , 32 , 1 }, - {24 , 5632 , 35 , 128 , 1 }, - {24 , 6144 , 35 , 256 , 1 }, - {24 , 6656 , 35 , 512 , 1 }, - {24 , 7168 , 35 , 128 , 1 }, - {24 , 7680 , 35 , 32 , 1 }, - {24 , 8192 , 35 , 8192 , 0 }, - {24 , 8704 , 35 , 512 , 1 }, - {24 , 9216 , 35 , 1024 , 1 }, - {24 , 9728 , 35 , 32 , 1 }, - {24 , 10240 , 35 , 2048 , 1 }, - {24 , 20480 , 35 , 4096 , 1 }, - {24 , 30720 , 35 , 1024 , 1 }, - {24 , 40960 , 35 , 512 , 1 }, - {24 , 51200 , 35 , 512 , 1 }, - {24 , 61440 , 35 , 4096 , 1 }, - {24 , 71680 , 35 , 1024 , 1 }, - {24 , 81920 , 35 , 2048 , 1 }, - {24 , 92160 , 35 , 2048 , 1 }, - {24 , 102400 , 35 , 2048 , 1 }, - {24 , 204800 , 35 , 4096 , 1 }, - {24 , 307200 , 35 , 4096 , 1 }, - {24 , 409600 , 35 , 409600 , 0 }, - {24 , 512000 , 35 , 512000 , 0 }, - {24 , 614400 , 35 , 614400 , 0 }, - {24 , 716800 , 35 , 716800 , 0 }, - {24 , 819200 , 35 , 819200 , 0 }, - {24 , 921600 , 35 , 921600 , 0 }, - {24 , 1024000, 35 , 1024000, 0 }, - {33 , 512 , 36 , 512 , 0 }, - {33 , 1024 , 36 , 1024 , 0 }, - {33 , 1536 , 36 , 512 , 1 }, - {33 , 2048 , 36 , 32 , 1 }, - {33 , 2560 , 36 , 32 , 1 }, - {33 , 3072 , 36 , 32 , 1 }, - {33 , 3584 , 36 , 128 , 1 }, - {33 , 4096 , 36 , 256 , 1 }, - {33 , 4608 , 36 , 64 , 1 }, - {33 , 5120 , 36 , 256 , 1 }, - {33 , 5632 , 36 , 256 , 1 }, - {33 , 6144 , 36 , 2048 , 1 }, - {33 , 6656 , 36 , 32 , 1 }, - {33 , 7168 , 36 , 1024 , 1 }, - {33 , 7680 , 36 , 512 , 1 }, - {33 , 8192 , 36 , 4096 , 1 }, - {33 , 8704 , 36 , 128 , 1 }, - {33 , 9216 , 36 , 128 , 1 }, - {33 , 9728 , 36 , 512 , 1 }, - {33 , 10240 , 36 , 32 , 1 }, - {33 , 20480 , 36 , 256 , 1 }, - {33 , 30720 , 36 , 1024 , 1 }, - {33 , 40960 , 36 , 2048 , 1 }, - {33 , 51200 , 36 , 1024 , 1 }, - {33 , 61440 , 36 , 2048 , 1 }, - {33 , 71680 , 36 , 1024 , 1 }, - {33 , 81920 , 36 , 8192 , 1 }, - {33 , 92160 , 36 , 1024 , 1 }, - {33 , 102400 , 36 , 2048 , 1 }, - {33 , 204800 , 36 , 8192 , 1 }, - {33 , 307200 , 36 , 4096 , 1 }, - {33 , 409600 , 36 , 409600 , 0 }, - {33 , 512000 , 36 , 512000 , 0 }, - {33 , 614400 , 36 , 614400 , 0 }, - {33 , 716800 , 36 , 716800 , 0 }, - {33 , 819200 , 36 , 819200 , 0 }, - {33 , 921600 , 36 , 921600 , 0 }, - {33 , 1024000, 36 , 1024000, 0 }, - {42 , 512 , 45 , 512 , 0 }, - {42 , 1024 , 45 , 1024 , 0 }, - {42 , 1536 , 45 , 32 , 1 }, - {42 , 2048 , 45 , 2048 , 0 }, - {42 , 2560 , 45 , 64 , 1 }, - {42 , 3072 , 45 , 64 , 1 }, - {42 , 3584 , 45 , 64 , 1 }, - {42 , 4096 , 45 , 32 , 1 }, - {42 , 4608 , 45 , 512 , 1 }, - {42 , 5120 , 45 , 1024 , 1 }, - {42 , 5632 , 45 , 32 , 1 }, - {42 , 6144 , 45 , 64 , 1 }, - {42 , 6656 , 45 , 64 , 1 }, - {42 , 7168 , 45 , 1024 , 1 }, - {42 , 7680 , 45 , 256 , 1 }, - {42 , 8192 , 45 , 128 , 1 }, - {42 , 8704 , 45 , 512 , 1 }, - {42 , 9216 , 45 , 128 , 1 }, - {42 , 9728 , 45 , 128 , 1 }, - {42 , 10240 , 45 , 32 , 1 }, - {42 , 20480 , 45 , 1024 , 1 }, - {42 , 30720 , 45 , 2048 , 1 }, - {42 , 40960 , 45 , 2048 , 1 }, - {42 , 51200 , 45 , 1024 , 1 }, - {42 , 61440 , 45 , 2048 , 1 }, - {42 , 71680 , 45 , 2048 , 1 }, - {42 , 81920 , 45 , 8192 , 1 }, - {42 , 92160 , 45 , 2048 , 1 }, - {42 , 102400 , 45 , 4096 , 1 }, - {42 , 204800 , 45 , 512 , 1 }, - {42 , 307200 , 45 , 307200 , 0 }, - {42 , 409600 , 45 , 409600 , 0 }, - {42 , 512000 , 45 , 512000 , 0 }, - {42 , 614400 , 45 , 614400 , 0 }, - {42 , 716800 , 45 , 716800 , 0 }, - {42 , 819200 , 45 , 819200 , 0 }, - {42 , 921600 , 45 , 921600 , 0 }, - {42 , 1024000, 45 , 1024000, 0 }, - {43 , 512 , 56 , 512 , 0 }, - {43 , 1024 , 56 , 1024 , 0 }, - {43 , 1536 , 56 , 32 , 1 }, - {43 , 2048 , 56 , 2048 , 0 }, - {43 , 2560 , 56 , 32 , 1 }, - {43 , 3072 , 56 , 512 , 1 }, - {43 , 3584 , 56 , 256 , 1 }, - {43 , 4096 , 56 , 4096 , 0 }, - {43 , 4608 , 56 , 128 , 1 }, - {43 , 5120 , 56 , 512 , 1 }, - {43 , 5632 , 56 , 256 , 1 }, - {43 , 6144 , 56 , 512 , 1 }, - {43 , 6656 , 56 , 32 , 1 }, - {43 , 7168 , 56 , 128 , 1 }, - {43 , 7680 , 56 , 512 , 1 }, - {43 , 8192 , 56 , 2048 , 1 }, - {43 , 8704 , 56 , 64 , 1 }, - {43 , 9216 , 56 , 64 , 1 }, - {43 , 9728 , 56 , 128 , 1 }, - {43 , 10240 , 56 , 2048 , 1 }, - {43 , 20480 , 56 , 512 , 1 }, - {43 , 30720 , 56 , 2048 , 1 }, - {43 , 40960 , 56 , 4096 , 1 }, - {43 , 51200 , 56 , 2048 , 1 }, - {43 , 61440 , 56 , 4096 , 1 }, - {43 , 71680 , 56 , 1024 , 1 }, - {43 , 81920 , 56 , 16384 , 1 }, - {43 , 92160 , 56 , 2048 , 1 }, - {43 , 102400 , 56 , 4096 , 1 }, - {43 , 204800 , 56 , 1024 , 1 }, - {43 , 307200 , 56 , 307200 , 0 }, - {43 , 409600 , 56 , 409600 , 0 }, - {43 , 512000 , 56 , 512000 , 0 }, - {43 , 614400 , 56 , 614400 , 0 }, - {43 , 716800 , 56 , 716800 , 0 }, - {43 , 819200 , 56 , 819200 , 0 }, - {43 , 921600 , 56 , 921600 , 0 }, - {43 , 1024000, 56 , 1024000, 0 }, - {126 , 512 , 84 , 512 , 0 }, - {126 , 1024 , 84 , 1024 , 0 }, - {126 , 1536 , 84 , 256 , 1 }, - {126 , 2048 , 84 , 2048 , 0 }, - {126 , 2560 , 84 , 32 , 1 }, - {126 , 3072 , 84 , 256 , 1 }, - {126 , 3584 , 84 , 512 , 1 }, - {126 , 4096 , 84 , 4096 , 1 }, - {126 , 4608 , 84 , 256 , 1 }, - {126 , 5120 , 84 , 1024 , 1 }, - {126 , 5632 , 84 , 512 , 1 }, - {126 , 6144 , 84 , 1024 , 1 }, - {126 , 6656 , 84 , 256 , 1 }, - {126 , 7168 , 84 , 512 , 1 }, - {126 , 7680 , 84 , 256 , 1 }, - {126 , 8192 , 84 , 2048 , 1 }, - {126 , 8704 , 84 , 256 , 1 }, - {126 , 9216 , 84 , 1024 , 1 }, - {126 , 9728 , 84 , 128 , 1 }, - {126 , 10240 , 84 , 2048 , 1 }, - {126 , 20480 , 84 , 2048 , 1 }, - {126 , 30720 , 84 , 2048 , 1 }, - {126 , 40960 , 84 , 8192 , 1 }, - {126 , 51200 , 84 , 512 , 1 }, - {126 , 61440 , 84 , 1024 , 1 }, - {126 , 71680 , 84 , 512 , 1 }, - {126 , 81920 , 84 , 16384 , 1 }, - {126 , 92160 , 84 , 512 , 0 }, - {126 , 102400 , 84 , 2048 , 0 }, - {126 , 204800 , 84 , 512 , 0 }, - {126 , 307200 , 84 , 307200 , 0 }, - {126 , 409600 , 84 , 409600 , 0 }, - {126 , 512000 , 84 , 512000 , 0 }, - {126 , 614400 , 84 , 614400 , 0 }, - {126 , 716800 , 84 , 716800 , 0 }, - {126 , 819200 , 84 , 32768 , 0 }, - {126 , 921600 , 84 , 921600 , 0 }, - {126 , 1024000, 84 , 256 , 0 }, - {210 , 512 , 120 , 512 , 0 }, - {210 , 1024 , 120 , 1024 , 0 }, - {210 , 1536 , 120 , 256 , 1 }, - {210 , 2048 , 120 , 2048 , 1 }, - {210 , 2560 , 120 , 512 , 1 }, - {210 , 3072 , 120 , 1024 , 1 }, - {210 , 3584 , 120 , 512 , 1 }, - {210 , 4096 , 120 , 512 , 1 }, - {210 , 4608 , 120 , 512 , 1 }, - {210 , 5120 , 120 , 1024 , 1 }, - {210 , 5632 , 120 , 512 , 1 }, - {210 , 6144 , 120 , 1024 , 1 }, - {210 , 6656 , 120 , 256 , 1 }, - {210 , 7168 , 120 , 512 , 1 }, - {210 , 7680 , 120 , 512 , 1 }, - {210 , 8192 , 120 , 4096 , 1 }, - {210 , 8704 , 120 , 256 , 1 }, - {210 , 9216 , 120 , 1024 , 1 }, - {210 , 9728 , 120 , 512 , 1 }, - {210 , 10240 , 120 , 512 , 1 }, - {210 , 20480 , 120 , 1024 , 0 }, - {210 , 30720 , 120 , 512 , 1 }, - {210 , 40960 , 120 , 8192 , 0 }, - {210 , 51200 , 120 , 51200 , 0 }, - {210 , 61440 , 120 , 1024 , 0 }, - {210 , 71680 , 120 , 71680 , 0 }, - {210 , 81920 , 120 , 81920 , 0 }, - {210 , 92160 , 120 , 92160 , 0 }, - {210 , 102400 , 120 , 102400 , 0 }, - {210 , 204800 , 120 , 204800 , 0 }, - {210 , 307200 , 120 , 307200 , 0 }, - {210 , 409600 , 120 , 409600 , 0 }, - {210 , 512000 , 120 , 512000 , 0 }, - {210 , 614400 , 120 , 614400 , 0 }, - {210 , 716800 , 120 , 128 , 0 }, - {210 , 819200 , 120 , 128 , 0 }, - {210 , 921600 , 120 , 128 , 0 }, - {210 , 1024000, 120 , 128 , 0 }, - {330 , 512 , 165 , 512 , 0 }, - {330 , 1024 , 165 , 1024 , 0 }, - {330 , 1536 , 165 , 256 , 1 }, - {330 , 2048 , 165 , 1024 , 1 }, - {330 , 2560 , 165 , 256 , 1 }, - {330 , 3072 , 165 , 256 , 1 }, - {330 , 3584 , 165 , 512 , 1 }, - {330 , 4096 , 165 , 1024 , 1 }, - {330 , 4608 , 165 , 512 , 1 }, - {330 , 5120 , 165 , 512 , 1 }, - {330 , 5632 , 165 , 512 , 1 }, - {330 , 6144 , 165 , 1024 , 1 }, - {330 , 6656 , 165 , 512 , 1 }, - {330 , 7168 , 165 , 512 , 1 }, - {330 , 7680 , 165 , 512 , 1 }, - {330 , 8192 , 165 , 8192 , 1 }, - {330 , 8704 , 165 , 256 , 0 }, - {330 , 9216 , 165 , 128 , 0 }, - {330 , 9728 , 165 , 512 , 1 }, - {330 , 10240 , 165 , 512 , 1 }, - {330 , 20480 , 165 , 20480 , 0 }, - {330 , 30720 , 165 , 1024 , 0 }, - {330 , 40960 , 165 , 4096 , 0 }, - {330 , 51200 , 165 , 51200 , 0 }, - {330 , 61440 , 165 , 61440 , 0 }, - {330 , 71680 , 165 , 512 , 0 }, - {330 , 81920 , 165 , 512 , 0 }, - {330 , 92160 , 165 , 128 , 0 }, - {330 , 102400 , 165 , 102400 , 0 }, - {330 , 204800 , 165 , 204800 , 0 }, - {330 , 307200 , 165 , 307200 , 0 }, - {330 , 409600 , 165 , 128 , 0 }, - {330 , 512000 , 165 , 128 , 0 }, - {330 , 614400 , 165 , 128 , 0 }, - {330 , 716800 , 165 , 128 , 0 }, - {330 , 819200 , 165 , 128 , 0 }, - {330 , 921600 , 165 , 128 , 0 }, - {330 , 1024000, 165 , 128 , 0 } +std::vector > sgemm_tn_a100 = { + {1, 512, 3, 512, 0}, + {1, 1024, 3, 1024, 0}, + {1, 1536, 3, 32, 0}, + {1, 2048, 3, 2048, 0}, + {1, 2560, 3, 32, 0}, + {1, 3072, 3, 128, 0}, + {1, 3584, 3, 128, 0}, + {1, 4096, 3, 4096, 0}, + {1, 4608, 3, 512, 0}, + {1, 5120, 3, 256, 0}, + {1, 5632, 3, 64, 0}, + {1, 6144, 3, 1024, 0}, + {1, 6656, 3, 512, 0}, + {1, 7168, 3, 64, 0}, + {1, 7680, 3, 128, 0}, + {1, 8192, 3, 8192, 0}, + {1, 8704, 3, 32, 0}, + {1, 9216, 3, 128, 0}, + {1, 9728, 3, 128, 0}, + {1, 10240, 3, 256, 0}, + {1, 20480, 3, 4096, 0}, + {1, 30720, 3, 2048, 0}, + {1, 40960, 3, 4096, 0}, + {1, 51200, 3, 512, 1}, + {1, 61440, 3, 1024, 1}, + {1, 71680, 3, 2048, 1}, + {1, 81920, 3, 16384, 1}, + {1, 92160, 3, 2048, 1}, + {1, 102400, 3, 1024, 1}, + {1, 204800, 3, 4096, 1}, + {1, 307200, 3, 2048, 1}, + {1, 409600, 3, 8192, 1}, + {1, 512000, 3, 4096, 1}, + {1, 614400, 3, 8192, 1}, + {1, 716800, 3, 2048, 1}, + {1, 819200, 3, 4096, 1}, + {1, 921600, 3, 4096, 1}, + {1, 1024000, 3, 8192, 1}, + {1, 512, 4, 64, 0}, + {1, 1024, 4, 1024, 0}, + {1, 1536, 4, 256, 0}, + {1, 2048, 4, 2048, 0}, + {1, 2560, 4, 64, 0}, + {1, 3072, 4, 32, 0}, + {1, 3584, 4, 64, 0}, + {1, 4096, 4, 4096, 0}, + {1, 4608, 4, 32, 0}, + {1, 5120, 4, 256, 0}, + {1, 5632, 4, 32, 0}, + {1, 6144, 4, 1024, 0}, + {1, 6656, 4, 64, 0}, + {1, 7168, 4, 512, 0}, + {1, 7680, 4, 32, 0}, + {1, 8192, 4, 8192, 0}, + {1, 8704, 4, 64, 0}, + {1, 9216, 4, 32, 0}, + {1, 9728, 4, 128, 0}, + {1, 10240, 4, 2048, 0}, + {1, 20480, 4, 256, 0}, + {1, 30720, 4, 1024, 0}, + {1, 40960, 4, 8192, 0}, + {1, 51200, 4, 128, 0}, + {1, 61440, 4, 2048, 1}, + {1, 71680, 4, 512, 1}, + {1, 81920, 4, 1024, 1}, + {1, 92160, 4, 512, 1}, + {1, 102400, 4, 4096, 1}, + {1, 204800, 4, 512, 1}, + {1, 307200, 4, 2048, 1}, + {1, 409600, 4, 8192, 1}, + {1, 512000, 4, 512, 1}, + {1, 614400, 4, 8192, 1}, + {1, 716800, 4, 1024, 1}, + {1, 819200, 4, 16384, 1}, + {1, 921600, 4, 4096, 1}, + {1, 1024000, 4, 2048, 1}, + {3, 512, 6, 512, 0}, + {3, 1024, 6, 1024, 0}, + {3, 1536, 6, 256, 1}, + {3, 2048, 6, 2048, 0}, + {3, 2560, 6, 256, 1}, + {3, 3072, 6, 128, 1}, + {3, 3584, 6, 32, 1}, + {3, 4096, 6, 4096, 0}, + {3, 4608, 6, 512, 1}, + {3, 5120, 6, 256, 1}, + {3, 5632, 6, 256, 1}, + {3, 6144, 6, 32, 1}, + {3, 6656, 6, 128, 1}, + {3, 7168, 6, 32, 1}, + {3, 7680, 6, 32, 1}, + {3, 8192, 6, 8192, 0}, + {3, 8704, 6, 512, 1}, + {3, 9216, 6, 256, 1}, + {3, 9728, 6, 512, 1}, + {3, 10240, 6, 256, 1}, + {3, 20480, 6, 64, 1}, + {3, 30720, 6, 2048, 1}, + {3, 40960, 6, 4096, 1}, + {3, 51200, 6, 1024, 1}, + {3, 61440, 6, 1024, 1}, + {3, 71680, 6, 1024, 1}, + {3, 81920, 6, 16384, 1}, + {3, 92160, 6, 512, 1}, + {3, 102400, 6, 256, 1}, + {3, 204800, 6, 8192, 1}, + {3, 307200, 6, 512, 1}, + {3, 409600, 6, 16384, 1}, + {3, 512000, 6, 2048, 1}, + {3, 614400, 6, 8192, 1}, + {3, 716800, 6, 1024, 1}, + {3, 819200, 6, 16384, 1}, + {3, 921600, 6, 4096, 1}, + {3, 1024000, 6, 8192, 1}, + {4, 512, 10, 512, 0}, + {4, 1024, 10, 1024, 0}, + {4, 1536, 10, 64, 1}, + {4, 2048, 10, 2048, 0}, + {4, 2560, 10, 256, 1}, + {4, 3072, 10, 32, 1}, + {4, 3584, 10, 256, 1}, + {4, 4096, 10, 4096, 0}, + {4, 4608, 10, 64, 1}, + {4, 5120, 10, 64, 1}, + {4, 5632, 10, 64, 1}, + {4, 6144, 10, 256, 1}, + {4, 6656, 10, 64, 1}, + {4, 7168, 10, 128, 1}, + {4, 7680, 10, 64, 1}, + {4, 8192, 10, 8192, 0}, + {4, 8704, 10, 256, 1}, + {4, 9216, 10, 32, 1}, + {4, 9728, 10, 32, 1}, + {4, 10240, 10, 512, 1}, + {4, 20480, 10, 256, 1}, + {4, 30720, 10, 512, 1}, + {4, 40960, 10, 128, 1}, + {4, 51200, 10, 512, 1}, + {4, 61440, 10, 2048, 1}, + {4, 71680, 10, 1024, 1}, + {4, 81920, 10, 512, 1}, + {4, 92160, 10, 1024, 1}, + {4, 102400, 10, 512, 1}, + {4, 204800, 10, 2048, 1}, + {4, 307200, 10, 4096, 1}, + {4, 409600, 10, 4096, 1}, + {4, 512000, 10, 2048, 1}, + {4, 614400, 10, 4096, 1}, + {4, 716800, 10, 2048, 1}, + {4, 819200, 10, 16384, 1}, + {4, 921600, 10, 1024, 1}, + {4, 1024000, 10, 8192, 1}, + {6, 512, 10, 512, 0}, + {6, 1024, 10, 1024, 0}, + {6, 1536, 10, 256, 1}, + {6, 2048, 10, 2048, 0}, + {6, 2560, 10, 32, 1}, + {6, 3072, 10, 512, 1}, + {6, 3584, 10, 256, 1}, + {6, 4096, 10, 4096, 0}, + {6, 4608, 10, 64, 1}, + {6, 5120, 10, 32, 1}, + {6, 5632, 10, 512, 1}, + {6, 6144, 10, 256, 1}, + {6, 6656, 10, 32, 1}, + {6, 7168, 10, 1024, 1}, + {6, 7680, 10, 256, 1}, + {6, 8192, 10, 8192, 0}, + {6, 8704, 10, 512, 1}, + {6, 9216, 10, 64, 1}, + {6, 9728, 10, 64, 1}, + {6, 10240, 10, 128, 1}, + {6, 20480, 10, 128, 1}, + {6, 30720, 10, 1024, 1}, + {6, 40960, 10, 8192, 1}, + {6, 51200, 10, 256, 1}, + {6, 61440, 10, 1024, 1}, + {6, 71680, 10, 128, 1}, + {6, 81920, 10, 16384, 1}, + {6, 92160, 10, 512, 1}, + {6, 102400, 10, 256, 1}, + {6, 204800, 10, 4096, 1}, + {6, 307200, 10, 512, 1}, + {6, 409600, 10, 4096, 1}, + {6, 512000, 10, 1024, 1}, + {6, 614400, 10, 4096, 1}, + {6, 716800, 10, 4096, 1}, + {6, 819200, 10, 16384, 1}, + {6, 921600, 10, 1024, 1}, + {6, 1024000, 10, 8192, 1}, + {12, 512, 15, 512, 0}, + {12, 1024, 15, 1024, 0}, + {12, 1536, 15, 512, 1}, + {12, 2048, 15, 2048, 0}, + {12, 2560, 15, 512, 1}, + {12, 3072, 15, 64, 1}, + {12, 3584, 15, 256, 1}, + {12, 4096, 15, 4096, 0}, + {12, 4608, 15, 32, 1}, + {12, 5120, 15, 256, 1}, + {12, 5632, 15, 512, 1}, + {12, 6144, 15, 512, 1}, + {12, 6656, 15, 512, 1}, + {12, 7168, 15, 512, 1}, + {12, 7680, 15, 256, 1}, + {12, 8192, 15, 512, 1}, + {12, 8704, 15, 512, 1}, + {12, 9216, 15, 1024, 1}, + {12, 9728, 15, 32, 1}, + {12, 10240, 15, 128, 1}, + {12, 20480, 15, 32, 1}, + {12, 30720, 15, 64, 1}, + {12, 40960, 15, 2048, 1}, + {12, 51200, 15, 1024, 1}, + {12, 61440, 15, 4096, 1}, + {12, 71680, 15, 2048, 1}, + {12, 81920, 15, 1024, 1}, + {12, 92160, 15, 512, 1}, + {12, 102400, 15, 1024, 1}, + {12, 204800, 15, 4096, 1}, + {12, 307200, 15, 1024, 1}, + {12, 409600, 15, 8192, 1}, + {12, 512000, 15, 2048, 1}, + {12, 614400, 15, 8192, 1}, + {12, 716800, 15, 4096, 1}, + {12, 819200, 15, 32768, 1}, + {12, 921600, 15, 4096, 1}, + {12, 1024000, 15, 8192, 1}, + {11, 512, 20, 512, 0}, + {11, 1024, 20, 256, 1}, + {11, 1536, 20, 32, 1}, + {11, 2048, 20, 2048, 0}, + {11, 2560, 20, 256, 1}, + {11, 3072, 20, 512, 1}, + {11, 3584, 20, 512, 1}, + {11, 4096, 20, 4096, 0}, + {11, 4608, 20, 128, 1}, + {11, 5120, 20, 32, 1}, + {11, 5632, 20, 512, 1}, + {11, 6144, 20, 512, 1}, + {11, 6656, 20, 128, 1}, + {11, 7168, 20, 256, 1}, + {11, 7680, 20, 512, 1}, + {11, 8192, 20, 8192, 0}, + {11, 8704, 20, 512, 1}, + {11, 9216, 20, 64, 1}, + {11, 9728, 20, 64, 1}, + {11, 10240, 20, 512, 1}, + {11, 20480, 20, 2048, 1}, + {11, 30720, 20, 256, 1}, + {11, 40960, 20, 2048, 1}, + {11, 51200, 20, 256, 1}, + {11, 61440, 20, 4096, 1}, + {11, 71680, 20, 1024, 1}, + {11, 81920, 20, 2048, 1}, + {11, 92160, 20, 1024, 1}, + {11, 102400, 20, 4096, 1}, + {11, 204800, 20, 2048, 1}, + {11, 307200, 20, 4096, 1}, + {11, 409600, 20, 8192, 1}, + {11, 512000, 20, 2048, 1}, + {11, 614400, 20, 8192, 1}, + {11, 716800, 20, 4096, 1}, + {11, 819200, 20, 32768, 1}, + {11, 921600, 20, 4096, 1}, + {11, 1024000, 20, 8192, 1}, + {16, 512, 21, 512, 0}, + {16, 1024, 21, 32, 1}, + {16, 1536, 21, 512, 1}, + {16, 2048, 21, 2048, 0}, + {16, 2560, 21, 32, 1}, + {16, 3072, 21, 128, 1}, + {16, 3584, 21, 512, 1}, + {16, 4096, 21, 4096, 0}, + {16, 4608, 21, 512, 1}, + {16, 5120, 21, 32, 1}, + {16, 5632, 21, 512, 1}, + {16, 6144, 21, 2048, 1}, + {16, 6656, 21, 512, 1}, + {16, 7168, 21, 512, 1}, + {16, 7680, 21, 64, 1}, + {16, 8192, 21, 128, 1}, + {16, 8704, 21, 256, 1}, + {16, 9216, 21, 512, 1}, + {16, 9728, 21, 64, 1}, + {16, 10240, 21, 32, 1}, + {16, 20480, 21, 256, 1}, + {16, 30720, 21, 256, 1}, + {16, 40960, 21, 4096, 1}, + {16, 51200, 21, 512, 1}, + {16, 61440, 21, 2048, 1}, + {16, 71680, 21, 512, 1}, + {16, 81920, 21, 4096, 1}, + {16, 92160, 21, 2048, 1}, + {16, 102400, 21, 4096, 1}, + {16, 204800, 21, 4096, 1}, + {16, 307200, 21, 4096, 1}, + {16, 409600, 21, 8192, 1}, + {16, 512000, 21, 4096, 1}, + {16, 614400, 21, 8192, 1}, + {16, 716800, 21, 4096, 1}, + {16, 819200, 21, 32768, 1}, + {16, 921600, 21, 512, 1}, + {16, 1024000, 21, 8192, 1}, + {25, 512, 28, 32, 1}, + {25, 1024, 28, 1024, 0}, + {25, 1536, 28, 64, 1}, + {25, 2048, 28, 2048, 0}, + {25, 2560, 28, 64, 1}, + {25, 3072, 28, 512, 1}, + {25, 3584, 28, 256, 1}, + {25, 4096, 28, 2048, 1}, + {25, 4608, 28, 256, 1}, + {25, 5120, 28, 64, 1}, + {25, 5632, 28, 512, 1}, + {25, 6144, 28, 128, 1}, + {25, 6656, 28, 512, 1}, + {25, 7168, 28, 256, 1}, + {25, 7680, 28, 512, 1}, + {25, 8192, 28, 8192, 0}, + {25, 8704, 28, 128, 1}, + {25, 9216, 28, 256, 1}, + {25, 9728, 28, 64, 1}, + {25, 10240, 28, 256, 1}, + {25, 20480, 28, 512, 1}, + {25, 30720, 28, 1024, 1}, + {25, 40960, 28, 4096, 1}, + {25, 51200, 28, 1024, 1}, + {25, 61440, 28, 512, 1}, + {25, 71680, 28, 2048, 1}, + {25, 81920, 28, 2048, 1}, + {25, 92160, 28, 512, 1}, + {25, 102400, 28, 4096, 1}, + {25, 204800, 28, 512, 1}, + {25, 307200, 28, 307200, 0}, + {25, 409600, 28, 409600, 0}, + {25, 512000, 28, 512000, 0}, + {25, 614400, 28, 614400, 0}, + {25, 716800, 28, 716800, 0}, + {25, 819200, 28, 819200, 0}, + {25, 921600, 28, 921600, 0}, + {25, 1024000, 28, 1024000, 0}, + {24, 512, 35, 512, 0}, + {24, 1024, 35, 1024, 0}, + {24, 1536, 35, 64, 1}, + {24, 2048, 35, 2048, 0}, + {24, 2560, 35, 128, 1}, + {24, 3072, 35, 256, 1}, + {24, 3584, 35, 32, 1}, + {24, 4096, 35, 64, 1}, + {24, 4608, 35, 32, 1}, + {24, 5120, 35, 32, 1}, + {24, 5632, 35, 128, 1}, + {24, 6144, 35, 256, 1}, + {24, 6656, 35, 512, 1}, + {24, 7168, 35, 128, 1}, + {24, 7680, 35, 32, 1}, + {24, 8192, 35, 8192, 0}, + {24, 8704, 35, 512, 1}, + {24, 9216, 35, 1024, 1}, + {24, 9728, 35, 32, 1}, + {24, 10240, 35, 2048, 1}, + {24, 20480, 35, 4096, 1}, + {24, 30720, 35, 1024, 1}, + {24, 40960, 35, 512, 1}, + {24, 51200, 35, 512, 1}, + {24, 61440, 35, 4096, 1}, + {24, 71680, 35, 1024, 1}, + {24, 81920, 35, 2048, 1}, + {24, 92160, 35, 2048, 1}, + {24, 102400, 35, 2048, 1}, + {24, 204800, 35, 4096, 1}, + {24, 307200, 35, 4096, 1}, + {24, 409600, 35, 409600, 0}, + {24, 512000, 35, 512000, 0}, + {24, 614400, 35, 614400, 0}, + {24, 716800, 35, 716800, 0}, + {24, 819200, 35, 819200, 0}, + {24, 921600, 35, 921600, 0}, + {24, 1024000, 35, 1024000, 0}, + {33, 512, 36, 512, 0}, + {33, 1024, 36, 1024, 0}, + {33, 1536, 36, 512, 1}, + {33, 2048, 36, 32, 1}, + {33, 2560, 36, 32, 1}, + {33, 3072, 36, 32, 1}, + {33, 3584, 36, 128, 1}, + {33, 4096, 36, 256, 1}, + {33, 4608, 36, 64, 1}, + {33, 5120, 36, 256, 1}, + {33, 5632, 36, 256, 1}, + {33, 6144, 36, 2048, 1}, + {33, 6656, 36, 32, 1}, + {33, 7168, 36, 1024, 1}, + {33, 7680, 36, 512, 1}, + {33, 8192, 36, 4096, 1}, + {33, 8704, 36, 128, 1}, + {33, 9216, 36, 128, 1}, + {33, 9728, 36, 512, 1}, + {33, 10240, 36, 32, 1}, + {33, 20480, 36, 256, 1}, + {33, 30720, 36, 1024, 1}, + {33, 40960, 36, 2048, 1}, + {33, 51200, 36, 1024, 1}, + {33, 61440, 36, 2048, 1}, + {33, 71680, 36, 1024, 1}, + {33, 81920, 36, 8192, 1}, + {33, 92160, 36, 1024, 1}, + {33, 102400, 36, 2048, 1}, + {33, 204800, 36, 8192, 1}, + {33, 307200, 36, 4096, 1}, + {33, 409600, 36, 409600, 0}, + {33, 512000, 36, 512000, 0}, + {33, 614400, 36, 614400, 0}, + {33, 716800, 36, 716800, 0}, + {33, 819200, 36, 819200, 0}, + {33, 921600, 36, 921600, 0}, + {33, 1024000, 36, 1024000, 0}, + {42, 512, 45, 512, 0}, + {42, 1024, 45, 1024, 0}, + {42, 1536, 45, 32, 1}, + {42, 2048, 45, 2048, 0}, + {42, 2560, 45, 64, 1}, + {42, 3072, 45, 64, 1}, + {42, 3584, 45, 64, 1}, + {42, 4096, 45, 32, 1}, + {42, 4608, 45, 512, 1}, + {42, 5120, 45, 1024, 1}, + {42, 5632, 45, 32, 1}, + {42, 6144, 45, 64, 1}, + {42, 6656, 45, 64, 1}, + {42, 7168, 45, 1024, 1}, + {42, 7680, 45, 256, 1}, + {42, 8192, 45, 128, 1}, + {42, 8704, 45, 512, 1}, + {42, 9216, 45, 128, 1}, + {42, 9728, 45, 128, 1}, + {42, 10240, 45, 32, 1}, + {42, 20480, 45, 1024, 1}, + {42, 30720, 45, 2048, 1}, + {42, 40960, 45, 2048, 1}, + {42, 51200, 45, 1024, 1}, + {42, 61440, 45, 2048, 1}, + {42, 71680, 45, 2048, 1}, + {42, 81920, 45, 8192, 1}, + {42, 92160, 45, 2048, 1}, + {42, 102400, 45, 4096, 1}, + {42, 204800, 45, 512, 1}, + {42, 307200, 45, 307200, 0}, + {42, 409600, 45, 409600, 0}, + {42, 512000, 45, 512000, 0}, + {42, 614400, 45, 614400, 0}, + {42, 716800, 45, 716800, 0}, + {42, 819200, 45, 819200, 0}, + {42, 921600, 45, 921600, 0}, + {42, 1024000, 45, 1024000, 0}, + {43, 512, 56, 512, 0}, + {43, 1024, 56, 1024, 0}, + {43, 1536, 56, 32, 1}, + {43, 2048, 56, 2048, 0}, + {43, 2560, 56, 32, 1}, + {43, 3072, 56, 512, 1}, + {43, 3584, 56, 256, 1}, + {43, 4096, 56, 4096, 0}, + {43, 4608, 56, 128, 1}, + {43, 5120, 56, 512, 1}, + {43, 5632, 56, 256, 1}, + {43, 6144, 56, 512, 1}, + {43, 6656, 56, 32, 1}, + {43, 7168, 56, 128, 1}, + {43, 7680, 56, 512, 1}, + {43, 8192, 56, 2048, 1}, + {43, 8704, 56, 64, 1}, + {43, 9216, 56, 64, 1}, + {43, 9728, 56, 128, 1}, + {43, 10240, 56, 2048, 1}, + {43, 20480, 56, 512, 1}, + {43, 30720, 56, 2048, 1}, + {43, 40960, 56, 4096, 1}, + {43, 51200, 56, 2048, 1}, + {43, 61440, 56, 4096, 1}, + {43, 71680, 56, 1024, 1}, + {43, 81920, 56, 16384, 1}, + {43, 92160, 56, 2048, 1}, + {43, 102400, 56, 4096, 1}, + {43, 204800, 56, 1024, 1}, + {43, 307200, 56, 307200, 0}, + {43, 409600, 56, 409600, 0}, + {43, 512000, 56, 512000, 0}, + {43, 614400, 56, 614400, 0}, + {43, 716800, 56, 716800, 0}, + {43, 819200, 56, 819200, 0}, + {43, 921600, 56, 921600, 0}, + {43, 1024000, 56, 1024000, 0}, + {126, 512, 84, 512, 0}, + {126, 1024, 84, 1024, 0}, + {126, 1536, 84, 256, 1}, + {126, 2048, 84, 2048, 0}, + {126, 2560, 84, 32, 1}, + {126, 3072, 84, 256, 1}, + {126, 3584, 84, 512, 1}, + {126, 4096, 84, 4096, 1}, + {126, 4608, 84, 256, 1}, + {126, 5120, 84, 1024, 1}, + {126, 5632, 84, 512, 1}, + {126, 6144, 84, 1024, 1}, + {126, 6656, 84, 256, 1}, + {126, 7168, 84, 512, 1}, + {126, 7680, 84, 256, 1}, + {126, 8192, 84, 2048, 1}, + {126, 8704, 84, 256, 1}, + {126, 9216, 84, 1024, 1}, + {126, 9728, 84, 128, 1}, + {126, 10240, 84, 2048, 1}, + {126, 20480, 84, 2048, 1}, + {126, 30720, 84, 2048, 1}, + {126, 40960, 84, 8192, 1}, + {126, 51200, 84, 512, 1}, + {126, 61440, 84, 1024, 1}, + {126, 71680, 84, 512, 1}, + {126, 81920, 84, 16384, 1}, + {126, 92160, 84, 512, 0}, + {126, 102400, 84, 2048, 0}, + {126, 204800, 84, 512, 0}, + {126, 307200, 84, 307200, 0}, + {126, 409600, 84, 409600, 0}, + {126, 512000, 84, 512000, 0}, + {126, 614400, 84, 614400, 0}, + {126, 716800, 84, 716800, 0}, + {126, 819200, 84, 32768, 0}, + {126, 921600, 84, 921600, 0}, + {126, 1024000, 84, 256, 0}, + {210, 512, 120, 512, 0}, + {210, 1024, 120, 1024, 0}, + {210, 1536, 120, 256, 1}, + {210, 2048, 120, 2048, 1}, + {210, 2560, 120, 512, 1}, + {210, 3072, 120, 1024, 1}, + {210, 3584, 120, 512, 1}, + {210, 4096, 120, 512, 1}, + {210, 4608, 120, 512, 1}, + {210, 5120, 120, 1024, 1}, + {210, 5632, 120, 512, 1}, + {210, 6144, 120, 1024, 1}, + {210, 6656, 120, 256, 1}, + {210, 7168, 120, 512, 1}, + {210, 7680, 120, 512, 1}, + {210, 8192, 120, 4096, 1}, + {210, 8704, 120, 256, 1}, + {210, 9216, 120, 1024, 1}, + {210, 9728, 120, 512, 1}, + {210, 10240, 120, 512, 1}, + {210, 20480, 120, 1024, 0}, + {210, 30720, 120, 512, 1}, + {210, 40960, 120, 8192, 0}, + {210, 51200, 120, 51200, 0}, + {210, 61440, 120, 1024, 0}, + {210, 71680, 120, 71680, 0}, + {210, 81920, 120, 81920, 0}, + {210, 92160, 120, 92160, 0}, + {210, 102400, 120, 102400, 0}, + {210, 204800, 120, 204800, 0}, + {210, 307200, 120, 307200, 0}, + {210, 409600, 120, 409600, 0}, + {210, 512000, 120, 512000, 0}, + {210, 614400, 120, 614400, 0}, + {210, 716800, 120, 128, 0}, + {210, 819200, 120, 128, 0}, + {210, 921600, 120, 128, 0}, + {210, 1024000, 120, 128, 0}, + {330, 512, 165, 512, 0}, + {330, 1024, 165, 1024, 0}, + {330, 1536, 165, 256, 1}, + {330, 2048, 165, 1024, 1}, + {330, 2560, 165, 256, 1}, + {330, 3072, 165, 256, 1}, + {330, 3584, 165, 512, 1}, + {330, 4096, 165, 1024, 1}, + {330, 4608, 165, 512, 1}, + {330, 5120, 165, 512, 1}, + {330, 5632, 165, 512, 1}, + {330, 6144, 165, 1024, 1}, + {330, 6656, 165, 512, 1}, + {330, 7168, 165, 512, 1}, + {330, 7680, 165, 512, 1}, + {330, 8192, 165, 8192, 1}, + {330, 8704, 165, 256, 0}, + {330, 9216, 165, 128, 0}, + {330, 9728, 165, 512, 1}, + {330, 10240, 165, 512, 1}, + {330, 20480, 165, 20480, 0}, + {330, 30720, 165, 1024, 0}, + {330, 40960, 165, 4096, 0}, + {330, 51200, 165, 51200, 0}, + {330, 61440, 165, 61440, 0}, + {330, 71680, 165, 512, 0}, + {330, 81920, 165, 512, 0}, + {330, 92160, 165, 128, 0}, + {330, 102400, 165, 102400, 0}, + {330, 204800, 165, 204800, 0}, + {330, 307200, 165, 307200, 0}, + {330, 409600, 165, 128, 0}, + {330, 512000, 165, 128, 0}, + {330, 614400, 165, 128, 0}, + {330, 716800, 165, 128, 0}, + {330, 819200, 165, 128, 0}, + {330, 921600, 165, 128, 0}, + {330, 1024000, 165, 128, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > dgemm_nn_a100 = -{ - {3 , 512 , 1 , 64 , 0 }, - {3 , 1024 , 1 , 512 , 0 }, - {3 , 1536 , 1 , 512 , 0 }, - {3 , 2048 , 1 , 2048 , 0 }, - {3 , 2560 , 1 , 256 , 0 }, - {3 , 3072 , 1 , 64 , 0 }, - {3 , 3584 , 1 , 32 , 0 }, - {3 , 4096 , 1 , 512 , 0 }, - {3 , 4608 , 1 , 256 , 0 }, - {3 , 5120 , 1 , 1024 , 0 }, - {3 , 5632 , 1 , 256 , 0 }, - {3 , 6144 , 1 , 512 , 0 }, - {3 , 6656 , 1 , 256 , 0 }, - {3 , 7168 , 1 , 512 , 0 }, - {3 , 7680 , 1 , 256 , 0 }, - {3 , 8192 , 1 , 128 , 0 }, - {3 , 8704 , 1 , 128 , 0 }, - {3 , 9216 , 1 , 1024 , 0 }, - {3 , 9728 , 1 , 32 , 0 }, - {3 , 10240 , 1 , 128 , 0 }, - {3 , 20480 , 1 , 1024 , 0 }, - {3 , 30720 , 1 , 64 , 0 }, - {3 , 40960 , 1 , 128 , 0 }, - {3 , 51200 , 1 , 2048 , 0 }, - {3 , 61440 , 1 , 1024 , 0 }, - {3 , 71680 , 1 , 128 , 0 }, - {3 , 81920 , 1 , 64 , 0 }, - {3 , 92160 , 1 , 512 , 0 }, - {3 , 102400 , 1 , 512 , 0 }, - {3 , 204800 , 1 , 128 , 1 }, - {3 , 307200 , 1 , 4096 , 1 }, - {3 , 409600 , 1 , 1024 , 1 }, - {3 , 512000 , 1 , 4096 , 1 }, - {3 , 614400 , 1 , 1024 , 0 }, - {3 , 716800 , 1 , 32 , 0 }, - {3 , 819200 , 1 , 16384 , 1 }, - {3 , 921600 , 1 , 2048 , 1 }, - {3 , 1024000, 1 , 8192 , 1 }, - {4 , 512 , 1 , 32 , 0 }, - {4 , 1024 , 1 , 1024 , 0 }, - {4 , 1536 , 1 , 64 , 0 }, - {4 , 2048 , 1 , 64 , 0 }, - {4 , 2560 , 1 , 512 , 0 }, - {4 , 3072 , 1 , 512 , 0 }, - {4 , 3584 , 1 , 32 , 0 }, - {4 , 4096 , 1 , 512 , 0 }, - {4 , 4608 , 1 , 64 , 0 }, - {4 , 5120 , 1 , 512 , 0 }, - {4 , 5632 , 1 , 128 , 0 }, - {4 , 6144 , 1 , 128 , 0 }, - {4 , 6656 , 1 , 32 , 0 }, - {4 , 7168 , 1 , 256 , 0 }, - {4 , 7680 , 1 , 512 , 0 }, - {4 , 8192 , 1 , 1024 , 0 }, - {4 , 8704 , 1 , 128 , 0 }, - {4 , 9216 , 1 , 512 , 0 }, - {4 , 9728 , 1 , 256 , 0 }, - {4 , 10240 , 1 , 1024 , 0 }, - {4 , 20480 , 1 , 1024 , 0 }, - {4 , 30720 , 1 , 512 , 0 }, - {4 , 40960 , 1 , 128 , 0 }, - {4 , 51200 , 1 , 32 , 0 }, - {4 , 61440 , 1 , 64 , 0 }, - {4 , 71680 , 1 , 64 , 0 }, - {4 , 81920 , 1 , 64 , 0 }, - {4 , 92160 , 1 , 64 , 0 }, - {4 , 102400 , 1 , 4096 , 0 }, - {4 , 204800 , 1 , 2048 , 1 }, - {4 , 307200 , 1 , 4096 , 1 }, - {4 , 409600 , 1 , 16384 , 1 }, - {4 , 512000 , 1 , 4096 , 1 }, - {4 , 614400 , 1 , 2048 , 0 }, - {4 , 716800 , 1 , 1024 , 1 }, - {4 , 819200 , 1 , 32768 , 1 }, - {4 , 921600 , 1 , 4096 , 1 }, - {4 , 1024000, 1 , 1024 , 0 }, - {6 , 512 , 3 , 512 , 0 }, - {6 , 1024 , 3 , 1024 , 0 }, - {6 , 1536 , 3 , 32 , 1 }, - {6 , 2048 , 3 , 2048 , 0 }, - {6 , 2560 , 3 , 512 , 1 }, - {6 , 3072 , 3 , 64 , 1 }, - {6 , 3584 , 3 , 32 , 1 }, - {6 , 4096 , 3 , 4096 , 0 }, - {6 , 4608 , 3 , 256 , 1 }, - {6 , 5120 , 3 , 256 , 1 }, - {6 , 5632 , 3 , 256 , 1 }, - {6 , 6144 , 3 , 64 , 1 }, - {6 , 6656 , 3 , 128 , 1 }, - {6 , 7168 , 3 , 512 , 1 }, - {6 , 7680 , 3 , 512 , 1 }, - {6 , 8192 , 3 , 8192 , 0 }, - {6 , 8704 , 3 , 128 , 1 }, - {6 , 9216 , 3 , 256 , 1 }, - {6 , 9728 , 3 , 512 , 1 }, - {6 , 10240 , 3 , 1024 , 1 }, - {6 , 20480 , 3 , 1024 , 1 }, - {6 , 30720 , 3 , 128 , 1 }, - {6 , 40960 , 3 , 1024 , 1 }, - {6 , 51200 , 3 , 2048 , 1 }, - {6 , 61440 , 3 , 2048 , 1 }, - {6 , 71680 , 3 , 2048 , 1 }, - {6 , 81920 , 3 , 512 , 1 }, - {6 , 92160 , 3 , 256 , 1 }, - {6 , 102400 , 3 , 4096 , 1 }, - {6 , 204800 , 3 , 512 , 1 }, - {6 , 307200 , 3 , 512 , 1 }, - {6 , 409600 , 3 , 4096 , 1 }, - {6 , 512000 , 3 , 4096 , 1 }, - {6 , 614400 , 3 , 4096 , 1 }, - {6 , 716800 , 3 , 1024 , 1 }, - {6 , 819200 , 3 , 16384 , 1 }, - {6 , 921600 , 3 , 4096 , 1 }, - {6 , 1024000, 3 , 8192 , 1 }, - {10 , 512 , 4 , 512 , 0 }, - {10 , 1024 , 4 , 1024 , 0 }, - {10 , 1536 , 4 , 256 , 1 }, - {10 , 2048 , 4 , 2048 , 0 }, - {10 , 2560 , 4 , 64 , 1 }, - {10 , 3072 , 4 , 64 , 1 }, - {10 , 3584 , 4 , 32 , 1 }, - {10 , 4096 , 4 , 4096 , 0 }, - {10 , 4608 , 4 , 512 , 1 }, - {10 , 5120 , 4 , 512 , 1 }, - {10 , 5632 , 4 , 256 , 1 }, - {10 , 6144 , 4 , 32 , 1 }, - {10 , 6656 , 4 , 512 , 1 }, - {10 , 7168 , 4 , 1024 , 1 }, - {10 , 7680 , 4 , 512 , 1 }, - {10 , 8192 , 4 , 8192 , 0 }, - {10 , 8704 , 4 , 128 , 1 }, - {10 , 9216 , 4 , 64 , 1 }, - {10 , 9728 , 4 , 512 , 1 }, - {10 , 10240 , 4 , 512 , 1 }, - {10 , 20480 , 4 , 512 , 1 }, - {10 , 30720 , 4 , 128 , 1 }, - {10 , 40960 , 4 , 128 , 1 }, - {10 , 51200 , 4 , 2048 , 1 }, - {10 , 61440 , 4 , 4096 , 1 }, - {10 , 71680 , 4 , 128 , 1 }, - {10 , 81920 , 4 , 1024 , 1 }, - {10 , 92160 , 4 , 1024 , 1 }, - {10 , 102400 , 4 , 4096 , 1 }, - {10 , 204800 , 4 , 4096 , 1 }, - {10 , 307200 , 4 , 2048 , 1 }, - {10 , 409600 , 4 , 16384 , 1 }, - {10 , 512000 , 4 , 4096 , 1 }, - {10 , 614400 , 4 , 8192 , 1 }, - {10 , 716800 , 4 , 4096 , 1 }, - {10 , 819200 , 4 , 32768 , 1 }, - {10 , 921600 , 4 , 4096 , 1 }, - {10 , 1024000, 4 , 8192 , 1 }, - {10 , 512 , 6 , 512 , 0 }, - {10 , 1024 , 6 , 1024 , 0 }, - {10 , 1536 , 6 , 512 , 1 }, - {10 , 2048 , 6 , 2048 , 0 }, - {10 , 2560 , 6 , 32 , 1 }, - {10 , 3072 , 6 , 512 , 1 }, - {10 , 3584 , 6 , 64 , 1 }, - {10 , 4096 , 6 , 4096 , 0 }, - {10 , 4608 , 6 , 512 , 1 }, - {10 , 5120 , 6 , 128 , 1 }, - {10 , 5632 , 6 , 64 , 1 }, - {10 , 6144 , 6 , 256 , 1 }, - {10 , 6656 , 6 , 512 , 1 }, - {10 , 7168 , 6 , 128 , 1 }, - {10 , 7680 , 6 , 512 , 1 }, - {10 , 8192 , 6 , 8192 , 0 }, - {10 , 8704 , 6 , 512 , 1 }, - {10 , 9216 , 6 , 256 , 1 }, - {10 , 9728 , 6 , 128 , 1 }, - {10 , 10240 , 6 , 256 , 1 }, - {10 , 20480 , 6 , 256 , 1 }, - {10 , 30720 , 6 , 128 , 1 }, - {10 , 40960 , 6 , 64 , 1 }, - {10 , 51200 , 6 , 1024 , 1 }, - {10 , 61440 , 6 , 512 , 1 }, - {10 , 71680 , 6 , 512 , 1 }, - {10 , 81920 , 6 , 16384 , 1 }, - {10 , 92160 , 6 , 1024 , 1 }, - {10 , 102400 , 6 , 1024 , 1 }, - {10 , 204800 , 6 , 4096 , 1 }, - {10 , 307200 , 6 , 2048 , 1 }, - {10 , 409600 , 6 , 8192 , 1 }, - {10 , 512000 , 6 , 1024 , 1 }, - {10 , 614400 , 6 , 8192 , 1 }, - {10 , 716800 , 6 , 4096 , 1 }, - {10 , 819200 , 6 , 32768 , 1 }, - {10 , 921600 , 6 , 4096 , 1 }, - {10 , 1024000, 6 , 8192 , 1 }, - {15 , 512 , 12 , 512 , 0 }, - {15 , 1024 , 12 , 1024 , 0 }, - {15 , 1536 , 12 , 64 , 1 }, - {15 , 2048 , 12 , 2048 , 0 }, - {15 , 2560 , 12 , 256 , 1 }, - {15 , 3072 , 12 , 512 , 1 }, - {15 , 3584 , 12 , 64 , 1 }, - {15 , 4096 , 12 , 4096 , 0 }, - {15 , 4608 , 12 , 256 , 1 }, - {15 , 5120 , 12 , 512 , 1 }, - {15 , 5632 , 12 , 512 , 0 }, - {15 , 6144 , 12 , 256 , 1 }, - {15 , 6656 , 12 , 32 , 1 }, - {15 , 7168 , 12 , 256 , 0 }, - {15 , 7680 , 12 , 512 , 1 }, - {15 , 8192 , 12 , 8192 , 0 }, - {15 , 8704 , 12 , 32 , 1 }, - {15 , 9216 , 12 , 64 , 1 }, - {15 , 9728 , 12 , 256 , 1 }, - {15 , 10240 , 12 , 32 , 1 }, - {15 , 20480 , 12 , 32 , 1 }, - {15 , 30720 , 12 , 2048 , 1 }, - {15 , 40960 , 12 , 2048 , 1 }, - {15 , 51200 , 12 , 64 , 1 }, - {15 , 61440 , 12 , 64 , 1 }, - {15 , 71680 , 12 , 512 , 1 }, - {15 , 81920 , 12 , 64 , 1 }, - {15 , 92160 , 12 , 512 , 1 }, - {15 , 102400 , 12 , 512 , 1 }, - {15 , 204800 , 12 , 8192 , 1 }, - {15 , 307200 , 12 , 128 , 1 }, - {15 , 409600 , 12 , 4096 , 1 }, - {15 , 512000 , 12 , 4096 , 1 }, - {15 , 614400 , 12 , 4096 , 1 }, - {15 , 716800 , 12 , 4096 , 1 }, - {15 , 819200 , 12 , 32768 , 1 }, - {15 , 921600 , 12 , 512 , 1 }, - {15 , 1024000, 12 , 8192 , 1 }, - {20 , 512 , 11 , 512 , 0 }, - {20 , 1024 , 11 , 1024 , 0 }, - {20 , 1536 , 11 , 256 , 0 }, - {20 , 2048 , 11 , 2048 , 0 }, - {20 , 2560 , 11 , 128 , 0 }, - {20 , 3072 , 11 , 64 , 1 }, - {20 , 3584 , 11 , 32 , 0 }, - {20 , 4096 , 11 , 4096 , 0 }, - {20 , 4608 , 11 , 256 , 1 }, - {20 , 5120 , 11 , 64 , 1 }, - {20 , 5632 , 11 , 256 , 1 }, - {20 , 6144 , 11 , 128 , 1 }, - {20 , 6656 , 11 , 32 , 1 }, - {20 , 7168 , 11 , 256 , 1 }, - {20 , 7680 , 11 , 64 , 1 }, - {20 , 8192 , 11 , 8192 , 0 }, - {20 , 8704 , 11 , 256 , 1 }, - {20 , 9216 , 11 , 256 , 1 }, - {20 , 9728 , 11 , 256 , 1 }, - {20 , 10240 , 11 , 64 , 1 }, - {20 , 20480 , 11 , 2048 , 1 }, - {20 , 30720 , 11 , 2048 , 1 }, - {20 , 40960 , 11 , 512 , 1 }, - {20 , 51200 , 11 , 128 , 1 }, - {20 , 61440 , 11 , 2048 , 1 }, - {20 , 71680 , 11 , 128 , 1 }, - {20 , 81920 , 11 , 16384 , 1 }, - {20 , 92160 , 11 , 64 , 1 }, - {20 , 102400 , 11 , 64 , 1 }, - {20 , 204800 , 11 , 2048 , 1 }, - {20 , 307200 , 11 , 256 , 1 }, - {20 , 409600 , 11 , 16384 , 1 }, - {20 , 512000 , 11 , 256 , 1 }, - {20 , 614400 , 11 , 8192 , 1 }, - {20 , 716800 , 11 , 512 , 1 }, - {20 , 819200 , 11 , 32768 , 1 }, - {20 , 921600 , 11 , 512 , 0 }, - {20 , 1024000, 11 , 8192 , 0 }, - {21 , 512 , 16 , 512 , 0 }, - {21 , 1024 , 16 , 1024 , 0 }, - {21 , 1536 , 16 , 256 , 0 }, - {21 , 2048 , 16 , 2048 , 0 }, - {21 , 2560 , 16 , 128 , 0 }, - {21 , 3072 , 16 , 64 , 1 }, - {21 , 3584 , 16 , 64 , 0 }, - {21 , 4096 , 16 , 4096 , 0 }, - {21 , 4608 , 16 , 128 , 0 }, - {21 , 5120 , 16 , 256 , 0 }, - {21 , 5632 , 16 , 32 , 0 }, - {21 , 6144 , 16 , 128 , 0 }, - {21 , 6656 , 16 , 64 , 1 }, - {21 , 7168 , 16 , 128 , 1 }, - {21 , 7680 , 16 , 32 , 0 }, - {21 , 8192 , 16 , 4096 , 1 }, - {21 , 8704 , 16 , 64 , 0 }, - {21 , 9216 , 16 , 64 , 1 }, - {21 , 9728 , 16 , 512 , 1 }, - {21 , 10240 , 16 , 256 , 0 }, - {21 , 20480 , 16 , 512 , 1 }, - {21 , 30720 , 16 , 32 , 1 }, - {21 , 40960 , 16 , 64 , 1 }, - {21 , 51200 , 16 , 64 , 1 }, - {21 , 61440 , 16 , 32 , 1 }, - {21 , 71680 , 16 , 1024 , 1 }, - {21 , 81920 , 16 , 128 , 1 }, - {21 , 92160 , 16 , 512 , 1 }, - {21 , 102400 , 16 , 256 , 1 }, - {21 , 204800 , 16 , 256 , 1 }, - {21 , 307200 , 16 , 128 , 1 }, - {21 , 409600 , 16 , 16384 , 1 }, - {21 , 512000 , 16 , 256 , 1 }, - {21 , 614400 , 16 , 8192 , 1 }, - {21 , 716800 , 16 , 1024 , 1 }, - {21 , 819200 , 16 , 32768 , 1 }, - {21 , 921600 , 16 , 512 , 1 }, - {21 , 1024000, 16 , 1024 , 0 }, - {28 , 512 , 25 , 64 , 0 }, - {28 , 1024 , 25 , 256 , 0 }, - {28 , 1536 , 25 , 64 , 0 }, - {28 , 2048 , 25 , 2048 , 0 }, - {28 , 2560 , 25 , 32 , 0 }, - {28 , 3072 , 25 , 64 , 0 }, - {28 , 3584 , 25 , 64 , 0 }, - {28 , 4096 , 25 , 64 , 0 }, - {28 , 4608 , 25 , 128 , 0 }, - {28 , 5120 , 25 , 128 , 0 }, - {28 , 5632 , 25 , 128 , 1 }, - {28 , 6144 , 25 , 256 , 0 }, - {28 , 6656 , 25 , 512 , 0 }, - {28 , 7168 , 25 , 256 , 0 }, - {28 , 7680 , 25 , 128 , 0 }, - {28 , 8192 , 25 , 8192 , 0 }, - {28 , 8704 , 25 , 64 , 1 }, - {28 , 9216 , 25 , 1024 , 0 }, - {28 , 9728 , 25 , 64 , 0 }, - {28 , 10240 , 25 , 256 , 1 }, - {28 , 20480 , 25 , 128 , 1 }, - {28 , 30720 , 25 , 32 , 1 }, - {28 , 40960 , 25 , 2048 , 1 }, - {28 , 51200 , 25 , 2048 , 1 }, - {28 , 61440 , 25 , 4096 , 1 }, - {28 , 71680 , 25 , 512 , 1 }, - {28 , 81920 , 25 , 8192 , 1 }, - {28 , 92160 , 25 , 256 , 1 }, - {28 , 102400 , 25 , 4096 , 1 }, - {28 , 204800 , 25 , 8192 , 1 }, - {28 , 307200 , 25 , 2048 , 0 }, - {28 , 409600 , 25 , 4096 , 0 }, - {28 , 512000 , 25 , 256 , 0 }, - {28 , 614400 , 25 , 2048 , 0 }, - {28 , 716800 , 25 , 512 , 0 }, - {28 , 819200 , 25 , 8192 , 0 }, - {28 , 921600 , 25 , 1024 , 0 }, - {28 , 1024000, 25 , 1024 , 0 }, - {35 , 512 , 24 , 512 , 0 }, - {35 , 1024 , 24 , 1024 , 0 }, - {35 , 1536 , 24 , 512 , 0 }, - {35 , 2048 , 24 , 2048 , 0 }, - {35 , 2560 , 24 , 128 , 0 }, - {35 , 3072 , 24 , 128 , 0 }, - {35 , 3584 , 24 , 64 , 0 }, - {35 , 4096 , 24 , 256 , 1 }, - {35 , 4608 , 24 , 64 , 1 }, - {35 , 5120 , 24 , 512 , 1 }, - {35 , 5632 , 24 , 32 , 1 }, - {35 , 6144 , 24 , 32 , 1 }, - {35 , 6656 , 24 , 32 , 1 }, - {35 , 7168 , 24 , 256 , 1 }, - {35 , 7680 , 24 , 128 , 1 }, - {35 , 8192 , 24 , 256 , 1 }, - {35 , 8704 , 24 , 64 , 1 }, - {35 , 9216 , 24 , 256 , 1 }, - {35 , 9728 , 24 , 64 , 1 }, - {35 , 10240 , 24 , 128 , 1 }, - {35 , 20480 , 24 , 1024 , 1 }, - {35 , 30720 , 24 , 32 , 1 }, - {35 , 40960 , 24 , 8192 , 1 }, - {35 , 51200 , 24 , 1024 , 1 }, - {35 , 61440 , 24 , 2048 , 1 }, - {35 , 71680 , 24 , 2048 , 1 }, - {35 , 81920 , 24 , 8192 , 1 }, - {35 , 92160 , 24 , 256 , 1 }, - {35 , 102400 , 24 , 4096 , 1 }, - {35 , 204800 , 24 , 8192 , 1 }, - {35 , 307200 , 24 , 1024 , 1 }, - {35 , 409600 , 24 , 16384 , 1 }, - {35 , 512000 , 24 , 1024 , 1 }, - {35 , 614400 , 24 , 2048 , 1 }, - {35 , 716800 , 24 , 716800 , 0 }, - {35 , 819200 , 24 , 32768 , 1 }, - {35 , 921600 , 24 , 921600 , 0 }, - {35 , 1024000, 24 , 1024000, 0 }, - {36 , 512 , 33 , 512 , 0 }, - {36 , 1024 , 33 , 1024 , 0 }, - {36 , 1536 , 33 , 128 , 0 }, - {36 , 2048 , 33 , 128 , 0 }, - {36 , 2560 , 33 , 64 , 0 }, - {36 , 3072 , 33 , 64 , 0 }, - {36 , 3584 , 33 , 128 , 0 }, - {36 , 4096 , 33 , 2048 , 1 }, - {36 , 4608 , 33 , 128 , 0 }, - {36 , 5120 , 33 , 1024 , 0 }, - {36 , 5632 , 33 , 512 , 1 }, - {36 , 6144 , 33 , 2048 , 0 }, - {36 , 6656 , 33 , 256 , 1 }, - {36 , 7168 , 33 , 512 , 1 }, - {36 , 7680 , 33 , 64 , 1 }, - {36 , 8192 , 33 , 4096 , 1 }, - {36 , 8704 , 33 , 512 , 1 }, - {36 , 9216 , 33 , 1024 , 1 }, - {36 , 9728 , 33 , 256 , 1 }, - {36 , 10240 , 33 , 512 , 0 }, - {36 , 20480 , 33 , 4096 , 1 }, - {36 , 30720 , 33 , 128 , 1 }, - {36 , 40960 , 33 , 8192 , 1 }, - {36 , 51200 , 33 , 256 , 1 }, - {36 , 61440 , 33 , 2048 , 1 }, - {36 , 71680 , 33 , 256 , 1 }, - {36 , 81920 , 33 , 16384 , 1 }, - {36 , 92160 , 33 , 256 , 1 }, - {36 , 102400 , 33 , 4096 , 1 }, - {36 , 204800 , 33 , 2048 , 0 }, - {36 , 307200 , 33 , 2048 , 0 }, - {36 , 409600 , 33 , 128 , 0 }, - {36 , 512000 , 33 , 1024 , 0 }, - {36 , 614400 , 33 , 614400 , 0 }, - {36 , 716800 , 33 , 1024 , 0 }, - {36 , 819200 , 33 , 4096 , 0 }, - {36 , 921600 , 33 , 2048 , 0 }, - {36 , 1024000, 33 , 2048 , 0 }, - {45 , 512 , 42 , 512 , 0 }, - {45 , 1024 , 42 , 1024 , 0 }, - {45 , 1536 , 42 , 32 , 0 }, - {45 , 2048 , 42 , 64 , 0 }, - {45 , 2560 , 42 , 128 , 0 }, - {45 , 3072 , 42 , 32 , 0 }, - {45 , 3584 , 42 , 512 , 1 }, - {45 , 4096 , 42 , 128 , 0 }, - {45 , 4608 , 42 , 64 , 1 }, - {45 , 5120 , 42 , 128 , 0 }, - {45 , 5632 , 42 , 512 , 0 }, - {45 , 6144 , 42 , 1024 , 1 }, - {45 , 6656 , 42 , 512 , 1 }, - {45 , 7168 , 42 , 512 , 1 }, - {45 , 7680 , 42 , 512 , 1 }, - {45 , 8192 , 42 , 8192 , 1 }, - {45 , 8704 , 42 , 64 , 1 }, - {45 , 9216 , 42 , 1024 , 1 }, - {45 , 9728 , 42 , 64 , 1 }, - {45 , 10240 , 42 , 256 , 1 }, - {45 , 20480 , 42 , 4096 , 1 }, - {45 , 30720 , 42 , 512 , 1 }, - {45 , 40960 , 42 , 4096 , 1 }, - {45 , 51200 , 42 , 1024 , 1 }, - {45 , 61440 , 42 , 4096 , 1 }, - {45 , 71680 , 42 , 1024 , 1 }, - {45 , 81920 , 42 , 8192 , 1 }, - {45 , 92160 , 42 , 512 , 1 }, - {45 , 102400 , 42 , 4096 , 1 }, - {45 , 204800 , 42 , 4096 , 0 }, - {45 , 307200 , 42 , 4096 , 0 }, - {45 , 409600 , 42 , 4096 , 0 }, - {45 , 512000 , 42 , 1024 , 0 }, - {45 , 614400 , 42 , 2048 , 0 }, - {45 , 716800 , 42 , 2048 , 0 }, - {45 , 819200 , 42 , 32768 , 0 }, - {45 , 921600 , 42 , 4096 , 0 }, - {45 , 1024000, 42 , 512 , 0 }, - {56 , 512 , 43 , 512 , 0 }, - {56 , 1024 , 43 , 32 , 0 }, - {56 , 1536 , 43 , 64 , 0 }, - {56 , 2048 , 43 , 1024 , 0 }, - {56 , 2560 , 43 , 128 , 0 }, - {56 , 3072 , 43 , 128 , 1 }, - {56 , 3584 , 43 , 128 , 0 }, - {56 , 4096 , 43 , 128 , 0 }, - {56 , 4608 , 43 , 256 , 0 }, - {56 , 5120 , 43 , 256 , 0 }, - {56 , 5632 , 43 , 256 , 1 }, - {56 , 6144 , 43 , 1024 , 1 }, - {56 , 6656 , 43 , 128 , 1 }, - {56 , 7168 , 43 , 1024 , 1 }, - {56 , 7680 , 43 , 256 , 1 }, - {56 , 8192 , 43 , 1024 , 1 }, - {56 , 8704 , 43 , 256 , 1 }, - {56 , 9216 , 43 , 1024 , 1 }, - {56 , 9728 , 43 , 512 , 1 }, - {56 , 10240 , 43 , 1024 , 1 }, - {56 , 20480 , 43 , 4096 , 1 }, - {56 , 30720 , 43 , 2048 , 1 }, - {56 , 40960 , 43 , 8192 , 1 }, - {56 , 51200 , 43 , 512 , 1 }, - {56 , 61440 , 43 , 512 , 1 }, - {56 , 71680 , 43 , 512 , 1 }, - {56 , 81920 , 43 , 8192 , 1 }, - {56 , 92160 , 43 , 512 , 1 }, - {56 , 102400 , 43 , 512 , 1 }, - {56 , 204800 , 43 , 4096 , 0 }, - {56 , 307200 , 43 , 307200 , 0 }, - {56 , 409600 , 43 , 16384 , 0 }, - {56 , 512000 , 43 , 4096 , 0 }, - {56 , 614400 , 43 , 614400 , 0 }, - {56 , 716800 , 43 , 4096 , 0 }, - {56 , 819200 , 43 , 819200 , 0 }, - {56 , 921600 , 43 , 921600 , 0 }, - {56 , 1024000, 43 , 8192 , 0 }, - {84 , 512 , 126 , 32 , 0 }, - {84 , 1024 , 126 , 512 , 0 }, - {84 , 1536 , 126 , 256 , 0 }, - {84 , 2048 , 126 , 1024 , 0 }, - {84 , 2560 , 126 , 256 , 0 }, - {84 , 3072 , 126 , 64 , 1 }, - {84 , 3584 , 126 , 512 , 0 }, - {84 , 4096 , 126 , 4096 , 1 }, - {84 , 4608 , 126 , 64 , 1 }, - {84 , 5120 , 126 , 512 , 1 }, - {84 , 5632 , 126 , 64 , 1 }, - {84 , 6144 , 126 , 2048 , 1 }, - {84 , 6656 , 126 , 64 , 1 }, - {84 , 7168 , 126 , 512 , 1 }, - {84 , 7680 , 126 , 256 , 1 }, - {84 , 8192 , 126 , 64 , 1 }, - {84 , 8704 , 126 , 512 , 1 }, - {84 , 9216 , 126 , 32 , 1 }, - {84 , 9728 , 126 , 256 , 1 }, - {84 , 10240 , 126 , 256 , 1 }, - {84 , 20480 , 126 , 64 , 1 }, - {84 , 30720 , 126 , 512 , 0 }, - {84 , 40960 , 126 , 128 , 0 }, - {84 , 51200 , 126 , 2048 , 0 }, - {84 , 61440 , 126 , 512 , 0 }, - {84 , 71680 , 126 , 128 , 0 }, - {84 , 81920 , 126 , 4096 , 0 }, - {84 , 92160 , 126 , 2048 , 0 }, - {84 , 102400 , 126 , 2048 , 0 }, - {84 , 204800 , 126 , 256 , 0 }, - {84 , 307200 , 126 , 512 , 0 }, - {84 , 409600 , 126 , 512 , 0 }, - {84 , 512000 , 126 , 512 , 0 }, - {84 , 614400 , 126 , 64 , 0 }, - {84 , 716800 , 126 , 64 , 0 }, - {84 , 819200 , 126 , 64 , 0 }, - {84 , 921600 , 126 , 128 , 0 }, - {84 , 1024000, 126 , 128 , 0 }, - {120 , 512 , 210 , 256 , 0 }, - {120 , 1024 , 210 , 1024 , 0 }, - {120 , 1536 , 210 , 512 , 0 }, - {120 , 2048 , 210 , 512 , 0 }, - {120 , 2560 , 210 , 256 , 0 }, - {120 , 3072 , 210 , 512 , 0 }, - {120 , 3584 , 210 , 512 , 0 }, - {120 , 4096 , 210 , 512 , 0 }, - {120 , 4608 , 210 , 512 , 1 }, - {120 , 5120 , 210 , 512 , 0 }, - {120 , 5632 , 210 , 512 , 1 }, - {120 , 6144 , 210 , 256 , 0 }, - {120 , 6656 , 210 , 64 , 0 }, - {120 , 7168 , 210 , 512 , 0 }, - {120 , 7680 , 210 , 512 , 0 }, - {120 , 8192 , 210 , 8192 , 0 }, - {120 , 8704 , 210 , 512 , 0 }, - {120 , 9216 , 210 , 1024 , 0 }, - {120 , 9728 , 210 , 512 , 0 }, - {120 , 10240 , 210 , 1024 , 0 }, - {120 , 20480 , 210 , 4096 , 0 }, - {120 , 30720 , 210 , 30720 , 0 }, - {120 , 40960 , 210 , 512 , 0 }, - {120 , 51200 , 210 , 51200 , 0 }, - {120 , 61440 , 210 , 61440 , 0 }, - {120 , 71680 , 210 , 71680 , 0 }, - {120 , 81920 , 210 , 81920 , 0 }, - {120 , 92160 , 210 , 92160 , 0 }, - {120 , 102400 , 210 , 102400 , 0 }, - {120 , 204800 , 210 , 204800 , 0 }, - {120 , 307200 , 210 , 307200 , 0 }, - {120 , 409600 , 210 , 409600 , 0 }, - {120 , 512000 , 210 , 512 , 0 }, - {120 , 614400 , 210 , 1024 , 0 }, - {120 , 716800 , 210 , 1024 , 0 }, - {120 , 819200 , 210 , 8192 , 0 }, - {120 , 921600 , 210 , 1024 , 0 }, - {120 , 1024000, 210 , 8192 , 0 }, - {165 , 512 , 330 , 256 , 0 }, - {165 , 1024 , 330 , 1024 , 0 }, - {165 , 1536 , 330 , 512 , 0 }, - {165 , 2048 , 330 , 2048 , 0 }, - {165 , 2560 , 330 , 512 , 0 }, - {165 , 3072 , 330 , 512 , 0 }, - {165 , 3584 , 330 , 512 , 0 }, - {165 , 4096 , 330 , 4096 , 0 }, - {165 , 4608 , 330 , 256 , 0 }, - {165 , 5120 , 330 , 256 , 0 }, - {165 , 5632 , 330 , 512 , 0 }, - {165 , 6144 , 330 , 1024 , 0 }, - {165 , 6656 , 330 , 6656 , 0 }, - {165 , 7168 , 330 , 256 , 0 }, - {165 , 7680 , 330 , 128 , 0 }, - {165 , 8192 , 330 , 8192 , 0 }, - {165 , 8704 , 330 , 512 , 0 }, - {165 , 9216 , 330 , 1024 , 0 }, - {165 , 9728 , 330 , 512 , 0 }, - {165 , 10240 , 330 , 256 , 0 }, - {165 , 20480 , 330 , 20480 , 0 }, - {165 , 30720 , 330 , 30720 , 0 }, - {165 , 40960 , 330 , 40960 , 0 }, - {165 , 51200 , 330 , 51200 , 0 }, - {165 , 61440 , 330 , 61440 , 0 }, - {165 , 71680 , 330 , 71680 , 0 }, - {165 , 81920 , 330 , 81920 , 0 }, - {165 , 92160 , 330 , 92160 , 0 }, - {165 , 102400 , 330 , 102400 , 0 }, - {165 , 204800 , 330 , 204800 , 0 }, - {165 , 307200 , 330 , 307200 , 0 }, - {165 , 409600 , 330 , 409600 , 0 }, - {165 , 512000 , 330 , 512000 , 0 }, - {165 , 614400 , 330 , 614400 , 1 }, - {165 , 716800 , 330 , 716800 , 1 }, - {165 , 819200 , 330 , 819200 , 1 }, - {165 , 921600 , 330 , 921600 , 1 }, - {165 , 1024000, 330 , 1024000, 0 } +std::vector > dgemm_nn_a100 = { + {3, 512, 1, 64, 0}, + {3, 1024, 1, 512, 0}, + {3, 1536, 1, 512, 0}, + {3, 2048, 1, 2048, 0}, + {3, 2560, 1, 256, 0}, + {3, 3072, 1, 64, 0}, + {3, 3584, 1, 32, 0}, + {3, 4096, 1, 512, 0}, + {3, 4608, 1, 256, 0}, + {3, 5120, 1, 1024, 0}, + {3, 5632, 1, 256, 0}, + {3, 6144, 1, 512, 0}, + {3, 6656, 1, 256, 0}, + {3, 7168, 1, 512, 0}, + {3, 7680, 1, 256, 0}, + {3, 8192, 1, 128, 0}, + {3, 8704, 1, 128, 0}, + {3, 9216, 1, 1024, 0}, + {3, 9728, 1, 32, 0}, + {3, 10240, 1, 128, 0}, + {3, 20480, 1, 1024, 0}, + {3, 30720, 1, 64, 0}, + {3, 40960, 1, 128, 0}, + {3, 51200, 1, 2048, 0}, + {3, 61440, 1, 1024, 0}, + {3, 71680, 1, 128, 0}, + {3, 81920, 1, 64, 0}, + {3, 92160, 1, 512, 0}, + {3, 102400, 1, 512, 0}, + {3, 204800, 1, 128, 1}, + {3, 307200, 1, 4096, 1}, + {3, 409600, 1, 1024, 1}, + {3, 512000, 1, 4096, 1}, + {3, 614400, 1, 1024, 0}, + {3, 716800, 1, 32, 0}, + {3, 819200, 1, 16384, 1}, + {3, 921600, 1, 2048, 1}, + {3, 1024000, 1, 8192, 1}, + {4, 512, 1, 32, 0}, + {4, 1024, 1, 1024, 0}, + {4, 1536, 1, 64, 0}, + {4, 2048, 1, 64, 0}, + {4, 2560, 1, 512, 0}, + {4, 3072, 1, 512, 0}, + {4, 3584, 1, 32, 0}, + {4, 4096, 1, 512, 0}, + {4, 4608, 1, 64, 0}, + {4, 5120, 1, 512, 0}, + {4, 5632, 1, 128, 0}, + {4, 6144, 1, 128, 0}, + {4, 6656, 1, 32, 0}, + {4, 7168, 1, 256, 0}, + {4, 7680, 1, 512, 0}, + {4, 8192, 1, 1024, 0}, + {4, 8704, 1, 128, 0}, + {4, 9216, 1, 512, 0}, + {4, 9728, 1, 256, 0}, + {4, 10240, 1, 1024, 0}, + {4, 20480, 1, 1024, 0}, + {4, 30720, 1, 512, 0}, + {4, 40960, 1, 128, 0}, + {4, 51200, 1, 32, 0}, + {4, 61440, 1, 64, 0}, + {4, 71680, 1, 64, 0}, + {4, 81920, 1, 64, 0}, + {4, 92160, 1, 64, 0}, + {4, 102400, 1, 4096, 0}, + {4, 204800, 1, 2048, 1}, + {4, 307200, 1, 4096, 1}, + {4, 409600, 1, 16384, 1}, + {4, 512000, 1, 4096, 1}, + {4, 614400, 1, 2048, 0}, + {4, 716800, 1, 1024, 1}, + {4, 819200, 1, 32768, 1}, + {4, 921600, 1, 4096, 1}, + {4, 1024000, 1, 1024, 0}, + {6, 512, 3, 512, 0}, + {6, 1024, 3, 1024, 0}, + {6, 1536, 3, 32, 1}, + {6, 2048, 3, 2048, 0}, + {6, 2560, 3, 512, 1}, + {6, 3072, 3, 64, 1}, + {6, 3584, 3, 32, 1}, + {6, 4096, 3, 4096, 0}, + {6, 4608, 3, 256, 1}, + {6, 5120, 3, 256, 1}, + {6, 5632, 3, 256, 1}, + {6, 6144, 3, 64, 1}, + {6, 6656, 3, 128, 1}, + {6, 7168, 3, 512, 1}, + {6, 7680, 3, 512, 1}, + {6, 8192, 3, 8192, 0}, + {6, 8704, 3, 128, 1}, + {6, 9216, 3, 256, 1}, + {6, 9728, 3, 512, 1}, + {6, 10240, 3, 1024, 1}, + {6, 20480, 3, 1024, 1}, + {6, 30720, 3, 128, 1}, + {6, 40960, 3, 1024, 1}, + {6, 51200, 3, 2048, 1}, + {6, 61440, 3, 2048, 1}, + {6, 71680, 3, 2048, 1}, + {6, 81920, 3, 512, 1}, + {6, 92160, 3, 256, 1}, + {6, 102400, 3, 4096, 1}, + {6, 204800, 3, 512, 1}, + {6, 307200, 3, 512, 1}, + {6, 409600, 3, 4096, 1}, + {6, 512000, 3, 4096, 1}, + {6, 614400, 3, 4096, 1}, + {6, 716800, 3, 1024, 1}, + {6, 819200, 3, 16384, 1}, + {6, 921600, 3, 4096, 1}, + {6, 1024000, 3, 8192, 1}, + {10, 512, 4, 512, 0}, + {10, 1024, 4, 1024, 0}, + {10, 1536, 4, 256, 1}, + {10, 2048, 4, 2048, 0}, + {10, 2560, 4, 64, 1}, + {10, 3072, 4, 64, 1}, + {10, 3584, 4, 32, 1}, + {10, 4096, 4, 4096, 0}, + {10, 4608, 4, 512, 1}, + {10, 5120, 4, 512, 1}, + {10, 5632, 4, 256, 1}, + {10, 6144, 4, 32, 1}, + {10, 6656, 4, 512, 1}, + {10, 7168, 4, 1024, 1}, + {10, 7680, 4, 512, 1}, + {10, 8192, 4, 8192, 0}, + {10, 8704, 4, 128, 1}, + {10, 9216, 4, 64, 1}, + {10, 9728, 4, 512, 1}, + {10, 10240, 4, 512, 1}, + {10, 20480, 4, 512, 1}, + {10, 30720, 4, 128, 1}, + {10, 40960, 4, 128, 1}, + {10, 51200, 4, 2048, 1}, + {10, 61440, 4, 4096, 1}, + {10, 71680, 4, 128, 1}, + {10, 81920, 4, 1024, 1}, + {10, 92160, 4, 1024, 1}, + {10, 102400, 4, 4096, 1}, + {10, 204800, 4, 4096, 1}, + {10, 307200, 4, 2048, 1}, + {10, 409600, 4, 16384, 1}, + {10, 512000, 4, 4096, 1}, + {10, 614400, 4, 8192, 1}, + {10, 716800, 4, 4096, 1}, + {10, 819200, 4, 32768, 1}, + {10, 921600, 4, 4096, 1}, + {10, 1024000, 4, 8192, 1}, + {10, 512, 6, 512, 0}, + {10, 1024, 6, 1024, 0}, + {10, 1536, 6, 512, 1}, + {10, 2048, 6, 2048, 0}, + {10, 2560, 6, 32, 1}, + {10, 3072, 6, 512, 1}, + {10, 3584, 6, 64, 1}, + {10, 4096, 6, 4096, 0}, + {10, 4608, 6, 512, 1}, + {10, 5120, 6, 128, 1}, + {10, 5632, 6, 64, 1}, + {10, 6144, 6, 256, 1}, + {10, 6656, 6, 512, 1}, + {10, 7168, 6, 128, 1}, + {10, 7680, 6, 512, 1}, + {10, 8192, 6, 8192, 0}, + {10, 8704, 6, 512, 1}, + {10, 9216, 6, 256, 1}, + {10, 9728, 6, 128, 1}, + {10, 10240, 6, 256, 1}, + {10, 20480, 6, 256, 1}, + {10, 30720, 6, 128, 1}, + {10, 40960, 6, 64, 1}, + {10, 51200, 6, 1024, 1}, + {10, 61440, 6, 512, 1}, + {10, 71680, 6, 512, 1}, + {10, 81920, 6, 16384, 1}, + {10, 92160, 6, 1024, 1}, + {10, 102400, 6, 1024, 1}, + {10, 204800, 6, 4096, 1}, + {10, 307200, 6, 2048, 1}, + {10, 409600, 6, 8192, 1}, + {10, 512000, 6, 1024, 1}, + {10, 614400, 6, 8192, 1}, + {10, 716800, 6, 4096, 1}, + {10, 819200, 6, 32768, 1}, + {10, 921600, 6, 4096, 1}, + {10, 1024000, 6, 8192, 1}, + {15, 512, 12, 512, 0}, + {15, 1024, 12, 1024, 0}, + {15, 1536, 12, 64, 1}, + {15, 2048, 12, 2048, 0}, + {15, 2560, 12, 256, 1}, + {15, 3072, 12, 512, 1}, + {15, 3584, 12, 64, 1}, + {15, 4096, 12, 4096, 0}, + {15, 4608, 12, 256, 1}, + {15, 5120, 12, 512, 1}, + {15, 5632, 12, 512, 0}, + {15, 6144, 12, 256, 1}, + {15, 6656, 12, 32, 1}, + {15, 7168, 12, 256, 0}, + {15, 7680, 12, 512, 1}, + {15, 8192, 12, 8192, 0}, + {15, 8704, 12, 32, 1}, + {15, 9216, 12, 64, 1}, + {15, 9728, 12, 256, 1}, + {15, 10240, 12, 32, 1}, + {15, 20480, 12, 32, 1}, + {15, 30720, 12, 2048, 1}, + {15, 40960, 12, 2048, 1}, + {15, 51200, 12, 64, 1}, + {15, 61440, 12, 64, 1}, + {15, 71680, 12, 512, 1}, + {15, 81920, 12, 64, 1}, + {15, 92160, 12, 512, 1}, + {15, 102400, 12, 512, 1}, + {15, 204800, 12, 8192, 1}, + {15, 307200, 12, 128, 1}, + {15, 409600, 12, 4096, 1}, + {15, 512000, 12, 4096, 1}, + {15, 614400, 12, 4096, 1}, + {15, 716800, 12, 4096, 1}, + {15, 819200, 12, 32768, 1}, + {15, 921600, 12, 512, 1}, + {15, 1024000, 12, 8192, 1}, + {20, 512, 11, 512, 0}, + {20, 1024, 11, 1024, 0}, + {20, 1536, 11, 256, 0}, + {20, 2048, 11, 2048, 0}, + {20, 2560, 11, 128, 0}, + {20, 3072, 11, 64, 1}, + {20, 3584, 11, 32, 0}, + {20, 4096, 11, 4096, 0}, + {20, 4608, 11, 256, 1}, + {20, 5120, 11, 64, 1}, + {20, 5632, 11, 256, 1}, + {20, 6144, 11, 128, 1}, + {20, 6656, 11, 32, 1}, + {20, 7168, 11, 256, 1}, + {20, 7680, 11, 64, 1}, + {20, 8192, 11, 8192, 0}, + {20, 8704, 11, 256, 1}, + {20, 9216, 11, 256, 1}, + {20, 9728, 11, 256, 1}, + {20, 10240, 11, 64, 1}, + {20, 20480, 11, 2048, 1}, + {20, 30720, 11, 2048, 1}, + {20, 40960, 11, 512, 1}, + {20, 51200, 11, 128, 1}, + {20, 61440, 11, 2048, 1}, + {20, 71680, 11, 128, 1}, + {20, 81920, 11, 16384, 1}, + {20, 92160, 11, 64, 1}, + {20, 102400, 11, 64, 1}, + {20, 204800, 11, 2048, 1}, + {20, 307200, 11, 256, 1}, + {20, 409600, 11, 16384, 1}, + {20, 512000, 11, 256, 1}, + {20, 614400, 11, 8192, 1}, + {20, 716800, 11, 512, 1}, + {20, 819200, 11, 32768, 1}, + {20, 921600, 11, 512, 0}, + {20, 1024000, 11, 8192, 0}, + {21, 512, 16, 512, 0}, + {21, 1024, 16, 1024, 0}, + {21, 1536, 16, 256, 0}, + {21, 2048, 16, 2048, 0}, + {21, 2560, 16, 128, 0}, + {21, 3072, 16, 64, 1}, + {21, 3584, 16, 64, 0}, + {21, 4096, 16, 4096, 0}, + {21, 4608, 16, 128, 0}, + {21, 5120, 16, 256, 0}, + {21, 5632, 16, 32, 0}, + {21, 6144, 16, 128, 0}, + {21, 6656, 16, 64, 1}, + {21, 7168, 16, 128, 1}, + {21, 7680, 16, 32, 0}, + {21, 8192, 16, 4096, 1}, + {21, 8704, 16, 64, 0}, + {21, 9216, 16, 64, 1}, + {21, 9728, 16, 512, 1}, + {21, 10240, 16, 256, 0}, + {21, 20480, 16, 512, 1}, + {21, 30720, 16, 32, 1}, + {21, 40960, 16, 64, 1}, + {21, 51200, 16, 64, 1}, + {21, 61440, 16, 32, 1}, + {21, 71680, 16, 1024, 1}, + {21, 81920, 16, 128, 1}, + {21, 92160, 16, 512, 1}, + {21, 102400, 16, 256, 1}, + {21, 204800, 16, 256, 1}, + {21, 307200, 16, 128, 1}, + {21, 409600, 16, 16384, 1}, + {21, 512000, 16, 256, 1}, + {21, 614400, 16, 8192, 1}, + {21, 716800, 16, 1024, 1}, + {21, 819200, 16, 32768, 1}, + {21, 921600, 16, 512, 1}, + {21, 1024000, 16, 1024, 0}, + {28, 512, 25, 64, 0}, + {28, 1024, 25, 256, 0}, + {28, 1536, 25, 64, 0}, + {28, 2048, 25, 2048, 0}, + {28, 2560, 25, 32, 0}, + {28, 3072, 25, 64, 0}, + {28, 3584, 25, 64, 0}, + {28, 4096, 25, 64, 0}, + {28, 4608, 25, 128, 0}, + {28, 5120, 25, 128, 0}, + {28, 5632, 25, 128, 1}, + {28, 6144, 25, 256, 0}, + {28, 6656, 25, 512, 0}, + {28, 7168, 25, 256, 0}, + {28, 7680, 25, 128, 0}, + {28, 8192, 25, 8192, 0}, + {28, 8704, 25, 64, 1}, + {28, 9216, 25, 1024, 0}, + {28, 9728, 25, 64, 0}, + {28, 10240, 25, 256, 1}, + {28, 20480, 25, 128, 1}, + {28, 30720, 25, 32, 1}, + {28, 40960, 25, 2048, 1}, + {28, 51200, 25, 2048, 1}, + {28, 61440, 25, 4096, 1}, + {28, 71680, 25, 512, 1}, + {28, 81920, 25, 8192, 1}, + {28, 92160, 25, 256, 1}, + {28, 102400, 25, 4096, 1}, + {28, 204800, 25, 8192, 1}, + {28, 307200, 25, 2048, 0}, + {28, 409600, 25, 4096, 0}, + {28, 512000, 25, 256, 0}, + {28, 614400, 25, 2048, 0}, + {28, 716800, 25, 512, 0}, + {28, 819200, 25, 8192, 0}, + {28, 921600, 25, 1024, 0}, + {28, 1024000, 25, 1024, 0}, + {35, 512, 24, 512, 0}, + {35, 1024, 24, 1024, 0}, + {35, 1536, 24, 512, 0}, + {35, 2048, 24, 2048, 0}, + {35, 2560, 24, 128, 0}, + {35, 3072, 24, 128, 0}, + {35, 3584, 24, 64, 0}, + {35, 4096, 24, 256, 1}, + {35, 4608, 24, 64, 1}, + {35, 5120, 24, 512, 1}, + {35, 5632, 24, 32, 1}, + {35, 6144, 24, 32, 1}, + {35, 6656, 24, 32, 1}, + {35, 7168, 24, 256, 1}, + {35, 7680, 24, 128, 1}, + {35, 8192, 24, 256, 1}, + {35, 8704, 24, 64, 1}, + {35, 9216, 24, 256, 1}, + {35, 9728, 24, 64, 1}, + {35, 10240, 24, 128, 1}, + {35, 20480, 24, 1024, 1}, + {35, 30720, 24, 32, 1}, + {35, 40960, 24, 8192, 1}, + {35, 51200, 24, 1024, 1}, + {35, 61440, 24, 2048, 1}, + {35, 71680, 24, 2048, 1}, + {35, 81920, 24, 8192, 1}, + {35, 92160, 24, 256, 1}, + {35, 102400, 24, 4096, 1}, + {35, 204800, 24, 8192, 1}, + {35, 307200, 24, 1024, 1}, + {35, 409600, 24, 16384, 1}, + {35, 512000, 24, 1024, 1}, + {35, 614400, 24, 2048, 1}, + {35, 716800, 24, 716800, 0}, + {35, 819200, 24, 32768, 1}, + {35, 921600, 24, 921600, 0}, + {35, 1024000, 24, 1024000, 0}, + {36, 512, 33, 512, 0}, + {36, 1024, 33, 1024, 0}, + {36, 1536, 33, 128, 0}, + {36, 2048, 33, 128, 0}, + {36, 2560, 33, 64, 0}, + {36, 3072, 33, 64, 0}, + {36, 3584, 33, 128, 0}, + {36, 4096, 33, 2048, 1}, + {36, 4608, 33, 128, 0}, + {36, 5120, 33, 1024, 0}, + {36, 5632, 33, 512, 1}, + {36, 6144, 33, 2048, 0}, + {36, 6656, 33, 256, 1}, + {36, 7168, 33, 512, 1}, + {36, 7680, 33, 64, 1}, + {36, 8192, 33, 4096, 1}, + {36, 8704, 33, 512, 1}, + {36, 9216, 33, 1024, 1}, + {36, 9728, 33, 256, 1}, + {36, 10240, 33, 512, 0}, + {36, 20480, 33, 4096, 1}, + {36, 30720, 33, 128, 1}, + {36, 40960, 33, 8192, 1}, + {36, 51200, 33, 256, 1}, + {36, 61440, 33, 2048, 1}, + {36, 71680, 33, 256, 1}, + {36, 81920, 33, 16384, 1}, + {36, 92160, 33, 256, 1}, + {36, 102400, 33, 4096, 1}, + {36, 204800, 33, 2048, 0}, + {36, 307200, 33, 2048, 0}, + {36, 409600, 33, 128, 0}, + {36, 512000, 33, 1024, 0}, + {36, 614400, 33, 614400, 0}, + {36, 716800, 33, 1024, 0}, + {36, 819200, 33, 4096, 0}, + {36, 921600, 33, 2048, 0}, + {36, 1024000, 33, 2048, 0}, + {45, 512, 42, 512, 0}, + {45, 1024, 42, 1024, 0}, + {45, 1536, 42, 32, 0}, + {45, 2048, 42, 64, 0}, + {45, 2560, 42, 128, 0}, + {45, 3072, 42, 32, 0}, + {45, 3584, 42, 512, 1}, + {45, 4096, 42, 128, 0}, + {45, 4608, 42, 64, 1}, + {45, 5120, 42, 128, 0}, + {45, 5632, 42, 512, 0}, + {45, 6144, 42, 1024, 1}, + {45, 6656, 42, 512, 1}, + {45, 7168, 42, 512, 1}, + {45, 7680, 42, 512, 1}, + {45, 8192, 42, 8192, 1}, + {45, 8704, 42, 64, 1}, + {45, 9216, 42, 1024, 1}, + {45, 9728, 42, 64, 1}, + {45, 10240, 42, 256, 1}, + {45, 20480, 42, 4096, 1}, + {45, 30720, 42, 512, 1}, + {45, 40960, 42, 4096, 1}, + {45, 51200, 42, 1024, 1}, + {45, 61440, 42, 4096, 1}, + {45, 71680, 42, 1024, 1}, + {45, 81920, 42, 8192, 1}, + {45, 92160, 42, 512, 1}, + {45, 102400, 42, 4096, 1}, + {45, 204800, 42, 4096, 0}, + {45, 307200, 42, 4096, 0}, + {45, 409600, 42, 4096, 0}, + {45, 512000, 42, 1024, 0}, + {45, 614400, 42, 2048, 0}, + {45, 716800, 42, 2048, 0}, + {45, 819200, 42, 32768, 0}, + {45, 921600, 42, 4096, 0}, + {45, 1024000, 42, 512, 0}, + {56, 512, 43, 512, 0}, + {56, 1024, 43, 32, 0}, + {56, 1536, 43, 64, 0}, + {56, 2048, 43, 1024, 0}, + {56, 2560, 43, 128, 0}, + {56, 3072, 43, 128, 1}, + {56, 3584, 43, 128, 0}, + {56, 4096, 43, 128, 0}, + {56, 4608, 43, 256, 0}, + {56, 5120, 43, 256, 0}, + {56, 5632, 43, 256, 1}, + {56, 6144, 43, 1024, 1}, + {56, 6656, 43, 128, 1}, + {56, 7168, 43, 1024, 1}, + {56, 7680, 43, 256, 1}, + {56, 8192, 43, 1024, 1}, + {56, 8704, 43, 256, 1}, + {56, 9216, 43, 1024, 1}, + {56, 9728, 43, 512, 1}, + {56, 10240, 43, 1024, 1}, + {56, 20480, 43, 4096, 1}, + {56, 30720, 43, 2048, 1}, + {56, 40960, 43, 8192, 1}, + {56, 51200, 43, 512, 1}, + {56, 61440, 43, 512, 1}, + {56, 71680, 43, 512, 1}, + {56, 81920, 43, 8192, 1}, + {56, 92160, 43, 512, 1}, + {56, 102400, 43, 512, 1}, + {56, 204800, 43, 4096, 0}, + {56, 307200, 43, 307200, 0}, + {56, 409600, 43, 16384, 0}, + {56, 512000, 43, 4096, 0}, + {56, 614400, 43, 614400, 0}, + {56, 716800, 43, 4096, 0}, + {56, 819200, 43, 819200, 0}, + {56, 921600, 43, 921600, 0}, + {56, 1024000, 43, 8192, 0}, + {84, 512, 126, 32, 0}, + {84, 1024, 126, 512, 0}, + {84, 1536, 126, 256, 0}, + {84, 2048, 126, 1024, 0}, + {84, 2560, 126, 256, 0}, + {84, 3072, 126, 64, 1}, + {84, 3584, 126, 512, 0}, + {84, 4096, 126, 4096, 1}, + {84, 4608, 126, 64, 1}, + {84, 5120, 126, 512, 1}, + {84, 5632, 126, 64, 1}, + {84, 6144, 126, 2048, 1}, + {84, 6656, 126, 64, 1}, + {84, 7168, 126, 512, 1}, + {84, 7680, 126, 256, 1}, + {84, 8192, 126, 64, 1}, + {84, 8704, 126, 512, 1}, + {84, 9216, 126, 32, 1}, + {84, 9728, 126, 256, 1}, + {84, 10240, 126, 256, 1}, + {84, 20480, 126, 64, 1}, + {84, 30720, 126, 512, 0}, + {84, 40960, 126, 128, 0}, + {84, 51200, 126, 2048, 0}, + {84, 61440, 126, 512, 0}, + {84, 71680, 126, 128, 0}, + {84, 81920, 126, 4096, 0}, + {84, 92160, 126, 2048, 0}, + {84, 102400, 126, 2048, 0}, + {84, 204800, 126, 256, 0}, + {84, 307200, 126, 512, 0}, + {84, 409600, 126, 512, 0}, + {84, 512000, 126, 512, 0}, + {84, 614400, 126, 64, 0}, + {84, 716800, 126, 64, 0}, + {84, 819200, 126, 64, 0}, + {84, 921600, 126, 128, 0}, + {84, 1024000, 126, 128, 0}, + {120, 512, 210, 256, 0}, + {120, 1024, 210, 1024, 0}, + {120, 1536, 210, 512, 0}, + {120, 2048, 210, 512, 0}, + {120, 2560, 210, 256, 0}, + {120, 3072, 210, 512, 0}, + {120, 3584, 210, 512, 0}, + {120, 4096, 210, 512, 0}, + {120, 4608, 210, 512, 1}, + {120, 5120, 210, 512, 0}, + {120, 5632, 210, 512, 1}, + {120, 6144, 210, 256, 0}, + {120, 6656, 210, 64, 0}, + {120, 7168, 210, 512, 0}, + {120, 7680, 210, 512, 0}, + {120, 8192, 210, 8192, 0}, + {120, 8704, 210, 512, 0}, + {120, 9216, 210, 1024, 0}, + {120, 9728, 210, 512, 0}, + {120, 10240, 210, 1024, 0}, + {120, 20480, 210, 4096, 0}, + {120, 30720, 210, 30720, 0}, + {120, 40960, 210, 512, 0}, + {120, 51200, 210, 51200, 0}, + {120, 61440, 210, 61440, 0}, + {120, 71680, 210, 71680, 0}, + {120, 81920, 210, 81920, 0}, + {120, 92160, 210, 92160, 0}, + {120, 102400, 210, 102400, 0}, + {120, 204800, 210, 204800, 0}, + {120, 307200, 210, 307200, 0}, + {120, 409600, 210, 409600, 0}, + {120, 512000, 210, 512, 0}, + {120, 614400, 210, 1024, 0}, + {120, 716800, 210, 1024, 0}, + {120, 819200, 210, 8192, 0}, + {120, 921600, 210, 1024, 0}, + {120, 1024000, 210, 8192, 0}, + {165, 512, 330, 256, 0}, + {165, 1024, 330, 1024, 0}, + {165, 1536, 330, 512, 0}, + {165, 2048, 330, 2048, 0}, + {165, 2560, 330, 512, 0}, + {165, 3072, 330, 512, 0}, + {165, 3584, 330, 512, 0}, + {165, 4096, 330, 4096, 0}, + {165, 4608, 330, 256, 0}, + {165, 5120, 330, 256, 0}, + {165, 5632, 330, 512, 0}, + {165, 6144, 330, 1024, 0}, + {165, 6656, 330, 6656, 0}, + {165, 7168, 330, 256, 0}, + {165, 7680, 330, 128, 0}, + {165, 8192, 330, 8192, 0}, + {165, 8704, 330, 512, 0}, + {165, 9216, 330, 1024, 0}, + {165, 9728, 330, 512, 0}, + {165, 10240, 330, 256, 0}, + {165, 20480, 330, 20480, 0}, + {165, 30720, 330, 30720, 0}, + {165, 40960, 330, 40960, 0}, + {165, 51200, 330, 51200, 0}, + {165, 61440, 330, 61440, 0}, + {165, 71680, 330, 71680, 0}, + {165, 81920, 330, 81920, 0}, + {165, 92160, 330, 92160, 0}, + {165, 102400, 330, 102400, 0}, + {165, 204800, 330, 204800, 0}, + {165, 307200, 330, 307200, 0}, + {165, 409600, 330, 409600, 0}, + {165, 512000, 330, 512000, 0}, + {165, 614400, 330, 614400, 1}, + {165, 716800, 330, 716800, 1}, + {165, 819200, 330, 819200, 1}, + {165, 921600, 330, 921600, 1}, + {165, 1024000, 330, 1024000, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > dgemm_tn_a100 = -{ - {1 , 512 , 3 , 512 , 0 }, - {1 , 1024 , 3 , 256 , 0 }, - {1 , 1536 , 3 , 64 , 0 }, - {1 , 2048 , 3 , 256 , 0 }, - {1 , 2560 , 3 , 256 , 0 }, - {1 , 3072 , 3 , 256 , 0 }, - {1 , 3584 , 3 , 32 , 0 }, - {1 , 4096 , 3 , 2048 , 0 }, - {1 , 4608 , 3 , 32 , 0 }, - {1 , 5120 , 3 , 32 , 0 }, - {1 , 5632 , 3 , 512 , 0 }, - {1 , 6144 , 3 , 2048 , 0 }, - {1 , 6656 , 3 , 128 , 0 }, - {1 , 7168 , 3 , 128 , 0 }, - {1 , 7680 , 3 , 64 , 0 }, - {1 , 8192 , 3 , 8192 , 0 }, - {1 , 8704 , 3 , 64 , 0 }, - {1 , 9216 , 3 , 32 , 0 }, - {1 , 9728 , 3 , 512 , 0 }, - {1 , 10240 , 3 , 2048 , 0 }, - {1 , 20480 , 3 , 1024 , 0 }, - {1 , 30720 , 3 , 64 , 0 }, - {1 , 40960 , 3 , 4096 , 0 }, - {1 , 51200 , 3 , 256 , 0 }, - {1 , 61440 , 3 , 4096 , 1 }, - {1 , 71680 , 3 , 512 , 1 }, - {1 , 81920 , 3 , 8192 , 1 }, - {1 , 92160 , 3 , 2048 , 1 }, - {1 , 102400 , 3 , 512 , 1 }, - {1 , 204800 , 3 , 2048 , 1 }, - {1 , 307200 , 3 , 512 , 1 }, - {1 , 409600 , 3 , 16384 , 1 }, - {1 , 512000 , 3 , 512000 , 0 }, - {1 , 614400 , 3 , 614400 , 0 }, - {1 , 716800 , 3 , 716800 , 0 }, - {1 , 819200 , 3 , 819200 , 0 }, - {1 , 921600 , 3 , 921600 , 0 }, - {1 , 1024000, 3 , 1024000, 0 }, - {1 , 512 , 4 , 256 , 0 }, - {1 , 1024 , 4 , 256 , 0 }, - {1 , 1536 , 4 , 128 , 0 }, - {1 , 2048 , 4 , 2048 , 0 }, - {1 , 2560 , 4 , 256 , 0 }, - {1 , 3072 , 4 , 32 , 0 }, - {1 , 3584 , 4 , 32 , 0 }, - {1 , 4096 , 4 , 512 , 0 }, - {1 , 4608 , 4 , 64 , 0 }, - {1 , 5120 , 4 , 128 , 0 }, - {1 , 5632 , 4 , 64 , 0 }, - {1 , 6144 , 4 , 2048 , 0 }, - {1 , 6656 , 4 , 512 , 0 }, - {1 , 7168 , 4 , 128 , 0 }, - {1 , 7680 , 4 , 64 , 0 }, - {1 , 8192 , 4 , 8192 , 0 }, - {1 , 8704 , 4 , 32 , 0 }, - {1 , 9216 , 4 , 128 , 0 }, - {1 , 9728 , 4 , 32 , 0 }, - {1 , 10240 , 4 , 1024 , 0 }, - {1 , 20480 , 4 , 32 , 0 }, - {1 , 30720 , 4 , 256 , 0 }, - {1 , 40960 , 4 , 8192 , 0 }, - {1 , 51200 , 4 , 2048 , 1 }, - {1 , 61440 , 4 , 512 , 1 }, - {1 , 71680 , 4 , 256 , 1 }, - {1 , 81920 , 4 , 4096 , 1 }, - {1 , 92160 , 4 , 2048 , 1 }, - {1 , 102400 , 4 , 2048 , 1 }, - {1 , 204800 , 4 , 256 , 1 }, - {1 , 307200 , 4 , 2048 , 1 }, - {1 , 409600 , 4 , 8192 , 1 }, - {1 , 512000 , 4 , 512000 , 0 }, - {1 , 614400 , 4 , 614400 , 0 }, - {1 , 716800 , 4 , 716800 , 0 }, - {1 , 819200 , 4 , 819200 , 0 }, - {1 , 921600 , 4 , 921600 , 0 }, - {1 , 1024000, 4 , 1024000, 0 }, - {3 , 512 , 6 , 32 , 1 }, - {3 , 1024 , 6 , 1024 , 0 }, - {3 , 1536 , 6 , 32 , 0 }, - {3 , 2048 , 6 , 2048 , 0 }, - {3 , 2560 , 6 , 256 , 1 }, - {3 , 3072 , 6 , 64 , 1 }, - {3 , 3584 , 6 , 32 , 1 }, - {3 , 4096 , 6 , 4096 , 0 }, - {3 , 4608 , 6 , 128 , 1 }, - {3 , 5120 , 6 , 1024 , 1 }, - {3 , 5632 , 6 , 128 , 1 }, - {3 , 6144 , 6 , 128 , 1 }, - {3 , 6656 , 6 , 64 , 1 }, - {3 , 7168 , 6 , 32 , 1 }, - {3 , 7680 , 6 , 32 , 1 }, - {3 , 8192 , 6 , 8192 , 0 }, - {3 , 8704 , 6 , 32 , 1 }, - {3 , 9216 , 6 , 128 , 1 }, - {3 , 9728 , 6 , 32 , 1 }, - {3 , 10240 , 6 , 2048 , 1 }, - {3 , 20480 , 6 , 1024 , 1 }, - {3 , 30720 , 6 , 128 , 1 }, - {3 , 40960 , 6 , 512 , 1 }, - {3 , 51200 , 6 , 512 , 1 }, - {3 , 61440 , 6 , 128 , 1 }, - {3 , 71680 , 6 , 1024 , 1 }, - {3 , 81920 , 6 , 128 , 1 }, - {3 , 92160 , 6 , 256 , 1 }, - {3 , 102400 , 6 , 256 , 1 }, - {3 , 204800 , 6 , 2048 , 1 }, - {3 , 307200 , 6 , 4096 , 1 }, - {3 , 409600 , 6 , 8192 , 1 }, - {3 , 512000 , 6 , 4096 , 1 }, - {3 , 614400 , 6 , 4096 , 1 }, - {3 , 716800 , 6 , 2048 , 1 }, - {3 , 819200 , 6 , 32768 , 1 }, - {3 , 921600 , 6 , 4096 , 1 }, - {3 , 1024000, 6 , 8192 , 1 }, - {4 , 512 , 10 , 512 , 1 }, - {4 , 1024 , 10 , 1024 , 0 }, - {4 , 1536 , 10 , 32 , 1 }, - {4 , 2048 , 10 , 2048 , 0 }, - {4 , 2560 , 10 , 256 , 1 }, - {4 , 3072 , 10 , 256 , 1 }, - {4 , 3584 , 10 , 256 , 1 }, - {4 , 4096 , 10 , 4096 , 0 }, - {4 , 4608 , 10 , 128 , 1 }, - {4 , 5120 , 10 , 256 , 0 }, - {4 , 5632 , 10 , 512 , 1 }, - {4 , 6144 , 10 , 128 , 1 }, - {4 , 6656 , 10 , 128 , 1 }, - {4 , 7168 , 10 , 1024 , 1 }, - {4 , 7680 , 10 , 256 , 1 }, - {4 , 8192 , 10 , 8192 , 0 }, - {4 , 8704 , 10 , 64 , 1 }, - {4 , 9216 , 10 , 1024 , 1 }, - {4 , 9728 , 10 , 32 , 1 }, - {4 , 10240 , 10 , 1024 , 1 }, - {4 , 20480 , 10 , 64 , 1 }, - {4 , 30720 , 10 , 256 , 1 }, - {4 , 40960 , 10 , 128 , 1 }, - {4 , 51200 , 10 , 64 , 1 }, - {4 , 61440 , 10 , 128 , 1 }, - {4 , 71680 , 10 , 256 , 1 }, - {4 , 81920 , 10 , 8192 , 1 }, - {4 , 92160 , 10 , 256 , 1 }, - {4 , 102400 , 10 , 4096 , 1 }, - {4 , 204800 , 10 , 512 , 1 }, - {4 , 307200 , 10 , 4096 , 1 }, - {4 , 409600 , 10 , 8192 , 1 }, - {4 , 512000 , 10 , 2048 , 1 }, - {4 , 614400 , 10 , 4096 , 1 }, - {4 , 716800 , 10 , 4096 , 1 }, - {4 , 819200 , 10 , 32768 , 1 }, - {4 , 921600 , 10 , 4096 , 1 }, - {4 , 1024000, 10 , 8192 , 1 }, - {6 , 512 , 10 , 32 , 1 }, - {6 , 1024 , 10 , 1024 , 0 }, - {6 , 1536 , 10 , 32 , 0 }, - {6 , 2048 , 10 , 2048 , 0 }, - {6 , 2560 , 10 , 256 , 0 }, - {6 , 3072 , 10 , 64 , 1 }, - {6 , 3584 , 10 , 64 , 1 }, - {6 , 4096 , 10 , 4096 , 0 }, - {6 , 4608 , 10 , 256 , 1 }, - {6 , 5120 , 10 , 64 , 1 }, - {6 , 5632 , 10 , 512 , 1 }, - {6 , 6144 , 10 , 32 , 1 }, - {6 , 6656 , 10 , 64 , 1 }, - {6 , 7168 , 10 , 1024 , 1 }, - {6 , 7680 , 10 , 256 , 1 }, - {6 , 8192 , 10 , 8192 , 0 }, - {6 , 8704 , 10 , 256 , 1 }, - {6 , 9216 , 10 , 512 , 1 }, - {6 , 9728 , 10 , 256 , 1 }, - {6 , 10240 , 10 , 32 , 1 }, - {6 , 20480 , 10 , 64 , 1 }, - {6 , 30720 , 10 , 512 , 1 }, - {6 , 40960 , 10 , 8192 , 1 }, - {6 , 51200 , 10 , 2048 , 1 }, - {6 , 61440 , 10 , 1024 , 1 }, - {6 , 71680 , 10 , 512 , 1 }, - {6 , 81920 , 10 , 1024 , 1 }, - {6 , 92160 , 10 , 1024 , 1 }, - {6 , 102400 , 10 , 256 , 1 }, - {6 , 204800 , 10 , 8192 , 1 }, - {6 , 307200 , 10 , 4096 , 1 }, - {6 , 409600 , 10 , 16384 , 1 }, - {6 , 512000 , 10 , 2048 , 1 }, - {6 , 614400 , 10 , 8192 , 1 }, - {6 , 716800 , 10 , 4096 , 1 }, - {6 , 819200 , 10 , 32768 , 1 }, - {6 , 921600 , 10 , 4096 , 1 }, - {6 , 1024000, 10 , 8192 , 1 }, - {12 , 512 , 15 , 512 , 1 }, - {12 , 1024 , 15 , 1024 , 0 }, - {12 , 1536 , 15 , 64 , 0 }, - {12 , 2048 , 15 , 2048 , 0 }, - {12 , 2560 , 15 , 512 , 0 }, - {12 , 3072 , 15 , 512 , 1 }, - {12 , 3584 , 15 , 256 , 1 }, - {12 , 4096 , 15 , 4096 , 0 }, - {12 , 4608 , 15 , 256 , 1 }, - {12 , 5120 , 15 , 128 , 1 }, - {12 , 5632 , 15 , 64 , 0 }, - {12 , 6144 , 15 , 512 , 1 }, - {12 , 6656 , 15 , 256 , 1 }, - {12 , 7168 , 15 , 64 , 1 }, - {12 , 7680 , 15 , 32 , 0 }, - {12 , 8192 , 15 , 8192 , 0 }, - {12 , 8704 , 15 , 64 , 1 }, - {12 , 9216 , 15 , 256 , 1 }, - {12 , 9728 , 15 , 32 , 1 }, - {12 , 10240 , 15 , 32 , 1 }, - {12 , 20480 , 15 , 2048 , 1 }, - {12 , 30720 , 15 , 2048 , 1 }, - {12 , 40960 , 15 , 512 , 1 }, - {12 , 51200 , 15 , 2048 , 1 }, - {12 , 61440 , 15 , 2048 , 1 }, - {12 , 71680 , 15 , 32 , 1 }, - {12 , 81920 , 15 , 16384 , 1 }, - {12 , 92160 , 15 , 256 , 1 }, - {12 , 102400 , 15 , 2048 , 1 }, - {12 , 204800 , 15 , 512 , 1 }, - {12 , 307200 , 15 , 2048 , 1 }, - {12 , 409600 , 15 , 16384 , 1 }, - {12 , 512000 , 15 , 4096 , 1 }, - {12 , 614400 , 15 , 4096 , 1 }, - {12 , 716800 , 15 , 512 , 1 }, - {12 , 819200 , 15 , 32768 , 1 }, - {12 , 921600 , 15 , 512 , 1 }, - {12 , 1024000, 15 , 8192 , 1 }, - {11 , 512 , 20 , 128 , 1 }, - {11 , 1024 , 20 , 1024 , 0 }, - {11 , 1536 , 20 , 512 , 1 }, - {11 , 2048 , 20 , 2048 , 0 }, - {11 , 2560 , 20 , 512 , 1 }, - {11 , 3072 , 20 , 32 , 0 }, - {11 , 3584 , 20 , 128 , 0 }, - {11 , 4096 , 20 , 4096 , 0 }, - {11 , 4608 , 20 , 256 , 0 }, - {11 , 5120 , 20 , 256 , 0 }, - {11 , 5632 , 20 , 128 , 0 }, - {11 , 6144 , 20 , 64 , 0 }, - {11 , 6656 , 20 , 512 , 0 }, - {11 , 7168 , 20 , 512 , 0 }, - {11 , 7680 , 20 , 512 , 1 }, - {11 , 8192 , 20 , 8192 , 0 }, - {11 , 8704 , 20 , 32 , 1 }, - {11 , 9216 , 20 , 256 , 1 }, - {11 , 9728 , 20 , 512 , 1 }, - {11 , 10240 , 20 , 256 , 0 }, - {11 , 20480 , 20 , 256 , 1 }, - {11 , 30720 , 20 , 2048 , 1 }, - {11 , 40960 , 20 , 512 , 1 }, - {11 , 51200 , 20 , 2048 , 1 }, - {11 , 61440 , 20 , 2048 , 1 }, - {11 , 71680 , 20 , 32 , 1 }, - {11 , 81920 , 20 , 2048 , 1 }, - {11 , 92160 , 20 , 64 , 1 }, - {11 , 102400 , 20 , 4096 , 1 }, - {11 , 204800 , 20 , 8192 , 1 }, - {11 , 307200 , 20 , 128 , 1 }, - {11 , 409600 , 20 , 8192 , 1 }, - {11 , 512000 , 20 , 4096 , 1 }, - {11 , 614400 , 20 , 256 , 1 }, - {11 , 716800 , 20 , 512 , 1 }, - {11 , 819200 , 20 , 16384 , 1 }, - {11 , 921600 , 20 , 512 , 1 }, - {11 , 1024000, 20 , 8192 , 1 }, - {16 , 512 , 21 , 32 , 1 }, - {16 , 1024 , 21 , 1024 , 0 }, - {16 , 1536 , 21 , 128 , 0 }, - {16 , 2048 , 21 , 2048 , 0 }, - {16 , 2560 , 21 , 32 , 0 }, - {16 , 3072 , 21 , 64 , 1 }, - {16 , 3584 , 21 , 64 , 0 }, - {16 , 4096 , 21 , 256 , 0 }, - {16 , 4608 , 21 , 64 , 0 }, - {16 , 5120 , 21 , 512 , 0 }, - {16 , 5632 , 21 , 128 , 0 }, - {16 , 6144 , 21 , 64 , 0 }, - {16 , 6656 , 21 , 256 , 1 }, - {16 , 7168 , 21 , 512 , 0 }, - {16 , 7680 , 21 , 64 , 1 }, - {16 , 8192 , 21 , 8192 , 0 }, - {16 , 8704 , 21 , 256 , 0 }, - {16 , 9216 , 21 , 1024 , 1 }, - {16 , 9728 , 21 , 128 , 1 }, - {16 , 10240 , 21 , 256 , 1 }, - {16 , 20480 , 21 , 4096 , 1 }, - {16 , 30720 , 21 , 2048 , 1 }, - {16 , 40960 , 21 , 1024 , 1 }, - {16 , 51200 , 21 , 128 , 1 }, - {16 , 61440 , 21 , 512 , 1 }, - {16 , 71680 , 21 , 1024 , 1 }, - {16 , 81920 , 21 , 2048 , 1 }, - {16 , 92160 , 21 , 64 , 1 }, - {16 , 102400 , 21 , 4096 , 1 }, - {16 , 204800 , 21 , 8192 , 1 }, - {16 , 307200 , 21 , 128 , 1 }, - {16 , 409600 , 21 , 256 , 1 }, - {16 , 512000 , 21 , 128 , 1 }, - {16 , 614400 , 21 , 8192 , 1 }, - {16 , 716800 , 21 , 1024 , 1 }, - {16 , 819200 , 21 , 32768 , 1 }, - {16 , 921600 , 21 , 1024 , 1 }, - {16 , 1024000, 21 , 512 , 0 }, - {25 , 512 , 28 , 32 , 0 }, - {25 , 1024 , 28 , 1024 , 0 }, - {25 , 1536 , 28 , 64 , 0 }, - {25 , 2048 , 28 , 2048 , 0 }, - {25 , 2560 , 28 , 64 , 0 }, - {25 , 3072 , 28 , 256 , 0 }, - {25 , 3584 , 28 , 128 , 0 }, - {25 , 4096 , 28 , 64 , 0 }, - {25 , 4608 , 28 , 128 , 0 }, - {25 , 5120 , 28 , 128 , 0 }, - {25 , 5632 , 28 , 128 , 1 }, - {25 , 6144 , 28 , 64 , 0 }, - {25 , 6656 , 28 , 128 , 0 }, - {25 , 7168 , 28 , 256 , 0 }, - {25 , 7680 , 28 , 128 , 0 }, - {25 , 8192 , 28 , 512 , 0 }, - {25 , 8704 , 28 , 256 , 1 }, - {25 , 9216 , 28 , 64 , 0 }, - {25 , 9728 , 28 , 256 , 0 }, - {25 , 10240 , 28 , 512 , 1 }, - {25 , 20480 , 28 , 2048 , 1 }, - {25 , 30720 , 28 , 512 , 1 }, - {25 , 40960 , 28 , 1024 , 1 }, - {25 , 51200 , 28 , 128 , 1 }, - {25 , 61440 , 28 , 4096 , 1 }, - {25 , 71680 , 28 , 1024 , 1 }, - {25 , 81920 , 28 , 4096 , 1 }, - {25 , 92160 , 28 , 256 , 1 }, - {25 , 102400 , 28 , 4096 , 1 }, - {25 , 204800 , 28 , 8192 , 1 }, - {25 , 307200 , 28 , 512 , 1 }, - {25 , 409600 , 28 , 1024 , 0 }, - {25 , 512000 , 28 , 128 , 0 }, - {25 , 614400 , 28 , 1024 , 0 }, - {25 , 716800 , 28 , 4096 , 0 }, - {25 , 819200 , 28 , 256 , 0 }, - {25 , 921600 , 28 , 4096 , 0 }, - {25 , 1024000, 28 , 4096 , 0 }, - {24 , 512 , 35 , 256 , 0 }, - {24 , 1024 , 35 , 1024 , 0 }, - {24 , 1536 , 35 , 64 , 0 }, - {24 , 2048 , 35 , 64 , 0 }, - {24 , 2560 , 35 , 128 , 0 }, - {24 , 3072 , 35 , 128 , 0 }, - {24 , 3584 , 35 , 128 , 0 }, - {24 , 4096 , 35 , 256 , 0 }, - {24 , 4608 , 35 , 64 , 0 }, - {24 , 5120 , 35 , 128 , 0 }, - {24 , 5632 , 35 , 128 , 0 }, - {24 , 6144 , 35 , 256 , 0 }, - {24 , 6656 , 35 , 128 , 0 }, - {24 , 7168 , 35 , 64 , 0 }, - {24 , 7680 , 35 , 128 , 0 }, - {24 , 8192 , 35 , 8192 , 0 }, - {24 , 8704 , 35 , 32 , 0 }, - {24 , 9216 , 35 , 256 , 0 }, - {24 , 9728 , 35 , 256 , 0 }, - {24 , 10240 , 35 , 2048 , 0 }, - {24 , 20480 , 35 , 2048 , 1 }, - {24 , 30720 , 35 , 256 , 1 }, - {24 , 40960 , 35 , 64 , 1 }, - {24 , 51200 , 35 , 128 , 1 }, - {24 , 61440 , 35 , 32 , 1 }, - {24 , 71680 , 35 , 64 , 1 }, - {24 , 81920 , 35 , 256 , 1 }, - {24 , 92160 , 35 , 256 , 1 }, - {24 , 102400 , 35 , 256 , 1 }, - {24 , 204800 , 35 , 128 , 1 }, - {24 , 307200 , 35 , 4096 , 0 }, - {24 , 409600 , 35 , 8192 , 0 }, - {24 , 512000 , 35 , 4096 , 0 }, - {24 , 614400 , 35 , 512 , 0 }, - {24 , 716800 , 35 , 4096 , 0 }, - {24 , 819200 , 35 , 16384 , 0 }, - {24 , 921600 , 35 , 2048 , 0 }, - {24 , 1024000, 35 , 4096 , 0 }, - {33 , 512 , 36 , 512 , 0 }, - {33 , 1024 , 36 , 1024 , 0 }, - {33 , 1536 , 36 , 32 , 0 }, - {33 , 2048 , 36 , 256 , 0 }, - {33 , 2560 , 36 , 128 , 0 }, - {33 , 3072 , 36 , 128 , 0 }, - {33 , 3584 , 36 , 64 , 0 }, - {33 , 4096 , 36 , 64 , 0 }, - {33 , 4608 , 36 , 32 , 1 }, - {33 , 5120 , 36 , 128 , 1 }, - {33 , 5632 , 36 , 512 , 1 }, - {33 , 6144 , 36 , 1024 , 1 }, - {33 , 6656 , 36 , 256 , 1 }, - {33 , 7168 , 36 , 64 , 1 }, - {33 , 7680 , 36 , 32 , 1 }, - {33 , 8192 , 36 , 1024 , 1 }, - {33 , 8704 , 36 , 512 , 1 }, - {33 , 9216 , 36 , 1024 , 1 }, - {33 , 9728 , 36 , 32 , 1 }, - {33 , 10240 , 36 , 64 , 1 }, - {33 , 20480 , 36 , 4096 , 1 }, - {33 , 30720 , 36 , 2048 , 1 }, - {33 , 40960 , 36 , 8192 , 1 }, - {33 , 51200 , 36 , 128 , 1 }, - {33 , 61440 , 36 , 256 , 1 }, - {33 , 71680 , 36 , 128 , 1 }, - {33 , 81920 , 36 , 256 , 1 }, - {33 , 92160 , 36 , 256 , 1 }, - {33 , 102400 , 36 , 64 , 1 }, - {33 , 204800 , 36 , 8192 , 1 }, - {33 , 307200 , 36 , 1024 , 1 }, - {33 , 409600 , 36 , 16384 , 1 }, - {33 , 512000 , 36 , 2048 , 1 }, - {33 , 614400 , 36 , 8192 , 1 }, - {33 , 716800 , 36 , 1024 , 1 }, - {33 , 819200 , 36 , 32768 , 1 }, - {33 , 921600 , 36 , 4096 , 1 }, - {33 , 1024000, 36 , 1024000, 0 }, - {42 , 512 , 45 , 512 , 0 }, - {42 , 1024 , 45 , 1024 , 0 }, - {42 , 1536 , 45 , 32 , 0 }, - {42 , 2048 , 45 , 2048 , 0 }, - {42 , 2560 , 45 , 128 , 0 }, - {42 , 3072 , 45 , 512 , 1 }, - {42 , 3584 , 45 , 256 , 0 }, - {42 , 4096 , 45 , 128 , 0 }, - {42 , 4608 , 45 , 64 , 1 }, - {42 , 5120 , 45 , 64 , 1 }, - {42 , 5632 , 45 , 32 , 1 }, - {42 , 6144 , 45 , 2048 , 0 }, - {42 , 6656 , 45 , 128 , 1 }, - {42 , 7168 , 45 , 1024 , 1 }, - {42 , 7680 , 45 , 256 , 1 }, - {42 , 8192 , 45 , 2048 , 1 }, - {42 , 8704 , 45 , 512 , 1 }, - {42 , 9216 , 45 , 256 , 1 }, - {42 , 9728 , 45 , 512 , 1 }, - {42 , 10240 , 45 , 1024 , 1 }, - {42 , 20480 , 45 , 128 , 1 }, - {42 , 30720 , 45 , 512 , 1 }, - {42 , 40960 , 45 , 2048 , 1 }, - {42 , 51200 , 45 , 256 , 1 }, - {42 , 61440 , 45 , 4096 , 1 }, - {42 , 71680 , 45 , 128 , 1 }, - {42 , 81920 , 45 , 8192 , 1 }, - {42 , 92160 , 45 , 512 , 1 }, - {42 , 102400 , 45 , 256 , 1 }, - {42 , 204800 , 45 , 8192 , 1 }, - {42 , 307200 , 45 , 2048 , 1 }, - {42 , 409600 , 45 , 16384 , 1 }, - {42 , 512000 , 45 , 4096 , 1 }, - {42 , 614400 , 45 , 614400 , 0 }, - {42 , 716800 , 45 , 716800 , 0 }, - {42 , 819200 , 45 , 819200 , 0 }, - {42 , 921600 , 45 , 921600 , 0 }, - {42 , 1024000, 45 , 1024000, 0 }, - {43 , 512 , 56 , 512 , 0 }, - {43 , 1024 , 56 , 1024 , 0 }, - {43 , 1536 , 56 , 256 , 0 }, - {43 , 2048 , 56 , 128 , 0 }, - {43 , 2560 , 56 , 32 , 0 }, - {43 , 3072 , 56 , 128 , 0 }, - {43 , 3584 , 56 , 128 , 0 }, - {43 , 4096 , 56 , 128 , 0 }, - {43 , 4608 , 56 , 256 , 0 }, - {43 , 5120 , 56 , 256 , 0 }, - {43 , 5632 , 56 , 512 , 1 }, - {43 , 6144 , 56 , 256 , 1 }, - {43 , 6656 , 56 , 256 , 1 }, - {43 , 7168 , 56 , 128 , 1 }, - {43 , 7680 , 56 , 256 , 1 }, - {43 , 8192 , 56 , 8192 , 1 }, - {43 , 8704 , 56 , 512 , 1 }, - {43 , 9216 , 56 , 512 , 1 }, - {43 , 9728 , 56 , 128 , 1 }, - {43 , 10240 , 56 , 1024 , 1 }, - {43 , 20480 , 56 , 128 , 1 }, - {43 , 30720 , 56 , 256 , 1 }, - {43 , 40960 , 56 , 256 , 1 }, - {43 , 51200 , 56 , 512 , 1 }, - {43 , 61440 , 56 , 4096 , 1 }, - {43 , 71680 , 56 , 512 , 1 }, - {43 , 81920 , 56 , 8192 , 1 }, - {43 , 92160 , 56 , 512 , 1 }, - {43 , 102400 , 56 , 128 , 1 }, - {43 , 204800 , 56 , 512 , 1 }, - {43 , 307200 , 56 , 1024 , 1 }, - {43 , 409600 , 56 , 4096 , 1 }, - {43 , 512000 , 56 , 512000 , 0 }, - {43 , 614400 , 56 , 614400 , 0 }, - {43 , 716800 , 56 , 716800 , 0 }, - {43 , 819200 , 56 , 819200 , 0 }, - {43 , 921600 , 56 , 921600 , 0 }, - {43 , 1024000, 56 , 1024000, 0 }, - {126 , 512 , 84 , 256 , 0 }, - {126 , 1024 , 84 , 1024 , 0 }, - {126 , 1536 , 84 , 64 , 0 }, - {126 , 2048 , 84 , 128 , 0 }, - {126 , 2560 , 84 , 512 , 0 }, - {126 , 3072 , 84 , 1024 , 0 }, - {126 , 3584 , 84 , 128 , 1 }, - {126 , 4096 , 84 , 4096 , 1 }, - {126 , 4608 , 84 , 256 , 1 }, - {126 , 5120 , 84 , 64 , 1 }, - {126 , 5632 , 84 , 128 , 1 }, - {126 , 6144 , 84 , 2048 , 1 }, - {126 , 6656 , 84 , 256 , 1 }, - {126 , 7168 , 84 , 128 , 1 }, - {126 , 7680 , 84 , 64 , 1 }, - {126 , 8192 , 84 , 128 , 1 }, - {126 , 8704 , 84 , 512 , 1 }, - {126 , 9216 , 84 , 128 , 1 }, - {126 , 9728 , 84 , 128 , 1 }, - {126 , 10240 , 84 , 2048 , 1 }, - {126 , 20480 , 84 , 256 , 1 }, - {126 , 30720 , 84 , 512 , 0 }, - {126 , 40960 , 84 , 8192 , 0 }, - {126 , 51200 , 84 , 128 , 0 }, - {126 , 61440 , 84 , 2048 , 0 }, - {126 , 71680 , 84 , 2048 , 0 }, - {126 , 81920 , 84 , 81920 , 0 }, - {126 , 92160 , 84 , 92160 , 0 }, - {126 , 102400 , 84 , 102400 , 0 }, - {126 , 204800 , 84 , 204800 , 0 }, - {126 , 307200 , 84 , 307200 , 0 }, - {126 , 409600 , 84 , 409600 , 0 }, - {126 , 512000 , 84 , 512000 , 0 }, - {126 , 614400 , 84 , 614400 , 0 }, - {126 , 716800 , 84 , 716800 , 0 }, - {126 , 819200 , 84 , 819200 , 0 }, - {126 , 921600 , 84 , 921600 , 0 }, - {126 , 1024000, 84 , 1024000, 0 }, - {210 , 512 , 120 , 512 , 0 }, - {210 , 1024 , 120 , 256 , 0 }, - {210 , 1536 , 120 , 256 , 0 }, - {210 , 2048 , 120 , 2048 , 1 }, - {210 , 2560 , 120 , 256 , 1 }, - {210 , 3072 , 120 , 128 , 1 }, - {210 , 3584 , 120 , 128 , 1 }, - {210 , 4096 , 120 , 256 , 1 }, - {210 , 4608 , 120 , 32 , 1 }, - {210 , 5120 , 120 , 128 , 1 }, - {210 , 5632 , 120 , 512 , 1 }, - {210 , 6144 , 120 , 512 , 1 }, - {210 , 6656 , 120 , 256 , 1 }, - {210 , 7168 , 120 , 256 , 1 }, - {210 , 7680 , 120 , 256 , 0 }, - {210 , 8192 , 120 , 1024 , 0 }, - {210 , 8704 , 120 , 512 , 1 }, - {210 , 9216 , 120 , 1024 , 0 }, - {210 , 9728 , 120 , 128 , 0 }, - {210 , 10240 , 120 , 2048 , 0 }, - {210 , 20480 , 120 , 2048 , 0 }, - {210 , 30720 , 120 , 256 , 0 }, - {210 , 40960 , 120 , 8192 , 0 }, - {210 , 51200 , 120 , 128 , 0 }, - {210 , 61440 , 120 , 4096 , 0 }, - {210 , 71680 , 120 , 71680 , 0 }, - {210 , 81920 , 120 , 81920 , 0 }, - {210 , 92160 , 120 , 92160 , 0 }, - {210 , 102400 , 120 , 102400 , 0 }, - {210 , 204800 , 120 , 204800 , 0 }, - {210 , 307200 , 120 , 307200 , 0 }, - {210 , 409600 , 120 , 409600 , 0 }, - {210 , 512000 , 120 , 256 , 0 }, - {210 , 614400 , 120 , 128 , 0 }, - {210 , 716800 , 120 , 1024 , 0 }, - {210 , 819200 , 120 , 2048 , 0 }, - {210 , 921600 , 120 , 4096 , 0 }, - {210 , 1024000, 120 , 8192 , 0 }, - {330 , 512 , 165 , 512 , 0 }, - {330 , 1024 , 165 , 1024 , 0 }, - {330 , 1536 , 165 , 128 , 1 }, - {330 , 2048 , 165 , 1024 , 1 }, - {330 , 2560 , 165 , 512 , 1 }, - {330 , 3072 , 165 , 256 , 1 }, - {330 , 3584 , 165 , 512 , 1 }, - {330 , 4096 , 165 , 512 , 1 }, - {330 , 4608 , 165 , 256 , 1 }, - {330 , 5120 , 165 , 1024 , 1 }, - {330 , 5632 , 165 , 512 , 1 }, - {330 , 6144 , 165 , 1024 , 1 }, - {330 , 6656 , 165 , 512 , 0 }, - {330 , 7168 , 165 , 1024 , 0 }, - {330 , 7680 , 165 , 7680 , 0 }, - {330 , 8192 , 165 , 8192 , 0 }, - {330 , 8704 , 165 , 512 , 0 }, - {330 , 9216 , 165 , 256 , 0 }, - {330 , 9728 , 165 , 9728 , 0 }, - {330 , 10240 , 165 , 10240 , 0 }, - {330 , 20480 , 165 , 20480 , 0 }, - {330 , 30720 , 165 , 30720 , 0 }, - {330 , 40960 , 165 , 4096 , 0 }, - {330 , 51200 , 165 , 51200 , 0 }, - {330 , 61440 , 165 , 1024 , 0 }, - {330 , 71680 , 165 , 2048 , 0 }, - {330 , 81920 , 165 , 16384 , 0 }, - {330 , 92160 , 165 , 2048 , 0 }, - {330 , 102400 , 165 , 1024 , 0 }, - {330 , 204800 , 165 , 2048 , 0 }, - {330 , 307200 , 165 , 128 , 0 }, - {330 , 409600 , 165 , 128 , 0 }, - {330 , 512000 , 165 , 1024 , 0 }, - {330 , 614400 , 165 , 1024 , 0 }, - {330 , 716800 , 165 , 1024 , 0 }, - {330 , 819200 , 165 , 4096 , 0 }, - {330 , 921600 , 165 , 4096 , 0 }, - {330 , 1024000, 165 , 4096 , 0 } +std::vector > dgemm_tn_a100 = { + {1, 512, 3, 512, 0}, + {1, 1024, 3, 256, 0}, + {1, 1536, 3, 64, 0}, + {1, 2048, 3, 256, 0}, + {1, 2560, 3, 256, 0}, + {1, 3072, 3, 256, 0}, + {1, 3584, 3, 32, 0}, + {1, 4096, 3, 2048, 0}, + {1, 4608, 3, 32, 0}, + {1, 5120, 3, 32, 0}, + {1, 5632, 3, 512, 0}, + {1, 6144, 3, 2048, 0}, + {1, 6656, 3, 128, 0}, + {1, 7168, 3, 128, 0}, + {1, 7680, 3, 64, 0}, + {1, 8192, 3, 8192, 0}, + {1, 8704, 3, 64, 0}, + {1, 9216, 3, 32, 0}, + {1, 9728, 3, 512, 0}, + {1, 10240, 3, 2048, 0}, + {1, 20480, 3, 1024, 0}, + {1, 30720, 3, 64, 0}, + {1, 40960, 3, 4096, 0}, + {1, 51200, 3, 256, 0}, + {1, 61440, 3, 4096, 1}, + {1, 71680, 3, 512, 1}, + {1, 81920, 3, 8192, 1}, + {1, 92160, 3, 2048, 1}, + {1, 102400, 3, 512, 1}, + {1, 204800, 3, 2048, 1}, + {1, 307200, 3, 512, 1}, + {1, 409600, 3, 16384, 1}, + {1, 512000, 3, 512000, 0}, + {1, 614400, 3, 614400, 0}, + {1, 716800, 3, 716800, 0}, + {1, 819200, 3, 819200, 0}, + {1, 921600, 3, 921600, 0}, + {1, 1024000, 3, 1024000, 0}, + {1, 512, 4, 256, 0}, + {1, 1024, 4, 256, 0}, + {1, 1536, 4, 128, 0}, + {1, 2048, 4, 2048, 0}, + {1, 2560, 4, 256, 0}, + {1, 3072, 4, 32, 0}, + {1, 3584, 4, 32, 0}, + {1, 4096, 4, 512, 0}, + {1, 4608, 4, 64, 0}, + {1, 5120, 4, 128, 0}, + {1, 5632, 4, 64, 0}, + {1, 6144, 4, 2048, 0}, + {1, 6656, 4, 512, 0}, + {1, 7168, 4, 128, 0}, + {1, 7680, 4, 64, 0}, + {1, 8192, 4, 8192, 0}, + {1, 8704, 4, 32, 0}, + {1, 9216, 4, 128, 0}, + {1, 9728, 4, 32, 0}, + {1, 10240, 4, 1024, 0}, + {1, 20480, 4, 32, 0}, + {1, 30720, 4, 256, 0}, + {1, 40960, 4, 8192, 0}, + {1, 51200, 4, 2048, 1}, + {1, 61440, 4, 512, 1}, + {1, 71680, 4, 256, 1}, + {1, 81920, 4, 4096, 1}, + {1, 92160, 4, 2048, 1}, + {1, 102400, 4, 2048, 1}, + {1, 204800, 4, 256, 1}, + {1, 307200, 4, 2048, 1}, + {1, 409600, 4, 8192, 1}, + {1, 512000, 4, 512000, 0}, + {1, 614400, 4, 614400, 0}, + {1, 716800, 4, 716800, 0}, + {1, 819200, 4, 819200, 0}, + {1, 921600, 4, 921600, 0}, + {1, 1024000, 4, 1024000, 0}, + {3, 512, 6, 32, 1}, + {3, 1024, 6, 1024, 0}, + {3, 1536, 6, 32, 0}, + {3, 2048, 6, 2048, 0}, + {3, 2560, 6, 256, 1}, + {3, 3072, 6, 64, 1}, + {3, 3584, 6, 32, 1}, + {3, 4096, 6, 4096, 0}, + {3, 4608, 6, 128, 1}, + {3, 5120, 6, 1024, 1}, + {3, 5632, 6, 128, 1}, + {3, 6144, 6, 128, 1}, + {3, 6656, 6, 64, 1}, + {3, 7168, 6, 32, 1}, + {3, 7680, 6, 32, 1}, + {3, 8192, 6, 8192, 0}, + {3, 8704, 6, 32, 1}, + {3, 9216, 6, 128, 1}, + {3, 9728, 6, 32, 1}, + {3, 10240, 6, 2048, 1}, + {3, 20480, 6, 1024, 1}, + {3, 30720, 6, 128, 1}, + {3, 40960, 6, 512, 1}, + {3, 51200, 6, 512, 1}, + {3, 61440, 6, 128, 1}, + {3, 71680, 6, 1024, 1}, + {3, 81920, 6, 128, 1}, + {3, 92160, 6, 256, 1}, + {3, 102400, 6, 256, 1}, + {3, 204800, 6, 2048, 1}, + {3, 307200, 6, 4096, 1}, + {3, 409600, 6, 8192, 1}, + {3, 512000, 6, 4096, 1}, + {3, 614400, 6, 4096, 1}, + {3, 716800, 6, 2048, 1}, + {3, 819200, 6, 32768, 1}, + {3, 921600, 6, 4096, 1}, + {3, 1024000, 6, 8192, 1}, + {4, 512, 10, 512, 1}, + {4, 1024, 10, 1024, 0}, + {4, 1536, 10, 32, 1}, + {4, 2048, 10, 2048, 0}, + {4, 2560, 10, 256, 1}, + {4, 3072, 10, 256, 1}, + {4, 3584, 10, 256, 1}, + {4, 4096, 10, 4096, 0}, + {4, 4608, 10, 128, 1}, + {4, 5120, 10, 256, 0}, + {4, 5632, 10, 512, 1}, + {4, 6144, 10, 128, 1}, + {4, 6656, 10, 128, 1}, + {4, 7168, 10, 1024, 1}, + {4, 7680, 10, 256, 1}, + {4, 8192, 10, 8192, 0}, + {4, 8704, 10, 64, 1}, + {4, 9216, 10, 1024, 1}, + {4, 9728, 10, 32, 1}, + {4, 10240, 10, 1024, 1}, + {4, 20480, 10, 64, 1}, + {4, 30720, 10, 256, 1}, + {4, 40960, 10, 128, 1}, + {4, 51200, 10, 64, 1}, + {4, 61440, 10, 128, 1}, + {4, 71680, 10, 256, 1}, + {4, 81920, 10, 8192, 1}, + {4, 92160, 10, 256, 1}, + {4, 102400, 10, 4096, 1}, + {4, 204800, 10, 512, 1}, + {4, 307200, 10, 4096, 1}, + {4, 409600, 10, 8192, 1}, + {4, 512000, 10, 2048, 1}, + {4, 614400, 10, 4096, 1}, + {4, 716800, 10, 4096, 1}, + {4, 819200, 10, 32768, 1}, + {4, 921600, 10, 4096, 1}, + {4, 1024000, 10, 8192, 1}, + {6, 512, 10, 32, 1}, + {6, 1024, 10, 1024, 0}, + {6, 1536, 10, 32, 0}, + {6, 2048, 10, 2048, 0}, + {6, 2560, 10, 256, 0}, + {6, 3072, 10, 64, 1}, + {6, 3584, 10, 64, 1}, + {6, 4096, 10, 4096, 0}, + {6, 4608, 10, 256, 1}, + {6, 5120, 10, 64, 1}, + {6, 5632, 10, 512, 1}, + {6, 6144, 10, 32, 1}, + {6, 6656, 10, 64, 1}, + {6, 7168, 10, 1024, 1}, + {6, 7680, 10, 256, 1}, + {6, 8192, 10, 8192, 0}, + {6, 8704, 10, 256, 1}, + {6, 9216, 10, 512, 1}, + {6, 9728, 10, 256, 1}, + {6, 10240, 10, 32, 1}, + {6, 20480, 10, 64, 1}, + {6, 30720, 10, 512, 1}, + {6, 40960, 10, 8192, 1}, + {6, 51200, 10, 2048, 1}, + {6, 61440, 10, 1024, 1}, + {6, 71680, 10, 512, 1}, + {6, 81920, 10, 1024, 1}, + {6, 92160, 10, 1024, 1}, + {6, 102400, 10, 256, 1}, + {6, 204800, 10, 8192, 1}, + {6, 307200, 10, 4096, 1}, + {6, 409600, 10, 16384, 1}, + {6, 512000, 10, 2048, 1}, + {6, 614400, 10, 8192, 1}, + {6, 716800, 10, 4096, 1}, + {6, 819200, 10, 32768, 1}, + {6, 921600, 10, 4096, 1}, + {6, 1024000, 10, 8192, 1}, + {12, 512, 15, 512, 1}, + {12, 1024, 15, 1024, 0}, + {12, 1536, 15, 64, 0}, + {12, 2048, 15, 2048, 0}, + {12, 2560, 15, 512, 0}, + {12, 3072, 15, 512, 1}, + {12, 3584, 15, 256, 1}, + {12, 4096, 15, 4096, 0}, + {12, 4608, 15, 256, 1}, + {12, 5120, 15, 128, 1}, + {12, 5632, 15, 64, 0}, + {12, 6144, 15, 512, 1}, + {12, 6656, 15, 256, 1}, + {12, 7168, 15, 64, 1}, + {12, 7680, 15, 32, 0}, + {12, 8192, 15, 8192, 0}, + {12, 8704, 15, 64, 1}, + {12, 9216, 15, 256, 1}, + {12, 9728, 15, 32, 1}, + {12, 10240, 15, 32, 1}, + {12, 20480, 15, 2048, 1}, + {12, 30720, 15, 2048, 1}, + {12, 40960, 15, 512, 1}, + {12, 51200, 15, 2048, 1}, + {12, 61440, 15, 2048, 1}, + {12, 71680, 15, 32, 1}, + {12, 81920, 15, 16384, 1}, + {12, 92160, 15, 256, 1}, + {12, 102400, 15, 2048, 1}, + {12, 204800, 15, 512, 1}, + {12, 307200, 15, 2048, 1}, + {12, 409600, 15, 16384, 1}, + {12, 512000, 15, 4096, 1}, + {12, 614400, 15, 4096, 1}, + {12, 716800, 15, 512, 1}, + {12, 819200, 15, 32768, 1}, + {12, 921600, 15, 512, 1}, + {12, 1024000, 15, 8192, 1}, + {11, 512, 20, 128, 1}, + {11, 1024, 20, 1024, 0}, + {11, 1536, 20, 512, 1}, + {11, 2048, 20, 2048, 0}, + {11, 2560, 20, 512, 1}, + {11, 3072, 20, 32, 0}, + {11, 3584, 20, 128, 0}, + {11, 4096, 20, 4096, 0}, + {11, 4608, 20, 256, 0}, + {11, 5120, 20, 256, 0}, + {11, 5632, 20, 128, 0}, + {11, 6144, 20, 64, 0}, + {11, 6656, 20, 512, 0}, + {11, 7168, 20, 512, 0}, + {11, 7680, 20, 512, 1}, + {11, 8192, 20, 8192, 0}, + {11, 8704, 20, 32, 1}, + {11, 9216, 20, 256, 1}, + {11, 9728, 20, 512, 1}, + {11, 10240, 20, 256, 0}, + {11, 20480, 20, 256, 1}, + {11, 30720, 20, 2048, 1}, + {11, 40960, 20, 512, 1}, + {11, 51200, 20, 2048, 1}, + {11, 61440, 20, 2048, 1}, + {11, 71680, 20, 32, 1}, + {11, 81920, 20, 2048, 1}, + {11, 92160, 20, 64, 1}, + {11, 102400, 20, 4096, 1}, + {11, 204800, 20, 8192, 1}, + {11, 307200, 20, 128, 1}, + {11, 409600, 20, 8192, 1}, + {11, 512000, 20, 4096, 1}, + {11, 614400, 20, 256, 1}, + {11, 716800, 20, 512, 1}, + {11, 819200, 20, 16384, 1}, + {11, 921600, 20, 512, 1}, + {11, 1024000, 20, 8192, 1}, + {16, 512, 21, 32, 1}, + {16, 1024, 21, 1024, 0}, + {16, 1536, 21, 128, 0}, + {16, 2048, 21, 2048, 0}, + {16, 2560, 21, 32, 0}, + {16, 3072, 21, 64, 1}, + {16, 3584, 21, 64, 0}, + {16, 4096, 21, 256, 0}, + {16, 4608, 21, 64, 0}, + {16, 5120, 21, 512, 0}, + {16, 5632, 21, 128, 0}, + {16, 6144, 21, 64, 0}, + {16, 6656, 21, 256, 1}, + {16, 7168, 21, 512, 0}, + {16, 7680, 21, 64, 1}, + {16, 8192, 21, 8192, 0}, + {16, 8704, 21, 256, 0}, + {16, 9216, 21, 1024, 1}, + {16, 9728, 21, 128, 1}, + {16, 10240, 21, 256, 1}, + {16, 20480, 21, 4096, 1}, + {16, 30720, 21, 2048, 1}, + {16, 40960, 21, 1024, 1}, + {16, 51200, 21, 128, 1}, + {16, 61440, 21, 512, 1}, + {16, 71680, 21, 1024, 1}, + {16, 81920, 21, 2048, 1}, + {16, 92160, 21, 64, 1}, + {16, 102400, 21, 4096, 1}, + {16, 204800, 21, 8192, 1}, + {16, 307200, 21, 128, 1}, + {16, 409600, 21, 256, 1}, + {16, 512000, 21, 128, 1}, + {16, 614400, 21, 8192, 1}, + {16, 716800, 21, 1024, 1}, + {16, 819200, 21, 32768, 1}, + {16, 921600, 21, 1024, 1}, + {16, 1024000, 21, 512, 0}, + {25, 512, 28, 32, 0}, + {25, 1024, 28, 1024, 0}, + {25, 1536, 28, 64, 0}, + {25, 2048, 28, 2048, 0}, + {25, 2560, 28, 64, 0}, + {25, 3072, 28, 256, 0}, + {25, 3584, 28, 128, 0}, + {25, 4096, 28, 64, 0}, + {25, 4608, 28, 128, 0}, + {25, 5120, 28, 128, 0}, + {25, 5632, 28, 128, 1}, + {25, 6144, 28, 64, 0}, + {25, 6656, 28, 128, 0}, + {25, 7168, 28, 256, 0}, + {25, 7680, 28, 128, 0}, + {25, 8192, 28, 512, 0}, + {25, 8704, 28, 256, 1}, + {25, 9216, 28, 64, 0}, + {25, 9728, 28, 256, 0}, + {25, 10240, 28, 512, 1}, + {25, 20480, 28, 2048, 1}, + {25, 30720, 28, 512, 1}, + {25, 40960, 28, 1024, 1}, + {25, 51200, 28, 128, 1}, + {25, 61440, 28, 4096, 1}, + {25, 71680, 28, 1024, 1}, + {25, 81920, 28, 4096, 1}, + {25, 92160, 28, 256, 1}, + {25, 102400, 28, 4096, 1}, + {25, 204800, 28, 8192, 1}, + {25, 307200, 28, 512, 1}, + {25, 409600, 28, 1024, 0}, + {25, 512000, 28, 128, 0}, + {25, 614400, 28, 1024, 0}, + {25, 716800, 28, 4096, 0}, + {25, 819200, 28, 256, 0}, + {25, 921600, 28, 4096, 0}, + {25, 1024000, 28, 4096, 0}, + {24, 512, 35, 256, 0}, + {24, 1024, 35, 1024, 0}, + {24, 1536, 35, 64, 0}, + {24, 2048, 35, 64, 0}, + {24, 2560, 35, 128, 0}, + {24, 3072, 35, 128, 0}, + {24, 3584, 35, 128, 0}, + {24, 4096, 35, 256, 0}, + {24, 4608, 35, 64, 0}, + {24, 5120, 35, 128, 0}, + {24, 5632, 35, 128, 0}, + {24, 6144, 35, 256, 0}, + {24, 6656, 35, 128, 0}, + {24, 7168, 35, 64, 0}, + {24, 7680, 35, 128, 0}, + {24, 8192, 35, 8192, 0}, + {24, 8704, 35, 32, 0}, + {24, 9216, 35, 256, 0}, + {24, 9728, 35, 256, 0}, + {24, 10240, 35, 2048, 0}, + {24, 20480, 35, 2048, 1}, + {24, 30720, 35, 256, 1}, + {24, 40960, 35, 64, 1}, + {24, 51200, 35, 128, 1}, + {24, 61440, 35, 32, 1}, + {24, 71680, 35, 64, 1}, + {24, 81920, 35, 256, 1}, + {24, 92160, 35, 256, 1}, + {24, 102400, 35, 256, 1}, + {24, 204800, 35, 128, 1}, + {24, 307200, 35, 4096, 0}, + {24, 409600, 35, 8192, 0}, + {24, 512000, 35, 4096, 0}, + {24, 614400, 35, 512, 0}, + {24, 716800, 35, 4096, 0}, + {24, 819200, 35, 16384, 0}, + {24, 921600, 35, 2048, 0}, + {24, 1024000, 35, 4096, 0}, + {33, 512, 36, 512, 0}, + {33, 1024, 36, 1024, 0}, + {33, 1536, 36, 32, 0}, + {33, 2048, 36, 256, 0}, + {33, 2560, 36, 128, 0}, + {33, 3072, 36, 128, 0}, + {33, 3584, 36, 64, 0}, + {33, 4096, 36, 64, 0}, + {33, 4608, 36, 32, 1}, + {33, 5120, 36, 128, 1}, + {33, 5632, 36, 512, 1}, + {33, 6144, 36, 1024, 1}, + {33, 6656, 36, 256, 1}, + {33, 7168, 36, 64, 1}, + {33, 7680, 36, 32, 1}, + {33, 8192, 36, 1024, 1}, + {33, 8704, 36, 512, 1}, + {33, 9216, 36, 1024, 1}, + {33, 9728, 36, 32, 1}, + {33, 10240, 36, 64, 1}, + {33, 20480, 36, 4096, 1}, + {33, 30720, 36, 2048, 1}, + {33, 40960, 36, 8192, 1}, + {33, 51200, 36, 128, 1}, + {33, 61440, 36, 256, 1}, + {33, 71680, 36, 128, 1}, + {33, 81920, 36, 256, 1}, + {33, 92160, 36, 256, 1}, + {33, 102400, 36, 64, 1}, + {33, 204800, 36, 8192, 1}, + {33, 307200, 36, 1024, 1}, + {33, 409600, 36, 16384, 1}, + {33, 512000, 36, 2048, 1}, + {33, 614400, 36, 8192, 1}, + {33, 716800, 36, 1024, 1}, + {33, 819200, 36, 32768, 1}, + {33, 921600, 36, 4096, 1}, + {33, 1024000, 36, 1024000, 0}, + {42, 512, 45, 512, 0}, + {42, 1024, 45, 1024, 0}, + {42, 1536, 45, 32, 0}, + {42, 2048, 45, 2048, 0}, + {42, 2560, 45, 128, 0}, + {42, 3072, 45, 512, 1}, + {42, 3584, 45, 256, 0}, + {42, 4096, 45, 128, 0}, + {42, 4608, 45, 64, 1}, + {42, 5120, 45, 64, 1}, + {42, 5632, 45, 32, 1}, + {42, 6144, 45, 2048, 0}, + {42, 6656, 45, 128, 1}, + {42, 7168, 45, 1024, 1}, + {42, 7680, 45, 256, 1}, + {42, 8192, 45, 2048, 1}, + {42, 8704, 45, 512, 1}, + {42, 9216, 45, 256, 1}, + {42, 9728, 45, 512, 1}, + {42, 10240, 45, 1024, 1}, + {42, 20480, 45, 128, 1}, + {42, 30720, 45, 512, 1}, + {42, 40960, 45, 2048, 1}, + {42, 51200, 45, 256, 1}, + {42, 61440, 45, 4096, 1}, + {42, 71680, 45, 128, 1}, + {42, 81920, 45, 8192, 1}, + {42, 92160, 45, 512, 1}, + {42, 102400, 45, 256, 1}, + {42, 204800, 45, 8192, 1}, + {42, 307200, 45, 2048, 1}, + {42, 409600, 45, 16384, 1}, + {42, 512000, 45, 4096, 1}, + {42, 614400, 45, 614400, 0}, + {42, 716800, 45, 716800, 0}, + {42, 819200, 45, 819200, 0}, + {42, 921600, 45, 921600, 0}, + {42, 1024000, 45, 1024000, 0}, + {43, 512, 56, 512, 0}, + {43, 1024, 56, 1024, 0}, + {43, 1536, 56, 256, 0}, + {43, 2048, 56, 128, 0}, + {43, 2560, 56, 32, 0}, + {43, 3072, 56, 128, 0}, + {43, 3584, 56, 128, 0}, + {43, 4096, 56, 128, 0}, + {43, 4608, 56, 256, 0}, + {43, 5120, 56, 256, 0}, + {43, 5632, 56, 512, 1}, + {43, 6144, 56, 256, 1}, + {43, 6656, 56, 256, 1}, + {43, 7168, 56, 128, 1}, + {43, 7680, 56, 256, 1}, + {43, 8192, 56, 8192, 1}, + {43, 8704, 56, 512, 1}, + {43, 9216, 56, 512, 1}, + {43, 9728, 56, 128, 1}, + {43, 10240, 56, 1024, 1}, + {43, 20480, 56, 128, 1}, + {43, 30720, 56, 256, 1}, + {43, 40960, 56, 256, 1}, + {43, 51200, 56, 512, 1}, + {43, 61440, 56, 4096, 1}, + {43, 71680, 56, 512, 1}, + {43, 81920, 56, 8192, 1}, + {43, 92160, 56, 512, 1}, + {43, 102400, 56, 128, 1}, + {43, 204800, 56, 512, 1}, + {43, 307200, 56, 1024, 1}, + {43, 409600, 56, 4096, 1}, + {43, 512000, 56, 512000, 0}, + {43, 614400, 56, 614400, 0}, + {43, 716800, 56, 716800, 0}, + {43, 819200, 56, 819200, 0}, + {43, 921600, 56, 921600, 0}, + {43, 1024000, 56, 1024000, 0}, + {126, 512, 84, 256, 0}, + {126, 1024, 84, 1024, 0}, + {126, 1536, 84, 64, 0}, + {126, 2048, 84, 128, 0}, + {126, 2560, 84, 512, 0}, + {126, 3072, 84, 1024, 0}, + {126, 3584, 84, 128, 1}, + {126, 4096, 84, 4096, 1}, + {126, 4608, 84, 256, 1}, + {126, 5120, 84, 64, 1}, + {126, 5632, 84, 128, 1}, + {126, 6144, 84, 2048, 1}, + {126, 6656, 84, 256, 1}, + {126, 7168, 84, 128, 1}, + {126, 7680, 84, 64, 1}, + {126, 8192, 84, 128, 1}, + {126, 8704, 84, 512, 1}, + {126, 9216, 84, 128, 1}, + {126, 9728, 84, 128, 1}, + {126, 10240, 84, 2048, 1}, + {126, 20480, 84, 256, 1}, + {126, 30720, 84, 512, 0}, + {126, 40960, 84, 8192, 0}, + {126, 51200, 84, 128, 0}, + {126, 61440, 84, 2048, 0}, + {126, 71680, 84, 2048, 0}, + {126, 81920, 84, 81920, 0}, + {126, 92160, 84, 92160, 0}, + {126, 102400, 84, 102400, 0}, + {126, 204800, 84, 204800, 0}, + {126, 307200, 84, 307200, 0}, + {126, 409600, 84, 409600, 0}, + {126, 512000, 84, 512000, 0}, + {126, 614400, 84, 614400, 0}, + {126, 716800, 84, 716800, 0}, + {126, 819200, 84, 819200, 0}, + {126, 921600, 84, 921600, 0}, + {126, 1024000, 84, 1024000, 0}, + {210, 512, 120, 512, 0}, + {210, 1024, 120, 256, 0}, + {210, 1536, 120, 256, 0}, + {210, 2048, 120, 2048, 1}, + {210, 2560, 120, 256, 1}, + {210, 3072, 120, 128, 1}, + {210, 3584, 120, 128, 1}, + {210, 4096, 120, 256, 1}, + {210, 4608, 120, 32, 1}, + {210, 5120, 120, 128, 1}, + {210, 5632, 120, 512, 1}, + {210, 6144, 120, 512, 1}, + {210, 6656, 120, 256, 1}, + {210, 7168, 120, 256, 1}, + {210, 7680, 120, 256, 0}, + {210, 8192, 120, 1024, 0}, + {210, 8704, 120, 512, 1}, + {210, 9216, 120, 1024, 0}, + {210, 9728, 120, 128, 0}, + {210, 10240, 120, 2048, 0}, + {210, 20480, 120, 2048, 0}, + {210, 30720, 120, 256, 0}, + {210, 40960, 120, 8192, 0}, + {210, 51200, 120, 128, 0}, + {210, 61440, 120, 4096, 0}, + {210, 71680, 120, 71680, 0}, + {210, 81920, 120, 81920, 0}, + {210, 92160, 120, 92160, 0}, + {210, 102400, 120, 102400, 0}, + {210, 204800, 120, 204800, 0}, + {210, 307200, 120, 307200, 0}, + {210, 409600, 120, 409600, 0}, + {210, 512000, 120, 256, 0}, + {210, 614400, 120, 128, 0}, + {210, 716800, 120, 1024, 0}, + {210, 819200, 120, 2048, 0}, + {210, 921600, 120, 4096, 0}, + {210, 1024000, 120, 8192, 0}, + {330, 512, 165, 512, 0}, + {330, 1024, 165, 1024, 0}, + {330, 1536, 165, 128, 1}, + {330, 2048, 165, 1024, 1}, + {330, 2560, 165, 512, 1}, + {330, 3072, 165, 256, 1}, + {330, 3584, 165, 512, 1}, + {330, 4096, 165, 512, 1}, + {330, 4608, 165, 256, 1}, + {330, 5120, 165, 1024, 1}, + {330, 5632, 165, 512, 1}, + {330, 6144, 165, 1024, 1}, + {330, 6656, 165, 512, 0}, + {330, 7168, 165, 1024, 0}, + {330, 7680, 165, 7680, 0}, + {330, 8192, 165, 8192, 0}, + {330, 8704, 165, 512, 0}, + {330, 9216, 165, 256, 0}, + {330, 9728, 165, 9728, 0}, + {330, 10240, 165, 10240, 0}, + {330, 20480, 165, 20480, 0}, + {330, 30720, 165, 30720, 0}, + {330, 40960, 165, 4096, 0}, + {330, 51200, 165, 51200, 0}, + {330, 61440, 165, 1024, 0}, + {330, 71680, 165, 2048, 0}, + {330, 81920, 165, 16384, 0}, + {330, 92160, 165, 2048, 0}, + {330, 102400, 165, 1024, 0}, + {330, 204800, 165, 2048, 0}, + {330, 307200, 165, 128, 0}, + {330, 409600, 165, 128, 0}, + {330, 512000, 165, 1024, 0}, + {330, 614400, 165, 1024, 0}, + {330, 716800, 165, 1024, 0}, + {330, 819200, 165, 4096, 0}, + {330, 921600, 165, 4096, 0}, + {330, 1024000, 165, 4096, 0} }; - diff --git a/backends/magma/gemm_tuning/indices.h b/backends/magma/gemm_tuning/indices.h index b30aeb32bc..f5f5103ee9 100644 --- a/backends/magma/gemm_tuning/indices.h +++ b/backends/magma/gemm_tuning/indices.h @@ -1,8 +1,6 @@ -#define RECORD_LENGTH 5 -#define M_INDEX 0 -#define N_INDEX 1 -#define K_INDEX 2 -#define N_BATCH_INDEX 3 -#define USE_MAGMA_INDEX 4 - - +#define RECORD_LENGTH 5 +#define M_INDEX 0 +#define N_INDEX 1 +#define K_INDEX 2 +#define N_BATCH_INDEX 3 +#define USE_MAGMA_INDEX 4 diff --git a/backends/magma/gemm_tuning/mi100.h b/backends/magma/gemm_tuning/mi100.h index 7968e8a854..9285998081 100644 --- a/backends/magma/gemm_tuning/mi100.h +++ b/backends/magma/gemm_tuning/mi100.h @@ -2,2454 +2,2449 @@ // auto-generated from data on mi100-rocm5.0.2 //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > sgemm_nn_mi100 = -{ - {3 , 512 , 1 , 32 , 0 }, - {3 , 1024 , 1 , 512 , 0 }, - {3 , 1536 , 1 , 256 , 0 }, - {3 , 2048 , 1 , 512 , 0 }, - {3 , 2560 , 1 , 512 , 0 }, - {3 , 3072 , 1 , 512 , 0 }, - {3 , 3584 , 1 , 32 , 1 }, - {3 , 4096 , 1 , 512 , 0 }, - {3 , 4608 , 1 , 512 , 0 }, - {3 , 5120 , 1 , 32 , 1 }, - {3 , 5632 , 1 , 32 , 1 }, - {3 , 6144 , 1 , 512 , 0 }, - {3 , 6656 , 1 , 32 , 1 }, - {3 , 7168 , 1 , 32 , 1 }, - {3 , 7680 , 1 , 32 , 1 }, - {3 , 8192 , 1 , 32 , 1 }, - {3 , 8704 , 1 , 32 , 1 }, - {3 , 9216 , 1 , 32 , 1 }, - {3 , 9728 , 1 , 512 , 0 }, - {3 , 10240 , 1 , 32 , 1 }, - {3 , 20480 , 1 , 64 , 1 }, - {3 , 30720 , 1 , 64 , 1 }, - {3 , 40960 , 1 , 256 , 0 }, - {3 , 51200 , 1 , 256 , 0 }, - {3 , 61440 , 1 , 256 , 0 }, - {3 , 71680 , 1 , 256 , 0 }, - {3 , 81920 , 1 , 256 , 0 }, - {3 , 92160 , 1 , 2048 , 1 }, - {3 , 102400 , 1 , 4096 , 1 }, - {3 , 204800 , 1 , 2048 , 1 }, - {3 , 307200 , 1 , 4096 , 1 }, - {3 , 409600 , 1 , 2048 , 1 }, - {3 , 512000 , 1 , 2048 , 1 }, - {3 , 614400 , 1 , 2048 , 1 }, - {3 , 716800 , 1 , 1024 , 1 }, - {3 , 819200 , 1 , 2048 , 1 }, - {3 , 921600 , 1 , 256 , 0 }, - {3 , 1024000, 1 , 256 , 0 }, - {4 , 512 , 1 , 512 , 0 }, - {4 , 1024 , 1 , 512 , 0 }, - {4 , 1536 , 1 , 32 , 0 }, - {4 , 2048 , 1 , 512 , 0 }, - {4 , 2560 , 1 , 512 , 0 }, - {4 , 3072 , 1 , 512 , 0 }, - {4 , 3584 , 1 , 32 , 1 }, - {4 , 4096 , 1 , 32 , 1 }, - {4 , 4608 , 1 , 32 , 1 }, - {4 , 5120 , 1 , 32 , 1 }, - {4 , 5632 , 1 , 32 , 1 }, - {4 , 6144 , 1 , 32 , 1 }, - {4 , 6656 , 1 , 32 , 1 }, - {4 , 7168 , 1 , 32 , 1 }, - {4 , 7680 , 1 , 32 , 1 }, - {4 , 8192 , 1 , 32 , 1 }, - {4 , 8704 , 1 , 32 , 1 }, - {4 , 9216 , 1 , 32 , 1 }, - {4 , 9728 , 1 , 32 , 1 }, - {4 , 10240 , 1 , 32 , 1 }, - {4 , 20480 , 1 , 32 , 1 }, - {4 , 30720 , 1 , 64 , 1 }, - {4 , 40960 , 1 , 256 , 0 }, - {4 , 51200 , 1 , 256 , 0 }, - {4 , 61440 , 1 , 256 , 0 }, - {4 , 71680 , 1 , 2048 , 1 }, - {4 , 81920 , 1 , 1024 , 1 }, - {4 , 92160 , 1 , 2048 , 1 }, - {4 , 102400 , 1 , 256 , 1 }, - {4 , 204800 , 1 , 4096 , 1 }, - {4 , 307200 , 1 , 2048 , 1 }, - {4 , 409600 , 1 , 8192 , 1 }, - {4 , 512000 , 1 , 2048 , 1 }, - {4 , 614400 , 1 , 2048 , 1 }, - {4 , 716800 , 1 , 1024 , 1 }, - {4 , 819200 , 1 , 8192 , 1 }, - {4 , 921600 , 1 , 2048 , 1 }, - {4 , 1024000, 1 , 256 , 0 }, - {6 , 512 , 3 , 64 , 0 }, - {6 , 1024 , 3 , 1024 , 0 }, - {6 , 1536 , 3 , 32 , 1 }, - {6 , 2048 , 3 , 512 , 0 }, - {6 , 2560 , 3 , 512 , 0 }, - {6 , 3072 , 3 , 32 , 1 }, - {6 , 3584 , 3 , 32 , 1 }, - {6 , 4096 , 3 , 512 , 0 }, - {6 , 4608 , 3 , 32 , 1 }, - {6 , 5120 , 3 , 32 , 1 }, - {6 , 5632 , 3 , 32 , 1 }, - {6 , 6144 , 3 , 32 , 1 }, - {6 , 6656 , 3 , 32 , 1 }, - {6 , 7168 , 3 , 32 , 1 }, - {6 , 7680 , 3 , 32 , 1 }, - {6 , 8192 , 3 , 32 , 1 }, - {6 , 8704 , 3 , 32 , 1 }, - {6 , 9216 , 3 , 32 , 1 }, - {6 , 9728 , 3 , 32 , 1 }, - {6 , 10240 , 3 , 32 , 1 }, - {6 , 20480 , 3 , 32 , 1 }, - {6 , 30720 , 3 , 256 , 1 }, - {6 , 40960 , 3 , 256 , 0 }, - {6 , 51200 , 3 , 1024 , 1 }, - {6 , 61440 , 3 , 1024 , 1 }, - {6 , 71680 , 3 , 2048 , 1 }, - {6 , 81920 , 3 , 2048 , 1 }, - {6 , 92160 , 3 , 256 , 1 }, - {6 , 102400 , 3 , 2048 , 1 }, - {6 , 204800 , 3 , 1024 , 1 }, - {6 , 307200 , 3 , 2048 , 1 }, - {6 , 409600 , 3 , 1024 , 1 }, - {6 , 512000 , 3 , 512 , 1 }, - {6 , 614400 , 3 , 1024 , 1 }, - {6 , 716800 , 3 , 2048 , 1 }, - {6 , 819200 , 3 , 512 , 1 }, - {6 , 921600 , 3 , 1024 , 1 }, - {6 , 1024000, 3 , 1024 , 1 }, - {10 , 512 , 4 , 512 , 0 }, - {10 , 1024 , 4 , 64 , 0 }, - {10 , 1536 , 4 , 512 , 0 }, - {10 , 2048 , 4 , 512 , 0 }, - {10 , 2560 , 4 , 128 , 0 }, - {10 , 3072 , 4 , 32 , 1 }, - {10 , 3584 , 4 , 32 , 1 }, - {10 , 4096 , 4 , 64 , 0 }, - {10 , 4608 , 4 , 32 , 1 }, - {10 , 5120 , 4 , 512 , 0 }, - {10 , 5632 , 4 , 512 , 0 }, - {10 , 6144 , 4 , 32 , 1 }, - {10 , 6656 , 4 , 32 , 1 }, - {10 , 7168 , 4 , 32 , 1 }, - {10 , 7680 , 4 , 32 , 1 }, - {10 , 8192 , 4 , 32 , 1 }, - {10 , 8704 , 4 , 32 , 1 }, - {10 , 9216 , 4 , 32 , 1 }, - {10 , 9728 , 4 , 32 , 1 }, - {10 , 10240 , 4 , 512 , 0 }, - {10 , 20480 , 4 , 64 , 1 }, - {10 , 30720 , 4 , 512 , 1 }, - {10 , 40960 , 4 , 512 , 1 }, - {10 , 51200 , 4 , 256 , 1 }, - {10 , 61440 , 4 , 512 , 1 }, - {10 , 71680 , 4 , 512 , 1 }, - {10 , 81920 , 4 , 1024 , 1 }, - {10 , 92160 , 4 , 1024 , 1 }, - {10 , 102400 , 4 , 512 , 1 }, - {10 , 204800 , 4 , 4096 , 1 }, - {10 , 307200 , 4 , 512 , 1 }, - {10 , 409600 , 4 , 2048 , 1 }, - {10 , 512000 , 4 , 4096 , 1 }, - {10 , 614400 , 4 , 8192 , 1 }, - {10 , 716800 , 4 , 1024 , 1 }, - {10 , 819200 , 4 , 32768 , 1 }, - {10 , 921600 , 4 , 4096 , 1 }, - {10 , 1024000, 4 , 8192 , 1 }, - {10 , 512 , 6 , 32 , 1 }, - {10 , 1024 , 6 , 32 , 1 }, - {10 , 1536 , 6 , 32 , 1 }, - {10 , 2048 , 6 , 32 , 1 }, - {10 , 2560 , 6 , 32 , 1 }, - {10 , 3072 , 6 , 32 , 1 }, - {10 , 3584 , 6 , 32 , 1 }, - {10 , 4096 , 6 , 32 , 1 }, - {10 , 4608 , 6 , 32 , 1 }, - {10 , 5120 , 6 , 32 , 1 }, - {10 , 5632 , 6 , 32 , 1 }, - {10 , 6144 , 6 , 32 , 1 }, - {10 , 6656 , 6 , 32 , 1 }, - {10 , 7168 , 6 , 32 , 1 }, - {10 , 7680 , 6 , 64 , 1 }, - {10 , 8192 , 6 , 32 , 1 }, - {10 , 8704 , 6 , 32 , 1 }, - {10 , 9216 , 6 , 32 , 1 }, - {10 , 9728 , 6 , 32 , 1 }, - {10 , 10240 , 6 , 32 , 1 }, - {10 , 20480 , 6 , 64 , 1 }, - {10 , 30720 , 6 , 128 , 1 }, - {10 , 40960 , 6 , 512 , 1 }, - {10 , 51200 , 6 , 512 , 1 }, - {10 , 61440 , 6 , 256 , 1 }, - {10 , 71680 , 6 , 512 , 1 }, - {10 , 81920 , 6 , 512 , 1 }, - {10 , 92160 , 6 , 256 , 1 }, - {10 , 102400 , 6 , 256 , 1 }, - {10 , 204800 , 6 , 512 , 1 }, - {10 , 307200 , 6 , 512 , 1 }, - {10 , 409600 , 6 , 1024 , 1 }, - {10 , 512000 , 6 , 1024 , 1 }, - {10 , 614400 , 6 , 8192 , 1 }, - {10 , 716800 , 6 , 4096 , 1 }, - {10 , 819200 , 6 , 32768 , 1 }, - {10 , 921600 , 6 , 4096 , 1 }, - {10 , 1024000, 6 , 256 , 0 }, - {15 , 512 , 12 , 32 , 1 }, - {15 , 1024 , 12 , 32 , 1 }, - {15 , 1536 , 12 , 32 , 1 }, - {15 , 2048 , 12 , 32 , 1 }, - {15 , 2560 , 12 , 32 , 1 }, - {15 , 3072 , 12 , 32 , 1 }, - {15 , 3584 , 12 , 32 , 1 }, - {15 , 4096 , 12 , 256 , 1 }, - {15 , 4608 , 12 , 32 , 1 }, - {15 , 5120 , 12 , 32 , 1 }, - {15 , 5632 , 12 , 32 , 1 }, - {15 , 6144 , 12 , 32 , 1 }, - {15 , 6656 , 12 , 32 , 1 }, - {15 , 7168 , 12 , 32 , 1 }, - {15 , 7680 , 12 , 32 , 1 }, - {15 , 8192 , 12 , 256 , 1 }, - {15 , 8704 , 12 , 256 , 1 }, - {15 , 9216 , 12 , 512 , 1 }, - {15 , 9728 , 12 , 32 , 1 }, - {15 , 10240 , 12 , 32 , 1 }, - {15 , 20480 , 12 , 512 , 1 }, - {15 , 30720 , 12 , 512 , 1 }, - {15 , 40960 , 12 , 256 , 1 }, - {15 , 51200 , 12 , 512 , 1 }, - {15 , 61440 , 12 , 256 , 1 }, - {15 , 71680 , 12 , 512 , 1 }, - {15 , 81920 , 12 , 8192 , 1 }, - {15 , 92160 , 12 , 512 , 1 }, - {15 , 102400 , 12 , 512 , 1 }, - {15 , 204800 , 12 , 64 , 0 }, - {15 , 307200 , 12 , 128 , 0 }, - {15 , 409600 , 12 , 64 , 0 }, - {15 , 512000 , 12 , 64 , 0 }, - {15 , 614400 , 12 , 128 , 0 }, - {15 , 716800 , 12 , 128 , 0 }, - {15 , 819200 , 12 , 128 , 0 }, - {15 , 921600 , 12 , 64 , 0 }, - {15 , 1024000, 12 , 64 , 0 }, - {20 , 512 , 11 , 128 , 0 }, - {20 , 1024 , 11 , 128 , 0 }, - {20 , 1536 , 11 , 32 , 1 }, - {20 , 2048 , 11 , 32 , 1 }, - {20 , 2560 , 11 , 32 , 1 }, - {20 , 3072 , 11 , 32 , 1 }, - {20 , 3584 , 11 , 32 , 1 }, - {20 , 4096 , 11 , 32 , 1 }, - {20 , 4608 , 11 , 32 , 1 }, - {20 , 5120 , 11 , 32 , 1 }, - {20 , 5632 , 11 , 32 , 1 }, - {20 , 6144 , 11 , 32 , 1 }, - {20 , 6656 , 11 , 32 , 1 }, - {20 , 7168 , 11 , 32 , 1 }, - {20 , 7680 , 11 , 32 , 1 }, - {20 , 8192 , 11 , 32 , 1 }, - {20 , 8704 , 11 , 32 , 1 }, - {20 , 9216 , 11 , 32 , 1 }, - {20 , 9728 , 11 , 32 , 1 }, - {20 , 10240 , 11 , 32 , 1 }, - {20 , 20480 , 11 , 256 , 1 }, - {20 , 30720 , 11 , 256 , 1 }, - {20 , 40960 , 11 , 256 , 1 }, - {20 , 51200 , 11 , 512 , 1 }, - {20 , 61440 , 11 , 512 , 1 }, - {20 , 71680 , 11 , 256 , 1 }, - {20 , 81920 , 11 , 512 , 1 }, - {20 , 92160 , 11 , 256 , 1 }, - {20 , 102400 , 11 , 512 , 1 }, - {20 , 204800 , 11 , 4096 , 1 }, - {20 , 307200 , 11 , 4096 , 1 }, - {20 , 409600 , 11 , 8192 , 1 }, - {20 , 512000 , 11 , 4096 , 1 }, - {20 , 614400 , 11 , 8192 , 1 }, - {20 , 716800 , 11 , 4096 , 1 }, - {20 , 819200 , 11 , 8192 , 1 }, - {20 , 921600 , 11 , 4096 , 1 }, - {20 , 1024000, 11 , 8192 , 1 }, - {21 , 512 , 16 , 128 , 0 }, - {21 , 1024 , 16 , 128 , 0 }, - {21 , 1536 , 16 , 64 , 0 }, - {21 , 2048 , 16 , 128 , 0 }, - {21 , 2560 , 16 , 32 , 1 }, - {21 , 3072 , 16 , 32 , 1 }, - {21 , 3584 , 16 , 32 , 1 }, - {21 , 4096 , 16 , 256 , 1 }, - {21 , 4608 , 16 , 64 , 0 }, - {21 , 5120 , 16 , 128 , 0 }, - {21 , 5632 , 16 , 32 , 1 }, - {21 , 6144 , 16 , 32 , 1 }, - {21 , 6656 , 16 , 32 , 1 }, - {21 , 7168 , 16 , 32 , 1 }, - {21 , 7680 , 16 , 32 , 1 }, - {21 , 8192 , 16 , 128 , 0 }, - {21 , 8704 , 16 , 64 , 0 }, - {21 , 9216 , 16 , 64 , 0 }, - {21 , 9728 , 16 , 128 , 0 }, - {21 , 10240 , 16 , 128 , 0 }, - {21 , 20480 , 16 , 512 , 1 }, - {21 , 30720 , 16 , 256 , 1 }, - {21 , 40960 , 16 , 512 , 1 }, - {21 , 51200 , 16 , 256 , 1 }, - {21 , 61440 , 16 , 512 , 1 }, - {21 , 71680 , 16 , 512 , 1 }, - {21 , 81920 , 16 , 512 , 1 }, - {21 , 92160 , 16 , 512 , 1 }, - {21 , 102400 , 16 , 512 , 1 }, - {21 , 204800 , 16 , 512 , 1 }, - {21 , 307200 , 16 , 128 , 0 }, - {21 , 409600 , 16 , 64 , 0 }, - {21 , 512000 , 16 , 128 , 0 }, - {21 , 614400 , 16 , 64 , 0 }, - {21 , 716800 , 16 , 64 , 0 }, - {21 , 819200 , 16 , 64 , 0 }, - {21 , 921600 , 16 , 128 , 0 }, - {21 , 1024000, 16 , 128 , 0 }, - {28 , 512 , 25 , 128 , 0 }, - {28 , 1024 , 25 , 128 , 0 }, - {28 , 1536 , 25 , 128 , 0 }, - {28 , 2048 , 25 , 128 , 0 }, - {28 , 2560 , 25 , 128 , 0 }, - {28 , 3072 , 25 , 128 , 0 }, - {28 , 3584 , 25 , 128 , 0 }, - {28 , 4096 , 25 , 128 , 0 }, - {28 , 4608 , 25 , 128 , 0 }, - {28 , 5120 , 25 , 128 , 0 }, - {28 , 5632 , 25 , 128 , 0 }, - {28 , 6144 , 25 , 128 , 0 }, - {28 , 6656 , 25 , 32 , 0 }, - {28 , 7168 , 25 , 128 , 0 }, - {28 , 7680 , 25 , 64 , 0 }, - {28 , 8192 , 25 , 32 , 0 }, - {28 , 8704 , 25 , 256 , 0 }, - {28 , 9216 , 25 , 256 , 0 }, - {28 , 9728 , 25 , 32 , 1 }, - {28 , 10240 , 25 , 32 , 1 }, - {28 , 20480 , 25 , 256 , 1 }, - {28 , 30720 , 25 , 256 , 1 }, - {28 , 40960 , 25 , 4096 , 1 }, - {28 , 51200 , 25 , 256 , 1 }, - {28 , 61440 , 25 , 512 , 1 }, - {28 , 71680 , 25 , 512 , 1 }, - {28 , 81920 , 25 , 4096 , 1 }, - {28 , 92160 , 25 , 512 , 1 }, - {28 , 102400 , 25 , 4096 , 1 }, - {28 , 204800 , 25 , 64 , 0 }, - {28 , 307200 , 25 , 2048 , 0 }, - {28 , 409600 , 25 , 32 , 0 }, - {28 , 512000 , 25 , 2048 , 0 }, - {28 , 614400 , 25 , 2048 , 0 }, - {28 , 716800 , 25 , 2048 , 0 }, - {28 , 819200 , 25 , 2048 , 0 }, - {28 , 921600 , 25 , 2048 , 0 }, - {28 , 1024000, 25 , 2048 , 0 }, - {35 , 512 , 24 , 128 , 0 }, - {35 , 1024 , 24 , 128 , 0 }, - {35 , 1536 , 24 , 128 , 0 }, - {35 , 2048 , 24 , 128 , 0 }, - {35 , 2560 , 24 , 128 , 0 }, - {35 , 3072 , 24 , 128 , 0 }, - {35 , 3584 , 24 , 128 , 0 }, - {35 , 4096 , 24 , 128 , 0 }, - {35 , 4608 , 24 , 128 , 0 }, - {35 , 5120 , 24 , 64 , 0 }, - {35 , 5632 , 24 , 128 , 0 }, - {35 , 6144 , 24 , 128 , 0 }, - {35 , 6656 , 24 , 256 , 0 }, - {35 , 7168 , 24 , 64 , 0 }, - {35 , 7680 , 24 , 32 , 1 }, - {35 , 8192 , 24 , 32 , 0 }, - {35 , 8704 , 24 , 256 , 0 }, - {35 , 9216 , 24 , 64 , 0 }, - {35 , 9728 , 24 , 32 , 1 }, - {35 , 10240 , 24 , 32 , 1 }, - {35 , 20480 , 24 , 256 , 1 }, - {35 , 30720 , 24 , 256 , 1 }, - {35 , 40960 , 24 , 4096 , 1 }, - {35 , 51200 , 24 , 256 , 1 }, - {35 , 61440 , 24 , 256 , 1 }, - {35 , 71680 , 24 , 256 , 1 }, - {35 , 81920 , 24 , 256 , 1 }, - {35 , 92160 , 24 , 512 , 1 }, - {35 , 102400 , 24 , 4096 , 0 }, - {35 , 204800 , 24 , 8192 , 1 }, - {35 , 307200 , 24 , 4096 , 0 }, - {35 , 409600 , 24 , 4096 , 0 }, - {35 , 512000 , 24 , 4096 , 0 }, - {35 , 614400 , 24 , 4096 , 0 }, - {35 , 716800 , 24 , 4096 , 0 }, - {35 , 819200 , 24 , 4096 , 0 }, - {35 , 921600 , 24 , 4096 , 0 }, - {35 , 1024000, 24 , 4096 , 0 }, - {36 , 512 , 33 , 128 , 0 }, - {36 , 1024 , 33 , 128 , 0 }, - {36 , 1536 , 33 , 128 , 0 }, - {36 , 2048 , 33 , 128 , 0 }, - {36 , 2560 , 33 , 128 , 0 }, - {36 , 3072 , 33 , 128 , 0 }, - {36 , 3584 , 33 , 128 , 0 }, - {36 , 4096 , 33 , 128 , 0 }, - {36 , 4608 , 33 , 128 , 0 }, - {36 , 5120 , 33 , 128 , 0 }, - {36 , 5632 , 33 , 128 , 0 }, - {36 , 6144 , 33 , 128 , 0 }, - {36 , 6656 , 33 , 64 , 0 }, - {36 , 7168 , 33 , 256 , 0 }, - {36 , 7680 , 33 , 64 , 0 }, - {36 , 8192 , 33 , 256 , 0 }, - {36 , 8704 , 33 , 64 , 0 }, - {36 , 9216 , 33 , 256 , 0 }, - {36 , 9728 , 33 , 128 , 0 }, - {36 , 10240 , 33 , 128 , 0 }, - {36 , 20480 , 33 , 256 , 1 }, - {36 , 30720 , 33 , 256 , 1 }, - {36 , 40960 , 33 , 4096 , 0 }, - {36 , 51200 , 33 , 512 , 1 }, - {36 , 61440 , 33 , 4096 , 0 }, - {36 , 71680 , 33 , 1024 , 1 }, - {36 , 81920 , 33 , 4096 , 0 }, - {36 , 92160 , 33 , 2048 , 1 }, - {36 , 102400 , 33 , 4096 , 0 }, - {36 , 204800 , 33 , 512 , 1 }, - {36 , 307200 , 33 , 4096 , 0 }, - {36 , 409600 , 33 , 8192 , 0 }, - {36 , 512000 , 33 , 4096 , 0 }, - {36 , 614400 , 33 , 8192 , 0 }, - {36 , 716800 , 33 , 4096 , 0 }, - {36 , 819200 , 33 , 8192 , 0 }, - {36 , 921600 , 33 , 921600 , 0 }, - {36 , 1024000, 33 , 8192 , 0 }, - {45 , 512 , 42 , 128 , 0 }, - {45 , 1024 , 42 , 128 , 0 }, - {45 , 1536 , 42 , 128 , 0 }, - {45 , 2048 , 42 , 128 , 0 }, - {45 , 2560 , 42 , 128 , 0 }, - {45 , 3072 , 42 , 1024 , 0 }, - {45 , 3584 , 42 , 128 , 0 }, - {45 , 4096 , 42 , 128 , 0 }, - {45 , 4608 , 42 , 128 , 0 }, - {45 , 5120 , 42 , 128 , 0 }, - {45 , 5632 , 42 , 32 , 0 }, - {45 , 6144 , 42 , 32 , 0 }, - {45 , 6656 , 42 , 64 , 0 }, - {45 , 7168 , 42 , 64 , 0 }, - {45 , 7680 , 42 , 32 , 0 }, - {45 , 8192 , 42 , 128 , 0 }, - {45 , 8704 , 42 , 64 , 0 }, - {45 , 9216 , 42 , 64 , 0 }, - {45 , 9728 , 42 , 64 , 0 }, - {45 , 10240 , 42 , 64 , 0 }, - {45 , 20480 , 42 , 1024 , 1 }, - {45 , 30720 , 42 , 2048 , 1 }, - {45 , 40960 , 42 , 64 , 0 }, - {45 , 51200 , 42 , 64 , 0 }, - {45 , 61440 , 42 , 64 , 0 }, - {45 , 71680 , 42 , 64 , 0 }, - {45 , 81920 , 42 , 64 , 0 }, - {45 , 92160 , 42 , 64 , 0 }, - {45 , 102400 , 42 , 4096 , 0 }, - {45 , 204800 , 42 , 8192 , 0 }, - {45 , 307200 , 42 , 64 , 0 }, - {45 , 409600 , 42 , 8192 , 0 }, - {45 , 512000 , 42 , 64 , 0 }, - {45 , 614400 , 42 , 8192 , 0 }, - {45 , 716800 , 42 , 64 , 0 }, - {45 , 819200 , 42 , 8192 , 0 }, - {45 , 921600 , 42 , 64 , 0 }, - {45 , 1024000, 42 , 8192 , 0 }, - {56 , 512 , 43 , 128 , 0 }, - {56 , 1024 , 43 , 128 , 0 }, - {56 , 1536 , 43 , 128 , 0 }, - {56 , 2048 , 43 , 128 , 0 }, - {56 , 2560 , 43 , 128 , 0 }, - {56 , 3072 , 43 , 128 , 0 }, - {56 , 3584 , 43 , 128 , 0 }, - {56 , 4096 , 43 , 32 , 0 }, - {56 , 4608 , 43 , 32 , 0 }, - {56 , 5120 , 43 , 32 , 0 }, - {56 , 5632 , 43 , 32 , 0 }, - {56 , 6144 , 43 , 32 , 0 }, - {56 , 6656 , 43 , 32 , 0 }, - {56 , 7168 , 43 , 32 , 0 }, - {56 , 7680 , 43 , 32 , 0 }, - {56 , 8192 , 43 , 32 , 0 }, - {56 , 8704 , 43 , 32 , 0 }, - {56 , 9216 , 43 , 32 , 0 }, - {56 , 9728 , 43 , 32 , 0 }, - {56 , 10240 , 43 , 32 , 0 }, - {56 , 20480 , 43 , 64 , 0 }, - {56 , 30720 , 43 , 32 , 0 }, - {56 , 40960 , 43 , 32 , 0 }, - {56 , 51200 , 43 , 32 , 0 }, - {56 , 61440 , 43 , 32 , 0 }, - {56 , 71680 , 43 , 32 , 0 }, - {56 , 81920 , 43 , 32 , 0 }, - {56 , 92160 , 43 , 32 , 0 }, - {56 , 102400 , 43 , 32 , 0 }, - {56 , 204800 , 43 , 32 , 0 }, - {56 , 307200 , 43 , 32 , 0 }, - {56 , 409600 , 43 , 32 , 0 }, - {56 , 512000 , 43 , 32 , 0 }, - {56 , 614400 , 43 , 32 , 0 }, - {56 , 716800 , 43 , 32 , 0 }, - {56 , 819200 , 43 , 32 , 0 }, - {56 , 921600 , 43 , 32 , 0 }, - {56 , 1024000, 43 , 32 , 0 }, - {84 , 512 , 126 , 64 , 0 }, - {84 , 1024 , 126 , 64 , 0 }, - {84 , 1536 , 126 , 64 , 0 }, - {84 , 2048 , 126 , 64 , 0 }, - {84 , 2560 , 126 , 64 , 0 }, - {84 , 3072 , 126 , 64 , 0 }, - {84 , 3584 , 126 , 128 , 0 }, - {84 , 4096 , 126 , 4096 , 0 }, - {84 , 4608 , 126 , 128 , 0 }, - {84 , 5120 , 126 , 128 , 0 }, - {84 , 5632 , 126 , 64 , 0 }, - {84 , 6144 , 126 , 2048 , 0 }, - {84 , 6656 , 126 , 128 , 0 }, - {84 , 7168 , 126 , 128 , 0 }, - {84 , 7680 , 126 , 128 , 0 }, - {84 , 8192 , 126 , 2048 , 0 }, - {84 , 8704 , 126 , 128 , 0 }, - {84 , 9216 , 126 , 128 , 0 }, - {84 , 9728 , 126 , 128 , 0 }, - {84 , 10240 , 126 , 2048 , 0 }, - {84 , 20480 , 126 , 128 , 0 }, - {84 , 30720 , 126 , 2048 , 0 }, - {84 , 40960 , 126 , 8192 , 0 }, - {84 , 51200 , 126 , 2048 , 0 }, - {84 , 61440 , 126 , 2048 , 0 }, - {84 , 71680 , 126 , 2048 , 0 }, - {84 , 81920 , 126 , 8192 , 0 }, - {84 , 92160 , 126 , 2048 , 0 }, - {84 , 102400 , 126 , 2048 , 0 }, - {84 , 204800 , 126 , 8192 , 0 }, - {84 , 307200 , 126 , 2048 , 0 }, - {84 , 409600 , 126 , 8192 , 0 }, - {84 , 512000 , 126 , 2048 , 0 }, - {84 , 614400 , 126 , 8192 , 0 }, - {84 , 716800 , 126 , 2048 , 0 }, - {84 , 819200 , 126 , 8192 , 0 }, - {84 , 921600 , 126 , 2048 , 0 }, - {84 , 1024000, 126 , 8192 , 0 }, - {120 , 512 , 210 , 64 , 0 }, - {120 , 1024 , 210 , 64 , 0 }, - {120 , 1536 , 210 , 64 , 0 }, - {120 , 2048 , 210 , 64 , 0 }, - {120 , 2560 , 210 , 32 , 0 }, - {120 , 3072 , 210 , 1024 , 0 }, - {120 , 3584 , 210 , 64 , 0 }, - {120 , 4096 , 210 , 4096 , 0 }, - {120 , 4608 , 210 , 64 , 0 }, - {120 , 5120 , 210 , 64 , 0 }, - {120 , 5632 , 210 , 32 , 0 }, - {120 , 6144 , 210 , 2048 , 0 }, - {120 , 6656 , 210 , 64 , 0 }, - {120 , 7168 , 210 , 1024 , 0 }, - {120 , 7680 , 210 , 64 , 0 }, - {120 , 8192 , 210 , 4096 , 0 }, - {120 , 8704 , 210 , 64 , 0 }, - {120 , 9216 , 210 , 1024 , 0 }, - {120 , 9728 , 210 , 64 , 0 }, - {120 , 10240 , 210 , 2048 , 0 }, - {120 , 20480 , 210 , 4096 , 0 }, - {120 , 30720 , 210 , 2048 , 0 }, - {120 , 40960 , 210 , 8192 , 0 }, - {120 , 51200 , 210 , 2048 , 0 }, - {120 , 61440 , 210 , 4096 , 0 }, - {120 , 71680 , 210 , 2048 , 0 }, - {120 , 81920 , 210 , 16384 , 0 }, - {120 , 92160 , 210 , 2048 , 0 }, - {120 , 102400 , 210 , 4096 , 0 }, - {120 , 204800 , 210 , 8192 , 0 }, - {120 , 307200 , 210 , 307200 , 0 }, - {120 , 409600 , 210 , 16384 , 0 }, - {120 , 512000 , 210 , 512000 , 0 }, - {120 , 614400 , 210 , 8192 , 0 }, - {120 , 716800 , 210 , 716800 , 0 }, - {120 , 819200 , 210 , 16384 , 0 }, - {120 , 921600 , 210 , 921600 , 0 }, - {120 , 1024000, 210 , 8192 , 0 }, - {165 , 512 , 330 , 256 , 0 }, - {165 , 1024 , 330 , 256 , 0 }, - {165 , 1536 , 330 , 64 , 0 }, - {165 , 2048 , 330 , 2048 , 0 }, - {165 , 2560 , 330 , 512 , 0 }, - {165 , 3072 , 330 , 1024 , 0 }, - {165 , 3584 , 330 , 64 , 0 }, - {165 , 4096 , 330 , 4096 , 0 }, - {165 , 4608 , 330 , 64 , 0 }, - {165 , 5120 , 330 , 1024 , 0 }, - {165 , 5632 , 330 , 512 , 0 }, - {165 , 6144 , 330 , 1024 , 0 }, - {165 , 6656 , 330 , 512 , 0 }, - {165 , 7168 , 330 , 1024 , 0 }, - {165 , 7680 , 330 , 512 , 0 }, - {165 , 8192 , 330 , 4096 , 0 }, - {165 , 8704 , 330 , 8704 , 0 }, - {165 , 9216 , 330 , 1024 , 0 }, - {165 , 9728 , 330 , 9728 , 0 }, - {165 , 10240 , 330 , 1024 , 0 }, - {165 , 20480 , 330 , 1024 , 0 }, - {165 , 30720 , 330 , 1024 , 0 }, - {165 , 40960 , 330 , 1024 , 0 }, - {165 , 51200 , 330 , 1024 , 0 }, - {165 , 61440 , 330 , 1024 , 0 }, - {165 , 71680 , 330 , 1024 , 0 }, - {165 , 81920 , 330 , 1024 , 0 }, - {165 , 92160 , 330 , 1024 , 0 }, - {165 , 102400 , 330 , 1024 , 0 }, - {165 , 204800 , 330 , 1024 , 0 }, - {165 , 307200 , 330 , 1024 , 0 }, - {165 , 409600 , 330 , 1024 , 0 }, - {165 , 512000 , 330 , 1024 , 0 }, - {165 , 614400 , 330 , 1024 , 0 }, - {165 , 716800 , 330 , 1024 , 0 }, - {165 , 819200 , 330 , 1024 , 0 }, - {165 , 921600 , 330 , 1024 , 0 }, - {165 , 1024000, 330 , 1024 , 0 } +std::vector > sgemm_nn_mi100 = { + {3, 512, 1, 32, 0}, + {3, 1024, 1, 512, 0}, + {3, 1536, 1, 256, 0}, + {3, 2048, 1, 512, 0}, + {3, 2560, 1, 512, 0}, + {3, 3072, 1, 512, 0}, + {3, 3584, 1, 32, 1}, + {3, 4096, 1, 512, 0}, + {3, 4608, 1, 512, 0}, + {3, 5120, 1, 32, 1}, + {3, 5632, 1, 32, 1}, + {3, 6144, 1, 512, 0}, + {3, 6656, 1, 32, 1}, + {3, 7168, 1, 32, 1}, + {3, 7680, 1, 32, 1}, + {3, 8192, 1, 32, 1}, + {3, 8704, 1, 32, 1}, + {3, 9216, 1, 32, 1}, + {3, 9728, 1, 512, 0}, + {3, 10240, 1, 32, 1}, + {3, 20480, 1, 64, 1}, + {3, 30720, 1, 64, 1}, + {3, 40960, 1, 256, 0}, + {3, 51200, 1, 256, 0}, + {3, 61440, 1, 256, 0}, + {3, 71680, 1, 256, 0}, + {3, 81920, 1, 256, 0}, + {3, 92160, 1, 2048, 1}, + {3, 102400, 1, 4096, 1}, + {3, 204800, 1, 2048, 1}, + {3, 307200, 1, 4096, 1}, + {3, 409600, 1, 2048, 1}, + {3, 512000, 1, 2048, 1}, + {3, 614400, 1, 2048, 1}, + {3, 716800, 1, 1024, 1}, + {3, 819200, 1, 2048, 1}, + {3, 921600, 1, 256, 0}, + {3, 1024000, 1, 256, 0}, + {4, 512, 1, 512, 0}, + {4, 1024, 1, 512, 0}, + {4, 1536, 1, 32, 0}, + {4, 2048, 1, 512, 0}, + {4, 2560, 1, 512, 0}, + {4, 3072, 1, 512, 0}, + {4, 3584, 1, 32, 1}, + {4, 4096, 1, 32, 1}, + {4, 4608, 1, 32, 1}, + {4, 5120, 1, 32, 1}, + {4, 5632, 1, 32, 1}, + {4, 6144, 1, 32, 1}, + {4, 6656, 1, 32, 1}, + {4, 7168, 1, 32, 1}, + {4, 7680, 1, 32, 1}, + {4, 8192, 1, 32, 1}, + {4, 8704, 1, 32, 1}, + {4, 9216, 1, 32, 1}, + {4, 9728, 1, 32, 1}, + {4, 10240, 1, 32, 1}, + {4, 20480, 1, 32, 1}, + {4, 30720, 1, 64, 1}, + {4, 40960, 1, 256, 0}, + {4, 51200, 1, 256, 0}, + {4, 61440, 1, 256, 0}, + {4, 71680, 1, 2048, 1}, + {4, 81920, 1, 1024, 1}, + {4, 92160, 1, 2048, 1}, + {4, 102400, 1, 256, 1}, + {4, 204800, 1, 4096, 1}, + {4, 307200, 1, 2048, 1}, + {4, 409600, 1, 8192, 1}, + {4, 512000, 1, 2048, 1}, + {4, 614400, 1, 2048, 1}, + {4, 716800, 1, 1024, 1}, + {4, 819200, 1, 8192, 1}, + {4, 921600, 1, 2048, 1}, + {4, 1024000, 1, 256, 0}, + {6, 512, 3, 64, 0}, + {6, 1024, 3, 1024, 0}, + {6, 1536, 3, 32, 1}, + {6, 2048, 3, 512, 0}, + {6, 2560, 3, 512, 0}, + {6, 3072, 3, 32, 1}, + {6, 3584, 3, 32, 1}, + {6, 4096, 3, 512, 0}, + {6, 4608, 3, 32, 1}, + {6, 5120, 3, 32, 1}, + {6, 5632, 3, 32, 1}, + {6, 6144, 3, 32, 1}, + {6, 6656, 3, 32, 1}, + {6, 7168, 3, 32, 1}, + {6, 7680, 3, 32, 1}, + {6, 8192, 3, 32, 1}, + {6, 8704, 3, 32, 1}, + {6, 9216, 3, 32, 1}, + {6, 9728, 3, 32, 1}, + {6, 10240, 3, 32, 1}, + {6, 20480, 3, 32, 1}, + {6, 30720, 3, 256, 1}, + {6, 40960, 3, 256, 0}, + {6, 51200, 3, 1024, 1}, + {6, 61440, 3, 1024, 1}, + {6, 71680, 3, 2048, 1}, + {6, 81920, 3, 2048, 1}, + {6, 92160, 3, 256, 1}, + {6, 102400, 3, 2048, 1}, + {6, 204800, 3, 1024, 1}, + {6, 307200, 3, 2048, 1}, + {6, 409600, 3, 1024, 1}, + {6, 512000, 3, 512, 1}, + {6, 614400, 3, 1024, 1}, + {6, 716800, 3, 2048, 1}, + {6, 819200, 3, 512, 1}, + {6, 921600, 3, 1024, 1}, + {6, 1024000, 3, 1024, 1}, + {10, 512, 4, 512, 0}, + {10, 1024, 4, 64, 0}, + {10, 1536, 4, 512, 0}, + {10, 2048, 4, 512, 0}, + {10, 2560, 4, 128, 0}, + {10, 3072, 4, 32, 1}, + {10, 3584, 4, 32, 1}, + {10, 4096, 4, 64, 0}, + {10, 4608, 4, 32, 1}, + {10, 5120, 4, 512, 0}, + {10, 5632, 4, 512, 0}, + {10, 6144, 4, 32, 1}, + {10, 6656, 4, 32, 1}, + {10, 7168, 4, 32, 1}, + {10, 7680, 4, 32, 1}, + {10, 8192, 4, 32, 1}, + {10, 8704, 4, 32, 1}, + {10, 9216, 4, 32, 1}, + {10, 9728, 4, 32, 1}, + {10, 10240, 4, 512, 0}, + {10, 20480, 4, 64, 1}, + {10, 30720, 4, 512, 1}, + {10, 40960, 4, 512, 1}, + {10, 51200, 4, 256, 1}, + {10, 61440, 4, 512, 1}, + {10, 71680, 4, 512, 1}, + {10, 81920, 4, 1024, 1}, + {10, 92160, 4, 1024, 1}, + {10, 102400, 4, 512, 1}, + {10, 204800, 4, 4096, 1}, + {10, 307200, 4, 512, 1}, + {10, 409600, 4, 2048, 1}, + {10, 512000, 4, 4096, 1}, + {10, 614400, 4, 8192, 1}, + {10, 716800, 4, 1024, 1}, + {10, 819200, 4, 32768, 1}, + {10, 921600, 4, 4096, 1}, + {10, 1024000, 4, 8192, 1}, + {10, 512, 6, 32, 1}, + {10, 1024, 6, 32, 1}, + {10, 1536, 6, 32, 1}, + {10, 2048, 6, 32, 1}, + {10, 2560, 6, 32, 1}, + {10, 3072, 6, 32, 1}, + {10, 3584, 6, 32, 1}, + {10, 4096, 6, 32, 1}, + {10, 4608, 6, 32, 1}, + {10, 5120, 6, 32, 1}, + {10, 5632, 6, 32, 1}, + {10, 6144, 6, 32, 1}, + {10, 6656, 6, 32, 1}, + {10, 7168, 6, 32, 1}, + {10, 7680, 6, 64, 1}, + {10, 8192, 6, 32, 1}, + {10, 8704, 6, 32, 1}, + {10, 9216, 6, 32, 1}, + {10, 9728, 6, 32, 1}, + {10, 10240, 6, 32, 1}, + {10, 20480, 6, 64, 1}, + {10, 30720, 6, 128, 1}, + {10, 40960, 6, 512, 1}, + {10, 51200, 6, 512, 1}, + {10, 61440, 6, 256, 1}, + {10, 71680, 6, 512, 1}, + {10, 81920, 6, 512, 1}, + {10, 92160, 6, 256, 1}, + {10, 102400, 6, 256, 1}, + {10, 204800, 6, 512, 1}, + {10, 307200, 6, 512, 1}, + {10, 409600, 6, 1024, 1}, + {10, 512000, 6, 1024, 1}, + {10, 614400, 6, 8192, 1}, + {10, 716800, 6, 4096, 1}, + {10, 819200, 6, 32768, 1}, + {10, 921600, 6, 4096, 1}, + {10, 1024000, 6, 256, 0}, + {15, 512, 12, 32, 1}, + {15, 1024, 12, 32, 1}, + {15, 1536, 12, 32, 1}, + {15, 2048, 12, 32, 1}, + {15, 2560, 12, 32, 1}, + {15, 3072, 12, 32, 1}, + {15, 3584, 12, 32, 1}, + {15, 4096, 12, 256, 1}, + {15, 4608, 12, 32, 1}, + {15, 5120, 12, 32, 1}, + {15, 5632, 12, 32, 1}, + {15, 6144, 12, 32, 1}, + {15, 6656, 12, 32, 1}, + {15, 7168, 12, 32, 1}, + {15, 7680, 12, 32, 1}, + {15, 8192, 12, 256, 1}, + {15, 8704, 12, 256, 1}, + {15, 9216, 12, 512, 1}, + {15, 9728, 12, 32, 1}, + {15, 10240, 12, 32, 1}, + {15, 20480, 12, 512, 1}, + {15, 30720, 12, 512, 1}, + {15, 40960, 12, 256, 1}, + {15, 51200, 12, 512, 1}, + {15, 61440, 12, 256, 1}, + {15, 71680, 12, 512, 1}, + {15, 81920, 12, 8192, 1}, + {15, 92160, 12, 512, 1}, + {15, 102400, 12, 512, 1}, + {15, 204800, 12, 64, 0}, + {15, 307200, 12, 128, 0}, + {15, 409600, 12, 64, 0}, + {15, 512000, 12, 64, 0}, + {15, 614400, 12, 128, 0}, + {15, 716800, 12, 128, 0}, + {15, 819200, 12, 128, 0}, + {15, 921600, 12, 64, 0}, + {15, 1024000, 12, 64, 0}, + {20, 512, 11, 128, 0}, + {20, 1024, 11, 128, 0}, + {20, 1536, 11, 32, 1}, + {20, 2048, 11, 32, 1}, + {20, 2560, 11, 32, 1}, + {20, 3072, 11, 32, 1}, + {20, 3584, 11, 32, 1}, + {20, 4096, 11, 32, 1}, + {20, 4608, 11, 32, 1}, + {20, 5120, 11, 32, 1}, + {20, 5632, 11, 32, 1}, + {20, 6144, 11, 32, 1}, + {20, 6656, 11, 32, 1}, + {20, 7168, 11, 32, 1}, + {20, 7680, 11, 32, 1}, + {20, 8192, 11, 32, 1}, + {20, 8704, 11, 32, 1}, + {20, 9216, 11, 32, 1}, + {20, 9728, 11, 32, 1}, + {20, 10240, 11, 32, 1}, + {20, 20480, 11, 256, 1}, + {20, 30720, 11, 256, 1}, + {20, 40960, 11, 256, 1}, + {20, 51200, 11, 512, 1}, + {20, 61440, 11, 512, 1}, + {20, 71680, 11, 256, 1}, + {20, 81920, 11, 512, 1}, + {20, 92160, 11, 256, 1}, + {20, 102400, 11, 512, 1}, + {20, 204800, 11, 4096, 1}, + {20, 307200, 11, 4096, 1}, + {20, 409600, 11, 8192, 1}, + {20, 512000, 11, 4096, 1}, + {20, 614400, 11, 8192, 1}, + {20, 716800, 11, 4096, 1}, + {20, 819200, 11, 8192, 1}, + {20, 921600, 11, 4096, 1}, + {20, 1024000, 11, 8192, 1}, + {21, 512, 16, 128, 0}, + {21, 1024, 16, 128, 0}, + {21, 1536, 16, 64, 0}, + {21, 2048, 16, 128, 0}, + {21, 2560, 16, 32, 1}, + {21, 3072, 16, 32, 1}, + {21, 3584, 16, 32, 1}, + {21, 4096, 16, 256, 1}, + {21, 4608, 16, 64, 0}, + {21, 5120, 16, 128, 0}, + {21, 5632, 16, 32, 1}, + {21, 6144, 16, 32, 1}, + {21, 6656, 16, 32, 1}, + {21, 7168, 16, 32, 1}, + {21, 7680, 16, 32, 1}, + {21, 8192, 16, 128, 0}, + {21, 8704, 16, 64, 0}, + {21, 9216, 16, 64, 0}, + {21, 9728, 16, 128, 0}, + {21, 10240, 16, 128, 0}, + {21, 20480, 16, 512, 1}, + {21, 30720, 16, 256, 1}, + {21, 40960, 16, 512, 1}, + {21, 51200, 16, 256, 1}, + {21, 61440, 16, 512, 1}, + {21, 71680, 16, 512, 1}, + {21, 81920, 16, 512, 1}, + {21, 92160, 16, 512, 1}, + {21, 102400, 16, 512, 1}, + {21, 204800, 16, 512, 1}, + {21, 307200, 16, 128, 0}, + {21, 409600, 16, 64, 0}, + {21, 512000, 16, 128, 0}, + {21, 614400, 16, 64, 0}, + {21, 716800, 16, 64, 0}, + {21, 819200, 16, 64, 0}, + {21, 921600, 16, 128, 0}, + {21, 1024000, 16, 128, 0}, + {28, 512, 25, 128, 0}, + {28, 1024, 25, 128, 0}, + {28, 1536, 25, 128, 0}, + {28, 2048, 25, 128, 0}, + {28, 2560, 25, 128, 0}, + {28, 3072, 25, 128, 0}, + {28, 3584, 25, 128, 0}, + {28, 4096, 25, 128, 0}, + {28, 4608, 25, 128, 0}, + {28, 5120, 25, 128, 0}, + {28, 5632, 25, 128, 0}, + {28, 6144, 25, 128, 0}, + {28, 6656, 25, 32, 0}, + {28, 7168, 25, 128, 0}, + {28, 7680, 25, 64, 0}, + {28, 8192, 25, 32, 0}, + {28, 8704, 25, 256, 0}, + {28, 9216, 25, 256, 0}, + {28, 9728, 25, 32, 1}, + {28, 10240, 25, 32, 1}, + {28, 20480, 25, 256, 1}, + {28, 30720, 25, 256, 1}, + {28, 40960, 25, 4096, 1}, + {28, 51200, 25, 256, 1}, + {28, 61440, 25, 512, 1}, + {28, 71680, 25, 512, 1}, + {28, 81920, 25, 4096, 1}, + {28, 92160, 25, 512, 1}, + {28, 102400, 25, 4096, 1}, + {28, 204800, 25, 64, 0}, + {28, 307200, 25, 2048, 0}, + {28, 409600, 25, 32, 0}, + {28, 512000, 25, 2048, 0}, + {28, 614400, 25, 2048, 0}, + {28, 716800, 25, 2048, 0}, + {28, 819200, 25, 2048, 0}, + {28, 921600, 25, 2048, 0}, + {28, 1024000, 25, 2048, 0}, + {35, 512, 24, 128, 0}, + {35, 1024, 24, 128, 0}, + {35, 1536, 24, 128, 0}, + {35, 2048, 24, 128, 0}, + {35, 2560, 24, 128, 0}, + {35, 3072, 24, 128, 0}, + {35, 3584, 24, 128, 0}, + {35, 4096, 24, 128, 0}, + {35, 4608, 24, 128, 0}, + {35, 5120, 24, 64, 0}, + {35, 5632, 24, 128, 0}, + {35, 6144, 24, 128, 0}, + {35, 6656, 24, 256, 0}, + {35, 7168, 24, 64, 0}, + {35, 7680, 24, 32, 1}, + {35, 8192, 24, 32, 0}, + {35, 8704, 24, 256, 0}, + {35, 9216, 24, 64, 0}, + {35, 9728, 24, 32, 1}, + {35, 10240, 24, 32, 1}, + {35, 20480, 24, 256, 1}, + {35, 30720, 24, 256, 1}, + {35, 40960, 24, 4096, 1}, + {35, 51200, 24, 256, 1}, + {35, 61440, 24, 256, 1}, + {35, 71680, 24, 256, 1}, + {35, 81920, 24, 256, 1}, + {35, 92160, 24, 512, 1}, + {35, 102400, 24, 4096, 0}, + {35, 204800, 24, 8192, 1}, + {35, 307200, 24, 4096, 0}, + {35, 409600, 24, 4096, 0}, + {35, 512000, 24, 4096, 0}, + {35, 614400, 24, 4096, 0}, + {35, 716800, 24, 4096, 0}, + {35, 819200, 24, 4096, 0}, + {35, 921600, 24, 4096, 0}, + {35, 1024000, 24, 4096, 0}, + {36, 512, 33, 128, 0}, + {36, 1024, 33, 128, 0}, + {36, 1536, 33, 128, 0}, + {36, 2048, 33, 128, 0}, + {36, 2560, 33, 128, 0}, + {36, 3072, 33, 128, 0}, + {36, 3584, 33, 128, 0}, + {36, 4096, 33, 128, 0}, + {36, 4608, 33, 128, 0}, + {36, 5120, 33, 128, 0}, + {36, 5632, 33, 128, 0}, + {36, 6144, 33, 128, 0}, + {36, 6656, 33, 64, 0}, + {36, 7168, 33, 256, 0}, + {36, 7680, 33, 64, 0}, + {36, 8192, 33, 256, 0}, + {36, 8704, 33, 64, 0}, + {36, 9216, 33, 256, 0}, + {36, 9728, 33, 128, 0}, + {36, 10240, 33, 128, 0}, + {36, 20480, 33, 256, 1}, + {36, 30720, 33, 256, 1}, + {36, 40960, 33, 4096, 0}, + {36, 51200, 33, 512, 1}, + {36, 61440, 33, 4096, 0}, + {36, 71680, 33, 1024, 1}, + {36, 81920, 33, 4096, 0}, + {36, 92160, 33, 2048, 1}, + {36, 102400, 33, 4096, 0}, + {36, 204800, 33, 512, 1}, + {36, 307200, 33, 4096, 0}, + {36, 409600, 33, 8192, 0}, + {36, 512000, 33, 4096, 0}, + {36, 614400, 33, 8192, 0}, + {36, 716800, 33, 4096, 0}, + {36, 819200, 33, 8192, 0}, + {36, 921600, 33, 921600, 0}, + {36, 1024000, 33, 8192, 0}, + {45, 512, 42, 128, 0}, + {45, 1024, 42, 128, 0}, + {45, 1536, 42, 128, 0}, + {45, 2048, 42, 128, 0}, + {45, 2560, 42, 128, 0}, + {45, 3072, 42, 1024, 0}, + {45, 3584, 42, 128, 0}, + {45, 4096, 42, 128, 0}, + {45, 4608, 42, 128, 0}, + {45, 5120, 42, 128, 0}, + {45, 5632, 42, 32, 0}, + {45, 6144, 42, 32, 0}, + {45, 6656, 42, 64, 0}, + {45, 7168, 42, 64, 0}, + {45, 7680, 42, 32, 0}, + {45, 8192, 42, 128, 0}, + {45, 8704, 42, 64, 0}, + {45, 9216, 42, 64, 0}, + {45, 9728, 42, 64, 0}, + {45, 10240, 42, 64, 0}, + {45, 20480, 42, 1024, 1}, + {45, 30720, 42, 2048, 1}, + {45, 40960, 42, 64, 0}, + {45, 51200, 42, 64, 0}, + {45, 61440, 42, 64, 0}, + {45, 71680, 42, 64, 0}, + {45, 81920, 42, 64, 0}, + {45, 92160, 42, 64, 0}, + {45, 102400, 42, 4096, 0}, + {45, 204800, 42, 8192, 0}, + {45, 307200, 42, 64, 0}, + {45, 409600, 42, 8192, 0}, + {45, 512000, 42, 64, 0}, + {45, 614400, 42, 8192, 0}, + {45, 716800, 42, 64, 0}, + {45, 819200, 42, 8192, 0}, + {45, 921600, 42, 64, 0}, + {45, 1024000, 42, 8192, 0}, + {56, 512, 43, 128, 0}, + {56, 1024, 43, 128, 0}, + {56, 1536, 43, 128, 0}, + {56, 2048, 43, 128, 0}, + {56, 2560, 43, 128, 0}, + {56, 3072, 43, 128, 0}, + {56, 3584, 43, 128, 0}, + {56, 4096, 43, 32, 0}, + {56, 4608, 43, 32, 0}, + {56, 5120, 43, 32, 0}, + {56, 5632, 43, 32, 0}, + {56, 6144, 43, 32, 0}, + {56, 6656, 43, 32, 0}, + {56, 7168, 43, 32, 0}, + {56, 7680, 43, 32, 0}, + {56, 8192, 43, 32, 0}, + {56, 8704, 43, 32, 0}, + {56, 9216, 43, 32, 0}, + {56, 9728, 43, 32, 0}, + {56, 10240, 43, 32, 0}, + {56, 20480, 43, 64, 0}, + {56, 30720, 43, 32, 0}, + {56, 40960, 43, 32, 0}, + {56, 51200, 43, 32, 0}, + {56, 61440, 43, 32, 0}, + {56, 71680, 43, 32, 0}, + {56, 81920, 43, 32, 0}, + {56, 92160, 43, 32, 0}, + {56, 102400, 43, 32, 0}, + {56, 204800, 43, 32, 0}, + {56, 307200, 43, 32, 0}, + {56, 409600, 43, 32, 0}, + {56, 512000, 43, 32, 0}, + {56, 614400, 43, 32, 0}, + {56, 716800, 43, 32, 0}, + {56, 819200, 43, 32, 0}, + {56, 921600, 43, 32, 0}, + {56, 1024000, 43, 32, 0}, + {84, 512, 126, 64, 0}, + {84, 1024, 126, 64, 0}, + {84, 1536, 126, 64, 0}, + {84, 2048, 126, 64, 0}, + {84, 2560, 126, 64, 0}, + {84, 3072, 126, 64, 0}, + {84, 3584, 126, 128, 0}, + {84, 4096, 126, 4096, 0}, + {84, 4608, 126, 128, 0}, + {84, 5120, 126, 128, 0}, + {84, 5632, 126, 64, 0}, + {84, 6144, 126, 2048, 0}, + {84, 6656, 126, 128, 0}, + {84, 7168, 126, 128, 0}, + {84, 7680, 126, 128, 0}, + {84, 8192, 126, 2048, 0}, + {84, 8704, 126, 128, 0}, + {84, 9216, 126, 128, 0}, + {84, 9728, 126, 128, 0}, + {84, 10240, 126, 2048, 0}, + {84, 20480, 126, 128, 0}, + {84, 30720, 126, 2048, 0}, + {84, 40960, 126, 8192, 0}, + {84, 51200, 126, 2048, 0}, + {84, 61440, 126, 2048, 0}, + {84, 71680, 126, 2048, 0}, + {84, 81920, 126, 8192, 0}, + {84, 92160, 126, 2048, 0}, + {84, 102400, 126, 2048, 0}, + {84, 204800, 126, 8192, 0}, + {84, 307200, 126, 2048, 0}, + {84, 409600, 126, 8192, 0}, + {84, 512000, 126, 2048, 0}, + {84, 614400, 126, 8192, 0}, + {84, 716800, 126, 2048, 0}, + {84, 819200, 126, 8192, 0}, + {84, 921600, 126, 2048, 0}, + {84, 1024000, 126, 8192, 0}, + {120, 512, 210, 64, 0}, + {120, 1024, 210, 64, 0}, + {120, 1536, 210, 64, 0}, + {120, 2048, 210, 64, 0}, + {120, 2560, 210, 32, 0}, + {120, 3072, 210, 1024, 0}, + {120, 3584, 210, 64, 0}, + {120, 4096, 210, 4096, 0}, + {120, 4608, 210, 64, 0}, + {120, 5120, 210, 64, 0}, + {120, 5632, 210, 32, 0}, + {120, 6144, 210, 2048, 0}, + {120, 6656, 210, 64, 0}, + {120, 7168, 210, 1024, 0}, + {120, 7680, 210, 64, 0}, + {120, 8192, 210, 4096, 0}, + {120, 8704, 210, 64, 0}, + {120, 9216, 210, 1024, 0}, + {120, 9728, 210, 64, 0}, + {120, 10240, 210, 2048, 0}, + {120, 20480, 210, 4096, 0}, + {120, 30720, 210, 2048, 0}, + {120, 40960, 210, 8192, 0}, + {120, 51200, 210, 2048, 0}, + {120, 61440, 210, 4096, 0}, + {120, 71680, 210, 2048, 0}, + {120, 81920, 210, 16384, 0}, + {120, 92160, 210, 2048, 0}, + {120, 102400, 210, 4096, 0}, + {120, 204800, 210, 8192, 0}, + {120, 307200, 210, 307200, 0}, + {120, 409600, 210, 16384, 0}, + {120, 512000, 210, 512000, 0}, + {120, 614400, 210, 8192, 0}, + {120, 716800, 210, 716800, 0}, + {120, 819200, 210, 16384, 0}, + {120, 921600, 210, 921600, 0}, + {120, 1024000, 210, 8192, 0}, + {165, 512, 330, 256, 0}, + {165, 1024, 330, 256, 0}, + {165, 1536, 330, 64, 0}, + {165, 2048, 330, 2048, 0}, + {165, 2560, 330, 512, 0}, + {165, 3072, 330, 1024, 0}, + {165, 3584, 330, 64, 0}, + {165, 4096, 330, 4096, 0}, + {165, 4608, 330, 64, 0}, + {165, 5120, 330, 1024, 0}, + {165, 5632, 330, 512, 0}, + {165, 6144, 330, 1024, 0}, + {165, 6656, 330, 512, 0}, + {165, 7168, 330, 1024, 0}, + {165, 7680, 330, 512, 0}, + {165, 8192, 330, 4096, 0}, + {165, 8704, 330, 8704, 0}, + {165, 9216, 330, 1024, 0}, + {165, 9728, 330, 9728, 0}, + {165, 10240, 330, 1024, 0}, + {165, 20480, 330, 1024, 0}, + {165, 30720, 330, 1024, 0}, + {165, 40960, 330, 1024, 0}, + {165, 51200, 330, 1024, 0}, + {165, 61440, 330, 1024, 0}, + {165, 71680, 330, 1024, 0}, + {165, 81920, 330, 1024, 0}, + {165, 92160, 330, 1024, 0}, + {165, 102400, 330, 1024, 0}, + {165, 204800, 330, 1024, 0}, + {165, 307200, 330, 1024, 0}, + {165, 409600, 330, 1024, 0}, + {165, 512000, 330, 1024, 0}, + {165, 614400, 330, 1024, 0}, + {165, 716800, 330, 1024, 0}, + {165, 819200, 330, 1024, 0}, + {165, 921600, 330, 1024, 0}, + {165, 1024000, 330, 1024, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > sgemm_tn_mi100 = -{ - {1 , 512 , 3 , 64 , 0 }, - {1 , 1024 , 3 , 1024 , 0 }, - {1 , 1536 , 3 , 64 , 0 }, - {1 , 2048 , 3 , 1024 , 0 }, - {1 , 2560 , 3 , 64 , 0 }, - {1 , 3072 , 3 , 64 , 0 }, - {1 , 3584 , 3 , 128 , 0 }, - {1 , 4096 , 3 , 2048 , 0 }, - {1 , 4608 , 3 , 128 , 0 }, - {1 , 5120 , 3 , 32 , 0 }, - {1 , 5632 , 3 , 64 , 0 }, - {1 , 6144 , 3 , 2048 , 0 }, - {1 , 6656 , 3 , 128 , 0 }, - {1 , 7168 , 3 , 64 , 0 }, - {1 , 7680 , 3 , 32 , 0 }, - {1 , 8192 , 3 , 32 , 1 }, - {1 , 8704 , 3 , 32 , 1 }, - {1 , 9216 , 3 , 32 , 1 }, - {1 , 9728 , 3 , 32 , 1 }, - {1 , 10240 , 3 , 32 , 1 }, - {1 , 20480 , 3 , 32 , 1 }, - {1 , 30720 , 3 , 64 , 1 }, - {1 , 40960 , 3 , 256 , 1 }, - {1 , 51200 , 3 , 64 , 0 }, - {1 , 61440 , 3 , 256 , 1 }, - {1 , 71680 , 3 , 512 , 1 }, - {1 , 81920 , 3 , 8192 , 1 }, - {1 , 92160 , 3 , 512 , 1 }, - {1 , 102400 , 3 , 4096 , 1 }, - {1 , 204800 , 3 , 8192 , 1 }, - {1 , 307200 , 3 , 4096 , 1 }, - {1 , 409600 , 3 , 8192 , 1 }, - {1 , 512000 , 3 , 1024 , 1 }, - {1 , 614400 , 3 , 2048 , 1 }, - {1 , 716800 , 3 , 1024 , 1 }, - {1 , 819200 , 3 , 1024 , 1 }, - {1 , 921600 , 3 , 2048 , 1 }, - {1 , 1024000, 3 , 8192 , 1 }, - {1 , 512 , 4 , 64 , 0 }, - {1 , 1024 , 4 , 1024 , 0 }, - {1 , 1536 , 4 , 128 , 0 }, - {1 , 2048 , 4 , 1024 , 0 }, - {1 , 2560 , 4 , 64 , 0 }, - {1 , 3072 , 4 , 128 , 0 }, - {1 , 3584 , 4 , 128 , 0 }, - {1 , 4096 , 4 , 128 , 0 }, - {1 , 4608 , 4 , 64 , 0 }, - {1 , 5120 , 4 , 64 , 0 }, - {1 , 5632 , 4 , 32 , 0 }, - {1 , 6144 , 4 , 2048 , 0 }, - {1 , 6656 , 4 , 64 , 0 }, - {1 , 7168 , 4 , 128 , 0 }, - {1 , 7680 , 4 , 32 , 0 }, - {1 , 8192 , 4 , 32 , 0 }, - {1 , 8704 , 4 , 64 , 0 }, - {1 , 9216 , 4 , 64 , 0 }, - {1 , 9728 , 4 , 32 , 0 }, - {1 , 10240 , 4 , 128 , 0 }, - {1 , 20480 , 4 , 64 , 0 }, - {1 , 30720 , 4 , 32 , 0 }, - {1 , 40960 , 4 , 128 , 0 }, - {1 , 51200 , 4 , 128 , 0 }, - {1 , 61440 , 4 , 128 , 0 }, - {1 , 71680 , 4 , 64 , 0 }, - {1 , 81920 , 4 , 4096 , 1 }, - {1 , 92160 , 4 , 64 , 0 }, - {1 , 102400 , 4 , 32 , 0 }, - {1 , 204800 , 4 , 8192 , 1 }, - {1 , 307200 , 4 , 512 , 1 }, - {1 , 409600 , 4 , 4096 , 1 }, - {1 , 512000 , 4 , 1024 , 1 }, - {1 , 614400 , 4 , 4096 , 1 }, - {1 , 716800 , 4 , 1024 , 1 }, - {1 , 819200 , 4 , 1024 , 1 }, - {1 , 921600 , 4 , 1024 , 1 }, - {1 , 1024000, 4 , 2048 , 1 }, - {3 , 512 , 6 , 64 , 0 }, - {3 , 1024 , 6 , 1024 , 0 }, - {3 , 1536 , 6 , 64 , 0 }, - {3 , 2048 , 6 , 1024 , 0 }, - {3 , 2560 , 6 , 128 , 0 }, - {3 , 3072 , 6 , 32 , 0 }, - {3 , 3584 , 6 , 32 , 0 }, - {3 , 4096 , 6 , 2048 , 0 }, - {3 , 4608 , 6 , 32 , 0 }, - {3 , 5120 , 6 , 32 , 0 }, - {3 , 5632 , 6 , 128 , 0 }, - {3 , 6144 , 6 , 64 , 0 }, - {3 , 6656 , 6 , 128 , 0 }, - {3 , 7168 , 6 , 64 , 0 }, - {3 , 7680 , 6 , 128 , 0 }, - {3 , 8192 , 6 , 32 , 0 }, - {3 , 8704 , 6 , 64 , 0 }, - {3 , 9216 , 6 , 64 , 0 }, - {3 , 9728 , 6 , 128 , 0 }, - {3 , 10240 , 6 , 64 , 0 }, - {3 , 20480 , 6 , 128 , 0 }, - {3 , 30720 , 6 , 128 , 0 }, - {3 , 40960 , 6 , 32 , 0 }, - {3 , 51200 , 6 , 32 , 0 }, - {3 , 61440 , 6 , 32 , 0 }, - {3 , 71680 , 6 , 32 , 0 }, - {3 , 81920 , 6 , 128 , 0 }, - {3 , 92160 , 6 , 1024 , 1 }, - {3 , 102400 , 6 , 1024 , 1 }, - {3 , 204800 , 6 , 512 , 1 }, - {3 , 307200 , 6 , 512 , 1 }, - {3 , 409600 , 6 , 512 , 1 }, - {3 , 512000 , 6 , 512 , 1 }, - {3 , 614400 , 6 , 1024 , 1 }, - {3 , 716800 , 6 , 512 , 1 }, - {3 , 819200 , 6 , 8192 , 1 }, - {3 , 921600 , 6 , 1024 , 1 }, - {3 , 1024000, 6 , 1024 , 1 }, - {4 , 512 , 10 , 64 , 0 }, - {4 , 1024 , 10 , 1024 , 0 }, - {4 , 1536 , 10 , 64 , 0 }, - {4 , 2048 , 10 , 1024 , 0 }, - {4 , 2560 , 10 , 32 , 0 }, - {4 , 3072 , 10 , 1024 , 0 }, - {4 , 3584 , 10 , 128 , 0 }, - {4 , 4096 , 10 , 2048 , 0 }, - {4 , 4608 , 10 , 64 , 0 }, - {4 , 5120 , 10 , 32 , 0 }, - {4 , 5632 , 10 , 128 , 0 }, - {4 , 6144 , 10 , 2048 , 0 }, - {4 , 6656 , 10 , 32 , 0 }, - {4 , 7168 , 10 , 128 , 0 }, - {4 , 7680 , 10 , 128 , 0 }, - {4 , 8192 , 10 , 2048 , 0 }, - {4 , 8704 , 10 , 64 , 0 }, - {4 , 9216 , 10 , 128 , 0 }, - {4 , 9728 , 10 , 32 , 0 }, - {4 , 10240 , 10 , 32 , 0 }, - {4 , 20480 , 10 , 64 , 0 }, - {4 , 30720 , 10 , 128 , 0 }, - {4 , 40960 , 10 , 32 , 0 }, - {4 , 51200 , 10 , 32 , 0 }, - {4 , 61440 , 10 , 32 , 0 }, - {4 , 71680 , 10 , 512 , 1 }, - {4 , 81920 , 10 , 1024 , 1 }, - {4 , 92160 , 10 , 1024 , 1 }, - {4 , 102400 , 10 , 32 , 0 }, - {4 , 204800 , 10 , 8192 , 1 }, - {4 , 307200 , 10 , 512 , 1 }, - {4 , 409600 , 10 , 128 , 0 }, - {4 , 512000 , 10 , 128 , 0 }, - {4 , 614400 , 10 , 128 , 0 }, - {4 , 716800 , 10 , 128 , 0 }, - {4 , 819200 , 10 , 64 , 0 }, - {4 , 921600 , 10 , 64 , 0 }, - {4 , 1024000, 10 , 64 , 0 }, - {6 , 512 , 10 , 64 , 0 }, - {6 , 1024 , 10 , 1024 , 0 }, - {6 , 1536 , 10 , 32 , 1 }, - {6 , 2048 , 10 , 32 , 1 }, - {6 , 2560 , 10 , 32 , 1 }, - {6 , 3072 , 10 , 32 , 1 }, - {6 , 3584 , 10 , 32 , 1 }, - {6 , 4096 , 10 , 32 , 1 }, - {6 , 4608 , 10 , 32 , 1 }, - {6 , 5120 , 10 , 32 , 1 }, - {6 , 5632 , 10 , 32 , 1 }, - {6 , 6144 , 10 , 32 , 1 }, - {6 , 6656 , 10 , 32 , 1 }, - {6 , 7168 , 10 , 32 , 1 }, - {6 , 7680 , 10 , 32 , 1 }, - {6 , 8192 , 10 , 32 , 1 }, - {6 , 8704 , 10 , 32 , 1 }, - {6 , 9216 , 10 , 32 , 1 }, - {6 , 9728 , 10 , 32 , 1 }, - {6 , 10240 , 10 , 32 , 1 }, - {6 , 20480 , 10 , 32 , 1 }, - {6 , 30720 , 10 , 256 , 1 }, - {6 , 40960 , 10 , 128 , 0 }, - {6 , 51200 , 10 , 1024 , 1 }, - {6 , 61440 , 10 , 512 , 1 }, - {6 , 71680 , 10 , 512 , 1 }, - {6 , 81920 , 10 , 512 , 1 }, - {6 , 92160 , 10 , 512 , 1 }, - {6 , 102400 , 10 , 512 , 1 }, - {6 , 204800 , 10 , 512 , 1 }, - {6 , 307200 , 10 , 1024 , 1 }, - {6 , 409600 , 10 , 512 , 1 }, - {6 , 512000 , 10 , 64 , 0 }, - {6 , 614400 , 10 , 128 , 0 }, - {6 , 716800 , 10 , 32 , 0 }, - {6 , 819200 , 10 , 32 , 0 }, - {6 , 921600 , 10 , 64 , 0 }, - {6 , 1024000, 10 , 64 , 0 }, - {12 , 512 , 15 , 64 , 0 }, - {12 , 1024 , 15 , 1024 , 0 }, - {12 , 1536 , 15 , 32 , 1 }, - {12 , 2048 , 15 , 32 , 1 }, - {12 , 2560 , 15 , 32 , 1 }, - {12 , 3072 , 15 , 32 , 1 }, - {12 , 3584 , 15 , 64 , 1 }, - {12 , 4096 , 15 , 32 , 1 }, - {12 , 4608 , 15 , 32 , 1 }, - {12 , 5120 , 15 , 32 , 1 }, - {12 , 5632 , 15 , 32 , 1 }, - {12 , 6144 , 15 , 128 , 1 }, - {12 , 6656 , 15 , 32 , 1 }, - {12 , 7168 , 15 , 32 , 1 }, - {12 , 7680 , 15 , 32 , 1 }, - {12 , 8192 , 15 , 32 , 1 }, - {12 , 8704 , 15 , 32 , 1 }, - {12 , 9216 , 15 , 32 , 1 }, - {12 , 9728 , 15 , 64 , 1 }, - {12 , 10240 , 15 , 32 , 1 }, - {12 , 20480 , 15 , 32 , 1 }, - {12 , 30720 , 15 , 32 , 1 }, - {12 , 40960 , 15 , 512 , 1 }, - {12 , 51200 , 15 , 512 , 1 }, - {12 , 61440 , 15 , 512 , 1 }, - {12 , 71680 , 15 , 512 , 1 }, - {12 , 81920 , 15 , 512 , 1 }, - {12 , 92160 , 15 , 512 , 1 }, - {12 , 102400 , 15 , 512 , 1 }, - {12 , 204800 , 15 , 512 , 1 }, - {12 , 307200 , 15 , 32 , 0 }, - {12 , 409600 , 15 , 32 , 0 }, - {12 , 512000 , 15 , 32 , 0 }, - {12 , 614400 , 15 , 32 , 0 }, - {12 , 716800 , 15 , 32 , 0 }, - {12 , 819200 , 15 , 64 , 0 }, - {12 , 921600 , 15 , 32 , 0 }, - {12 , 1024000, 15 , 32 , 0 }, - {11 , 512 , 20 , 64 , 0 }, - {11 , 1024 , 20 , 1024 , 0 }, - {11 , 1536 , 20 , 32 , 0 }, - {11 , 2048 , 20 , 1024 , 0 }, - {11 , 2560 , 20 , 32 , 0 }, - {11 , 3072 , 20 , 1024 , 0 }, - {11 , 3584 , 20 , 64 , 0 }, - {11 , 4096 , 20 , 32 , 0 }, - {11 , 4608 , 20 , 32 , 0 }, - {11 , 5120 , 20 , 64 , 0 }, - {11 , 5632 , 20 , 256 , 0 }, - {11 , 6144 , 20 , 2048 , 0 }, - {11 , 6656 , 20 , 32 , 0 }, - {11 , 7168 , 20 , 32 , 0 }, - {11 , 7680 , 20 , 64 , 0 }, - {11 , 8192 , 20 , 32 , 0 }, - {11 , 8704 , 20 , 32 , 0 }, - {11 , 9216 , 20 , 64 , 0 }, - {11 , 9728 , 20 , 64 , 0 }, - {11 , 10240 , 20 , 64 , 0 }, - {11 , 20480 , 20 , 32 , 0 }, - {11 , 30720 , 20 , 128 , 0 }, - {11 , 40960 , 20 , 32 , 0 }, - {11 , 51200 , 20 , 128 , 0 }, - {11 , 61440 , 20 , 128 , 0 }, - {11 , 71680 , 20 , 64 , 0 }, - {11 , 81920 , 20 , 32 , 0 }, - {11 , 92160 , 20 , 128 , 0 }, - {11 , 102400 , 20 , 64 , 0 }, - {11 , 204800 , 20 , 64 , 0 }, - {11 , 307200 , 20 , 32 , 0 }, - {11 , 409600 , 20 , 2048 , 0 }, - {11 , 512000 , 20 , 32 , 0 }, - {11 , 614400 , 20 , 2048 , 0 }, - {11 , 716800 , 20 , 2048 , 0 }, - {11 , 819200 , 20 , 2048 , 0 }, - {11 , 921600 , 20 , 2048 , 0 }, - {11 , 1024000, 20 , 2048 , 0 }, - {16 , 512 , 21 , 64 , 0 }, - {16 , 1024 , 21 , 1024 , 0 }, - {16 , 1536 , 21 , 64 , 0 }, - {16 , 2048 , 21 , 2048 , 0 }, - {16 , 2560 , 21 , 32 , 0 }, - {16 , 3072 , 21 , 64 , 0 }, - {16 , 3584 , 21 , 32 , 1 }, - {16 , 4096 , 21 , 32 , 0 }, - {16 , 4608 , 21 , 32 , 0 }, - {16 , 5120 , 21 , 64 , 0 }, - {16 , 5632 , 21 , 32 , 0 }, - {16 , 6144 , 21 , 64 , 0 }, - {16 , 6656 , 21 , 64 , 0 }, - {16 , 7168 , 21 , 64 , 0 }, - {16 , 7680 , 21 , 64 , 0 }, - {16 , 8192 , 21 , 32 , 0 }, - {16 , 8704 , 21 , 64 , 0 }, - {16 , 9216 , 21 , 32 , 0 }, - {16 , 9728 , 21 , 64 , 0 }, - {16 , 10240 , 21 , 64 , 0 }, - {16 , 20480 , 21 , 32 , 0 }, - {16 , 30720 , 21 , 128 , 0 }, - {16 , 40960 , 21 , 32 , 0 }, - {16 , 51200 , 21 , 32 , 0 }, - {16 , 61440 , 21 , 2048 , 0 }, - {16 , 71680 , 21 , 2048 , 0 }, - {16 , 81920 , 21 , 64 , 0 }, - {16 , 92160 , 21 , 32 , 0 }, - {16 , 102400 , 21 , 64 , 0 }, - {16 , 204800 , 21 , 2048 , 0 }, - {16 , 307200 , 21 , 2048 , 0 }, - {16 , 409600 , 21 , 2048 , 0 }, - {16 , 512000 , 21 , 2048 , 0 }, - {16 , 614400 , 21 , 2048 , 0 }, - {16 , 716800 , 21 , 2048 , 0 }, - {16 , 819200 , 21 , 2048 , 0 }, - {16 , 921600 , 21 , 2048 , 0 }, - {16 , 1024000, 21 , 2048 , 0 }, - {25 , 512 , 28 , 256 , 0 }, - {25 , 1024 , 28 , 256 , 0 }, - {25 , 1536 , 28 , 64 , 0 }, - {25 , 2048 , 28 , 1024 , 0 }, - {25 , 2560 , 28 , 128 , 0 }, - {25 , 3072 , 28 , 256 , 0 }, - {25 , 3584 , 28 , 256 , 0 }, - {25 , 4096 , 28 , 256 , 0 }, - {25 , 4608 , 28 , 256 , 0 }, - {25 , 5120 , 28 , 256 , 0 }, - {25 , 5632 , 28 , 256 , 0 }, - {25 , 6144 , 28 , 64 , 0 }, - {25 , 6656 , 28 , 256 , 0 }, - {25 , 7168 , 28 , 256 , 0 }, - {25 , 7680 , 28 , 256 , 0 }, - {25 , 8192 , 28 , 64 , 0 }, - {25 , 8704 , 28 , 64 , 0 }, - {25 , 9216 , 28 , 32 , 0 }, - {25 , 9728 , 28 , 64 , 0 }, - {25 , 10240 , 28 , 32 , 0 }, - {25 , 20480 , 28 , 128 , 0 }, - {25 , 30720 , 28 , 128 , 0 }, - {25 , 40960 , 28 , 128 , 0 }, - {25 , 51200 , 28 , 128 , 0 }, - {25 , 61440 , 28 , 128 , 0 }, - {25 , 71680 , 28 , 128 , 0 }, - {25 , 81920 , 28 , 128 , 0 }, - {25 , 92160 , 28 , 128 , 0 }, - {25 , 102400 , 28 , 128 , 0 }, - {25 , 204800 , 28 , 128 , 0 }, - {25 , 307200 , 28 , 128 , 0 }, - {25 , 409600 , 28 , 128 , 0 }, - {25 , 512000 , 28 , 128 , 0 }, - {25 , 614400 , 28 , 128 , 0 }, - {25 , 716800 , 28 , 128 , 0 }, - {25 , 819200 , 28 , 128 , 0 }, - {25 , 921600 , 28 , 128 , 0 }, - {25 , 1024000, 28 , 128 , 0 }, - {24 , 512 , 35 , 64 , 0 }, - {24 , 1024 , 35 , 1024 , 0 }, - {24 , 1536 , 35 , 64 , 0 }, - {24 , 2048 , 35 , 1024 , 0 }, - {24 , 2560 , 35 , 256 , 0 }, - {24 , 3072 , 35 , 64 , 0 }, - {24 , 3584 , 35 , 256 , 0 }, - {24 , 4096 , 35 , 2048 , 0 }, - {24 , 4608 , 35 , 256 , 0 }, - {24 , 5120 , 35 , 256 , 0 }, - {24 , 5632 , 35 , 64 , 0 }, - {24 , 6144 , 35 , 256 , 0 }, - {24 , 6656 , 35 , 256 , 0 }, - {24 , 7168 , 35 , 256 , 0 }, - {24 , 7680 , 35 , 128 , 0 }, - {24 , 8192 , 35 , 128 , 0 }, - {24 , 8704 , 35 , 64 , 0 }, - {24 , 9216 , 35 , 128 , 0 }, - {24 , 9728 , 35 , 32 , 0 }, - {24 , 10240 , 35 , 32 , 0 }, - {24 , 20480 , 35 , 32 , 0 }, - {24 , 30720 , 35 , 64 , 0 }, - {24 , 40960 , 35 , 64 , 0 }, - {24 , 51200 , 35 , 32 , 0 }, - {24 , 61440 , 35 , 128 , 0 }, - {24 , 71680 , 35 , 128 , 0 }, - {24 , 81920 , 35 , 128 , 0 }, - {24 , 92160 , 35 , 128 , 0 }, - {24 , 102400 , 35 , 128 , 0 }, - {24 , 204800 , 35 , 128 , 0 }, - {24 , 307200 , 35 , 128 , 0 }, - {24 , 409600 , 35 , 128 , 0 }, - {24 , 512000 , 35 , 128 , 0 }, - {24 , 614400 , 35 , 128 , 0 }, - {24 , 716800 , 35 , 128 , 0 }, - {24 , 819200 , 35 , 128 , 0 }, - {24 , 921600 , 35 , 128 , 0 }, - {24 , 1024000, 35 , 128 , 0 }, - {33 , 512 , 36 , 32 , 0 }, - {33 , 1024 , 36 , 64 , 0 }, - {33 , 1536 , 36 , 32 , 0 }, - {33 , 2048 , 36 , 2048 , 0 }, - {33 , 2560 , 36 , 256 , 0 }, - {33 , 3072 , 36 , 256 , 0 }, - {33 , 3584 , 36 , 256 , 0 }, - {33 , 4096 , 36 , 256 , 0 }, - {33 , 4608 , 36 , 32 , 0 }, - {33 , 5120 , 36 , 256 , 0 }, - {33 , 5632 , 36 , 256 , 0 }, - {33 , 6144 , 36 , 256 , 0 }, - {33 , 6656 , 36 , 256 , 0 }, - {33 , 7168 , 36 , 256 , 0 }, - {33 , 7680 , 36 , 256 , 0 }, - {33 , 8192 , 36 , 64 , 0 }, - {33 , 8704 , 36 , 32 , 0 }, - {33 , 9216 , 36 , 128 , 0 }, - {33 , 9728 , 36 , 128 , 0 }, - {33 , 10240 , 36 , 128 , 0 }, - {33 , 20480 , 36 , 32 , 0 }, - {33 , 30720 , 36 , 256 , 1 }, - {33 , 40960 , 36 , 128 , 0 }, - {33 , 51200 , 36 , 128 , 0 }, - {33 , 61440 , 36 , 128 , 0 }, - {33 , 71680 , 36 , 128 , 0 }, - {33 , 81920 , 36 , 128 , 0 }, - {33 , 92160 , 36 , 128 , 0 }, - {33 , 102400 , 36 , 128 , 0 }, - {33 , 204800 , 36 , 8192 , 0 }, - {33 , 307200 , 36 , 128 , 0 }, - {33 , 409600 , 36 , 8192 , 0 }, - {33 , 512000 , 36 , 128 , 0 }, - {33 , 614400 , 36 , 8192 , 0 }, - {33 , 716800 , 36 , 128 , 0 }, - {33 , 819200 , 36 , 8192 , 0 }, - {33 , 921600 , 36 , 128 , 0 }, - {33 , 1024000, 36 , 8192 , 0 }, - {42 , 512 , 45 , 32 , 0 }, - {42 , 1024 , 45 , 512 , 0 }, - {42 , 1536 , 45 , 128 , 0 }, - {42 , 2048 , 45 , 512 , 0 }, - {42 , 2560 , 45 , 128 , 0 }, - {42 , 3072 , 45 , 128 , 0 }, - {42 , 3584 , 45 , 512 , 0 }, - {42 , 4096 , 45 , 512 , 0 }, - {42 , 4608 , 45 , 128 , 0 }, - {42 , 5120 , 45 , 32 , 0 }, - {42 , 5632 , 45 , 128 , 0 }, - {42 , 6144 , 45 , 32 , 0 }, - {42 , 6656 , 45 , 128 , 0 }, - {42 , 7168 , 45 , 128 , 0 }, - {42 , 7680 , 45 , 128 , 0 }, - {42 , 8192 , 45 , 128 , 0 }, - {42 , 8704 , 45 , 32 , 0 }, - {42 , 9216 , 45 , 128 , 0 }, - {42 , 9728 , 45 , 128 , 0 }, - {42 , 10240 , 45 , 128 , 0 }, - {42 , 20480 , 45 , 64 , 0 }, - {42 , 30720 , 45 , 64 , 0 }, - {42 , 40960 , 45 , 64 , 0 }, - {42 , 51200 , 45 , 128 , 0 }, - {42 , 61440 , 45 , 64 , 0 }, - {42 , 71680 , 45 , 128 , 0 }, - {42 , 81920 , 45 , 64 , 0 }, - {42 , 92160 , 45 , 64 , 0 }, - {42 , 102400 , 45 , 64 , 0 }, - {42 , 204800 , 45 , 8192 , 0 }, - {42 , 307200 , 45 , 64 , 0 }, - {42 , 409600 , 45 , 8192 , 0 }, - {42 , 512000 , 45 , 64 , 0 }, - {42 , 614400 , 45 , 8192 , 0 }, - {42 , 716800 , 45 , 64 , 0 }, - {42 , 819200 , 45 , 8192 , 0 }, - {42 , 921600 , 45 , 64 , 0 }, - {42 , 1024000, 45 , 8192 , 0 }, - {43 , 512 , 56 , 128 , 0 }, - {43 , 1024 , 56 , 1024 , 0 }, - {43 , 1536 , 56 , 512 , 0 }, - {43 , 2048 , 56 , 1024 , 0 }, - {43 , 2560 , 56 , 128 , 0 }, - {43 , 3072 , 56 , 128 , 0 }, - {43 , 3584 , 56 , 128 , 0 }, - {43 , 4096 , 56 , 128 , 0 }, - {43 , 4608 , 56 , 128 , 0 }, - {43 , 5120 , 56 , 128 , 0 }, - {43 , 5632 , 56 , 128 , 0 }, - {43 , 6144 , 56 , 128 , 0 }, - {43 , 6656 , 56 , 128 , 0 }, - {43 , 7168 , 56 , 128 , 0 }, - {43 , 7680 , 56 , 128 , 0 }, - {43 , 8192 , 56 , 128 , 0 }, - {43 , 8704 , 56 , 128 , 0 }, - {43 , 9216 , 56 , 128 , 0 }, - {43 , 9728 , 56 , 128 , 0 }, - {43 , 10240 , 56 , 128 , 0 }, - {43 , 20480 , 56 , 128 , 0 }, - {43 , 30720 , 56 , 128 , 0 }, - {43 , 40960 , 56 , 128 , 0 }, - {43 , 51200 , 56 , 64 , 0 }, - {43 , 61440 , 56 , 128 , 0 }, - {43 , 71680 , 56 , 64 , 0 }, - {43 , 81920 , 56 , 8192 , 0 }, - {43 , 92160 , 56 , 64 , 0 }, - {43 , 102400 , 56 , 128 , 0 }, - {43 , 204800 , 56 , 8192 , 0 }, - {43 , 307200 , 56 , 64 , 0 }, - {43 , 409600 , 56 , 8192 , 0 }, - {43 , 512000 , 56 , 64 , 0 }, - {43 , 614400 , 56 , 8192 , 0 }, - {43 , 716800 , 56 , 64 , 0 }, - {43 , 819200 , 56 , 8192 , 0 }, - {43 , 921600 , 56 , 64 , 0 }, - {43 , 1024000, 56 , 8192 , 0 }, - {126 , 512 , 84 , 64 , 0 }, - {126 , 1024 , 84 , 64 , 0 }, - {126 , 1536 , 84 , 64 , 0 }, - {126 , 2048 , 84 , 64 , 0 }, - {126 , 2560 , 84 , 32 , 0 }, - {126 , 3072 , 84 , 32 , 0 }, - {126 , 3584 , 84 , 32 , 0 }, - {126 , 4096 , 84 , 64 , 0 }, - {126 , 4608 , 84 , 128 , 0 }, - {126 , 5120 , 84 , 32 , 0 }, - {126 , 5632 , 84 , 32 , 0 }, - {126 , 6144 , 84 , 128 , 0 }, - {126 , 6656 , 84 , 128 , 0 }, - {126 , 7168 , 84 , 128 , 0 }, - {126 , 7680 , 84 , 128 , 0 }, - {126 , 8192 , 84 , 8192 , 0 }, - {126 , 8704 , 84 , 128 , 0 }, - {126 , 9216 , 84 , 32 , 0 }, - {126 , 9728 , 84 , 128 , 0 }, - {126 , 10240 , 84 , 128 , 0 }, - {126 , 20480 , 84 , 4096 , 0 }, - {126 , 30720 , 84 , 128 , 0 }, - {126 , 40960 , 84 , 8192 , 0 }, - {126 , 51200 , 84 , 128 , 0 }, - {126 , 61440 , 84 , 4096 , 0 }, - {126 , 71680 , 84 , 128 , 0 }, - {126 , 81920 , 84 , 8192 , 0 }, - {126 , 92160 , 84 , 128 , 0 }, - {126 , 102400 , 84 , 4096 , 0 }, - {126 , 204800 , 84 , 8192 , 0 }, - {126 , 307200 , 84 , 307200 , 0 }, - {126 , 409600 , 84 , 8192 , 0 }, - {126 , 512000 , 84 , 512000 , 0 }, - {126 , 614400 , 84 , 8192 , 0 }, - {126 , 716800 , 84 , 716800 , 0 }, - {126 , 819200 , 84 , 8192 , 0 }, - {126 , 921600 , 84 , 921600 , 0 }, - {126 , 1024000, 84 , 8192 , 0 }, - {210 , 512 , 120 , 64 , 0 }, - {210 , 1024 , 120 , 32 , 0 }, - {210 , 1536 , 120 , 512 , 0 }, - {210 , 2048 , 120 , 32 , 0 }, - {210 , 2560 , 120 , 512 , 0 }, - {210 , 3072 , 120 , 32 , 0 }, - {210 , 3584 , 120 , 32 , 0 }, - {210 , 4096 , 120 , 4096 , 0 }, - {210 , 4608 , 120 , 512 , 0 }, - {210 , 5120 , 120 , 512 , 0 }, - {210 , 5632 , 120 , 512 , 0 }, - {210 , 6144 , 120 , 1024 , 0 }, - {210 , 6656 , 120 , 32 , 0 }, - {210 , 7168 , 120 , 512 , 0 }, - {210 , 7680 , 120 , 512 , 0 }, - {210 , 8192 , 120 , 8192 , 0 }, - {210 , 8704 , 120 , 512 , 0 }, - {210 , 9216 , 120 , 512 , 0 }, - {210 , 9728 , 120 , 512 , 0 }, - {210 , 10240 , 120 , 1024 , 0 }, - {210 , 20480 , 120 , 4096 , 0 }, - {210 , 30720 , 120 , 1024 , 0 }, - {210 , 40960 , 120 , 8192 , 0 }, - {210 , 51200 , 120 , 1024 , 0 }, - {210 , 61440 , 120 , 4096 , 0 }, - {210 , 71680 , 120 , 1024 , 0 }, - {210 , 81920 , 120 , 8192 , 0 }, - {210 , 92160 , 120 , 1024 , 0 }, - {210 , 102400 , 120 , 4096 , 0 }, - {210 , 204800 , 120 , 8192 , 0 }, - {210 , 307200 , 120 , 307200 , 0 }, - {210 , 409600 , 120 , 409600 , 0 }, - {210 , 512000 , 120 , 512000 , 0 }, - {210 , 614400 , 120 , 614400 , 0 }, - {210 , 716800 , 120 , 716800 , 0 }, - {210 , 819200 , 120 , 32768 , 0 }, - {210 , 921600 , 120 , 921600 , 0 }, - {210 , 1024000, 120 , 1024000, 0 }, - {330 , 512 , 165 , 512 , 0 }, - {330 , 1024 , 165 , 1024 , 0 }, - {330 , 1536 , 165 , 128 , 0 }, - {330 , 2048 , 165 , 128 , 0 }, - {330 , 2560 , 165 , 256 , 0 }, - {330 , 3072 , 165 , 256 , 0 }, - {330 , 3584 , 165 , 256 , 0 }, - {330 , 4096 , 165 , 128 , 0 }, - {330 , 4608 , 165 , 256 , 0 }, - {330 , 5120 , 165 , 128 , 0 }, - {330 , 5632 , 165 , 128 , 0 }, - {330 , 6144 , 165 , 256 , 0 }, - {330 , 6656 , 165 , 128 , 0 }, - {330 , 7168 , 165 , 128 , 0 }, - {330 , 7680 , 165 , 256 , 0 }, - {330 , 8192 , 165 , 8192 , 0 }, - {330 , 8704 , 165 , 128 , 0 }, - {330 , 9216 , 165 , 256 , 0 }, - {330 , 9728 , 165 , 128 , 0 }, - {330 , 10240 , 165 , 256 , 0 }, - {330 , 20480 , 165 , 256 , 0 }, - {330 , 30720 , 165 , 256 , 0 }, - {330 , 40960 , 165 , 8192 , 0 }, - {330 , 51200 , 165 , 256 , 0 }, - {330 , 61440 , 165 , 256 , 0 }, - {330 , 71680 , 165 , 71680 , 0 }, - {330 , 81920 , 165 , 81920 , 0 }, - {330 , 92160 , 165 , 92160 , 0 }, - {330 , 102400 , 165 , 102400 , 0 }, - {330 , 204800 , 165 , 204800 , 0 }, - {330 , 307200 , 165 , 307200 , 0 }, - {330 , 409600 , 165 , 409600 , 0 }, - {330 , 512000 , 165 , 512000 , 0 }, - {330 , 614400 , 165 , 614400 , 0 }, - {330 , 716800 , 165 , 716800 , 0 }, - {330 , 819200 , 165 , 819200 , 0 }, - {330 , 921600 , 165 , 921600 , 0 }, - {330 , 1024000, 165 , 1024000, 0 } +std::vector > sgemm_tn_mi100 = { + {1, 512, 3, 64, 0}, + {1, 1024, 3, 1024, 0}, + {1, 1536, 3, 64, 0}, + {1, 2048, 3, 1024, 0}, + {1, 2560, 3, 64, 0}, + {1, 3072, 3, 64, 0}, + {1, 3584, 3, 128, 0}, + {1, 4096, 3, 2048, 0}, + {1, 4608, 3, 128, 0}, + {1, 5120, 3, 32, 0}, + {1, 5632, 3, 64, 0}, + {1, 6144, 3, 2048, 0}, + {1, 6656, 3, 128, 0}, + {1, 7168, 3, 64, 0}, + {1, 7680, 3, 32, 0}, + {1, 8192, 3, 32, 1}, + {1, 8704, 3, 32, 1}, + {1, 9216, 3, 32, 1}, + {1, 9728, 3, 32, 1}, + {1, 10240, 3, 32, 1}, + {1, 20480, 3, 32, 1}, + {1, 30720, 3, 64, 1}, + {1, 40960, 3, 256, 1}, + {1, 51200, 3, 64, 0}, + {1, 61440, 3, 256, 1}, + {1, 71680, 3, 512, 1}, + {1, 81920, 3, 8192, 1}, + {1, 92160, 3, 512, 1}, + {1, 102400, 3, 4096, 1}, + {1, 204800, 3, 8192, 1}, + {1, 307200, 3, 4096, 1}, + {1, 409600, 3, 8192, 1}, + {1, 512000, 3, 1024, 1}, + {1, 614400, 3, 2048, 1}, + {1, 716800, 3, 1024, 1}, + {1, 819200, 3, 1024, 1}, + {1, 921600, 3, 2048, 1}, + {1, 1024000, 3, 8192, 1}, + {1, 512, 4, 64, 0}, + {1, 1024, 4, 1024, 0}, + {1, 1536, 4, 128, 0}, + {1, 2048, 4, 1024, 0}, + {1, 2560, 4, 64, 0}, + {1, 3072, 4, 128, 0}, + {1, 3584, 4, 128, 0}, + {1, 4096, 4, 128, 0}, + {1, 4608, 4, 64, 0}, + {1, 5120, 4, 64, 0}, + {1, 5632, 4, 32, 0}, + {1, 6144, 4, 2048, 0}, + {1, 6656, 4, 64, 0}, + {1, 7168, 4, 128, 0}, + {1, 7680, 4, 32, 0}, + {1, 8192, 4, 32, 0}, + {1, 8704, 4, 64, 0}, + {1, 9216, 4, 64, 0}, + {1, 9728, 4, 32, 0}, + {1, 10240, 4, 128, 0}, + {1, 20480, 4, 64, 0}, + {1, 30720, 4, 32, 0}, + {1, 40960, 4, 128, 0}, + {1, 51200, 4, 128, 0}, + {1, 61440, 4, 128, 0}, + {1, 71680, 4, 64, 0}, + {1, 81920, 4, 4096, 1}, + {1, 92160, 4, 64, 0}, + {1, 102400, 4, 32, 0}, + {1, 204800, 4, 8192, 1}, + {1, 307200, 4, 512, 1}, + {1, 409600, 4, 4096, 1}, + {1, 512000, 4, 1024, 1}, + {1, 614400, 4, 4096, 1}, + {1, 716800, 4, 1024, 1}, + {1, 819200, 4, 1024, 1}, + {1, 921600, 4, 1024, 1}, + {1, 1024000, 4, 2048, 1}, + {3, 512, 6, 64, 0}, + {3, 1024, 6, 1024, 0}, + {3, 1536, 6, 64, 0}, + {3, 2048, 6, 1024, 0}, + {3, 2560, 6, 128, 0}, + {3, 3072, 6, 32, 0}, + {3, 3584, 6, 32, 0}, + {3, 4096, 6, 2048, 0}, + {3, 4608, 6, 32, 0}, + {3, 5120, 6, 32, 0}, + {3, 5632, 6, 128, 0}, + {3, 6144, 6, 64, 0}, + {3, 6656, 6, 128, 0}, + {3, 7168, 6, 64, 0}, + {3, 7680, 6, 128, 0}, + {3, 8192, 6, 32, 0}, + {3, 8704, 6, 64, 0}, + {3, 9216, 6, 64, 0}, + {3, 9728, 6, 128, 0}, + {3, 10240, 6, 64, 0}, + {3, 20480, 6, 128, 0}, + {3, 30720, 6, 128, 0}, + {3, 40960, 6, 32, 0}, + {3, 51200, 6, 32, 0}, + {3, 61440, 6, 32, 0}, + {3, 71680, 6, 32, 0}, + {3, 81920, 6, 128, 0}, + {3, 92160, 6, 1024, 1}, + {3, 102400, 6, 1024, 1}, + {3, 204800, 6, 512, 1}, + {3, 307200, 6, 512, 1}, + {3, 409600, 6, 512, 1}, + {3, 512000, 6, 512, 1}, + {3, 614400, 6, 1024, 1}, + {3, 716800, 6, 512, 1}, + {3, 819200, 6, 8192, 1}, + {3, 921600, 6, 1024, 1}, + {3, 1024000, 6, 1024, 1}, + {4, 512, 10, 64, 0}, + {4, 1024, 10, 1024, 0}, + {4, 1536, 10, 64, 0}, + {4, 2048, 10, 1024, 0}, + {4, 2560, 10, 32, 0}, + {4, 3072, 10, 1024, 0}, + {4, 3584, 10, 128, 0}, + {4, 4096, 10, 2048, 0}, + {4, 4608, 10, 64, 0}, + {4, 5120, 10, 32, 0}, + {4, 5632, 10, 128, 0}, + {4, 6144, 10, 2048, 0}, + {4, 6656, 10, 32, 0}, + {4, 7168, 10, 128, 0}, + {4, 7680, 10, 128, 0}, + {4, 8192, 10, 2048, 0}, + {4, 8704, 10, 64, 0}, + {4, 9216, 10, 128, 0}, + {4, 9728, 10, 32, 0}, + {4, 10240, 10, 32, 0}, + {4, 20480, 10, 64, 0}, + {4, 30720, 10, 128, 0}, + {4, 40960, 10, 32, 0}, + {4, 51200, 10, 32, 0}, + {4, 61440, 10, 32, 0}, + {4, 71680, 10, 512, 1}, + {4, 81920, 10, 1024, 1}, + {4, 92160, 10, 1024, 1}, + {4, 102400, 10, 32, 0}, + {4, 204800, 10, 8192, 1}, + {4, 307200, 10, 512, 1}, + {4, 409600, 10, 128, 0}, + {4, 512000, 10, 128, 0}, + {4, 614400, 10, 128, 0}, + {4, 716800, 10, 128, 0}, + {4, 819200, 10, 64, 0}, + {4, 921600, 10, 64, 0}, + {4, 1024000, 10, 64, 0}, + {6, 512, 10, 64, 0}, + {6, 1024, 10, 1024, 0}, + {6, 1536, 10, 32, 1}, + {6, 2048, 10, 32, 1}, + {6, 2560, 10, 32, 1}, + {6, 3072, 10, 32, 1}, + {6, 3584, 10, 32, 1}, + {6, 4096, 10, 32, 1}, + {6, 4608, 10, 32, 1}, + {6, 5120, 10, 32, 1}, + {6, 5632, 10, 32, 1}, + {6, 6144, 10, 32, 1}, + {6, 6656, 10, 32, 1}, + {6, 7168, 10, 32, 1}, + {6, 7680, 10, 32, 1}, + {6, 8192, 10, 32, 1}, + {6, 8704, 10, 32, 1}, + {6, 9216, 10, 32, 1}, + {6, 9728, 10, 32, 1}, + {6, 10240, 10, 32, 1}, + {6, 20480, 10, 32, 1}, + {6, 30720, 10, 256, 1}, + {6, 40960, 10, 128, 0}, + {6, 51200, 10, 1024, 1}, + {6, 61440, 10, 512, 1}, + {6, 71680, 10, 512, 1}, + {6, 81920, 10, 512, 1}, + {6, 92160, 10, 512, 1}, + {6, 102400, 10, 512, 1}, + {6, 204800, 10, 512, 1}, + {6, 307200, 10, 1024, 1}, + {6, 409600, 10, 512, 1}, + {6, 512000, 10, 64, 0}, + {6, 614400, 10, 128, 0}, + {6, 716800, 10, 32, 0}, + {6, 819200, 10, 32, 0}, + {6, 921600, 10, 64, 0}, + {6, 1024000, 10, 64, 0}, + {12, 512, 15, 64, 0}, + {12, 1024, 15, 1024, 0}, + {12, 1536, 15, 32, 1}, + {12, 2048, 15, 32, 1}, + {12, 2560, 15, 32, 1}, + {12, 3072, 15, 32, 1}, + {12, 3584, 15, 64, 1}, + {12, 4096, 15, 32, 1}, + {12, 4608, 15, 32, 1}, + {12, 5120, 15, 32, 1}, + {12, 5632, 15, 32, 1}, + {12, 6144, 15, 128, 1}, + {12, 6656, 15, 32, 1}, + {12, 7168, 15, 32, 1}, + {12, 7680, 15, 32, 1}, + {12, 8192, 15, 32, 1}, + {12, 8704, 15, 32, 1}, + {12, 9216, 15, 32, 1}, + {12, 9728, 15, 64, 1}, + {12, 10240, 15, 32, 1}, + {12, 20480, 15, 32, 1}, + {12, 30720, 15, 32, 1}, + {12, 40960, 15, 512, 1}, + {12, 51200, 15, 512, 1}, + {12, 61440, 15, 512, 1}, + {12, 71680, 15, 512, 1}, + {12, 81920, 15, 512, 1}, + {12, 92160, 15, 512, 1}, + {12, 102400, 15, 512, 1}, + {12, 204800, 15, 512, 1}, + {12, 307200, 15, 32, 0}, + {12, 409600, 15, 32, 0}, + {12, 512000, 15, 32, 0}, + {12, 614400, 15, 32, 0}, + {12, 716800, 15, 32, 0}, + {12, 819200, 15, 64, 0}, + {12, 921600, 15, 32, 0}, + {12, 1024000, 15, 32, 0}, + {11, 512, 20, 64, 0}, + {11, 1024, 20, 1024, 0}, + {11, 1536, 20, 32, 0}, + {11, 2048, 20, 1024, 0}, + {11, 2560, 20, 32, 0}, + {11, 3072, 20, 1024, 0}, + {11, 3584, 20, 64, 0}, + {11, 4096, 20, 32, 0}, + {11, 4608, 20, 32, 0}, + {11, 5120, 20, 64, 0}, + {11, 5632, 20, 256, 0}, + {11, 6144, 20, 2048, 0}, + {11, 6656, 20, 32, 0}, + {11, 7168, 20, 32, 0}, + {11, 7680, 20, 64, 0}, + {11, 8192, 20, 32, 0}, + {11, 8704, 20, 32, 0}, + {11, 9216, 20, 64, 0}, + {11, 9728, 20, 64, 0}, + {11, 10240, 20, 64, 0}, + {11, 20480, 20, 32, 0}, + {11, 30720, 20, 128, 0}, + {11, 40960, 20, 32, 0}, + {11, 51200, 20, 128, 0}, + {11, 61440, 20, 128, 0}, + {11, 71680, 20, 64, 0}, + {11, 81920, 20, 32, 0}, + {11, 92160, 20, 128, 0}, + {11, 102400, 20, 64, 0}, + {11, 204800, 20, 64, 0}, + {11, 307200, 20, 32, 0}, + {11, 409600, 20, 2048, 0}, + {11, 512000, 20, 32, 0}, + {11, 614400, 20, 2048, 0}, + {11, 716800, 20, 2048, 0}, + {11, 819200, 20, 2048, 0}, + {11, 921600, 20, 2048, 0}, + {11, 1024000, 20, 2048, 0}, + {16, 512, 21, 64, 0}, + {16, 1024, 21, 1024, 0}, + {16, 1536, 21, 64, 0}, + {16, 2048, 21, 2048, 0}, + {16, 2560, 21, 32, 0}, + {16, 3072, 21, 64, 0}, + {16, 3584, 21, 32, 1}, + {16, 4096, 21, 32, 0}, + {16, 4608, 21, 32, 0}, + {16, 5120, 21, 64, 0}, + {16, 5632, 21, 32, 0}, + {16, 6144, 21, 64, 0}, + {16, 6656, 21, 64, 0}, + {16, 7168, 21, 64, 0}, + {16, 7680, 21, 64, 0}, + {16, 8192, 21, 32, 0}, + {16, 8704, 21, 64, 0}, + {16, 9216, 21, 32, 0}, + {16, 9728, 21, 64, 0}, + {16, 10240, 21, 64, 0}, + {16, 20480, 21, 32, 0}, + {16, 30720, 21, 128, 0}, + {16, 40960, 21, 32, 0}, + {16, 51200, 21, 32, 0}, + {16, 61440, 21, 2048, 0}, + {16, 71680, 21, 2048, 0}, + {16, 81920, 21, 64, 0}, + {16, 92160, 21, 32, 0}, + {16, 102400, 21, 64, 0}, + {16, 204800, 21, 2048, 0}, + {16, 307200, 21, 2048, 0}, + {16, 409600, 21, 2048, 0}, + {16, 512000, 21, 2048, 0}, + {16, 614400, 21, 2048, 0}, + {16, 716800, 21, 2048, 0}, + {16, 819200, 21, 2048, 0}, + {16, 921600, 21, 2048, 0}, + {16, 1024000, 21, 2048, 0}, + {25, 512, 28, 256, 0}, + {25, 1024, 28, 256, 0}, + {25, 1536, 28, 64, 0}, + {25, 2048, 28, 1024, 0}, + {25, 2560, 28, 128, 0}, + {25, 3072, 28, 256, 0}, + {25, 3584, 28, 256, 0}, + {25, 4096, 28, 256, 0}, + {25, 4608, 28, 256, 0}, + {25, 5120, 28, 256, 0}, + {25, 5632, 28, 256, 0}, + {25, 6144, 28, 64, 0}, + {25, 6656, 28, 256, 0}, + {25, 7168, 28, 256, 0}, + {25, 7680, 28, 256, 0}, + {25, 8192, 28, 64, 0}, + {25, 8704, 28, 64, 0}, + {25, 9216, 28, 32, 0}, + {25, 9728, 28, 64, 0}, + {25, 10240, 28, 32, 0}, + {25, 20480, 28, 128, 0}, + {25, 30720, 28, 128, 0}, + {25, 40960, 28, 128, 0}, + {25, 51200, 28, 128, 0}, + {25, 61440, 28, 128, 0}, + {25, 71680, 28, 128, 0}, + {25, 81920, 28, 128, 0}, + {25, 92160, 28, 128, 0}, + {25, 102400, 28, 128, 0}, + {25, 204800, 28, 128, 0}, + {25, 307200, 28, 128, 0}, + {25, 409600, 28, 128, 0}, + {25, 512000, 28, 128, 0}, + {25, 614400, 28, 128, 0}, + {25, 716800, 28, 128, 0}, + {25, 819200, 28, 128, 0}, + {25, 921600, 28, 128, 0}, + {25, 1024000, 28, 128, 0}, + {24, 512, 35, 64, 0}, + {24, 1024, 35, 1024, 0}, + {24, 1536, 35, 64, 0}, + {24, 2048, 35, 1024, 0}, + {24, 2560, 35, 256, 0}, + {24, 3072, 35, 64, 0}, + {24, 3584, 35, 256, 0}, + {24, 4096, 35, 2048, 0}, + {24, 4608, 35, 256, 0}, + {24, 5120, 35, 256, 0}, + {24, 5632, 35, 64, 0}, + {24, 6144, 35, 256, 0}, + {24, 6656, 35, 256, 0}, + {24, 7168, 35, 256, 0}, + {24, 7680, 35, 128, 0}, + {24, 8192, 35, 128, 0}, + {24, 8704, 35, 64, 0}, + {24, 9216, 35, 128, 0}, + {24, 9728, 35, 32, 0}, + {24, 10240, 35, 32, 0}, + {24, 20480, 35, 32, 0}, + {24, 30720, 35, 64, 0}, + {24, 40960, 35, 64, 0}, + {24, 51200, 35, 32, 0}, + {24, 61440, 35, 128, 0}, + {24, 71680, 35, 128, 0}, + {24, 81920, 35, 128, 0}, + {24, 92160, 35, 128, 0}, + {24, 102400, 35, 128, 0}, + {24, 204800, 35, 128, 0}, + {24, 307200, 35, 128, 0}, + {24, 409600, 35, 128, 0}, + {24, 512000, 35, 128, 0}, + {24, 614400, 35, 128, 0}, + {24, 716800, 35, 128, 0}, + {24, 819200, 35, 128, 0}, + {24, 921600, 35, 128, 0}, + {24, 1024000, 35, 128, 0}, + {33, 512, 36, 32, 0}, + {33, 1024, 36, 64, 0}, + {33, 1536, 36, 32, 0}, + {33, 2048, 36, 2048, 0}, + {33, 2560, 36, 256, 0}, + {33, 3072, 36, 256, 0}, + {33, 3584, 36, 256, 0}, + {33, 4096, 36, 256, 0}, + {33, 4608, 36, 32, 0}, + {33, 5120, 36, 256, 0}, + {33, 5632, 36, 256, 0}, + {33, 6144, 36, 256, 0}, + {33, 6656, 36, 256, 0}, + {33, 7168, 36, 256, 0}, + {33, 7680, 36, 256, 0}, + {33, 8192, 36, 64, 0}, + {33, 8704, 36, 32, 0}, + {33, 9216, 36, 128, 0}, + {33, 9728, 36, 128, 0}, + {33, 10240, 36, 128, 0}, + {33, 20480, 36, 32, 0}, + {33, 30720, 36, 256, 1}, + {33, 40960, 36, 128, 0}, + {33, 51200, 36, 128, 0}, + {33, 61440, 36, 128, 0}, + {33, 71680, 36, 128, 0}, + {33, 81920, 36, 128, 0}, + {33, 92160, 36, 128, 0}, + {33, 102400, 36, 128, 0}, + {33, 204800, 36, 8192, 0}, + {33, 307200, 36, 128, 0}, + {33, 409600, 36, 8192, 0}, + {33, 512000, 36, 128, 0}, + {33, 614400, 36, 8192, 0}, + {33, 716800, 36, 128, 0}, + {33, 819200, 36, 8192, 0}, + {33, 921600, 36, 128, 0}, + {33, 1024000, 36, 8192, 0}, + {42, 512, 45, 32, 0}, + {42, 1024, 45, 512, 0}, + {42, 1536, 45, 128, 0}, + {42, 2048, 45, 512, 0}, + {42, 2560, 45, 128, 0}, + {42, 3072, 45, 128, 0}, + {42, 3584, 45, 512, 0}, + {42, 4096, 45, 512, 0}, + {42, 4608, 45, 128, 0}, + {42, 5120, 45, 32, 0}, + {42, 5632, 45, 128, 0}, + {42, 6144, 45, 32, 0}, + {42, 6656, 45, 128, 0}, + {42, 7168, 45, 128, 0}, + {42, 7680, 45, 128, 0}, + {42, 8192, 45, 128, 0}, + {42, 8704, 45, 32, 0}, + {42, 9216, 45, 128, 0}, + {42, 9728, 45, 128, 0}, + {42, 10240, 45, 128, 0}, + {42, 20480, 45, 64, 0}, + {42, 30720, 45, 64, 0}, + {42, 40960, 45, 64, 0}, + {42, 51200, 45, 128, 0}, + {42, 61440, 45, 64, 0}, + {42, 71680, 45, 128, 0}, + {42, 81920, 45, 64, 0}, + {42, 92160, 45, 64, 0}, + {42, 102400, 45, 64, 0}, + {42, 204800, 45, 8192, 0}, + {42, 307200, 45, 64, 0}, + {42, 409600, 45, 8192, 0}, + {42, 512000, 45, 64, 0}, + {42, 614400, 45, 8192, 0}, + {42, 716800, 45, 64, 0}, + {42, 819200, 45, 8192, 0}, + {42, 921600, 45, 64, 0}, + {42, 1024000, 45, 8192, 0}, + {43, 512, 56, 128, 0}, + {43, 1024, 56, 1024, 0}, + {43, 1536, 56, 512, 0}, + {43, 2048, 56, 1024, 0}, + {43, 2560, 56, 128, 0}, + {43, 3072, 56, 128, 0}, + {43, 3584, 56, 128, 0}, + {43, 4096, 56, 128, 0}, + {43, 4608, 56, 128, 0}, + {43, 5120, 56, 128, 0}, + {43, 5632, 56, 128, 0}, + {43, 6144, 56, 128, 0}, + {43, 6656, 56, 128, 0}, + {43, 7168, 56, 128, 0}, + {43, 7680, 56, 128, 0}, + {43, 8192, 56, 128, 0}, + {43, 8704, 56, 128, 0}, + {43, 9216, 56, 128, 0}, + {43, 9728, 56, 128, 0}, + {43, 10240, 56, 128, 0}, + {43, 20480, 56, 128, 0}, + {43, 30720, 56, 128, 0}, + {43, 40960, 56, 128, 0}, + {43, 51200, 56, 64, 0}, + {43, 61440, 56, 128, 0}, + {43, 71680, 56, 64, 0}, + {43, 81920, 56, 8192, 0}, + {43, 92160, 56, 64, 0}, + {43, 102400, 56, 128, 0}, + {43, 204800, 56, 8192, 0}, + {43, 307200, 56, 64, 0}, + {43, 409600, 56, 8192, 0}, + {43, 512000, 56, 64, 0}, + {43, 614400, 56, 8192, 0}, + {43, 716800, 56, 64, 0}, + {43, 819200, 56, 8192, 0}, + {43, 921600, 56, 64, 0}, + {43, 1024000, 56, 8192, 0}, + {126, 512, 84, 64, 0}, + {126, 1024, 84, 64, 0}, + {126, 1536, 84, 64, 0}, + {126, 2048, 84, 64, 0}, + {126, 2560, 84, 32, 0}, + {126, 3072, 84, 32, 0}, + {126, 3584, 84, 32, 0}, + {126, 4096, 84, 64, 0}, + {126, 4608, 84, 128, 0}, + {126, 5120, 84, 32, 0}, + {126, 5632, 84, 32, 0}, + {126, 6144, 84, 128, 0}, + {126, 6656, 84, 128, 0}, + {126, 7168, 84, 128, 0}, + {126, 7680, 84, 128, 0}, + {126, 8192, 84, 8192, 0}, + {126, 8704, 84, 128, 0}, + {126, 9216, 84, 32, 0}, + {126, 9728, 84, 128, 0}, + {126, 10240, 84, 128, 0}, + {126, 20480, 84, 4096, 0}, + {126, 30720, 84, 128, 0}, + {126, 40960, 84, 8192, 0}, + {126, 51200, 84, 128, 0}, + {126, 61440, 84, 4096, 0}, + {126, 71680, 84, 128, 0}, + {126, 81920, 84, 8192, 0}, + {126, 92160, 84, 128, 0}, + {126, 102400, 84, 4096, 0}, + {126, 204800, 84, 8192, 0}, + {126, 307200, 84, 307200, 0}, + {126, 409600, 84, 8192, 0}, + {126, 512000, 84, 512000, 0}, + {126, 614400, 84, 8192, 0}, + {126, 716800, 84, 716800, 0}, + {126, 819200, 84, 8192, 0}, + {126, 921600, 84, 921600, 0}, + {126, 1024000, 84, 8192, 0}, + {210, 512, 120, 64, 0}, + {210, 1024, 120, 32, 0}, + {210, 1536, 120, 512, 0}, + {210, 2048, 120, 32, 0}, + {210, 2560, 120, 512, 0}, + {210, 3072, 120, 32, 0}, + {210, 3584, 120, 32, 0}, + {210, 4096, 120, 4096, 0}, + {210, 4608, 120, 512, 0}, + {210, 5120, 120, 512, 0}, + {210, 5632, 120, 512, 0}, + {210, 6144, 120, 1024, 0}, + {210, 6656, 120, 32, 0}, + {210, 7168, 120, 512, 0}, + {210, 7680, 120, 512, 0}, + {210, 8192, 120, 8192, 0}, + {210, 8704, 120, 512, 0}, + {210, 9216, 120, 512, 0}, + {210, 9728, 120, 512, 0}, + {210, 10240, 120, 1024, 0}, + {210, 20480, 120, 4096, 0}, + {210, 30720, 120, 1024, 0}, + {210, 40960, 120, 8192, 0}, + {210, 51200, 120, 1024, 0}, + {210, 61440, 120, 4096, 0}, + {210, 71680, 120, 1024, 0}, + {210, 81920, 120, 8192, 0}, + {210, 92160, 120, 1024, 0}, + {210, 102400, 120, 4096, 0}, + {210, 204800, 120, 8192, 0}, + {210, 307200, 120, 307200, 0}, + {210, 409600, 120, 409600, 0}, + {210, 512000, 120, 512000, 0}, + {210, 614400, 120, 614400, 0}, + {210, 716800, 120, 716800, 0}, + {210, 819200, 120, 32768, 0}, + {210, 921600, 120, 921600, 0}, + {210, 1024000, 120, 1024000, 0}, + {330, 512, 165, 512, 0}, + {330, 1024, 165, 1024, 0}, + {330, 1536, 165, 128, 0}, + {330, 2048, 165, 128, 0}, + {330, 2560, 165, 256, 0}, + {330, 3072, 165, 256, 0}, + {330, 3584, 165, 256, 0}, + {330, 4096, 165, 128, 0}, + {330, 4608, 165, 256, 0}, + {330, 5120, 165, 128, 0}, + {330, 5632, 165, 128, 0}, + {330, 6144, 165, 256, 0}, + {330, 6656, 165, 128, 0}, + {330, 7168, 165, 128, 0}, + {330, 7680, 165, 256, 0}, + {330, 8192, 165, 8192, 0}, + {330, 8704, 165, 128, 0}, + {330, 9216, 165, 256, 0}, + {330, 9728, 165, 128, 0}, + {330, 10240, 165, 256, 0}, + {330, 20480, 165, 256, 0}, + {330, 30720, 165, 256, 0}, + {330, 40960, 165, 8192, 0}, + {330, 51200, 165, 256, 0}, + {330, 61440, 165, 256, 0}, + {330, 71680, 165, 71680, 0}, + {330, 81920, 165, 81920, 0}, + {330, 92160, 165, 92160, 0}, + {330, 102400, 165, 102400, 0}, + {330, 204800, 165, 204800, 0}, + {330, 307200, 165, 307200, 0}, + {330, 409600, 165, 409600, 0}, + {330, 512000, 165, 512000, 0}, + {330, 614400, 165, 614400, 0}, + {330, 716800, 165, 716800, 0}, + {330, 819200, 165, 819200, 0}, + {330, 921600, 165, 921600, 0}, + {330, 1024000, 165, 1024000, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > dgemm_nn_mi100 = -{ - {3 , 512 , 1 , 64 , 1 }, - {3 , 1024 , 1 , 64 , 1 }, - {3 , 1536 , 1 , 32 , 1 }, - {3 , 2048 , 1 , 32 , 1 }, - {3 , 2560 , 1 , 32 , 1 }, - {3 , 3072 , 1 , 32 , 1 }, - {3 , 3584 , 1 , 32 , 1 }, - {3 , 4096 , 1 , 32 , 1 }, - {3 , 4608 , 1 , 32 , 1 }, - {3 , 5120 , 1 , 32 , 1 }, - {3 , 5632 , 1 , 32 , 1 }, - {3 , 6144 , 1 , 32 , 1 }, - {3 , 6656 , 1 , 32 , 1 }, - {3 , 7168 , 1 , 32 , 1 }, - {3 , 7680 , 1 , 32 , 1 }, - {3 , 8192 , 1 , 32 , 1 }, - {3 , 8704 , 1 , 64 , 1 }, - {3 , 9216 , 1 , 64 , 1 }, - {3 , 9728 , 1 , 64 , 1 }, - {3 , 10240 , 1 , 64 , 1 }, - {3 , 20480 , 1 , 64 , 1 }, - {3 , 30720 , 1 , 64 , 1 }, - {3 , 40960 , 1 , 256 , 0 }, - {3 , 51200 , 1 , 1024 , 1 }, - {3 , 61440 , 1 , 1024 , 1 }, - {3 , 71680 , 1 , 2048 , 1 }, - {3 , 81920 , 1 , 8192 , 1 }, - {3 , 92160 , 1 , 2048 , 1 }, - {3 , 102400 , 1 , 4096 , 1 }, - {3 , 204800 , 1 , 4096 , 1 }, - {3 , 307200 , 1 , 1024 , 1 }, - {3 , 409600 , 1 , 128 , 0 }, - {3 , 512000 , 1 , 256 , 0 }, - {3 , 614400 , 1 , 128 , 0 }, - {3 , 716800 , 1 , 256 , 0 }, - {3 , 819200 , 1 , 128 , 0 }, - {3 , 921600 , 1 , 256 , 0 }, - {3 , 1024000, 1 , 128 , 0 }, - {4 , 512 , 1 , 64 , 1 }, - {4 , 1024 , 1 , 64 , 1 }, - {4 , 1536 , 1 , 32 , 1 }, - {4 , 2048 , 1 , 32 , 1 }, - {4 , 2560 , 1 , 32 , 1 }, - {4 , 3072 , 1 , 64 , 1 }, - {4 , 3584 , 1 , 32 , 1 }, - {4 , 4096 , 1 , 32 , 1 }, - {4 , 4608 , 1 , 32 , 1 }, - {4 , 5120 , 1 , 64 , 1 }, - {4 , 5632 , 1 , 32 , 1 }, - {4 , 6144 , 1 , 32 , 1 }, - {4 , 6656 , 1 , 32 , 1 }, - {4 , 7168 , 1 , 32 , 1 }, - {4 , 7680 , 1 , 64 , 1 }, - {4 , 8192 , 1 , 32 , 1 }, - {4 , 8704 , 1 , 64 , 1 }, - {4 , 9216 , 1 , 32 , 1 }, - {4 , 9728 , 1 , 32 , 1 }, - {4 , 10240 , 1 , 32 , 1 }, - {4 , 20480 , 1 , 64 , 1 }, - {4 , 30720 , 1 , 64 , 1 }, - {4 , 40960 , 1 , 64 , 1 }, - {4 , 51200 , 1 , 128 , 1 }, - {4 , 61440 , 1 , 4096 , 1 }, - {4 , 71680 , 1 , 2048 , 1 }, - {4 , 81920 , 1 , 4096 , 1 }, - {4 , 92160 , 1 , 2048 , 1 }, - {4 , 102400 , 1 , 4096 , 1 }, - {4 , 204800 , 1 , 4096 , 1 }, - {4 , 307200 , 1 , 2048 , 1 }, - {4 , 409600 , 1 , 128 , 0 }, - {4 , 512000 , 1 , 128 , 0 }, - {4 , 614400 , 1 , 128 , 0 }, - {4 , 716800 , 1 , 256 , 0 }, - {4 , 819200 , 1 , 128 , 0 }, - {4 , 921600 , 1 , 128 , 0 }, - {4 , 1024000, 1 , 256 , 0 }, - {6 , 512 , 3 , 64 , 1 }, - {6 , 1024 , 3 , 64 , 1 }, - {6 , 1536 , 3 , 32 , 1 }, - {6 , 2048 , 3 , 32 , 1 }, - {6 , 2560 , 3 , 32 , 1 }, - {6 , 3072 , 3 , 64 , 1 }, - {6 , 3584 , 3 , 32 , 1 }, - {6 , 4096 , 3 , 256 , 1 }, - {6 , 4608 , 3 , 32 , 1 }, - {6 , 5120 , 3 , 32 , 1 }, - {6 , 5632 , 3 , 32 , 1 }, - {6 , 6144 , 3 , 32 , 1 }, - {6 , 6656 , 3 , 64 , 1 }, - {6 , 7168 , 3 , 32 , 1 }, - {6 , 7680 , 3 , 32 , 1 }, - {6 , 8192 , 3 , 32 , 1 }, - {6 , 8704 , 3 , 32 , 1 }, - {6 , 9216 , 3 , 64 , 1 }, - {6 , 9728 , 3 , 32 , 1 }, - {6 , 10240 , 3 , 32 , 1 }, - {6 , 20480 , 3 , 64 , 1 }, - {6 , 30720 , 3 , 64 , 1 }, - {6 , 40960 , 3 , 128 , 1 }, - {6 , 51200 , 3 , 128 , 1 }, - {6 , 61440 , 3 , 128 , 1 }, - {6 , 71680 , 3 , 512 , 1 }, - {6 , 81920 , 3 , 1024 , 1 }, - {6 , 92160 , 3 , 1024 , 1 }, - {6 , 102400 , 3 , 512 , 1 }, - {6 , 204800 , 3 , 2048 , 1 }, - {6 , 307200 , 3 , 2048 , 1 }, - {6 , 409600 , 3 , 128 , 0 }, - {6 , 512000 , 3 , 256 , 0 }, - {6 , 614400 , 3 , 1024 , 1 }, - {6 , 716800 , 3 , 256 , 0 }, - {6 , 819200 , 3 , 256 , 0 }, - {6 , 921600 , 3 , 128 , 0 }, - {6 , 1024000, 3 , 128 , 0 }, - {10 , 512 , 4 , 64 , 1 }, - {10 , 1024 , 4 , 64 , 1 }, - {10 , 1536 , 4 , 64 , 1 }, - {10 , 2048 , 4 , 32 , 1 }, - {10 , 2560 , 4 , 32 , 1 }, - {10 , 3072 , 4 , 64 , 1 }, - {10 , 3584 , 4 , 32 , 1 }, - {10 , 4096 , 4 , 32 , 1 }, - {10 , 4608 , 4 , 32 , 1 }, - {10 , 5120 , 4 , 32 , 1 }, - {10 , 5632 , 4 , 32 , 1 }, - {10 , 6144 , 4 , 32 , 1 }, - {10 , 6656 , 4 , 32 , 1 }, - {10 , 7168 , 4 , 64 , 1 }, - {10 , 7680 , 4 , 32 , 1 }, - {10 , 8192 , 4 , 32 , 1 }, - {10 , 8704 , 4 , 64 , 1 }, - {10 , 9216 , 4 , 32 , 1 }, - {10 , 9728 , 4 , 32 , 1 }, - {10 , 10240 , 4 , 32 , 1 }, - {10 , 20480 , 4 , 64 , 1 }, - {10 , 30720 , 4 , 512 , 1 }, - {10 , 40960 , 4 , 128 , 1 }, - {10 , 51200 , 4 , 2048 , 1 }, - {10 , 61440 , 4 , 64 , 1 }, - {10 , 71680 , 4 , 128 , 1 }, - {10 , 81920 , 4 , 1024 , 1 }, - {10 , 92160 , 4 , 2048 , 1 }, - {10 , 102400 , 4 , 1024 , 1 }, - {10 , 204800 , 4 , 512 , 1 }, - {10 , 307200 , 4 , 2048 , 1 }, - {10 , 409600 , 4 , 16384 , 1 }, - {10 , 512000 , 4 , 2048 , 0 }, - {10 , 614400 , 4 , 1024 , 0 }, - {10 , 716800 , 4 , 1024 , 0 }, - {10 , 819200 , 4 , 2048 , 0 }, - {10 , 921600 , 4 , 1024 , 0 }, - {10 , 1024000, 4 , 1024 , 0 }, - {10 , 512 , 6 , 64 , 1 }, - {10 , 1024 , 6 , 32 , 1 }, - {10 , 1536 , 6 , 128 , 1 }, - {10 , 2048 , 6 , 32 , 1 }, - {10 , 2560 , 6 , 32 , 1 }, - {10 , 3072 , 6 , 64 , 1 }, - {10 , 3584 , 6 , 32 , 1 }, - {10 , 4096 , 6 , 32 , 1 }, - {10 , 4608 , 6 , 32 , 1 }, - {10 , 5120 , 6 , 32 , 1 }, - {10 , 5632 , 6 , 32 , 1 }, - {10 , 6144 , 6 , 64 , 1 }, - {10 , 6656 , 6 , 32 , 1 }, - {10 , 7168 , 6 , 64 , 1 }, - {10 , 7680 , 6 , 64 , 1 }, - {10 , 8192 , 6 , 32 , 1 }, - {10 , 8704 , 6 , 32 , 1 }, - {10 , 9216 , 6 , 32 , 1 }, - {10 , 9728 , 6 , 64 , 1 }, - {10 , 10240 , 6 , 64 , 1 }, - {10 , 20480 , 6 , 64 , 1 }, - {10 , 30720 , 6 , 128 , 1 }, - {10 , 40960 , 6 , 128 , 1 }, - {10 , 51200 , 6 , 64 , 1 }, - {10 , 61440 , 6 , 64 , 1 }, - {10 , 71680 , 6 , 1024 , 1 }, - {10 , 81920 , 6 , 1024 , 1 }, - {10 , 92160 , 6 , 64 , 1 }, - {10 , 102400 , 6 , 1024 , 1 }, - {10 , 204800 , 6 , 4096 , 1 }, - {10 , 307200 , 6 , 2048 , 1 }, - {10 , 409600 , 6 , 8192 , 1 }, - {10 , 512000 , 6 , 4096 , 1 }, - {10 , 614400 , 6 , 1024 , 0 }, - {10 , 716800 , 6 , 1024 , 0 }, - {10 , 819200 , 6 , 2048 , 0 }, - {10 , 921600 , 6 , 1024 , 0 }, - {10 , 1024000, 6 , 2048 , 0 }, - {15 , 512 , 12 , 64 , 1 }, - {15 , 1024 , 12 , 64 , 1 }, - {15 , 1536 , 12 , 32 , 1 }, - {15 , 2048 , 12 , 32 , 1 }, - {15 , 2560 , 12 , 128 , 1 }, - {15 , 3072 , 12 , 32 , 1 }, - {15 , 3584 , 12 , 32 , 1 }, - {15 , 4096 , 12 , 32 , 1 }, - {15 , 4608 , 12 , 32 , 1 }, - {15 , 5120 , 12 , 32 , 1 }, - {15 , 5632 , 12 , 32 , 1 }, - {15 , 6144 , 12 , 32 , 1 }, - {15 , 6656 , 12 , 32 , 1 }, - {15 , 7168 , 12 , 32 , 1 }, - {15 , 7680 , 12 , 32 , 1 }, - {15 , 8192 , 12 , 32 , 1 }, - {15 , 8704 , 12 , 32 , 1 }, - {15 , 9216 , 12 , 32 , 1 }, - {15 , 9728 , 12 , 32 , 1 }, - {15 , 10240 , 12 , 32 , 1 }, - {15 , 20480 , 12 , 32 , 1 }, - {15 , 30720 , 12 , 64 , 1 }, - {15 , 40960 , 12 , 64 , 1 }, - {15 , 51200 , 12 , 64 , 1 }, - {15 , 61440 , 12 , 64 , 1 }, - {15 , 71680 , 12 , 32 , 1 }, - {15 , 81920 , 12 , 512 , 1 }, - {15 , 92160 , 12 , 64 , 1 }, - {15 , 102400 , 12 , 64 , 1 }, - {15 , 204800 , 12 , 512 , 1 }, - {15 , 307200 , 12 , 2048 , 1 }, - {15 , 409600 , 12 , 8192 , 1 }, - {15 , 512000 , 12 , 4096 , 1 }, - {15 , 614400 , 12 , 128 , 0 }, - {15 , 716800 , 12 , 128 , 0 }, - {15 , 819200 , 12 , 128 , 0 }, - {15 , 921600 , 12 , 128 , 0 }, - {15 , 1024000, 12 , 128 , 0 }, - {20 , 512 , 11 , 64 , 1 }, - {20 , 1024 , 11 , 32 , 1 }, - {20 , 1536 , 11 , 32 , 1 }, - {20 , 2048 , 11 , 32 , 1 }, - {20 , 2560 , 11 , 64 , 1 }, - {20 , 3072 , 11 , 32 , 1 }, - {20 , 3584 , 11 , 32 , 1 }, - {20 , 4096 , 11 , 64 , 1 }, - {20 , 4608 , 11 , 32 , 1 }, - {20 , 5120 , 11 , 32 , 1 }, - {20 , 5632 , 11 , 32 , 1 }, - {20 , 6144 , 11 , 32 , 1 }, - {20 , 6656 , 11 , 32 , 1 }, - {20 , 7168 , 11 , 32 , 1 }, - {20 , 7680 , 11 , 64 , 1 }, - {20 , 8192 , 11 , 32 , 1 }, - {20 , 8704 , 11 , 64 , 1 }, - {20 , 9216 , 11 , 64 , 1 }, - {20 , 9728 , 11 , 32 , 1 }, - {20 , 10240 , 11 , 128 , 1 }, - {20 , 20480 , 11 , 64 , 1 }, - {20 , 30720 , 11 , 512 , 1 }, - {20 , 40960 , 11 , 32 , 1 }, - {20 , 51200 , 11 , 32 , 1 }, - {20 , 61440 , 11 , 256 , 1 }, - {20 , 71680 , 11 , 64 , 1 }, - {20 , 81920 , 11 , 64 , 1 }, - {20 , 92160 , 11 , 128 , 1 }, - {20 , 102400 , 11 , 256 , 1 }, - {20 , 204800 , 11 , 2048 , 1 }, - {20 , 307200 , 11 , 4096 , 1 }, - {20 , 409600 , 11 , 256 , 0 }, - {20 , 512000 , 11 , 256 , 0 }, - {20 , 614400 , 11 , 512 , 0 }, - {20 , 716800 , 11 , 512 , 0 }, - {20 , 819200 , 11 , 8192 , 0 }, - {20 , 921600 , 11 , 512 , 0 }, - {20 , 1024000, 11 , 8192 , 0 }, - {21 , 512 , 16 , 64 , 1 }, - {21 , 1024 , 16 , 64 , 1 }, - {21 , 1536 , 16 , 32 , 1 }, - {21 , 2048 , 16 , 128 , 1 }, - {21 , 2560 , 16 , 64 , 1 }, - {21 , 3072 , 16 , 64 , 1 }, - {21 , 3584 , 16 , 32 , 1 }, - {21 , 4096 , 16 , 32 , 1 }, - {21 , 4608 , 16 , 32 , 1 }, - {21 , 5120 , 16 , 32 , 1 }, - {21 , 5632 , 16 , 32 , 1 }, - {21 , 6144 , 16 , 32 , 1 }, - {21 , 6656 , 16 , 64 , 1 }, - {21 , 7168 , 16 , 64 , 1 }, - {21 , 7680 , 16 , 64 , 1 }, - {21 , 8192 , 16 , 32 , 1 }, - {21 , 8704 , 16 , 64 , 1 }, - {21 , 9216 , 16 , 128 , 1 }, - {21 , 9728 , 16 , 64 , 1 }, - {21 , 10240 , 16 , 64 , 1 }, - {21 , 20480 , 16 , 32 , 1 }, - {21 , 30720 , 16 , 64 , 1 }, - {21 , 40960 , 16 , 32 , 1 }, - {21 , 51200 , 16 , 64 , 1 }, - {21 , 61440 , 16 , 64 , 1 }, - {21 , 71680 , 16 , 64 , 1 }, - {21 , 81920 , 16 , 16384 , 1 }, - {21 , 92160 , 16 , 256 , 1 }, - {21 , 102400 , 16 , 256 , 1 }, - {21 , 204800 , 16 , 256 , 1 }, - {21 , 307200 , 16 , 2048 , 1 }, - {21 , 409600 , 16 , 8192 , 0 }, - {21 , 512000 , 16 , 256 , 0 }, - {21 , 614400 , 16 , 256 , 0 }, - {21 , 716800 , 16 , 256 , 0 }, - {21 , 819200 , 16 , 256 , 0 }, - {21 , 921600 , 16 , 512 , 0 }, - {21 , 1024000, 16 , 8192 , 0 }, - {28 , 512 , 25 , 64 , 1 }, - {28 , 1024 , 25 , 32 , 1 }, - {28 , 1536 , 25 , 64 , 1 }, - {28 , 2048 , 25 , 32 , 1 }, - {28 , 2560 , 25 , 32 , 1 }, - {28 , 3072 , 25 , 64 , 1 }, - {28 , 3584 , 25 , 64 , 1 }, - {28 , 4096 , 25 , 32 , 1 }, - {28 , 4608 , 25 , 32 , 1 }, - {28 , 5120 , 25 , 32 , 1 }, - {28 , 5632 , 25 , 32 , 1 }, - {28 , 6144 , 25 , 256 , 1 }, - {28 , 6656 , 25 , 32 , 1 }, - {28 , 7168 , 25 , 32 , 1 }, - {28 , 7680 , 25 , 64 , 1 }, - {28 , 8192 , 25 , 64 , 1 }, - {28 , 8704 , 25 , 64 , 1 }, - {28 , 9216 , 25 , 32 , 1 }, - {28 , 9728 , 25 , 64 , 1 }, - {28 , 10240 , 25 , 128 , 1 }, - {28 , 20480 , 25 , 64 , 1 }, - {28 , 30720 , 25 , 64 , 1 }, - {28 , 40960 , 25 , 256 , 1 }, - {28 , 51200 , 25 , 64 , 1 }, - {28 , 61440 , 25 , 32 , 1 }, - {28 , 71680 , 25 , 64 , 1 }, - {28 , 81920 , 25 , 2048 , 1 }, - {28 , 92160 , 25 , 256 , 1 }, - {28 , 102400 , 25 , 2048 , 1 }, - {28 , 204800 , 25 , 8192 , 1 }, - {28 , 307200 , 25 , 4096 , 1 }, - {28 , 409600 , 25 , 256 , 0 }, - {28 , 512000 , 25 , 1024 , 0 }, - {28 , 614400 , 25 , 256 , 0 }, - {28 , 716800 , 25 , 256 , 0 }, - {28 , 819200 , 25 , 1024 , 0 }, - {28 , 921600 , 25 , 256 , 0 }, - {28 , 1024000, 25 , 8192 , 0 }, - {35 , 512 , 24 , 256 , 0 }, - {35 , 1024 , 24 , 256 , 1 }, - {35 , 1536 , 24 , 64 , 1 }, - {35 , 2048 , 24 , 128 , 1 }, - {35 , 2560 , 24 , 32 , 1 }, - {35 , 3072 , 24 , 32 , 1 }, - {35 , 3584 , 24 , 32 , 1 }, - {35 , 4096 , 24 , 256 , 1 }, - {35 , 4608 , 24 , 32 , 1 }, - {35 , 5120 , 24 , 32 , 1 }, - {35 , 5632 , 24 , 64 , 1 }, - {35 , 6144 , 24 , 32 , 1 }, - {35 , 6656 , 24 , 128 , 1 }, - {35 , 7168 , 24 , 128 , 1 }, - {35 , 7680 , 24 , 128 , 1 }, - {35 , 8192 , 24 , 64 , 1 }, - {35 , 8704 , 24 , 32 , 1 }, - {35 , 9216 , 24 , 32 , 1 }, - {35 , 9728 , 24 , 128 , 1 }, - {35 , 10240 , 24 , 256 , 1 }, - {35 , 20480 , 24 , 32 , 1 }, - {35 , 30720 , 24 , 32 , 1 }, - {35 , 40960 , 24 , 32 , 1 }, - {35 , 51200 , 24 , 256 , 1 }, - {35 , 61440 , 24 , 64 , 1 }, - {35 , 71680 , 24 , 128 , 1 }, - {35 , 81920 , 24 , 64 , 1 }, - {35 , 92160 , 24 , 64 , 1 }, - {35 , 102400 , 24 , 64 , 1 }, - {35 , 204800 , 24 , 128 , 1 }, - {35 , 307200 , 24 , 4096 , 1 }, - {35 , 409600 , 24 , 1024 , 1 }, - {35 , 512000 , 24 , 4096 , 1 }, - {35 , 614400 , 24 , 8192 , 1 }, - {35 , 716800 , 24 , 2048 , 1 }, - {35 , 819200 , 24 , 16384 , 1 }, - {35 , 921600 , 24 , 1024 , 1 }, - {35 , 1024000, 24 , 1024 , 1 }, - {36 , 512 , 33 , 256 , 0 }, - {36 , 1024 , 33 , 256 , 0 }, - {36 , 1536 , 33 , 128 , 0 }, - {36 , 2048 , 33 , 256 , 0 }, - {36 , 2560 , 33 , 128 , 1 }, - {36 , 3072 , 33 , 128 , 0 }, - {36 , 3584 , 33 , 256 , 0 }, - {36 , 4096 , 33 , 256 , 1 }, - {36 , 4608 , 33 , 64 , 1 }, - {36 , 5120 , 33 , 128 , 1 }, - {36 , 5632 , 33 , 256 , 1 }, - {36 , 6144 , 33 , 256 , 1 }, - {36 , 6656 , 33 , 256 , 1 }, - {36 , 7168 , 33 , 64 , 1 }, - {36 , 7680 , 33 , 256 , 1 }, - {36 , 8192 , 33 , 256 , 1 }, - {36 , 8704 , 33 , 128 , 1 }, - {36 , 9216 , 33 , 256 , 1 }, - {36 , 9728 , 33 , 256 , 1 }, - {36 , 10240 , 33 , 256 , 1 }, - {36 , 20480 , 33 , 256 , 1 }, - {36 , 30720 , 33 , 256 , 1 }, - {36 , 40960 , 33 , 256 , 1 }, - {36 , 51200 , 33 , 256 , 1 }, - {36 , 61440 , 33 , 2048 , 1 }, - {36 , 71680 , 33 , 1024 , 1 }, - {36 , 81920 , 33 , 4096 , 1 }, - {36 , 92160 , 33 , 256 , 0 }, - {36 , 102400 , 33 , 1024 , 1 }, - {36 , 204800 , 33 , 4096 , 0 }, - {36 , 307200 , 33 , 4096 , 0 }, - {36 , 409600 , 33 , 4096 , 0 }, - {36 , 512000 , 33 , 2048 , 0 }, - {36 , 614400 , 33 , 4096 , 0 }, - {36 , 716800 , 33 , 2048 , 0 }, - {36 , 819200 , 33 , 32768 , 0 }, - {36 , 921600 , 33 , 2048 , 0 }, - {36 , 1024000, 33 , 2048 , 0 }, - {45 , 512 , 42 , 32 , 1 }, - {45 , 1024 , 42 , 64 , 1 }, - {45 , 1536 , 42 , 64 , 1 }, - {45 , 2048 , 42 , 256 , 1 }, - {45 , 2560 , 42 , 256 , 1 }, - {45 , 3072 , 42 , 64 , 1 }, - {45 , 3584 , 42 , 64 , 1 }, - {45 , 4096 , 42 , 64 , 1 }, - {45 , 4608 , 42 , 128 , 1 }, - {45 , 5120 , 42 , 64 , 1 }, - {45 , 5632 , 42 , 64 , 1 }, - {45 , 6144 , 42 , 64 , 1 }, - {45 , 6656 , 42 , 64 , 1 }, - {45 , 7168 , 42 , 128 , 1 }, - {45 , 7680 , 42 , 128 , 1 }, - {45 , 8192 , 42 , 4096 , 1 }, - {45 , 8704 , 42 , 128 , 1 }, - {45 , 9216 , 42 , 128 , 1 }, - {45 , 9728 , 42 , 128 , 1 }, - {45 , 10240 , 42 , 64 , 1 }, - {45 , 20480 , 42 , 64 , 1 }, - {45 , 30720 , 42 , 64 , 1 }, - {45 , 40960 , 42 , 1024 , 1 }, - {45 , 51200 , 42 , 2048 , 1 }, - {45 , 61440 , 42 , 2048 , 1 }, - {45 , 71680 , 42 , 2048 , 1 }, - {45 , 81920 , 42 , 8192 , 1 }, - {45 , 92160 , 42 , 2048 , 1 }, - {45 , 102400 , 42 , 4096 , 1 }, - {45 , 204800 , 42 , 2048 , 0 }, - {45 , 307200 , 42 , 2048 , 0 }, - {45 , 409600 , 42 , 4096 , 0 }, - {45 , 512000 , 42 , 2048 , 0 }, - {45 , 614400 , 42 , 2048 , 0 }, - {45 , 716800 , 42 , 2048 , 0 }, - {45 , 819200 , 42 , 32768 , 0 }, - {45 , 921600 , 42 , 2048 , 0 }, - {45 , 1024000, 42 , 2048 , 0 }, - {56 , 512 , 43 , 32 , 1 }, - {56 , 1024 , 43 , 128 , 1 }, - {56 , 1536 , 43 , 64 , 1 }, - {56 , 2048 , 43 , 128 , 1 }, - {56 , 2560 , 43 , 128 , 1 }, - {56 , 3072 , 43 , 64 , 1 }, - {56 , 3584 , 43 , 64 , 1 }, - {56 , 4096 , 43 , 256 , 1 }, - {56 , 4608 , 43 , 128 , 1 }, - {56 , 5120 , 43 , 128 , 1 }, - {56 , 5632 , 43 , 64 , 1 }, - {56 , 6144 , 43 , 64 , 1 }, - {56 , 6656 , 43 , 128 , 1 }, - {56 , 7168 , 43 , 128 , 1 }, - {56 , 7680 , 43 , 64 , 1 }, - {56 , 8192 , 43 , 1024 , 1 }, - {56 , 8704 , 43 , 64 , 1 }, - {56 , 9216 , 43 , 128 , 1 }, - {56 , 9728 , 43 , 64 , 1 }, - {56 , 10240 , 43 , 128 , 1 }, - {56 , 20480 , 43 , 1024 , 1 }, - {56 , 30720 , 43 , 1024 , 1 }, - {56 , 40960 , 43 , 4096 , 1 }, - {56 , 51200 , 43 , 1024 , 1 }, - {56 , 61440 , 43 , 4096 , 1 }, - {56 , 71680 , 43 , 2048 , 1 }, - {56 , 81920 , 43 , 8192 , 1 }, - {56 , 92160 , 43 , 1024 , 1 }, - {56 , 102400 , 43 , 4096 , 1 }, - {56 , 204800 , 43 , 4096 , 0 }, - {56 , 307200 , 43 , 4096 , 0 }, - {56 , 409600 , 43 , 4096 , 0 }, - {56 , 512000 , 43 , 4096 , 0 }, - {56 , 614400 , 43 , 4096 , 0 }, - {56 , 716800 , 43 , 2048 , 0 }, - {56 , 819200 , 43 , 4096 , 0 }, - {56 , 921600 , 43 , 4096 , 0 }, - {56 , 1024000, 43 , 4096 , 0 }, - {84 , 512 , 126 , 32 , 1 }, - {84 , 1024 , 126 , 64 , 1 }, - {84 , 1536 , 126 , 64 , 1 }, - {84 , 2048 , 126 , 64 , 1 }, - {84 , 2560 , 126 , 64 , 1 }, - {84 , 3072 , 126 , 32 , 1 }, - {84 , 3584 , 126 , 32 , 1 }, - {84 , 4096 , 126 , 32 , 1 }, - {84 , 4608 , 126 , 32 , 1 }, - {84 , 5120 , 126 , 32 , 1 }, - {84 , 5632 , 126 , 32 , 1 }, - {84 , 6144 , 126 , 128 , 1 }, - {84 , 6656 , 126 , 64 , 1 }, - {84 , 7168 , 126 , 32 , 1 }, - {84 , 7680 , 126 , 256 , 1 }, - {84 , 8192 , 126 , 32 , 1 }, - {84 , 8704 , 126 , 32 , 1 }, - {84 , 9216 , 126 , 32 , 1 }, - {84 , 9728 , 126 , 32 , 1 }, - {84 , 10240 , 126 , 64 , 1 }, - {84 , 20480 , 126 , 64 , 1 }, - {84 , 30720 , 126 , 32 , 1 }, - {84 , 40960 , 126 , 512 , 0 }, - {84 , 51200 , 126 , 512 , 0 }, - {84 , 61440 , 126 , 512 , 0 }, - {84 , 71680 , 126 , 512 , 0 }, - {84 , 81920 , 126 , 512 , 0 }, - {84 , 92160 , 126 , 512 , 0 }, - {84 , 102400 , 126 , 512 , 0 }, - {84 , 204800 , 126 , 512 , 0 }, - {84 , 307200 , 126 , 512 , 0 }, - {84 , 409600 , 126 , 512 , 0 }, - {84 , 512000 , 126 , 512 , 0 }, - {84 , 614400 , 126 , 512 , 0 }, - {84 , 716800 , 126 , 512 , 0 }, - {84 , 819200 , 126 , 512 , 0 }, - {84 , 921600 , 126 , 512 , 0 }, - {84 , 1024000, 126 , 512 , 0 }, - {120 , 512 , 210 , 64 , 0 }, - {120 , 1024 , 210 , 32 , 1 }, - {120 , 1536 , 210 , 32 , 1 }, - {120 , 2048 , 210 , 32 , 1 }, - {120 , 2560 , 210 , 32 , 1 }, - {120 , 3072 , 210 , 64 , 1 }, - {120 , 3584 , 210 , 32 , 1 }, - {120 , 4096 , 210 , 64 , 1 }, - {120 , 4608 , 210 , 32 , 1 }, - {120 , 5120 , 210 , 32 , 1 }, - {120 , 5632 , 210 , 32 , 1 }, - {120 , 6144 , 210 , 64 , 1 }, - {120 , 6656 , 210 , 32 , 1 }, - {120 , 7168 , 210 , 32 , 1 }, - {120 , 7680 , 210 , 32 , 1 }, - {120 , 8192 , 210 , 32 , 1 }, - {120 , 8704 , 210 , 32 , 1 }, - {120 , 9216 , 210 , 32 , 1 }, - {120 , 9728 , 210 , 32 , 1 }, - {120 , 10240 , 210 , 32 , 1 }, - {120 , 20480 , 210 , 2048 , 0 }, - {120 , 30720 , 210 , 2048 , 0 }, - {120 , 40960 , 210 , 2048 , 0 }, - {120 , 51200 , 210 , 2048 , 0 }, - {120 , 61440 , 210 , 2048 , 0 }, - {120 , 71680 , 210 , 2048 , 0 }, - {120 , 81920 , 210 , 2048 , 0 }, - {120 , 92160 , 210 , 2048 , 0 }, - {120 , 102400 , 210 , 2048 , 0 }, - {120 , 204800 , 210 , 2048 , 0 }, - {120 , 307200 , 210 , 2048 , 0 }, - {120 , 409600 , 210 , 2048 , 0 }, - {120 , 512000 , 210 , 2048 , 0 }, - {120 , 614400 , 210 , 2048 , 0 }, - {120 , 716800 , 210 , 716800 , 0 }, - {120 , 819200 , 210 , 819200 , 0 }, - {120 , 921600 , 210 , 921600 , 0 }, - {120 , 1024000, 210 , 1024000, 0 }, - {165 , 512 , 330 , 128 , 0 }, - {165 , 1024 , 330 , 1024 , 0 }, - {165 , 1536 , 330 , 32 , 1 }, - {165 , 2048 , 330 , 128 , 1 }, - {165 , 2560 , 330 , 128 , 1 }, - {165 , 3072 , 330 , 32 , 1 }, - {165 , 3584 , 330 , 32 , 1 }, - {165 , 4096 , 330 , 4096 , 0 }, - {165 , 4608 , 330 , 256 , 0 }, - {165 , 5120 , 330 , 256 , 0 }, - {165 , 5632 , 330 , 32 , 1 }, - {165 , 6144 , 330 , 512 , 0 }, - {165 , 6656 , 330 , 32 , 1 }, - {165 , 7168 , 330 , 32 , 1 }, - {165 , 7680 , 330 , 256 , 0 }, - {165 , 8192 , 330 , 4096 , 0 }, - {165 , 8704 , 330 , 512 , 0 }, - {165 , 9216 , 330 , 512 , 0 }, - {165 , 9728 , 330 , 512 , 0 }, - {165 , 10240 , 330 , 512 , 0 }, - {165 , 20480 , 330 , 4096 , 0 }, - {165 , 30720 , 330 , 30720 , 0 }, - {165 , 40960 , 330 , 40960 , 0 }, - {165 , 51200 , 330 , 512 , 0 }, - {165 , 61440 , 330 , 4096 , 0 }, - {165 , 71680 , 330 , 512 , 0 }, - {165 , 81920 , 330 , 4096 , 0 }, - {165 , 92160 , 330 , 512 , 0 }, - {165 , 102400 , 330 , 4096 , 0 }, - {165 , 204800 , 330 , 4096 , 0 }, - {165 , 307200 , 330 , 4096 , 0 }, - {165 , 409600 , 330 , 4096 , 0 }, - {165 , 512000 , 330 , 4096 , 0 }, - {165 , 614400 , 330 , 4096 , 0 }, - {165 , 716800 , 330 , 4096 , 0 }, - {165 , 819200 , 330 , 32768 , 0 }, - {165 , 921600 , 330 , 4096 , 0 }, - {165 , 1024000, 330 , 4096 , 0 } +std::vector > dgemm_nn_mi100 = { + {3, 512, 1, 64, 1}, + {3, 1024, 1, 64, 1}, + {3, 1536, 1, 32, 1}, + {3, 2048, 1, 32, 1}, + {3, 2560, 1, 32, 1}, + {3, 3072, 1, 32, 1}, + {3, 3584, 1, 32, 1}, + {3, 4096, 1, 32, 1}, + {3, 4608, 1, 32, 1}, + {3, 5120, 1, 32, 1}, + {3, 5632, 1, 32, 1}, + {3, 6144, 1, 32, 1}, + {3, 6656, 1, 32, 1}, + {3, 7168, 1, 32, 1}, + {3, 7680, 1, 32, 1}, + {3, 8192, 1, 32, 1}, + {3, 8704, 1, 64, 1}, + {3, 9216, 1, 64, 1}, + {3, 9728, 1, 64, 1}, + {3, 10240, 1, 64, 1}, + {3, 20480, 1, 64, 1}, + {3, 30720, 1, 64, 1}, + {3, 40960, 1, 256, 0}, + {3, 51200, 1, 1024, 1}, + {3, 61440, 1, 1024, 1}, + {3, 71680, 1, 2048, 1}, + {3, 81920, 1, 8192, 1}, + {3, 92160, 1, 2048, 1}, + {3, 102400, 1, 4096, 1}, + {3, 204800, 1, 4096, 1}, + {3, 307200, 1, 1024, 1}, + {3, 409600, 1, 128, 0}, + {3, 512000, 1, 256, 0}, + {3, 614400, 1, 128, 0}, + {3, 716800, 1, 256, 0}, + {3, 819200, 1, 128, 0}, + {3, 921600, 1, 256, 0}, + {3, 1024000, 1, 128, 0}, + {4, 512, 1, 64, 1}, + {4, 1024, 1, 64, 1}, + {4, 1536, 1, 32, 1}, + {4, 2048, 1, 32, 1}, + {4, 2560, 1, 32, 1}, + {4, 3072, 1, 64, 1}, + {4, 3584, 1, 32, 1}, + {4, 4096, 1, 32, 1}, + {4, 4608, 1, 32, 1}, + {4, 5120, 1, 64, 1}, + {4, 5632, 1, 32, 1}, + {4, 6144, 1, 32, 1}, + {4, 6656, 1, 32, 1}, + {4, 7168, 1, 32, 1}, + {4, 7680, 1, 64, 1}, + {4, 8192, 1, 32, 1}, + {4, 8704, 1, 64, 1}, + {4, 9216, 1, 32, 1}, + {4, 9728, 1, 32, 1}, + {4, 10240, 1, 32, 1}, + {4, 20480, 1, 64, 1}, + {4, 30720, 1, 64, 1}, + {4, 40960, 1, 64, 1}, + {4, 51200, 1, 128, 1}, + {4, 61440, 1, 4096, 1}, + {4, 71680, 1, 2048, 1}, + {4, 81920, 1, 4096, 1}, + {4, 92160, 1, 2048, 1}, + {4, 102400, 1, 4096, 1}, + {4, 204800, 1, 4096, 1}, + {4, 307200, 1, 2048, 1}, + {4, 409600, 1, 128, 0}, + {4, 512000, 1, 128, 0}, + {4, 614400, 1, 128, 0}, + {4, 716800, 1, 256, 0}, + {4, 819200, 1, 128, 0}, + {4, 921600, 1, 128, 0}, + {4, 1024000, 1, 256, 0}, + {6, 512, 3, 64, 1}, + {6, 1024, 3, 64, 1}, + {6, 1536, 3, 32, 1}, + {6, 2048, 3, 32, 1}, + {6, 2560, 3, 32, 1}, + {6, 3072, 3, 64, 1}, + {6, 3584, 3, 32, 1}, + {6, 4096, 3, 256, 1}, + {6, 4608, 3, 32, 1}, + {6, 5120, 3, 32, 1}, + {6, 5632, 3, 32, 1}, + {6, 6144, 3, 32, 1}, + {6, 6656, 3, 64, 1}, + {6, 7168, 3, 32, 1}, + {6, 7680, 3, 32, 1}, + {6, 8192, 3, 32, 1}, + {6, 8704, 3, 32, 1}, + {6, 9216, 3, 64, 1}, + {6, 9728, 3, 32, 1}, + {6, 10240, 3, 32, 1}, + {6, 20480, 3, 64, 1}, + {6, 30720, 3, 64, 1}, + {6, 40960, 3, 128, 1}, + {6, 51200, 3, 128, 1}, + {6, 61440, 3, 128, 1}, + {6, 71680, 3, 512, 1}, + {6, 81920, 3, 1024, 1}, + {6, 92160, 3, 1024, 1}, + {6, 102400, 3, 512, 1}, + {6, 204800, 3, 2048, 1}, + {6, 307200, 3, 2048, 1}, + {6, 409600, 3, 128, 0}, + {6, 512000, 3, 256, 0}, + {6, 614400, 3, 1024, 1}, + {6, 716800, 3, 256, 0}, + {6, 819200, 3, 256, 0}, + {6, 921600, 3, 128, 0}, + {6, 1024000, 3, 128, 0}, + {10, 512, 4, 64, 1}, + {10, 1024, 4, 64, 1}, + {10, 1536, 4, 64, 1}, + {10, 2048, 4, 32, 1}, + {10, 2560, 4, 32, 1}, + {10, 3072, 4, 64, 1}, + {10, 3584, 4, 32, 1}, + {10, 4096, 4, 32, 1}, + {10, 4608, 4, 32, 1}, + {10, 5120, 4, 32, 1}, + {10, 5632, 4, 32, 1}, + {10, 6144, 4, 32, 1}, + {10, 6656, 4, 32, 1}, + {10, 7168, 4, 64, 1}, + {10, 7680, 4, 32, 1}, + {10, 8192, 4, 32, 1}, + {10, 8704, 4, 64, 1}, + {10, 9216, 4, 32, 1}, + {10, 9728, 4, 32, 1}, + {10, 10240, 4, 32, 1}, + {10, 20480, 4, 64, 1}, + {10, 30720, 4, 512, 1}, + {10, 40960, 4, 128, 1}, + {10, 51200, 4, 2048, 1}, + {10, 61440, 4, 64, 1}, + {10, 71680, 4, 128, 1}, + {10, 81920, 4, 1024, 1}, + {10, 92160, 4, 2048, 1}, + {10, 102400, 4, 1024, 1}, + {10, 204800, 4, 512, 1}, + {10, 307200, 4, 2048, 1}, + {10, 409600, 4, 16384, 1}, + {10, 512000, 4, 2048, 0}, + {10, 614400, 4, 1024, 0}, + {10, 716800, 4, 1024, 0}, + {10, 819200, 4, 2048, 0}, + {10, 921600, 4, 1024, 0}, + {10, 1024000, 4, 1024, 0}, + {10, 512, 6, 64, 1}, + {10, 1024, 6, 32, 1}, + {10, 1536, 6, 128, 1}, + {10, 2048, 6, 32, 1}, + {10, 2560, 6, 32, 1}, + {10, 3072, 6, 64, 1}, + {10, 3584, 6, 32, 1}, + {10, 4096, 6, 32, 1}, + {10, 4608, 6, 32, 1}, + {10, 5120, 6, 32, 1}, + {10, 5632, 6, 32, 1}, + {10, 6144, 6, 64, 1}, + {10, 6656, 6, 32, 1}, + {10, 7168, 6, 64, 1}, + {10, 7680, 6, 64, 1}, + {10, 8192, 6, 32, 1}, + {10, 8704, 6, 32, 1}, + {10, 9216, 6, 32, 1}, + {10, 9728, 6, 64, 1}, + {10, 10240, 6, 64, 1}, + {10, 20480, 6, 64, 1}, + {10, 30720, 6, 128, 1}, + {10, 40960, 6, 128, 1}, + {10, 51200, 6, 64, 1}, + {10, 61440, 6, 64, 1}, + {10, 71680, 6, 1024, 1}, + {10, 81920, 6, 1024, 1}, + {10, 92160, 6, 64, 1}, + {10, 102400, 6, 1024, 1}, + {10, 204800, 6, 4096, 1}, + {10, 307200, 6, 2048, 1}, + {10, 409600, 6, 8192, 1}, + {10, 512000, 6, 4096, 1}, + {10, 614400, 6, 1024, 0}, + {10, 716800, 6, 1024, 0}, + {10, 819200, 6, 2048, 0}, + {10, 921600, 6, 1024, 0}, + {10, 1024000, 6, 2048, 0}, + {15, 512, 12, 64, 1}, + {15, 1024, 12, 64, 1}, + {15, 1536, 12, 32, 1}, + {15, 2048, 12, 32, 1}, + {15, 2560, 12, 128, 1}, + {15, 3072, 12, 32, 1}, + {15, 3584, 12, 32, 1}, + {15, 4096, 12, 32, 1}, + {15, 4608, 12, 32, 1}, + {15, 5120, 12, 32, 1}, + {15, 5632, 12, 32, 1}, + {15, 6144, 12, 32, 1}, + {15, 6656, 12, 32, 1}, + {15, 7168, 12, 32, 1}, + {15, 7680, 12, 32, 1}, + {15, 8192, 12, 32, 1}, + {15, 8704, 12, 32, 1}, + {15, 9216, 12, 32, 1}, + {15, 9728, 12, 32, 1}, + {15, 10240, 12, 32, 1}, + {15, 20480, 12, 32, 1}, + {15, 30720, 12, 64, 1}, + {15, 40960, 12, 64, 1}, + {15, 51200, 12, 64, 1}, + {15, 61440, 12, 64, 1}, + {15, 71680, 12, 32, 1}, + {15, 81920, 12, 512, 1}, + {15, 92160, 12, 64, 1}, + {15, 102400, 12, 64, 1}, + {15, 204800, 12, 512, 1}, + {15, 307200, 12, 2048, 1}, + {15, 409600, 12, 8192, 1}, + {15, 512000, 12, 4096, 1}, + {15, 614400, 12, 128, 0}, + {15, 716800, 12, 128, 0}, + {15, 819200, 12, 128, 0}, + {15, 921600, 12, 128, 0}, + {15, 1024000, 12, 128, 0}, + {20, 512, 11, 64, 1}, + {20, 1024, 11, 32, 1}, + {20, 1536, 11, 32, 1}, + {20, 2048, 11, 32, 1}, + {20, 2560, 11, 64, 1}, + {20, 3072, 11, 32, 1}, + {20, 3584, 11, 32, 1}, + {20, 4096, 11, 64, 1}, + {20, 4608, 11, 32, 1}, + {20, 5120, 11, 32, 1}, + {20, 5632, 11, 32, 1}, + {20, 6144, 11, 32, 1}, + {20, 6656, 11, 32, 1}, + {20, 7168, 11, 32, 1}, + {20, 7680, 11, 64, 1}, + {20, 8192, 11, 32, 1}, + {20, 8704, 11, 64, 1}, + {20, 9216, 11, 64, 1}, + {20, 9728, 11, 32, 1}, + {20, 10240, 11, 128, 1}, + {20, 20480, 11, 64, 1}, + {20, 30720, 11, 512, 1}, + {20, 40960, 11, 32, 1}, + {20, 51200, 11, 32, 1}, + {20, 61440, 11, 256, 1}, + {20, 71680, 11, 64, 1}, + {20, 81920, 11, 64, 1}, + {20, 92160, 11, 128, 1}, + {20, 102400, 11, 256, 1}, + {20, 204800, 11, 2048, 1}, + {20, 307200, 11, 4096, 1}, + {20, 409600, 11, 256, 0}, + {20, 512000, 11, 256, 0}, + {20, 614400, 11, 512, 0}, + {20, 716800, 11, 512, 0}, + {20, 819200, 11, 8192, 0}, + {20, 921600, 11, 512, 0}, + {20, 1024000, 11, 8192, 0}, + {21, 512, 16, 64, 1}, + {21, 1024, 16, 64, 1}, + {21, 1536, 16, 32, 1}, + {21, 2048, 16, 128, 1}, + {21, 2560, 16, 64, 1}, + {21, 3072, 16, 64, 1}, + {21, 3584, 16, 32, 1}, + {21, 4096, 16, 32, 1}, + {21, 4608, 16, 32, 1}, + {21, 5120, 16, 32, 1}, + {21, 5632, 16, 32, 1}, + {21, 6144, 16, 32, 1}, + {21, 6656, 16, 64, 1}, + {21, 7168, 16, 64, 1}, + {21, 7680, 16, 64, 1}, + {21, 8192, 16, 32, 1}, + {21, 8704, 16, 64, 1}, + {21, 9216, 16, 128, 1}, + {21, 9728, 16, 64, 1}, + {21, 10240, 16, 64, 1}, + {21, 20480, 16, 32, 1}, + {21, 30720, 16, 64, 1}, + {21, 40960, 16, 32, 1}, + {21, 51200, 16, 64, 1}, + {21, 61440, 16, 64, 1}, + {21, 71680, 16, 64, 1}, + {21, 81920, 16, 16384, 1}, + {21, 92160, 16, 256, 1}, + {21, 102400, 16, 256, 1}, + {21, 204800, 16, 256, 1}, + {21, 307200, 16, 2048, 1}, + {21, 409600, 16, 8192, 0}, + {21, 512000, 16, 256, 0}, + {21, 614400, 16, 256, 0}, + {21, 716800, 16, 256, 0}, + {21, 819200, 16, 256, 0}, + {21, 921600, 16, 512, 0}, + {21, 1024000, 16, 8192, 0}, + {28, 512, 25, 64, 1}, + {28, 1024, 25, 32, 1}, + {28, 1536, 25, 64, 1}, + {28, 2048, 25, 32, 1}, + {28, 2560, 25, 32, 1}, + {28, 3072, 25, 64, 1}, + {28, 3584, 25, 64, 1}, + {28, 4096, 25, 32, 1}, + {28, 4608, 25, 32, 1}, + {28, 5120, 25, 32, 1}, + {28, 5632, 25, 32, 1}, + {28, 6144, 25, 256, 1}, + {28, 6656, 25, 32, 1}, + {28, 7168, 25, 32, 1}, + {28, 7680, 25, 64, 1}, + {28, 8192, 25, 64, 1}, + {28, 8704, 25, 64, 1}, + {28, 9216, 25, 32, 1}, + {28, 9728, 25, 64, 1}, + {28, 10240, 25, 128, 1}, + {28, 20480, 25, 64, 1}, + {28, 30720, 25, 64, 1}, + {28, 40960, 25, 256, 1}, + {28, 51200, 25, 64, 1}, + {28, 61440, 25, 32, 1}, + {28, 71680, 25, 64, 1}, + {28, 81920, 25, 2048, 1}, + {28, 92160, 25, 256, 1}, + {28, 102400, 25, 2048, 1}, + {28, 204800, 25, 8192, 1}, + {28, 307200, 25, 4096, 1}, + {28, 409600, 25, 256, 0}, + {28, 512000, 25, 1024, 0}, + {28, 614400, 25, 256, 0}, + {28, 716800, 25, 256, 0}, + {28, 819200, 25, 1024, 0}, + {28, 921600, 25, 256, 0}, + {28, 1024000, 25, 8192, 0}, + {35, 512, 24, 256, 0}, + {35, 1024, 24, 256, 1}, + {35, 1536, 24, 64, 1}, + {35, 2048, 24, 128, 1}, + {35, 2560, 24, 32, 1}, + {35, 3072, 24, 32, 1}, + {35, 3584, 24, 32, 1}, + {35, 4096, 24, 256, 1}, + {35, 4608, 24, 32, 1}, + {35, 5120, 24, 32, 1}, + {35, 5632, 24, 64, 1}, + {35, 6144, 24, 32, 1}, + {35, 6656, 24, 128, 1}, + {35, 7168, 24, 128, 1}, + {35, 7680, 24, 128, 1}, + {35, 8192, 24, 64, 1}, + {35, 8704, 24, 32, 1}, + {35, 9216, 24, 32, 1}, + {35, 9728, 24, 128, 1}, + {35, 10240, 24, 256, 1}, + {35, 20480, 24, 32, 1}, + {35, 30720, 24, 32, 1}, + {35, 40960, 24, 32, 1}, + {35, 51200, 24, 256, 1}, + {35, 61440, 24, 64, 1}, + {35, 71680, 24, 128, 1}, + {35, 81920, 24, 64, 1}, + {35, 92160, 24, 64, 1}, + {35, 102400, 24, 64, 1}, + {35, 204800, 24, 128, 1}, + {35, 307200, 24, 4096, 1}, + {35, 409600, 24, 1024, 1}, + {35, 512000, 24, 4096, 1}, + {35, 614400, 24, 8192, 1}, + {35, 716800, 24, 2048, 1}, + {35, 819200, 24, 16384, 1}, + {35, 921600, 24, 1024, 1}, + {35, 1024000, 24, 1024, 1}, + {36, 512, 33, 256, 0}, + {36, 1024, 33, 256, 0}, + {36, 1536, 33, 128, 0}, + {36, 2048, 33, 256, 0}, + {36, 2560, 33, 128, 1}, + {36, 3072, 33, 128, 0}, + {36, 3584, 33, 256, 0}, + {36, 4096, 33, 256, 1}, + {36, 4608, 33, 64, 1}, + {36, 5120, 33, 128, 1}, + {36, 5632, 33, 256, 1}, + {36, 6144, 33, 256, 1}, + {36, 6656, 33, 256, 1}, + {36, 7168, 33, 64, 1}, + {36, 7680, 33, 256, 1}, + {36, 8192, 33, 256, 1}, + {36, 8704, 33, 128, 1}, + {36, 9216, 33, 256, 1}, + {36, 9728, 33, 256, 1}, + {36, 10240, 33, 256, 1}, + {36, 20480, 33, 256, 1}, + {36, 30720, 33, 256, 1}, + {36, 40960, 33, 256, 1}, + {36, 51200, 33, 256, 1}, + {36, 61440, 33, 2048, 1}, + {36, 71680, 33, 1024, 1}, + {36, 81920, 33, 4096, 1}, + {36, 92160, 33, 256, 0}, + {36, 102400, 33, 1024, 1}, + {36, 204800, 33, 4096, 0}, + {36, 307200, 33, 4096, 0}, + {36, 409600, 33, 4096, 0}, + {36, 512000, 33, 2048, 0}, + {36, 614400, 33, 4096, 0}, + {36, 716800, 33, 2048, 0}, + {36, 819200, 33, 32768, 0}, + {36, 921600, 33, 2048, 0}, + {36, 1024000, 33, 2048, 0}, + {45, 512, 42, 32, 1}, + {45, 1024, 42, 64, 1}, + {45, 1536, 42, 64, 1}, + {45, 2048, 42, 256, 1}, + {45, 2560, 42, 256, 1}, + {45, 3072, 42, 64, 1}, + {45, 3584, 42, 64, 1}, + {45, 4096, 42, 64, 1}, + {45, 4608, 42, 128, 1}, + {45, 5120, 42, 64, 1}, + {45, 5632, 42, 64, 1}, + {45, 6144, 42, 64, 1}, + {45, 6656, 42, 64, 1}, + {45, 7168, 42, 128, 1}, + {45, 7680, 42, 128, 1}, + {45, 8192, 42, 4096, 1}, + {45, 8704, 42, 128, 1}, + {45, 9216, 42, 128, 1}, + {45, 9728, 42, 128, 1}, + {45, 10240, 42, 64, 1}, + {45, 20480, 42, 64, 1}, + {45, 30720, 42, 64, 1}, + {45, 40960, 42, 1024, 1}, + {45, 51200, 42, 2048, 1}, + {45, 61440, 42, 2048, 1}, + {45, 71680, 42, 2048, 1}, + {45, 81920, 42, 8192, 1}, + {45, 92160, 42, 2048, 1}, + {45, 102400, 42, 4096, 1}, + {45, 204800, 42, 2048, 0}, + {45, 307200, 42, 2048, 0}, + {45, 409600, 42, 4096, 0}, + {45, 512000, 42, 2048, 0}, + {45, 614400, 42, 2048, 0}, + {45, 716800, 42, 2048, 0}, + {45, 819200, 42, 32768, 0}, + {45, 921600, 42, 2048, 0}, + {45, 1024000, 42, 2048, 0}, + {56, 512, 43, 32, 1}, + {56, 1024, 43, 128, 1}, + {56, 1536, 43, 64, 1}, + {56, 2048, 43, 128, 1}, + {56, 2560, 43, 128, 1}, + {56, 3072, 43, 64, 1}, + {56, 3584, 43, 64, 1}, + {56, 4096, 43, 256, 1}, + {56, 4608, 43, 128, 1}, + {56, 5120, 43, 128, 1}, + {56, 5632, 43, 64, 1}, + {56, 6144, 43, 64, 1}, + {56, 6656, 43, 128, 1}, + {56, 7168, 43, 128, 1}, + {56, 7680, 43, 64, 1}, + {56, 8192, 43, 1024, 1}, + {56, 8704, 43, 64, 1}, + {56, 9216, 43, 128, 1}, + {56, 9728, 43, 64, 1}, + {56, 10240, 43, 128, 1}, + {56, 20480, 43, 1024, 1}, + {56, 30720, 43, 1024, 1}, + {56, 40960, 43, 4096, 1}, + {56, 51200, 43, 1024, 1}, + {56, 61440, 43, 4096, 1}, + {56, 71680, 43, 2048, 1}, + {56, 81920, 43, 8192, 1}, + {56, 92160, 43, 1024, 1}, + {56, 102400, 43, 4096, 1}, + {56, 204800, 43, 4096, 0}, + {56, 307200, 43, 4096, 0}, + {56, 409600, 43, 4096, 0}, + {56, 512000, 43, 4096, 0}, + {56, 614400, 43, 4096, 0}, + {56, 716800, 43, 2048, 0}, + {56, 819200, 43, 4096, 0}, + {56, 921600, 43, 4096, 0}, + {56, 1024000, 43, 4096, 0}, + {84, 512, 126, 32, 1}, + {84, 1024, 126, 64, 1}, + {84, 1536, 126, 64, 1}, + {84, 2048, 126, 64, 1}, + {84, 2560, 126, 64, 1}, + {84, 3072, 126, 32, 1}, + {84, 3584, 126, 32, 1}, + {84, 4096, 126, 32, 1}, + {84, 4608, 126, 32, 1}, + {84, 5120, 126, 32, 1}, + {84, 5632, 126, 32, 1}, + {84, 6144, 126, 128, 1}, + {84, 6656, 126, 64, 1}, + {84, 7168, 126, 32, 1}, + {84, 7680, 126, 256, 1}, + {84, 8192, 126, 32, 1}, + {84, 8704, 126, 32, 1}, + {84, 9216, 126, 32, 1}, + {84, 9728, 126, 32, 1}, + {84, 10240, 126, 64, 1}, + {84, 20480, 126, 64, 1}, + {84, 30720, 126, 32, 1}, + {84, 40960, 126, 512, 0}, + {84, 51200, 126, 512, 0}, + {84, 61440, 126, 512, 0}, + {84, 71680, 126, 512, 0}, + {84, 81920, 126, 512, 0}, + {84, 92160, 126, 512, 0}, + {84, 102400, 126, 512, 0}, + {84, 204800, 126, 512, 0}, + {84, 307200, 126, 512, 0}, + {84, 409600, 126, 512, 0}, + {84, 512000, 126, 512, 0}, + {84, 614400, 126, 512, 0}, + {84, 716800, 126, 512, 0}, + {84, 819200, 126, 512, 0}, + {84, 921600, 126, 512, 0}, + {84, 1024000, 126, 512, 0}, + {120, 512, 210, 64, 0}, + {120, 1024, 210, 32, 1}, + {120, 1536, 210, 32, 1}, + {120, 2048, 210, 32, 1}, + {120, 2560, 210, 32, 1}, + {120, 3072, 210, 64, 1}, + {120, 3584, 210, 32, 1}, + {120, 4096, 210, 64, 1}, + {120, 4608, 210, 32, 1}, + {120, 5120, 210, 32, 1}, + {120, 5632, 210, 32, 1}, + {120, 6144, 210, 64, 1}, + {120, 6656, 210, 32, 1}, + {120, 7168, 210, 32, 1}, + {120, 7680, 210, 32, 1}, + {120, 8192, 210, 32, 1}, + {120, 8704, 210, 32, 1}, + {120, 9216, 210, 32, 1}, + {120, 9728, 210, 32, 1}, + {120, 10240, 210, 32, 1}, + {120, 20480, 210, 2048, 0}, + {120, 30720, 210, 2048, 0}, + {120, 40960, 210, 2048, 0}, + {120, 51200, 210, 2048, 0}, + {120, 61440, 210, 2048, 0}, + {120, 71680, 210, 2048, 0}, + {120, 81920, 210, 2048, 0}, + {120, 92160, 210, 2048, 0}, + {120, 102400, 210, 2048, 0}, + {120, 204800, 210, 2048, 0}, + {120, 307200, 210, 2048, 0}, + {120, 409600, 210, 2048, 0}, + {120, 512000, 210, 2048, 0}, + {120, 614400, 210, 2048, 0}, + {120, 716800, 210, 716800, 0}, + {120, 819200, 210, 819200, 0}, + {120, 921600, 210, 921600, 0}, + {120, 1024000, 210, 1024000, 0}, + {165, 512, 330, 128, 0}, + {165, 1024, 330, 1024, 0}, + {165, 1536, 330, 32, 1}, + {165, 2048, 330, 128, 1}, + {165, 2560, 330, 128, 1}, + {165, 3072, 330, 32, 1}, + {165, 3584, 330, 32, 1}, + {165, 4096, 330, 4096, 0}, + {165, 4608, 330, 256, 0}, + {165, 5120, 330, 256, 0}, + {165, 5632, 330, 32, 1}, + {165, 6144, 330, 512, 0}, + {165, 6656, 330, 32, 1}, + {165, 7168, 330, 32, 1}, + {165, 7680, 330, 256, 0}, + {165, 8192, 330, 4096, 0}, + {165, 8704, 330, 512, 0}, + {165, 9216, 330, 512, 0}, + {165, 9728, 330, 512, 0}, + {165, 10240, 330, 512, 0}, + {165, 20480, 330, 4096, 0}, + {165, 30720, 330, 30720, 0}, + {165, 40960, 330, 40960, 0}, + {165, 51200, 330, 512, 0}, + {165, 61440, 330, 4096, 0}, + {165, 71680, 330, 512, 0}, + {165, 81920, 330, 4096, 0}, + {165, 92160, 330, 512, 0}, + {165, 102400, 330, 4096, 0}, + {165, 204800, 330, 4096, 0}, + {165, 307200, 330, 4096, 0}, + {165, 409600, 330, 4096, 0}, + {165, 512000, 330, 4096, 0}, + {165, 614400, 330, 4096, 0}, + {165, 716800, 330, 4096, 0}, + {165, 819200, 330, 32768, 0}, + {165, 921600, 330, 4096, 0}, + {165, 1024000, 330, 4096, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > dgemm_tn_mi100 = -{ - {1 , 512 , 3 , 64 , 1 }, - {1 , 1024 , 3 , 32 , 1 }, - {1 , 1536 , 3 , 32 , 1 }, - {1 , 2048 , 3 , 64 , 1 }, - {1 , 2560 , 3 , 32 , 1 }, - {1 , 3072 , 3 , 32 , 1 }, - {1 , 3584 , 3 , 32 , 1 }, - {1 , 4096 , 3 , 32 , 1 }, - {1 , 4608 , 3 , 32 , 1 }, - {1 , 5120 , 3 , 32 , 1 }, - {1 , 5632 , 3 , 32 , 1 }, - {1 , 6144 , 3 , 32 , 1 }, - {1 , 6656 , 3 , 32 , 1 }, - {1 , 7168 , 3 , 32 , 1 }, - {1 , 7680 , 3 , 32 , 1 }, - {1 , 8192 , 3 , 32 , 1 }, - {1 , 8704 , 3 , 32 , 1 }, - {1 , 9216 , 3 , 64 , 1 }, - {1 , 9728 , 3 , 64 , 1 }, - {1 , 10240 , 3 , 32 , 1 }, - {1 , 20480 , 3 , 64 , 1 }, - {1 , 30720 , 3 , 64 , 1 }, - {1 , 40960 , 3 , 64 , 1 }, - {1 , 51200 , 3 , 2048 , 1 }, - {1 , 61440 , 3 , 128 , 1 }, - {1 , 71680 , 3 , 1024 , 1 }, - {1 , 81920 , 3 , 2048 , 1 }, - {1 , 92160 , 3 , 2048 , 1 }, - {1 , 102400 , 3 , 2048 , 1 }, - {1 , 204800 , 3 , 1024 , 1 }, - {1 , 307200 , 3 , 1024 , 1 }, - {1 , 409600 , 3 , 4096 , 1 }, - {1 , 512000 , 3 , 2048 , 1 }, - {1 , 614400 , 3 , 2048 , 1 }, - {1 , 716800 , 3 , 4096 , 1 }, - {1 , 819200 , 3 , 1024 , 1 }, - {1 , 921600 , 3 , 4096 , 1 }, - {1 , 1024000, 3 , 2048 , 1 }, - {1 , 512 , 4 , 64 , 1 }, - {1 , 1024 , 4 , 64 , 1 }, - {1 , 1536 , 4 , 64 , 1 }, - {1 , 2048 , 4 , 128 , 1 }, - {1 , 2560 , 4 , 32 , 1 }, - {1 , 3072 , 4 , 32 , 1 }, - {1 , 3584 , 4 , 32 , 1 }, - {1 , 4096 , 4 , 32 , 1 }, - {1 , 4608 , 4 , 32 , 1 }, - {1 , 5120 , 4 , 32 , 1 }, - {1 , 5632 , 4 , 32 , 1 }, - {1 , 6144 , 4 , 32 , 1 }, - {1 , 6656 , 4 , 32 , 1 }, - {1 , 7168 , 4 , 32 , 1 }, - {1 , 7680 , 4 , 32 , 1 }, - {1 , 8192 , 4 , 32 , 1 }, - {1 , 8704 , 4 , 32 , 1 }, - {1 , 9216 , 4 , 32 , 1 }, - {1 , 9728 , 4 , 32 , 1 }, - {1 , 10240 , 4 , 32 , 1 }, - {1 , 20480 , 4 , 64 , 1 }, - {1 , 30720 , 4 , 64 , 1 }, - {1 , 40960 , 4 , 64 , 1 }, - {1 , 51200 , 4 , 1024 , 1 }, - {1 , 61440 , 4 , 4096 , 1 }, - {1 , 71680 , 4 , 1024 , 1 }, - {1 , 81920 , 4 , 128 , 1 }, - {1 , 92160 , 4 , 2048 , 1 }, - {1 , 102400 , 4 , 1024 , 1 }, - {1 , 204800 , 4 , 4096 , 1 }, - {1 , 307200 , 4 , 2048 , 1 }, - {1 , 409600 , 4 , 4096 , 1 }, - {1 , 512000 , 4 , 4096 , 1 }, - {1 , 614400 , 4 , 2048 , 1 }, - {1 , 716800 , 4 , 4096 , 1 }, - {1 , 819200 , 4 , 8192 , 1 }, - {1 , 921600 , 4 , 2048 , 1 }, - {1 , 1024000, 4 , 2048 , 1 }, - {3 , 512 , 6 , 32 , 1 }, - {3 , 1024 , 6 , 32 , 1 }, - {3 , 1536 , 6 , 32 , 1 }, - {3 , 2048 , 6 , 32 , 1 }, - {3 , 2560 , 6 , 128 , 1 }, - {3 , 3072 , 6 , 32 , 1 }, - {3 , 3584 , 6 , 32 , 1 }, - {3 , 4096 , 6 , 32 , 1 }, - {3 , 4608 , 6 , 32 , 1 }, - {3 , 5120 , 6 , 32 , 1 }, - {3 , 5632 , 6 , 32 , 1 }, - {3 , 6144 , 6 , 32 , 1 }, - {3 , 6656 , 6 , 64 , 1 }, - {3 , 7168 , 6 , 32 , 1 }, - {3 , 7680 , 6 , 32 , 1 }, - {3 , 8192 , 6 , 32 , 1 }, - {3 , 8704 , 6 , 32 , 1 }, - {3 , 9216 , 6 , 32 , 1 }, - {3 , 9728 , 6 , 64 , 1 }, - {3 , 10240 , 6 , 32 , 1 }, - {3 , 20480 , 6 , 64 , 1 }, - {3 , 30720 , 6 , 64 , 1 }, - {3 , 40960 , 6 , 64 , 1 }, - {3 , 51200 , 6 , 128 , 1 }, - {3 , 61440 , 6 , 128 , 1 }, - {3 , 71680 , 6 , 1024 , 1 }, - {3 , 81920 , 6 , 2048 , 1 }, - {3 , 92160 , 6 , 2048 , 1 }, - {3 , 102400 , 6 , 256 , 1 }, - {3 , 204800 , 6 , 256 , 1 }, - {3 , 307200 , 6 , 1024 , 1 }, - {3 , 409600 , 6 , 16384 , 1 }, - {3 , 512000 , 6 , 512 , 1 }, - {3 , 614400 , 6 , 512 , 1 }, - {3 , 716800 , 6 , 2048 , 1 }, - {3 , 819200 , 6 , 4096 , 1 }, - {3 , 921600 , 6 , 1024 , 1 }, - {3 , 1024000, 6 , 1024 , 1 }, - {4 , 512 , 10 , 32 , 1 }, - {4 , 1024 , 10 , 64 , 1 }, - {4 , 1536 , 10 , 32 , 1 }, - {4 , 2048 , 10 , 32 , 1 }, - {4 , 2560 , 10 , 32 , 1 }, - {4 , 3072 , 10 , 32 , 1 }, - {4 , 3584 , 10 , 32 , 1 }, - {4 , 4096 , 10 , 32 , 1 }, - {4 , 4608 , 10 , 32 , 1 }, - {4 , 5120 , 10 , 32 , 1 }, - {4 , 5632 , 10 , 32 , 1 }, - {4 , 6144 , 10 , 32 , 1 }, - {4 , 6656 , 10 , 32 , 1 }, - {4 , 7168 , 10 , 64 , 1 }, - {4 , 7680 , 10 , 32 , 1 }, - {4 , 8192 , 10 , 32 , 1 }, - {4 , 8704 , 10 , 32 , 1 }, - {4 , 9216 , 10 , 32 , 1 }, - {4 , 9728 , 10 , 32 , 1 }, - {4 , 10240 , 10 , 32 , 1 }, - {4 , 20480 , 10 , 32 , 1 }, - {4 , 30720 , 10 , 1024 , 1 }, - {4 , 40960 , 10 , 64 , 1 }, - {4 , 51200 , 10 , 128 , 1 }, - {4 , 61440 , 10 , 128 , 1 }, - {4 , 71680 , 10 , 64 , 1 }, - {4 , 81920 , 10 , 128 , 1 }, - {4 , 92160 , 10 , 256 , 1 }, - {4 , 102400 , 10 , 256 , 1 }, - {4 , 204800 , 10 , 512 , 1 }, - {4 , 307200 , 10 , 512 , 1 }, - {4 , 409600 , 10 , 16384 , 1 }, - {4 , 512000 , 10 , 1024 , 1 }, - {4 , 614400 , 10 , 2048 , 1 }, - {4 , 716800 , 10 , 2048 , 1 }, - {4 , 819200 , 10 , 1024 , 1 }, - {4 , 921600 , 10 , 4096 , 1 }, - {4 , 1024000, 10 , 4096 , 1 }, - {6 , 512 , 10 , 64 , 1 }, - {6 , 1024 , 10 , 128 , 1 }, - {6 , 1536 , 10 , 128 , 1 }, - {6 , 2048 , 10 , 32 , 1 }, - {6 , 2560 , 10 , 32 , 1 }, - {6 , 3072 , 10 , 32 , 1 }, - {6 , 3584 , 10 , 64 , 1 }, - {6 , 4096 , 10 , 32 , 1 }, - {6 , 4608 , 10 , 32 , 1 }, - {6 , 5120 , 10 , 32 , 1 }, - {6 , 5632 , 10 , 32 , 1 }, - {6 , 6144 , 10 , 32 , 1 }, - {6 , 6656 , 10 , 64 , 1 }, - {6 , 7168 , 10 , 32 , 1 }, - {6 , 7680 , 10 , 32 , 1 }, - {6 , 8192 , 10 , 32 , 1 }, - {6 , 8704 , 10 , 32 , 1 }, - {6 , 9216 , 10 , 32 , 1 }, - {6 , 9728 , 10 , 32 , 1 }, - {6 , 10240 , 10 , 32 , 1 }, - {6 , 20480 , 10 , 128 , 1 }, - {6 , 30720 , 10 , 64 , 1 }, - {6 , 40960 , 10 , 64 , 1 }, - {6 , 51200 , 10 , 64 , 1 }, - {6 , 61440 , 10 , 128 , 1 }, - {6 , 71680 , 10 , 128 , 1 }, - {6 , 81920 , 10 , 64 , 1 }, - {6 , 92160 , 10 , 64 , 1 }, - {6 , 102400 , 10 , 128 , 1 }, - {6 , 204800 , 10 , 1024 , 1 }, - {6 , 307200 , 10 , 1024 , 1 }, - {6 , 409600 , 10 , 1024 , 1 }, - {6 , 512000 , 10 , 1024 , 1 }, - {6 , 614400 , 10 , 512 , 1 }, - {6 , 716800 , 10 , 1024 , 1 }, - {6 , 819200 , 10 , 32768 , 1 }, - {6 , 921600 , 10 , 2048 , 1 }, - {6 , 1024000, 10 , 8192 , 1 }, - {12 , 512 , 15 , 32 , 1 }, - {12 , 1024 , 15 , 32 , 1 }, - {12 , 1536 , 15 , 32 , 1 }, - {12 , 2048 , 15 , 32 , 1 }, - {12 , 2560 , 15 , 32 , 1 }, - {12 , 3072 , 15 , 64 , 1 }, - {12 , 3584 , 15 , 32 , 1 }, - {12 , 4096 , 15 , 32 , 1 }, - {12 , 4608 , 15 , 32 , 1 }, - {12 , 5120 , 15 , 64 , 1 }, - {12 , 5632 , 15 , 32 , 1 }, - {12 , 6144 , 15 , 32 , 1 }, - {12 , 6656 , 15 , 32 , 1 }, - {12 , 7168 , 15 , 32 , 1 }, - {12 , 7680 , 15 , 32 , 1 }, - {12 , 8192 , 15 , 32 , 1 }, - {12 , 8704 , 15 , 64 , 1 }, - {12 , 9216 , 15 , 64 , 1 }, - {12 , 9728 , 15 , 32 , 1 }, - {12 , 10240 , 15 , 64 , 1 }, - {12 , 20480 , 15 , 32 , 1 }, - {12 , 30720 , 15 , 64 , 1 }, - {12 , 40960 , 15 , 32 , 1 }, - {12 , 51200 , 15 , 64 , 1 }, - {12 , 61440 , 15 , 256 , 1 }, - {12 , 71680 , 15 , 32 , 1 }, - {12 , 81920 , 15 , 512 , 1 }, - {12 , 92160 , 15 , 256 , 1 }, - {12 , 102400 , 15 , 64 , 1 }, - {12 , 204800 , 15 , 512 , 1 }, - {12 , 307200 , 15 , 512 , 1 }, - {12 , 409600 , 15 , 256 , 1 }, - {12 , 512000 , 15 , 2048 , 1 }, - {12 , 614400 , 15 , 2048 , 1 }, - {12 , 716800 , 15 , 512 , 1 }, - {12 , 819200 , 15 , 8192 , 1 }, - {12 , 921600 , 15 , 1024 , 1 }, - {12 , 1024000, 15 , 1024 , 1 }, - {11 , 512 , 20 , 64 , 1 }, - {11 , 1024 , 20 , 64 , 1 }, - {11 , 1536 , 20 , 128 , 1 }, - {11 , 2048 , 20 , 256 , 1 }, - {11 , 2560 , 20 , 32 , 1 }, - {11 , 3072 , 20 , 128 , 1 }, - {11 , 3584 , 20 , 64 , 1 }, - {11 , 4096 , 20 , 32 , 1 }, - {11 , 4608 , 20 , 32 , 1 }, - {11 , 5120 , 20 , 32 , 1 }, - {11 , 5632 , 20 , 32 , 1 }, - {11 , 6144 , 20 , 32 , 1 }, - {11 , 6656 , 20 , 32 , 1 }, - {11 , 7168 , 20 , 32 , 1 }, - {11 , 7680 , 20 , 32 , 1 }, - {11 , 8192 , 20 , 64 , 1 }, - {11 , 8704 , 20 , 64 , 1 }, - {11 , 9216 , 20 , 64 , 1 }, - {11 , 9728 , 20 , 32 , 1 }, - {11 , 10240 , 20 , 64 , 1 }, - {11 , 20480 , 20 , 64 , 1 }, - {11 , 30720 , 20 , 64 , 1 }, - {11 , 40960 , 20 , 64 , 1 }, - {11 , 51200 , 20 , 64 , 1 }, - {11 , 61440 , 20 , 64 , 1 }, - {11 , 71680 , 20 , 64 , 1 }, - {11 , 81920 , 20 , 64 , 1 }, - {11 , 92160 , 20 , 256 , 1 }, - {11 , 102400 , 20 , 64 , 1 }, - {11 , 204800 , 20 , 128 , 1 }, - {11 , 307200 , 20 , 256 , 1 }, - {11 , 409600 , 20 , 512 , 1 }, - {11 , 512000 , 20 , 256 , 1 }, - {11 , 614400 , 20 , 512 , 1 }, - {11 , 716800 , 20 , 4096 , 1 }, - {11 , 819200 , 20 , 8192 , 1 }, - {11 , 921600 , 20 , 4096 , 1 }, - {11 , 1024000, 20 , 4096 , 1 }, - {16 , 512 , 21 , 64 , 1 }, - {16 , 1024 , 21 , 256 , 1 }, - {16 , 1536 , 21 , 32 , 1 }, - {16 , 2048 , 21 , 32 , 1 }, - {16 , 2560 , 21 , 32 , 1 }, - {16 , 3072 , 21 , 32 , 1 }, - {16 , 3584 , 21 , 32 , 1 }, - {16 , 4096 , 21 , 32 , 1 }, - {16 , 4608 , 21 , 64 , 1 }, - {16 , 5120 , 21 , 64 , 1 }, - {16 , 5632 , 21 , 32 , 1 }, - {16 , 6144 , 21 , 32 , 1 }, - {16 , 6656 , 21 , 64 , 1 }, - {16 , 7168 , 21 , 32 , 1 }, - {16 , 7680 , 21 , 32 , 1 }, - {16 , 8192 , 21 , 32 , 1 }, - {16 , 8704 , 21 , 256 , 1 }, - {16 , 9216 , 21 , 128 , 1 }, - {16 , 9728 , 21 , 128 , 1 }, - {16 , 10240 , 21 , 32 , 1 }, - {16 , 20480 , 21 , 32 , 1 }, - {16 , 30720 , 21 , 64 , 1 }, - {16 , 40960 , 21 , 32 , 1 }, - {16 , 51200 , 21 , 32 , 1 }, - {16 , 61440 , 21 , 32 , 1 }, - {16 , 71680 , 21 , 32 , 1 }, - {16 , 81920 , 21 , 128 , 1 }, - {16 , 92160 , 21 , 512 , 1 }, - {16 , 102400 , 21 , 128 , 1 }, - {16 , 204800 , 21 , 128 , 1 }, - {16 , 307200 , 21 , 2048 , 1 }, - {16 , 409600 , 21 , 8192 , 1 }, - {16 , 512000 , 21 , 2048 , 1 }, - {16 , 614400 , 21 , 512 , 1 }, - {16 , 716800 , 21 , 4096 , 1 }, - {16 , 819200 , 21 , 32768 , 1 }, - {16 , 921600 , 21 , 4096 , 1 }, - {16 , 1024000, 21 , 2048 , 1 }, - {25 , 512 , 28 , 64 , 1 }, - {25 , 1024 , 28 , 32 , 1 }, - {25 , 1536 , 28 , 128 , 1 }, - {25 , 2048 , 28 , 128 , 1 }, - {25 , 2560 , 28 , 32 , 1 }, - {25 , 3072 , 28 , 64 , 1 }, - {25 , 3584 , 28 , 32 , 1 }, - {25 , 4096 , 28 , 32 , 1 }, - {25 , 4608 , 28 , 32 , 1 }, - {25 , 5120 , 28 , 32 , 1 }, - {25 , 5632 , 28 , 32 , 1 }, - {25 , 6144 , 28 , 64 , 1 }, - {25 , 6656 , 28 , 32 , 1 }, - {25 , 7168 , 28 , 32 , 1 }, - {25 , 7680 , 28 , 32 , 1 }, - {25 , 8192 , 28 , 32 , 1 }, - {25 , 8704 , 28 , 64 , 1 }, - {25 , 9216 , 28 , 32 , 1 }, - {25 , 9728 , 28 , 32 , 1 }, - {25 , 10240 , 28 , 32 , 1 }, - {25 , 20480 , 28 , 32 , 1 }, - {25 , 30720 , 28 , 32 , 1 }, - {25 , 40960 , 28 , 128 , 1 }, - {25 , 51200 , 28 , 64 , 1 }, - {25 , 61440 , 28 , 256 , 1 }, - {25 , 71680 , 28 , 64 , 1 }, - {25 , 81920 , 28 , 256 , 1 }, - {25 , 92160 , 28 , 128 , 1 }, - {25 , 102400 , 28 , 1024 , 1 }, - {25 , 204800 , 28 , 8192 , 1 }, - {25 , 307200 , 28 , 4096 , 1 }, - {25 , 409600 , 28 , 4096 , 1 }, - {25 , 512000 , 28 , 4096 , 1 }, - {25 , 614400 , 28 , 8192 , 1 }, - {25 , 716800 , 28 , 2048 , 1 }, - {25 , 819200 , 28 , 32768 , 1 }, - {25 , 921600 , 28 , 4096 , 1 }, - {25 , 1024000, 28 , 4096 , 1 }, - {24 , 512 , 35 , 128 , 1 }, - {24 , 1024 , 35 , 32 , 1 }, - {24 , 1536 , 35 , 32 , 1 }, - {24 , 2048 , 35 , 128 , 1 }, - {24 , 2560 , 35 , 32 , 1 }, - {24 , 3072 , 35 , 32 , 1 }, - {24 , 3584 , 35 , 32 , 1 }, - {24 , 4096 , 35 , 64 , 1 }, - {24 , 4608 , 35 , 32 , 1 }, - {24 , 5120 , 35 , 32 , 1 }, - {24 , 5632 , 35 , 32 , 1 }, - {24 , 6144 , 35 , 256 , 1 }, - {24 , 6656 , 35 , 128 , 1 }, - {24 , 7168 , 35 , 256 , 1 }, - {24 , 7680 , 35 , 64 , 1 }, - {24 , 8192 , 35 , 64 , 1 }, - {24 , 8704 , 35 , 32 , 1 }, - {24 , 9216 , 35 , 32 , 1 }, - {24 , 9728 , 35 , 32 , 1 }, - {24 , 10240 , 35 , 32 , 1 }, - {24 , 20480 , 35 , 64 , 1 }, - {24 , 30720 , 35 , 64 , 1 }, - {24 , 40960 , 35 , 64 , 1 }, - {24 , 51200 , 35 , 32 , 1 }, - {24 , 61440 , 35 , 64 , 1 }, - {24 , 71680 , 35 , 64 , 1 }, - {24 , 81920 , 35 , 64 , 1 }, - {24 , 92160 , 35 , 64 , 1 }, - {24 , 102400 , 35 , 128 , 1 }, - {24 , 204800 , 35 , 128 , 1 }, - {24 , 307200 , 35 , 1024 , 1 }, - {24 , 409600 , 35 , 256 , 1 }, - {24 , 512000 , 35 , 256 , 1 }, - {24 , 614400 , 35 , 4096 , 1 }, - {24 , 716800 , 35 , 4096 , 1 }, - {24 , 819200 , 35 , 512 , 1 }, - {24 , 921600 , 35 , 4096 , 1 }, - {24 , 1024000, 35 , 8192 , 1 }, - {33 , 512 , 36 , 32 , 1 }, - {33 , 1024 , 36 , 32 , 1 }, - {33 , 1536 , 36 , 64 , 1 }, - {33 , 2048 , 36 , 64 , 1 }, - {33 , 2560 , 36 , 32 , 1 }, - {33 , 3072 , 36 , 64 , 1 }, - {33 , 3584 , 36 , 32 , 1 }, - {33 , 4096 , 36 , 256 , 1 }, - {33 , 4608 , 36 , 128 , 1 }, - {33 , 5120 , 36 , 128 , 1 }, - {33 , 5632 , 36 , 128 , 1 }, - {33 , 6144 , 36 , 32 , 1 }, - {33 , 6656 , 36 , 64 , 1 }, - {33 , 7168 , 36 , 32 , 1 }, - {33 , 7680 , 36 , 32 , 1 }, - {33 , 8192 , 36 , 64 , 1 }, - {33 , 8704 , 36 , 128 , 1 }, - {33 , 9216 , 36 , 256 , 1 }, - {33 , 9728 , 36 , 32 , 1 }, - {33 , 10240 , 36 , 32 , 1 }, - {33 , 20480 , 36 , 32 , 1 }, - {33 , 30720 , 36 , 128 , 1 }, - {33 , 40960 , 36 , 32 , 1 }, - {33 , 51200 , 36 , 64 , 1 }, - {33 , 61440 , 36 , 32 , 1 }, - {33 , 71680 , 36 , 32 , 1 }, - {33 , 81920 , 36 , 64 , 1 }, - {33 , 92160 , 36 , 64 , 1 }, - {33 , 102400 , 36 , 64 , 1 }, - {33 , 204800 , 36 , 1024 , 1 }, - {33 , 307200 , 36 , 256 , 1 }, - {33 , 409600 , 36 , 1024 , 1 }, - {33 , 512000 , 36 , 256 , 1 }, - {33 , 614400 , 36 , 512 , 1 }, - {33 , 716800 , 36 , 4096 , 1 }, - {33 , 819200 , 36 , 1024 , 1 }, - {33 , 921600 , 36 , 4096 , 1 }, - {33 , 1024000, 36 , 4096 , 1 }, - {42 , 512 , 45 , 32 , 1 }, - {42 , 1024 , 45 , 64 , 1 }, - {42 , 1536 , 45 , 64 , 1 }, - {42 , 2048 , 45 , 64 , 1 }, - {42 , 2560 , 45 , 32 , 1 }, - {42 , 3072 , 45 , 32 , 1 }, - {42 , 3584 , 45 , 128 , 1 }, - {42 , 4096 , 45 , 64 , 1 }, - {42 , 4608 , 45 , 32 , 1 }, - {42 , 5120 , 45 , 32 , 1 }, - {42 , 5632 , 45 , 32 , 1 }, - {42 , 6144 , 45 , 32 , 1 }, - {42 , 6656 , 45 , 128 , 1 }, - {42 , 7168 , 45 , 32 , 1 }, - {42 , 7680 , 45 , 64 , 1 }, - {42 , 8192 , 45 , 64 , 1 }, - {42 , 8704 , 45 , 32 , 1 }, - {42 , 9216 , 45 , 128 , 1 }, - {42 , 9728 , 45 , 64 , 1 }, - {42 , 10240 , 45 , 64 , 1 }, - {42 , 20480 , 45 , 32 , 1 }, - {42 , 30720 , 45 , 128 , 1 }, - {42 , 40960 , 45 , 32 , 1 }, - {42 , 51200 , 45 , 32 , 1 }, - {42 , 61440 , 45 , 64 , 1 }, - {42 , 71680 , 45 , 64 , 1 }, - {42 , 81920 , 45 , 64 , 1 }, - {42 , 92160 , 45 , 64 , 1 }, - {42 , 102400 , 45 , 64 , 1 }, - {42 , 204800 , 45 , 128 , 1 }, - {42 , 307200 , 45 , 2048 , 1 }, - {42 , 409600 , 45 , 256 , 1 }, - {42 , 512000 , 45 , 4096 , 1 }, - {42 , 614400 , 45 , 512 , 1 }, - {42 , 716800 , 45 , 512 , 1 }, - {42 , 819200 , 45 , 512 , 1 }, - {42 , 921600 , 45 , 4096 , 1 }, - {42 , 1024000, 45 , 4096 , 1 }, - {43 , 512 , 56 , 64 , 1 }, - {43 , 1024 , 56 , 128 , 1 }, - {43 , 1536 , 56 , 32 , 1 }, - {43 , 2048 , 56 , 64 , 1 }, - {43 , 2560 , 56 , 64 , 1 }, - {43 , 3072 , 56 , 64 , 1 }, - {43 , 3584 , 56 , 64 , 1 }, - {43 , 4096 , 56 , 32 , 1 }, - {43 , 4608 , 56 , 32 , 1 }, - {43 , 5120 , 56 , 32 , 1 }, - {43 , 5632 , 56 , 32 , 1 }, - {43 , 6144 , 56 , 64 , 1 }, - {43 , 6656 , 56 , 32 , 1 }, - {43 , 7168 , 56 , 64 , 1 }, - {43 , 7680 , 56 , 64 , 1 }, - {43 , 8192 , 56 , 32 , 1 }, - {43 , 8704 , 56 , 32 , 1 }, - {43 , 9216 , 56 , 32 , 1 }, - {43 , 9728 , 56 , 32 , 1 }, - {43 , 10240 , 56 , 128 , 1 }, - {43 , 20480 , 56 , 32 , 1 }, - {43 , 30720 , 56 , 128 , 1 }, - {43 , 40960 , 56 , 64 , 1 }, - {43 , 51200 , 56 , 128 , 1 }, - {43 , 61440 , 56 , 32 , 1 }, - {43 , 71680 , 56 , 64 , 1 }, - {43 , 81920 , 56 , 128 , 1 }, - {43 , 92160 , 56 , 64 , 1 }, - {43 , 102400 , 56 , 64 , 1 }, - {43 , 204800 , 56 , 128 , 1 }, - {43 , 307200 , 56 , 128 , 1 }, - {43 , 409600 , 56 , 16384 , 1 }, - {43 , 512000 , 56 , 1024 , 1 }, - {43 , 614400 , 56 , 512 , 1 }, - {43 , 716800 , 56 , 2048 , 1 }, - {43 , 819200 , 56 , 8192 , 1 }, - {43 , 921600 , 56 , 1024 , 1 }, - {43 , 1024000, 56 , 2048 , 1 }, - {126 , 512 , 84 , 32 , 1 }, - {126 , 1024 , 84 , 32 , 1 }, - {126 , 1536 , 84 , 32 , 1 }, - {126 , 2048 , 84 , 32 , 1 }, - {126 , 2560 , 84 , 32 , 1 }, - {126 , 3072 , 84 , 32 , 1 }, - {126 , 3584 , 84 , 32 , 1 }, - {126 , 4096 , 84 , 2048 , 1 }, - {126 , 4608 , 84 , 32 , 1 }, - {126 , 5120 , 84 , 512 , 1 }, - {126 , 5632 , 84 , 32 , 1 }, - {126 , 6144 , 84 , 2048 , 1 }, - {126 , 6656 , 84 , 32 , 1 }, - {126 , 7168 , 84 , 32 , 1 }, - {126 , 7680 , 84 , 32 , 1 }, - {126 , 8192 , 84 , 128 , 1 }, - {126 , 8704 , 84 , 64 , 1 }, - {126 , 9216 , 84 , 32 , 1 }, - {126 , 9728 , 84 , 32 , 1 }, - {126 , 10240 , 84 , 32 , 1 }, - {126 , 20480 , 84 , 32 , 1 }, - {126 , 30720 , 84 , 32 , 1 }, - {126 , 40960 , 84 , 2048 , 1 }, - {126 , 51200 , 84 , 32 , 1 }, - {126 , 61440 , 84 , 32 , 1 }, - {126 , 71680 , 84 , 256 , 1 }, - {126 , 81920 , 84 , 64 , 1 }, - {126 , 92160 , 84 , 512 , 1 }, - {126 , 102400 , 84 , 128 , 1 }, - {126 , 204800 , 84 , 204800 , 1 }, - {126 , 307200 , 84 , 307200 , 1 }, - {126 , 409600 , 84 , 409600 , 1 }, - {126 , 512000 , 84 , 512000 , 1 }, - {126 , 614400 , 84 , 614400 , 1 }, - {126 , 716800 , 84 , 716800 , 1 }, - {126 , 819200 , 84 , 819200 , 1 }, - {126 , 921600 , 84 , 921600 , 1 }, - {126 , 1024000, 84 , 1024000, 1 }, - {210 , 512 , 120 , 32 , 1 }, - {210 , 1024 , 120 , 32 , 1 }, - {210 , 1536 , 120 , 32 , 1 }, - {210 , 2048 , 120 , 64 , 1 }, - {210 , 2560 , 120 , 64 , 1 }, - {210 , 3072 , 120 , 64 , 1 }, - {210 , 3584 , 120 , 64 , 1 }, - {210 , 4096 , 120 , 64 , 1 }, - {210 , 4608 , 120 , 32 , 1 }, - {210 , 5120 , 120 , 32 , 1 }, - {210 , 5632 , 120 , 256 , 1 }, - {210 , 6144 , 120 , 32 , 1 }, - {210 , 6656 , 120 , 32 , 1 }, - {210 , 7168 , 120 , 64 , 1 }, - {210 , 7680 , 120 , 32 , 1 }, - {210 , 8192 , 120 , 32 , 1 }, - {210 , 8704 , 120 , 32 , 1 }, - {210 , 9216 , 120 , 32 , 1 }, - {210 , 9728 , 120 , 32 , 1 }, - {210 , 10240 , 120 , 32 , 1 }, - {210 , 20480 , 120 , 512 , 1 }, - {210 , 30720 , 120 , 1024 , 1 }, - {210 , 40960 , 120 , 1024 , 1 }, - {210 , 51200 , 120 , 64 , 1 }, - {210 , 61440 , 120 , 32 , 1 }, - {210 , 71680 , 120 , 32 , 1 }, - {210 , 81920 , 120 , 32 , 1 }, - {210 , 92160 , 120 , 64 , 1 }, - {210 , 102400 , 120 , 256 , 1 }, - {210 , 204800 , 120 , 204800 , 1 }, - {210 , 307200 , 120 , 307200 , 1 }, - {210 , 409600 , 120 , 409600 , 1 }, - {210 , 512000 , 120 , 512000 , 1 }, - {210 , 614400 , 120 , 614400 , 1 }, - {210 , 716800 , 120 , 716800 , 1 }, - {210 , 819200 , 120 , 819200 , 1 }, - {210 , 921600 , 120 , 921600 , 1 }, - {210 , 1024000, 120 , 1024000, 1 }, - {330 , 512 , 165 , 32 , 1 }, - {330 , 1024 , 165 , 32 , 1 }, - {330 , 1536 , 165 , 512 , 1 }, - {330 , 2048 , 165 , 2048 , 1 }, - {330 , 2560 , 165 , 512 , 1 }, - {330 , 3072 , 165 , 512 , 1 }, - {330 , 3584 , 165 , 512 , 1 }, - {330 , 4096 , 165 , 4096 , 1 }, - {330 , 4608 , 165 , 512 , 1 }, - {330 , 5120 , 165 , 512 , 1 }, - {330 , 5632 , 165 , 512 , 1 }, - {330 , 6144 , 165 , 1024 , 1 }, - {330 , 6656 , 165 , 512 , 1 }, - {330 , 7168 , 165 , 512 , 1 }, - {330 , 7680 , 165 , 512 , 1 }, - {330 , 8192 , 165 , 8192 , 1 }, - {330 , 8704 , 165 , 512 , 1 }, - {330 , 9216 , 165 , 512 , 1 }, - {330 , 9728 , 165 , 512 , 1 }, - {330 , 10240 , 165 , 2048 , 1 }, - {330 , 20480 , 165 , 2048 , 1 }, - {330 , 30720 , 165 , 2048 , 1 }, - {330 , 40960 , 165 , 8192 , 1 }, - {330 , 51200 , 165 , 2048 , 1 }, - {330 , 61440 , 165 , 2048 , 1 }, - {330 , 71680 , 165 , 2048 , 1 }, - {330 , 81920 , 165 , 8192 , 1 }, - {330 , 92160 , 165 , 2048 , 1 }, - {330 , 102400 , 165 , 2048 , 1 }, - {330 , 204800 , 165 , 8192 , 1 }, - {330 , 307200 , 165 , 2048 , 1 }, - {330 , 409600 , 165 , 8192 , 1 }, - {330 , 512000 , 165 , 2048 , 1 }, - {330 , 614400 , 165 , 8192 , 1 }, - {330 , 716800 , 165 , 4096 , 1 }, - {330 , 819200 , 165 , 32768 , 1 }, - {330 , 921600 , 165 , 4096 , 1 }, - {330 , 1024000, 165 , 8192 , 1 } +std::vector > dgemm_tn_mi100 = { + {1, 512, 3, 64, 1}, + {1, 1024, 3, 32, 1}, + {1, 1536, 3, 32, 1}, + {1, 2048, 3, 64, 1}, + {1, 2560, 3, 32, 1}, + {1, 3072, 3, 32, 1}, + {1, 3584, 3, 32, 1}, + {1, 4096, 3, 32, 1}, + {1, 4608, 3, 32, 1}, + {1, 5120, 3, 32, 1}, + {1, 5632, 3, 32, 1}, + {1, 6144, 3, 32, 1}, + {1, 6656, 3, 32, 1}, + {1, 7168, 3, 32, 1}, + {1, 7680, 3, 32, 1}, + {1, 8192, 3, 32, 1}, + {1, 8704, 3, 32, 1}, + {1, 9216, 3, 64, 1}, + {1, 9728, 3, 64, 1}, + {1, 10240, 3, 32, 1}, + {1, 20480, 3, 64, 1}, + {1, 30720, 3, 64, 1}, + {1, 40960, 3, 64, 1}, + {1, 51200, 3, 2048, 1}, + {1, 61440, 3, 128, 1}, + {1, 71680, 3, 1024, 1}, + {1, 81920, 3, 2048, 1}, + {1, 92160, 3, 2048, 1}, + {1, 102400, 3, 2048, 1}, + {1, 204800, 3, 1024, 1}, + {1, 307200, 3, 1024, 1}, + {1, 409600, 3, 4096, 1}, + {1, 512000, 3, 2048, 1}, + {1, 614400, 3, 2048, 1}, + {1, 716800, 3, 4096, 1}, + {1, 819200, 3, 1024, 1}, + {1, 921600, 3, 4096, 1}, + {1, 1024000, 3, 2048, 1}, + {1, 512, 4, 64, 1}, + {1, 1024, 4, 64, 1}, + {1, 1536, 4, 64, 1}, + {1, 2048, 4, 128, 1}, + {1, 2560, 4, 32, 1}, + {1, 3072, 4, 32, 1}, + {1, 3584, 4, 32, 1}, + {1, 4096, 4, 32, 1}, + {1, 4608, 4, 32, 1}, + {1, 5120, 4, 32, 1}, + {1, 5632, 4, 32, 1}, + {1, 6144, 4, 32, 1}, + {1, 6656, 4, 32, 1}, + {1, 7168, 4, 32, 1}, + {1, 7680, 4, 32, 1}, + {1, 8192, 4, 32, 1}, + {1, 8704, 4, 32, 1}, + {1, 9216, 4, 32, 1}, + {1, 9728, 4, 32, 1}, + {1, 10240, 4, 32, 1}, + {1, 20480, 4, 64, 1}, + {1, 30720, 4, 64, 1}, + {1, 40960, 4, 64, 1}, + {1, 51200, 4, 1024, 1}, + {1, 61440, 4, 4096, 1}, + {1, 71680, 4, 1024, 1}, + {1, 81920, 4, 128, 1}, + {1, 92160, 4, 2048, 1}, + {1, 102400, 4, 1024, 1}, + {1, 204800, 4, 4096, 1}, + {1, 307200, 4, 2048, 1}, + {1, 409600, 4, 4096, 1}, + {1, 512000, 4, 4096, 1}, + {1, 614400, 4, 2048, 1}, + {1, 716800, 4, 4096, 1}, + {1, 819200, 4, 8192, 1}, + {1, 921600, 4, 2048, 1}, + {1, 1024000, 4, 2048, 1}, + {3, 512, 6, 32, 1}, + {3, 1024, 6, 32, 1}, + {3, 1536, 6, 32, 1}, + {3, 2048, 6, 32, 1}, + {3, 2560, 6, 128, 1}, + {3, 3072, 6, 32, 1}, + {3, 3584, 6, 32, 1}, + {3, 4096, 6, 32, 1}, + {3, 4608, 6, 32, 1}, + {3, 5120, 6, 32, 1}, + {3, 5632, 6, 32, 1}, + {3, 6144, 6, 32, 1}, + {3, 6656, 6, 64, 1}, + {3, 7168, 6, 32, 1}, + {3, 7680, 6, 32, 1}, + {3, 8192, 6, 32, 1}, + {3, 8704, 6, 32, 1}, + {3, 9216, 6, 32, 1}, + {3, 9728, 6, 64, 1}, + {3, 10240, 6, 32, 1}, + {3, 20480, 6, 64, 1}, + {3, 30720, 6, 64, 1}, + {3, 40960, 6, 64, 1}, + {3, 51200, 6, 128, 1}, + {3, 61440, 6, 128, 1}, + {3, 71680, 6, 1024, 1}, + {3, 81920, 6, 2048, 1}, + {3, 92160, 6, 2048, 1}, + {3, 102400, 6, 256, 1}, + {3, 204800, 6, 256, 1}, + {3, 307200, 6, 1024, 1}, + {3, 409600, 6, 16384, 1}, + {3, 512000, 6, 512, 1}, + {3, 614400, 6, 512, 1}, + {3, 716800, 6, 2048, 1}, + {3, 819200, 6, 4096, 1}, + {3, 921600, 6, 1024, 1}, + {3, 1024000, 6, 1024, 1}, + {4, 512, 10, 32, 1}, + {4, 1024, 10, 64, 1}, + {4, 1536, 10, 32, 1}, + {4, 2048, 10, 32, 1}, + {4, 2560, 10, 32, 1}, + {4, 3072, 10, 32, 1}, + {4, 3584, 10, 32, 1}, + {4, 4096, 10, 32, 1}, + {4, 4608, 10, 32, 1}, + {4, 5120, 10, 32, 1}, + {4, 5632, 10, 32, 1}, + {4, 6144, 10, 32, 1}, + {4, 6656, 10, 32, 1}, + {4, 7168, 10, 64, 1}, + {4, 7680, 10, 32, 1}, + {4, 8192, 10, 32, 1}, + {4, 8704, 10, 32, 1}, + {4, 9216, 10, 32, 1}, + {4, 9728, 10, 32, 1}, + {4, 10240, 10, 32, 1}, + {4, 20480, 10, 32, 1}, + {4, 30720, 10, 1024, 1}, + {4, 40960, 10, 64, 1}, + {4, 51200, 10, 128, 1}, + {4, 61440, 10, 128, 1}, + {4, 71680, 10, 64, 1}, + {4, 81920, 10, 128, 1}, + {4, 92160, 10, 256, 1}, + {4, 102400, 10, 256, 1}, + {4, 204800, 10, 512, 1}, + {4, 307200, 10, 512, 1}, + {4, 409600, 10, 16384, 1}, + {4, 512000, 10, 1024, 1}, + {4, 614400, 10, 2048, 1}, + {4, 716800, 10, 2048, 1}, + {4, 819200, 10, 1024, 1}, + {4, 921600, 10, 4096, 1}, + {4, 1024000, 10, 4096, 1}, + {6, 512, 10, 64, 1}, + {6, 1024, 10, 128, 1}, + {6, 1536, 10, 128, 1}, + {6, 2048, 10, 32, 1}, + {6, 2560, 10, 32, 1}, + {6, 3072, 10, 32, 1}, + {6, 3584, 10, 64, 1}, + {6, 4096, 10, 32, 1}, + {6, 4608, 10, 32, 1}, + {6, 5120, 10, 32, 1}, + {6, 5632, 10, 32, 1}, + {6, 6144, 10, 32, 1}, + {6, 6656, 10, 64, 1}, + {6, 7168, 10, 32, 1}, + {6, 7680, 10, 32, 1}, + {6, 8192, 10, 32, 1}, + {6, 8704, 10, 32, 1}, + {6, 9216, 10, 32, 1}, + {6, 9728, 10, 32, 1}, + {6, 10240, 10, 32, 1}, + {6, 20480, 10, 128, 1}, + {6, 30720, 10, 64, 1}, + {6, 40960, 10, 64, 1}, + {6, 51200, 10, 64, 1}, + {6, 61440, 10, 128, 1}, + {6, 71680, 10, 128, 1}, + {6, 81920, 10, 64, 1}, + {6, 92160, 10, 64, 1}, + {6, 102400, 10, 128, 1}, + {6, 204800, 10, 1024, 1}, + {6, 307200, 10, 1024, 1}, + {6, 409600, 10, 1024, 1}, + {6, 512000, 10, 1024, 1}, + {6, 614400, 10, 512, 1}, + {6, 716800, 10, 1024, 1}, + {6, 819200, 10, 32768, 1}, + {6, 921600, 10, 2048, 1}, + {6, 1024000, 10, 8192, 1}, + {12, 512, 15, 32, 1}, + {12, 1024, 15, 32, 1}, + {12, 1536, 15, 32, 1}, + {12, 2048, 15, 32, 1}, + {12, 2560, 15, 32, 1}, + {12, 3072, 15, 64, 1}, + {12, 3584, 15, 32, 1}, + {12, 4096, 15, 32, 1}, + {12, 4608, 15, 32, 1}, + {12, 5120, 15, 64, 1}, + {12, 5632, 15, 32, 1}, + {12, 6144, 15, 32, 1}, + {12, 6656, 15, 32, 1}, + {12, 7168, 15, 32, 1}, + {12, 7680, 15, 32, 1}, + {12, 8192, 15, 32, 1}, + {12, 8704, 15, 64, 1}, + {12, 9216, 15, 64, 1}, + {12, 9728, 15, 32, 1}, + {12, 10240, 15, 64, 1}, + {12, 20480, 15, 32, 1}, + {12, 30720, 15, 64, 1}, + {12, 40960, 15, 32, 1}, + {12, 51200, 15, 64, 1}, + {12, 61440, 15, 256, 1}, + {12, 71680, 15, 32, 1}, + {12, 81920, 15, 512, 1}, + {12, 92160, 15, 256, 1}, + {12, 102400, 15, 64, 1}, + {12, 204800, 15, 512, 1}, + {12, 307200, 15, 512, 1}, + {12, 409600, 15, 256, 1}, + {12, 512000, 15, 2048, 1}, + {12, 614400, 15, 2048, 1}, + {12, 716800, 15, 512, 1}, + {12, 819200, 15, 8192, 1}, + {12, 921600, 15, 1024, 1}, + {12, 1024000, 15, 1024, 1}, + {11, 512, 20, 64, 1}, + {11, 1024, 20, 64, 1}, + {11, 1536, 20, 128, 1}, + {11, 2048, 20, 256, 1}, + {11, 2560, 20, 32, 1}, + {11, 3072, 20, 128, 1}, + {11, 3584, 20, 64, 1}, + {11, 4096, 20, 32, 1}, + {11, 4608, 20, 32, 1}, + {11, 5120, 20, 32, 1}, + {11, 5632, 20, 32, 1}, + {11, 6144, 20, 32, 1}, + {11, 6656, 20, 32, 1}, + {11, 7168, 20, 32, 1}, + {11, 7680, 20, 32, 1}, + {11, 8192, 20, 64, 1}, + {11, 8704, 20, 64, 1}, + {11, 9216, 20, 64, 1}, + {11, 9728, 20, 32, 1}, + {11, 10240, 20, 64, 1}, + {11, 20480, 20, 64, 1}, + {11, 30720, 20, 64, 1}, + {11, 40960, 20, 64, 1}, + {11, 51200, 20, 64, 1}, + {11, 61440, 20, 64, 1}, + {11, 71680, 20, 64, 1}, + {11, 81920, 20, 64, 1}, + {11, 92160, 20, 256, 1}, + {11, 102400, 20, 64, 1}, + {11, 204800, 20, 128, 1}, + {11, 307200, 20, 256, 1}, + {11, 409600, 20, 512, 1}, + {11, 512000, 20, 256, 1}, + {11, 614400, 20, 512, 1}, + {11, 716800, 20, 4096, 1}, + {11, 819200, 20, 8192, 1}, + {11, 921600, 20, 4096, 1}, + {11, 1024000, 20, 4096, 1}, + {16, 512, 21, 64, 1}, + {16, 1024, 21, 256, 1}, + {16, 1536, 21, 32, 1}, + {16, 2048, 21, 32, 1}, + {16, 2560, 21, 32, 1}, + {16, 3072, 21, 32, 1}, + {16, 3584, 21, 32, 1}, + {16, 4096, 21, 32, 1}, + {16, 4608, 21, 64, 1}, + {16, 5120, 21, 64, 1}, + {16, 5632, 21, 32, 1}, + {16, 6144, 21, 32, 1}, + {16, 6656, 21, 64, 1}, + {16, 7168, 21, 32, 1}, + {16, 7680, 21, 32, 1}, + {16, 8192, 21, 32, 1}, + {16, 8704, 21, 256, 1}, + {16, 9216, 21, 128, 1}, + {16, 9728, 21, 128, 1}, + {16, 10240, 21, 32, 1}, + {16, 20480, 21, 32, 1}, + {16, 30720, 21, 64, 1}, + {16, 40960, 21, 32, 1}, + {16, 51200, 21, 32, 1}, + {16, 61440, 21, 32, 1}, + {16, 71680, 21, 32, 1}, + {16, 81920, 21, 128, 1}, + {16, 92160, 21, 512, 1}, + {16, 102400, 21, 128, 1}, + {16, 204800, 21, 128, 1}, + {16, 307200, 21, 2048, 1}, + {16, 409600, 21, 8192, 1}, + {16, 512000, 21, 2048, 1}, + {16, 614400, 21, 512, 1}, + {16, 716800, 21, 4096, 1}, + {16, 819200, 21, 32768, 1}, + {16, 921600, 21, 4096, 1}, + {16, 1024000, 21, 2048, 1}, + {25, 512, 28, 64, 1}, + {25, 1024, 28, 32, 1}, + {25, 1536, 28, 128, 1}, + {25, 2048, 28, 128, 1}, + {25, 2560, 28, 32, 1}, + {25, 3072, 28, 64, 1}, + {25, 3584, 28, 32, 1}, + {25, 4096, 28, 32, 1}, + {25, 4608, 28, 32, 1}, + {25, 5120, 28, 32, 1}, + {25, 5632, 28, 32, 1}, + {25, 6144, 28, 64, 1}, + {25, 6656, 28, 32, 1}, + {25, 7168, 28, 32, 1}, + {25, 7680, 28, 32, 1}, + {25, 8192, 28, 32, 1}, + {25, 8704, 28, 64, 1}, + {25, 9216, 28, 32, 1}, + {25, 9728, 28, 32, 1}, + {25, 10240, 28, 32, 1}, + {25, 20480, 28, 32, 1}, + {25, 30720, 28, 32, 1}, + {25, 40960, 28, 128, 1}, + {25, 51200, 28, 64, 1}, + {25, 61440, 28, 256, 1}, + {25, 71680, 28, 64, 1}, + {25, 81920, 28, 256, 1}, + {25, 92160, 28, 128, 1}, + {25, 102400, 28, 1024, 1}, + {25, 204800, 28, 8192, 1}, + {25, 307200, 28, 4096, 1}, + {25, 409600, 28, 4096, 1}, + {25, 512000, 28, 4096, 1}, + {25, 614400, 28, 8192, 1}, + {25, 716800, 28, 2048, 1}, + {25, 819200, 28, 32768, 1}, + {25, 921600, 28, 4096, 1}, + {25, 1024000, 28, 4096, 1}, + {24, 512, 35, 128, 1}, + {24, 1024, 35, 32, 1}, + {24, 1536, 35, 32, 1}, + {24, 2048, 35, 128, 1}, + {24, 2560, 35, 32, 1}, + {24, 3072, 35, 32, 1}, + {24, 3584, 35, 32, 1}, + {24, 4096, 35, 64, 1}, + {24, 4608, 35, 32, 1}, + {24, 5120, 35, 32, 1}, + {24, 5632, 35, 32, 1}, + {24, 6144, 35, 256, 1}, + {24, 6656, 35, 128, 1}, + {24, 7168, 35, 256, 1}, + {24, 7680, 35, 64, 1}, + {24, 8192, 35, 64, 1}, + {24, 8704, 35, 32, 1}, + {24, 9216, 35, 32, 1}, + {24, 9728, 35, 32, 1}, + {24, 10240, 35, 32, 1}, + {24, 20480, 35, 64, 1}, + {24, 30720, 35, 64, 1}, + {24, 40960, 35, 64, 1}, + {24, 51200, 35, 32, 1}, + {24, 61440, 35, 64, 1}, + {24, 71680, 35, 64, 1}, + {24, 81920, 35, 64, 1}, + {24, 92160, 35, 64, 1}, + {24, 102400, 35, 128, 1}, + {24, 204800, 35, 128, 1}, + {24, 307200, 35, 1024, 1}, + {24, 409600, 35, 256, 1}, + {24, 512000, 35, 256, 1}, + {24, 614400, 35, 4096, 1}, + {24, 716800, 35, 4096, 1}, + {24, 819200, 35, 512, 1}, + {24, 921600, 35, 4096, 1}, + {24, 1024000, 35, 8192, 1}, + {33, 512, 36, 32, 1}, + {33, 1024, 36, 32, 1}, + {33, 1536, 36, 64, 1}, + {33, 2048, 36, 64, 1}, + {33, 2560, 36, 32, 1}, + {33, 3072, 36, 64, 1}, + {33, 3584, 36, 32, 1}, + {33, 4096, 36, 256, 1}, + {33, 4608, 36, 128, 1}, + {33, 5120, 36, 128, 1}, + {33, 5632, 36, 128, 1}, + {33, 6144, 36, 32, 1}, + {33, 6656, 36, 64, 1}, + {33, 7168, 36, 32, 1}, + {33, 7680, 36, 32, 1}, + {33, 8192, 36, 64, 1}, + {33, 8704, 36, 128, 1}, + {33, 9216, 36, 256, 1}, + {33, 9728, 36, 32, 1}, + {33, 10240, 36, 32, 1}, + {33, 20480, 36, 32, 1}, + {33, 30720, 36, 128, 1}, + {33, 40960, 36, 32, 1}, + {33, 51200, 36, 64, 1}, + {33, 61440, 36, 32, 1}, + {33, 71680, 36, 32, 1}, + {33, 81920, 36, 64, 1}, + {33, 92160, 36, 64, 1}, + {33, 102400, 36, 64, 1}, + {33, 204800, 36, 1024, 1}, + {33, 307200, 36, 256, 1}, + {33, 409600, 36, 1024, 1}, + {33, 512000, 36, 256, 1}, + {33, 614400, 36, 512, 1}, + {33, 716800, 36, 4096, 1}, + {33, 819200, 36, 1024, 1}, + {33, 921600, 36, 4096, 1}, + {33, 1024000, 36, 4096, 1}, + {42, 512, 45, 32, 1}, + {42, 1024, 45, 64, 1}, + {42, 1536, 45, 64, 1}, + {42, 2048, 45, 64, 1}, + {42, 2560, 45, 32, 1}, + {42, 3072, 45, 32, 1}, + {42, 3584, 45, 128, 1}, + {42, 4096, 45, 64, 1}, + {42, 4608, 45, 32, 1}, + {42, 5120, 45, 32, 1}, + {42, 5632, 45, 32, 1}, + {42, 6144, 45, 32, 1}, + {42, 6656, 45, 128, 1}, + {42, 7168, 45, 32, 1}, + {42, 7680, 45, 64, 1}, + {42, 8192, 45, 64, 1}, + {42, 8704, 45, 32, 1}, + {42, 9216, 45, 128, 1}, + {42, 9728, 45, 64, 1}, + {42, 10240, 45, 64, 1}, + {42, 20480, 45, 32, 1}, + {42, 30720, 45, 128, 1}, + {42, 40960, 45, 32, 1}, + {42, 51200, 45, 32, 1}, + {42, 61440, 45, 64, 1}, + {42, 71680, 45, 64, 1}, + {42, 81920, 45, 64, 1}, + {42, 92160, 45, 64, 1}, + {42, 102400, 45, 64, 1}, + {42, 204800, 45, 128, 1}, + {42, 307200, 45, 2048, 1}, + {42, 409600, 45, 256, 1}, + {42, 512000, 45, 4096, 1}, + {42, 614400, 45, 512, 1}, + {42, 716800, 45, 512, 1}, + {42, 819200, 45, 512, 1}, + {42, 921600, 45, 4096, 1}, + {42, 1024000, 45, 4096, 1}, + {43, 512, 56, 64, 1}, + {43, 1024, 56, 128, 1}, + {43, 1536, 56, 32, 1}, + {43, 2048, 56, 64, 1}, + {43, 2560, 56, 64, 1}, + {43, 3072, 56, 64, 1}, + {43, 3584, 56, 64, 1}, + {43, 4096, 56, 32, 1}, + {43, 4608, 56, 32, 1}, + {43, 5120, 56, 32, 1}, + {43, 5632, 56, 32, 1}, + {43, 6144, 56, 64, 1}, + {43, 6656, 56, 32, 1}, + {43, 7168, 56, 64, 1}, + {43, 7680, 56, 64, 1}, + {43, 8192, 56, 32, 1}, + {43, 8704, 56, 32, 1}, + {43, 9216, 56, 32, 1}, + {43, 9728, 56, 32, 1}, + {43, 10240, 56, 128, 1}, + {43, 20480, 56, 32, 1}, + {43, 30720, 56, 128, 1}, + {43, 40960, 56, 64, 1}, + {43, 51200, 56, 128, 1}, + {43, 61440, 56, 32, 1}, + {43, 71680, 56, 64, 1}, + {43, 81920, 56, 128, 1}, + {43, 92160, 56, 64, 1}, + {43, 102400, 56, 64, 1}, + {43, 204800, 56, 128, 1}, + {43, 307200, 56, 128, 1}, + {43, 409600, 56, 16384, 1}, + {43, 512000, 56, 1024, 1}, + {43, 614400, 56, 512, 1}, + {43, 716800, 56, 2048, 1}, + {43, 819200, 56, 8192, 1}, + {43, 921600, 56, 1024, 1}, + {43, 1024000, 56, 2048, 1}, + {126, 512, 84, 32, 1}, + {126, 1024, 84, 32, 1}, + {126, 1536, 84, 32, 1}, + {126, 2048, 84, 32, 1}, + {126, 2560, 84, 32, 1}, + {126, 3072, 84, 32, 1}, + {126, 3584, 84, 32, 1}, + {126, 4096, 84, 2048, 1}, + {126, 4608, 84, 32, 1}, + {126, 5120, 84, 512, 1}, + {126, 5632, 84, 32, 1}, + {126, 6144, 84, 2048, 1}, + {126, 6656, 84, 32, 1}, + {126, 7168, 84, 32, 1}, + {126, 7680, 84, 32, 1}, + {126, 8192, 84, 128, 1}, + {126, 8704, 84, 64, 1}, + {126, 9216, 84, 32, 1}, + {126, 9728, 84, 32, 1}, + {126, 10240, 84, 32, 1}, + {126, 20480, 84, 32, 1}, + {126, 30720, 84, 32, 1}, + {126, 40960, 84, 2048, 1}, + {126, 51200, 84, 32, 1}, + {126, 61440, 84, 32, 1}, + {126, 71680, 84, 256, 1}, + {126, 81920, 84, 64, 1}, + {126, 92160, 84, 512, 1}, + {126, 102400, 84, 128, 1}, + {126, 204800, 84, 204800, 1}, + {126, 307200, 84, 307200, 1}, + {126, 409600, 84, 409600, 1}, + {126, 512000, 84, 512000, 1}, + {126, 614400, 84, 614400, 1}, + {126, 716800, 84, 716800, 1}, + {126, 819200, 84, 819200, 1}, + {126, 921600, 84, 921600, 1}, + {126, 1024000, 84, 1024000, 1}, + {210, 512, 120, 32, 1}, + {210, 1024, 120, 32, 1}, + {210, 1536, 120, 32, 1}, + {210, 2048, 120, 64, 1}, + {210, 2560, 120, 64, 1}, + {210, 3072, 120, 64, 1}, + {210, 3584, 120, 64, 1}, + {210, 4096, 120, 64, 1}, + {210, 4608, 120, 32, 1}, + {210, 5120, 120, 32, 1}, + {210, 5632, 120, 256, 1}, + {210, 6144, 120, 32, 1}, + {210, 6656, 120, 32, 1}, + {210, 7168, 120, 64, 1}, + {210, 7680, 120, 32, 1}, + {210, 8192, 120, 32, 1}, + {210, 8704, 120, 32, 1}, + {210, 9216, 120, 32, 1}, + {210, 9728, 120, 32, 1}, + {210, 10240, 120, 32, 1}, + {210, 20480, 120, 512, 1}, + {210, 30720, 120, 1024, 1}, + {210, 40960, 120, 1024, 1}, + {210, 51200, 120, 64, 1}, + {210, 61440, 120, 32, 1}, + {210, 71680, 120, 32, 1}, + {210, 81920, 120, 32, 1}, + {210, 92160, 120, 64, 1}, + {210, 102400, 120, 256, 1}, + {210, 204800, 120, 204800, 1}, + {210, 307200, 120, 307200, 1}, + {210, 409600, 120, 409600, 1}, + {210, 512000, 120, 512000, 1}, + {210, 614400, 120, 614400, 1}, + {210, 716800, 120, 716800, 1}, + {210, 819200, 120, 819200, 1}, + {210, 921600, 120, 921600, 1}, + {210, 1024000, 120, 1024000, 1}, + {330, 512, 165, 32, 1}, + {330, 1024, 165, 32, 1}, + {330, 1536, 165, 512, 1}, + {330, 2048, 165, 2048, 1}, + {330, 2560, 165, 512, 1}, + {330, 3072, 165, 512, 1}, + {330, 3584, 165, 512, 1}, + {330, 4096, 165, 4096, 1}, + {330, 4608, 165, 512, 1}, + {330, 5120, 165, 512, 1}, + {330, 5632, 165, 512, 1}, + {330, 6144, 165, 1024, 1}, + {330, 6656, 165, 512, 1}, + {330, 7168, 165, 512, 1}, + {330, 7680, 165, 512, 1}, + {330, 8192, 165, 8192, 1}, + {330, 8704, 165, 512, 1}, + {330, 9216, 165, 512, 1}, + {330, 9728, 165, 512, 1}, + {330, 10240, 165, 2048, 1}, + {330, 20480, 165, 2048, 1}, + {330, 30720, 165, 2048, 1}, + {330, 40960, 165, 8192, 1}, + {330, 51200, 165, 2048, 1}, + {330, 61440, 165, 2048, 1}, + {330, 71680, 165, 2048, 1}, + {330, 81920, 165, 8192, 1}, + {330, 92160, 165, 2048, 1}, + {330, 102400, 165, 2048, 1}, + {330, 204800, 165, 8192, 1}, + {330, 307200, 165, 2048, 1}, + {330, 409600, 165, 8192, 1}, + {330, 512000, 165, 2048, 1}, + {330, 614400, 165, 8192, 1}, + {330, 716800, 165, 4096, 1}, + {330, 819200, 165, 32768, 1}, + {330, 921600, 165, 4096, 1}, + {330, 1024000, 165, 8192, 1} }; - diff --git a/backends/magma/gemm_tuning/mi250x.h b/backends/magma/gemm_tuning/mi250x.h index 93412037d5..9f51d6f25f 100644 --- a/backends/magma/gemm_tuning/mi250x.h +++ b/backends/magma/gemm_tuning/mi250x.h @@ -2,2454 +2,2449 @@ // auto-generated from data on mi250x-rocm5.1.0 //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > sgemm_nn_mi250x = -{ - {3 , 512 , 1 , 128 , 0 }, - {3 , 1024 , 1 , 64 , 1 }, - {3 , 1536 , 1 , 64 , 1 }, - {3 , 2048 , 1 , 1024 , 1 }, - {3 , 2560 , 1 , 256 , 1 }, - {3 , 3072 , 1 , 1024 , 1 }, - {3 , 3584 , 1 , 512 , 1 }, - {3 , 4096 , 1 , 2048 , 0 }, - {3 , 4608 , 1 , 512 , 1 }, - {3 , 5120 , 1 , 256 , 1 }, - {3 , 5632 , 1 , 256 , 1 }, - {3 , 6144 , 1 , 512 , 1 }, - {3 , 6656 , 1 , 512 , 1 }, - {3 , 7168 , 1 , 1024 , 1 }, - {3 , 7680 , 1 , 512 , 1 }, - {3 , 8192 , 1 , 512 , 1 }, - {3 , 8704 , 1 , 512 , 1 }, - {3 , 9216 , 1 , 1024 , 1 }, - {3 , 9728 , 1 , 512 , 1 }, - {3 , 10240 , 1 , 512 , 1 }, - {3 , 20480 , 1 , 2048 , 1 }, - {3 , 30720 , 1 , 2048 , 1 }, - {3 , 40960 , 1 , 2048 , 1 }, - {3 , 51200 , 1 , 2048 , 1 }, - {3 , 61440 , 1 , 4096 , 1 }, - {3 , 71680 , 1 , 2048 , 1 }, - {3 , 81920 , 1 , 8192 , 1 }, - {3 , 92160 , 1 , 2048 , 1 }, - {3 , 102400 , 1 , 4096 , 1 }, - {3 , 204800 , 1 , 8192 , 1 }, - {3 , 307200 , 1 , 2048 , 1 }, - {3 , 409600 , 1 , 16384 , 1 }, - {3 , 512000 , 1 , 2048 , 1 }, - {3 , 614400 , 1 , 8192 , 1 }, - {3 , 716800 , 1 , 1024 , 1 }, - {3 , 819200 , 1 , 32768 , 1 }, - {3 , 921600 , 1 , 2048 , 1 }, - {3 , 1024000, 1 , 2048 , 1 }, - {4 , 512 , 1 , 128 , 0 }, - {4 , 1024 , 1 , 128 , 1 }, - {4 , 1536 , 1 , 64 , 1 }, - {4 , 2048 , 1 , 2048 , 1 }, - {4 , 2560 , 1 , 512 , 1 }, - {4 , 3072 , 1 , 512 , 1 }, - {4 , 3584 , 1 , 512 , 1 }, - {4 , 4096 , 1 , 256 , 1 }, - {4 , 4608 , 1 , 512 , 1 }, - {4 , 5120 , 1 , 512 , 1 }, - {4 , 5632 , 1 , 256 , 1 }, - {4 , 6144 , 1 , 512 , 1 }, - {4 , 6656 , 1 , 256 , 1 }, - {4 , 7168 , 1 , 1024 , 1 }, - {4 , 7680 , 1 , 256 , 1 }, - {4 , 8192 , 1 , 512 , 1 }, - {4 , 8704 , 1 , 512 , 1 }, - {4 , 9216 , 1 , 1024 , 1 }, - {4 , 9728 , 1 , 512 , 1 }, - {4 , 10240 , 1 , 2048 , 1 }, - {4 , 20480 , 1 , 2048 , 1 }, - {4 , 30720 , 1 , 2048 , 1 }, - {4 , 40960 , 1 , 2048 , 1 }, - {4 , 51200 , 1 , 2048 , 1 }, - {4 , 61440 , 1 , 4096 , 1 }, - {4 , 71680 , 1 , 2048 , 1 }, - {4 , 81920 , 1 , 8192 , 1 }, - {4 , 92160 , 1 , 2048 , 1 }, - {4 , 102400 , 1 , 4096 , 1 }, - {4 , 204800 , 1 , 8192 , 1 }, - {4 , 307200 , 1 , 4096 , 1 }, - {4 , 409600 , 1 , 16384 , 1 }, - {4 , 512000 , 1 , 2048 , 1 }, - {4 , 614400 , 1 , 8192 , 1 }, - {4 , 716800 , 1 , 4096 , 1 }, - {4 , 819200 , 1 , 32768 , 1 }, - {4 , 921600 , 1 , 4096 , 1 }, - {4 , 1024000, 1 , 2048 , 1 }, - {6 , 512 , 3 , 32 , 1 }, - {6 , 1024 , 3 , 128 , 1 }, - {6 , 1536 , 3 , 256 , 1 }, - {6 , 2048 , 3 , 128 , 1 }, - {6 , 2560 , 3 , 512 , 1 }, - {6 , 3072 , 3 , 1024 , 1 }, - {6 , 3584 , 3 , 512 , 1 }, - {6 , 4096 , 3 , 1024 , 1 }, - {6 , 4608 , 3 , 512 , 1 }, - {6 , 5120 , 3 , 512 , 1 }, - {6 , 5632 , 3 , 256 , 1 }, - {6 , 6144 , 3 , 256 , 1 }, - {6 , 6656 , 3 , 512 , 1 }, - {6 , 7168 , 3 , 512 , 1 }, - {6 , 7680 , 3 , 512 , 1 }, - {6 , 8192 , 3 , 512 , 1 }, - {6 , 8704 , 3 , 512 , 1 }, - {6 , 9216 , 3 , 1024 , 1 }, - {6 , 9728 , 3 , 512 , 1 }, - {6 , 10240 , 3 , 1024 , 1 }, - {6 , 20480 , 3 , 1024 , 1 }, - {6 , 30720 , 3 , 1024 , 1 }, - {6 , 40960 , 3 , 2048 , 1 }, - {6 , 51200 , 3 , 2048 , 1 }, - {6 , 61440 , 3 , 2048 , 1 }, - {6 , 71680 , 3 , 2048 , 1 }, - {6 , 81920 , 3 , 4096 , 1 }, - {6 , 92160 , 3 , 2048 , 1 }, - {6 , 102400 , 3 , 2048 , 1 }, - {6 , 204800 , 3 , 8192 , 1 }, - {6 , 307200 , 3 , 4096 , 1 }, - {6 , 409600 , 3 , 8192 , 1 }, - {6 , 512000 , 3 , 1024 , 1 }, - {6 , 614400 , 3 , 8192 , 1 }, - {6 , 716800 , 3 , 1024 , 1 }, - {6 , 819200 , 3 , 16384 , 1 }, - {6 , 921600 , 3 , 4096 , 1 }, - {6 , 1024000, 3 , 2048 , 1 }, - {10 , 512 , 4 , 128 , 1 }, - {10 , 1024 , 4 , 128 , 1 }, - {10 , 1536 , 4 , 256 , 1 }, - {10 , 2048 , 4 , 128 , 1 }, - {10 , 2560 , 4 , 512 , 1 }, - {10 , 3072 , 4 , 512 , 1 }, - {10 , 3584 , 4 , 512 , 1 }, - {10 , 4096 , 4 , 4096 , 0 }, - {10 , 4608 , 4 , 64 , 0 }, - {10 , 5120 , 4 , 512 , 1 }, - {10 , 5632 , 4 , 512 , 1 }, - {10 , 6144 , 4 , 512 , 1 }, - {10 , 6656 , 4 , 32 , 0 }, - {10 , 7168 , 4 , 512 , 1 }, - {10 , 7680 , 4 , 512 , 1 }, - {10 , 8192 , 4 , 32 , 0 }, - {10 , 8704 , 4 , 512 , 1 }, - {10 , 9216 , 4 , 512 , 1 }, - {10 , 9728 , 4 , 32 , 0 }, - {10 , 10240 , 4 , 512 , 1 }, - {10 , 20480 , 4 , 2048 , 1 }, - {10 , 30720 , 4 , 2048 , 1 }, - {10 , 40960 , 4 , 2048 , 1 }, - {10 , 51200 , 4 , 2048 , 1 }, - {10 , 61440 , 4 , 2048 , 1 }, - {10 , 71680 , 4 , 2048 , 1 }, - {10 , 81920 , 4 , 8192 , 1 }, - {10 , 92160 , 4 , 2048 , 1 }, - {10 , 102400 , 4 , 2048 , 1 }, - {10 , 204800 , 4 , 4096 , 1 }, - {10 , 307200 , 4 , 4096 , 1 }, - {10 , 409600 , 4 , 16384 , 1 }, - {10 , 512000 , 4 , 4096 , 1 }, - {10 , 614400 , 4 , 8192 , 1 }, - {10 , 716800 , 4 , 2048 , 1 }, - {10 , 819200 , 4 , 16384 , 1 }, - {10 , 921600 , 4 , 4096 , 1 }, - {10 , 1024000, 4 , 8192 , 1 }, - {10 , 512 , 6 , 32 , 1 }, - {10 , 1024 , 6 , 128 , 1 }, - {10 , 1536 , 6 , 256 , 1 }, - {10 , 2048 , 6 , 512 , 1 }, - {10 , 2560 , 6 , 128 , 1 }, - {10 , 3072 , 6 , 512 , 1 }, - {10 , 3584 , 6 , 512 , 1 }, - {10 , 4096 , 6 , 256 , 1 }, - {10 , 4608 , 6 , 256 , 1 }, - {10 , 5120 , 6 , 256 , 1 }, - {10 , 5632 , 6 , 512 , 1 }, - {10 , 6144 , 6 , 512 , 1 }, - {10 , 6656 , 6 , 512 , 1 }, - {10 , 7168 , 6 , 512 , 1 }, - {10 , 7680 , 6 , 512 , 1 }, - {10 , 8192 , 6 , 32 , 0 }, - {10 , 8704 , 6 , 32 , 0 }, - {10 , 9216 , 6 , 1024 , 1 }, - {10 , 9728 , 6 , 512 , 1 }, - {10 , 10240 , 6 , 512 , 1 }, - {10 , 20480 , 6 , 1024 , 1 }, - {10 , 30720 , 6 , 1024 , 1 }, - {10 , 40960 , 6 , 2048 , 1 }, - {10 , 51200 , 6 , 1024 , 1 }, - {10 , 61440 , 6 , 4096 , 1 }, - {10 , 71680 , 6 , 2048 , 1 }, - {10 , 81920 , 6 , 4096 , 1 }, - {10 , 92160 , 6 , 2048 , 1 }, - {10 , 102400 , 6 , 4096 , 1 }, - {10 , 204800 , 6 , 8192 , 1 }, - {10 , 307200 , 6 , 4096 , 1 }, - {10 , 409600 , 6 , 16384 , 1 }, - {10 , 512000 , 6 , 512 , 1 }, - {10 , 614400 , 6 , 8192 , 1 }, - {10 , 716800 , 6 , 4096 , 1 }, - {10 , 819200 , 6 , 32768 , 1 }, - {10 , 921600 , 6 , 2048 , 1 }, - {10 , 1024000, 6 , 8192 , 1 }, - {15 , 512 , 12 , 128 , 0 }, - {15 , 1024 , 12 , 128 , 1 }, - {15 , 1536 , 12 , 256 , 1 }, - {15 , 2048 , 12 , 128 , 1 }, - {15 , 2560 , 12 , 512 , 1 }, - {15 , 3072 , 12 , 1024 , 0 }, - {15 , 3584 , 12 , 512 , 0 }, - {15 , 4096 , 12 , 256 , 1 }, - {15 , 4608 , 12 , 512 , 0 }, - {15 , 5120 , 12 , 512 , 0 }, - {15 , 5632 , 12 , 512 , 0 }, - {15 , 6144 , 12 , 32 , 0 }, - {15 , 6656 , 12 , 256 , 0 }, - {15 , 7168 , 12 , 512 , 0 }, - {15 , 7680 , 12 , 32 , 0 }, - {15 , 8192 , 12 , 32 , 0 }, - {15 , 8704 , 12 , 32 , 0 }, - {15 , 9216 , 12 , 512 , 0 }, - {15 , 9728 , 12 , 32 , 0 }, - {15 , 10240 , 12 , 32 , 0 }, - {15 , 20480 , 12 , 512 , 1 }, - {15 , 30720 , 12 , 2048 , 1 }, - {15 , 40960 , 12 , 2048 , 1 }, - {15 , 51200 , 12 , 2048 , 1 }, - {15 , 61440 , 12 , 2048 , 1 }, - {15 , 71680 , 12 , 2048 , 1 }, - {15 , 81920 , 12 , 4096 , 1 }, - {15 , 92160 , 12 , 1024 , 0 }, - {15 , 102400 , 12 , 2048 , 0 }, - {15 , 204800 , 12 , 1024 , 0 }, - {15 , 307200 , 12 , 1024 , 0 }, - {15 , 409600 , 12 , 2048 , 0 }, - {15 , 512000 , 12 , 1024 , 0 }, - {15 , 614400 , 12 , 1024 , 0 }, - {15 , 716800 , 12 , 1024 , 0 }, - {15 , 819200 , 12 , 2048 , 0 }, - {15 , 921600 , 12 , 2048 , 0 }, - {15 , 1024000, 12 , 1024 , 0 }, - {20 , 512 , 11 , 32 , 0 }, - {20 , 1024 , 11 , 256 , 0 }, - {20 , 1536 , 11 , 256 , 0 }, - {20 , 2048 , 11 , 256 , 0 }, - {20 , 2560 , 11 , 256 , 0 }, - {20 , 3072 , 11 , 256 , 1 }, - {20 , 3584 , 11 , 32 , 0 }, - {20 , 4096 , 11 , 256 , 0 }, - {20 , 4608 , 11 , 32 , 0 }, - {20 , 5120 , 11 , 256 , 0 }, - {20 , 5632 , 11 , 32 , 0 }, - {20 , 6144 , 11 , 32 , 0 }, - {20 , 6656 , 11 , 32 , 0 }, - {20 , 7168 , 11 , 32 , 0 }, - {20 , 7680 , 11 , 32 , 0 }, - {20 , 8192 , 11 , 32 , 0 }, - {20 , 8704 , 11 , 32 , 0 }, - {20 , 9216 , 11 , 32 , 0 }, - {20 , 9728 , 11 , 32 , 0 }, - {20 , 10240 , 11 , 32 , 0 }, - {20 , 20480 , 11 , 2048 , 1 }, - {20 , 30720 , 11 , 2048 , 1 }, - {20 , 40960 , 11 , 8192 , 1 }, - {20 , 51200 , 11 , 2048 , 1 }, - {20 , 61440 , 11 , 2048 , 1 }, - {20 , 71680 , 11 , 2048 , 1 }, - {20 , 81920 , 11 , 16384 , 1 }, - {20 , 92160 , 11 , 2048 , 1 }, - {20 , 102400 , 11 , 4096 , 1 }, - {20 , 204800 , 11 , 4096 , 1 }, - {20 , 307200 , 11 , 4096 , 1 }, - {20 , 409600 , 11 , 8192 , 1 }, - {20 , 512000 , 11 , 4096 , 1 }, - {20 , 614400 , 11 , 8192 , 1 }, - {20 , 716800 , 11 , 4096 , 1 }, - {20 , 819200 , 11 , 32768 , 1 }, - {20 , 921600 , 11 , 4096 , 1 }, - {20 , 1024000, 11 , 8192 , 1 }, - {21 , 512 , 16 , 32 , 0 }, - {21 , 1024 , 16 , 256 , 0 }, - {21 , 1536 , 16 , 256 , 0 }, - {21 , 2048 , 16 , 256 , 0 }, - {21 , 2560 , 16 , 256 , 1 }, - {21 , 3072 , 16 , 32 , 0 }, - {21 , 3584 , 16 , 32 , 0 }, - {21 , 4096 , 16 , 256 , 0 }, - {21 , 4608 , 16 , 32 , 0 }, - {21 , 5120 , 16 , 32 , 0 }, - {21 , 5632 , 16 , 32 , 0 }, - {21 , 6144 , 16 , 32 , 0 }, - {21 , 6656 , 16 , 32 , 0 }, - {21 , 7168 , 16 , 32 , 0 }, - {21 , 7680 , 16 , 32 , 0 }, - {21 , 8192 , 16 , 32 , 0 }, - {21 , 8704 , 16 , 32 , 0 }, - {21 , 9216 , 16 , 32 , 0 }, - {21 , 9728 , 16 , 32 , 0 }, - {21 , 10240 , 16 , 32 , 0 }, - {21 , 20480 , 16 , 512 , 1 }, - {21 , 30720 , 16 , 2048 , 1 }, - {21 , 40960 , 16 , 2048 , 1 }, - {21 , 51200 , 16 , 2048 , 1 }, - {21 , 61440 , 16 , 4096 , 1 }, - {21 , 71680 , 16 , 2048 , 0 }, - {21 , 81920 , 16 , 2048 , 0 }, - {21 , 92160 , 16 , 1024 , 0 }, - {21 , 102400 , 16 , 4096 , 1 }, - {21 , 204800 , 16 , 8192 , 0 }, - {21 , 307200 , 16 , 1024 , 0 }, - {21 , 409600 , 16 , 8192 , 0 }, - {21 , 512000 , 16 , 2048 , 0 }, - {21 , 614400 , 16 , 8192 , 0 }, - {21 , 716800 , 16 , 1024 , 0 }, - {21 , 819200 , 16 , 8192 , 0 }, - {21 , 921600 , 16 , 1024 , 0 }, - {21 , 1024000, 16 , 8192 , 0 }, - {28 , 512 , 25 , 256 , 0 }, - {28 , 1024 , 25 , 64 , 0 }, - {28 , 1536 , 25 , 128 , 0 }, - {28 , 2048 , 25 , 128 , 0 }, - {28 , 2560 , 25 , 256 , 0 }, - {28 , 3072 , 25 , 128 , 0 }, - {28 , 3584 , 25 , 32 , 0 }, - {28 , 4096 , 25 , 128 , 0 }, - {28 , 4608 , 25 , 128 , 0 }, - {28 , 5120 , 25 , 128 , 0 }, - {28 , 5632 , 25 , 128 , 0 }, - {28 , 6144 , 25 , 128 , 0 }, - {28 , 6656 , 25 , 128 , 0 }, - {28 , 7168 , 25 , 128 , 0 }, - {28 , 7680 , 25 , 128 , 0 }, - {28 , 8192 , 25 , 128 , 0 }, - {28 , 8704 , 25 , 128 , 0 }, - {28 , 9216 , 25 , 128 , 0 }, - {28 , 9728 , 25 , 128 , 0 }, - {28 , 10240 , 25 , 128 , 0 }, - {28 , 20480 , 25 , 128 , 0 }, - {28 , 30720 , 25 , 128 , 0 }, - {28 , 40960 , 25 , 128 , 0 }, - {28 , 51200 , 25 , 128 , 0 }, - {28 , 61440 , 25 , 128 , 0 }, - {28 , 71680 , 25 , 128 , 0 }, - {28 , 81920 , 25 , 64 , 0 }, - {28 , 92160 , 25 , 128 , 0 }, - {28 , 102400 , 25 , 128 , 0 }, - {28 , 204800 , 25 , 8192 , 0 }, - {28 , 307200 , 25 , 128 , 0 }, - {28 , 409600 , 25 , 8192 , 0 }, - {28 , 512000 , 25 , 128 , 0 }, - {28 , 614400 , 25 , 8192 , 0 }, - {28 , 716800 , 25 , 128 , 0 }, - {28 , 819200 , 25 , 8192 , 0 }, - {28 , 921600 , 25 , 512 , 0 }, - {28 , 1024000, 25 , 8192 , 0 }, - {35 , 512 , 24 , 32 , 0 }, - {35 , 1024 , 24 , 256 , 0 }, - {35 , 1536 , 24 , 256 , 0 }, - {35 , 2048 , 24 , 256 , 0 }, - {35 , 2560 , 24 , 64 , 0 }, - {35 , 3072 , 24 , 32 , 0 }, - {35 , 3584 , 24 , 64 , 0 }, - {35 , 4096 , 24 , 128 , 0 }, - {35 , 4608 , 24 , 128 , 0 }, - {35 , 5120 , 24 , 128 , 0 }, - {35 , 5632 , 24 , 128 , 0 }, - {35 , 6144 , 24 , 128 , 0 }, - {35 , 6656 , 24 , 128 , 0 }, - {35 , 7168 , 24 , 128 , 0 }, - {35 , 7680 , 24 , 256 , 1 }, - {35 , 8192 , 24 , 1024 , 1 }, - {35 , 8704 , 24 , 512 , 1 }, - {35 , 9216 , 24 , 1024 , 1 }, - {35 , 9728 , 24 , 256 , 1 }, - {35 , 10240 , 24 , 2048 , 1 }, - {35 , 20480 , 24 , 32 , 0 }, - {35 , 30720 , 24 , 32 , 0 }, - {35 , 40960 , 24 , 8192 , 0 }, - {35 , 51200 , 24 , 2048 , 1 }, - {35 , 61440 , 24 , 128 , 0 }, - {35 , 71680 , 24 , 64 , 0 }, - {35 , 81920 , 24 , 8192 , 0 }, - {35 , 92160 , 24 , 2048 , 1 }, - {35 , 102400 , 24 , 128 , 0 }, - {35 , 204800 , 24 , 8192 , 0 }, - {35 , 307200 , 24 , 4096 , 1 }, - {35 , 409600 , 24 , 8192 , 0 }, - {35 , 512000 , 24 , 4096 , 1 }, - {35 , 614400 , 24 , 8192 , 0 }, - {35 , 716800 , 24 , 4096 , 1 }, - {35 , 819200 , 24 , 8192 , 0 }, - {35 , 921600 , 24 , 921600 , 1 }, - {35 , 1024000, 24 , 8192 , 0 }, - {36 , 512 , 33 , 32 , 0 }, - {36 , 1024 , 33 , 32 , 0 }, - {36 , 1536 , 33 , 128 , 0 }, - {36 , 2048 , 33 , 128 , 0 }, - {36 , 2560 , 33 , 256 , 0 }, - {36 , 3072 , 33 , 32 , 0 }, - {36 , 3584 , 33 , 128 , 0 }, - {36 , 4096 , 33 , 128 , 0 }, - {36 , 4608 , 33 , 32 , 0 }, - {36 , 5120 , 33 , 128 , 0 }, - {36 , 5632 , 33 , 128 , 0 }, - {36 , 6144 , 33 , 128 , 0 }, - {36 , 6656 , 33 , 64 , 0 }, - {36 , 7168 , 33 , 128 , 0 }, - {36 , 7680 , 33 , 64 , 0 }, - {36 , 8192 , 33 , 8192 , 1 }, - {36 , 8704 , 33 , 64 , 0 }, - {36 , 9216 , 33 , 64 , 0 }, - {36 , 9728 , 33 , 256 , 1 }, - {36 , 10240 , 33 , 128 , 0 }, - {36 , 20480 , 33 , 64 , 0 }, - {36 , 30720 , 33 , 64 , 0 }, - {36 , 40960 , 33 , 8192 , 0 }, - {36 , 51200 , 33 , 64 , 0 }, - {36 , 61440 , 33 , 32 , 0 }, - {36 , 71680 , 33 , 64 , 0 }, - {36 , 81920 , 33 , 8192 , 0 }, - {36 , 92160 , 33 , 64 , 0 }, - {36 , 102400 , 33 , 64 , 0 }, - {36 , 204800 , 33 , 8192 , 0 }, - {36 , 307200 , 33 , 64 , 0 }, - {36 , 409600 , 33 , 8192 , 0 }, - {36 , 512000 , 33 , 64 , 0 }, - {36 , 614400 , 33 , 8192 , 0 }, - {36 , 716800 , 33 , 512 , 0 }, - {36 , 819200 , 33 , 8192 , 0 }, - {36 , 921600 , 33 , 512 , 0 }, - {36 , 1024000, 33 , 8192 , 0 }, - {45 , 512 , 42 , 128 , 0 }, - {45 , 1024 , 42 , 128 , 0 }, - {45 , 1536 , 42 , 128 , 0 }, - {45 , 2048 , 42 , 128 , 0 }, - {45 , 2560 , 42 , 128 , 0 }, - {45 , 3072 , 42 , 32 , 0 }, - {45 , 3584 , 42 , 128 , 0 }, - {45 , 4096 , 42 , 32 , 0 }, - {45 , 4608 , 42 , 128 , 0 }, - {45 , 5120 , 42 , 128 , 0 }, - {45 , 5632 , 42 , 128 , 0 }, - {45 , 6144 , 42 , 128 , 0 }, - {45 , 6656 , 42 , 128 , 0 }, - {45 , 7168 , 42 , 128 , 0 }, - {45 , 7680 , 42 , 128 , 0 }, - {45 , 8192 , 42 , 32 , 0 }, - {45 , 8704 , 42 , 32 , 0 }, - {45 , 9216 , 42 , 64 , 0 }, - {45 , 9728 , 42 , 64 , 0 }, - {45 , 10240 , 42 , 128 , 0 }, - {45 , 20480 , 42 , 64 , 0 }, - {45 , 30720 , 42 , 64 , 0 }, - {45 , 40960 , 42 , 8192 , 0 }, - {45 , 51200 , 42 , 64 , 0 }, - {45 , 61440 , 42 , 64 , 0 }, - {45 , 71680 , 42 , 64 , 0 }, - {45 , 81920 , 42 , 64 , 0 }, - {45 , 92160 , 42 , 64 , 0 }, - {45 , 102400 , 42 , 64 , 0 }, - {45 , 204800 , 42 , 8192 , 0 }, - {45 , 307200 , 42 , 64 , 0 }, - {45 , 409600 , 42 , 8192 , 0 }, - {45 , 512000 , 42 , 512 , 0 }, - {45 , 614400 , 42 , 8192 , 0 }, - {45 , 716800 , 42 , 512 , 0 }, - {45 , 819200 , 42 , 8192 , 0 }, - {45 , 921600 , 42 , 512 , 0 }, - {45 , 1024000, 42 , 8192 , 0 }, - {56 , 512 , 43 , 512 , 0 }, - {56 , 1024 , 43 , 128 , 0 }, - {56 , 1536 , 43 , 128 , 0 }, - {56 , 2048 , 43 , 128 , 0 }, - {56 , 2560 , 43 , 128 , 0 }, - {56 , 3072 , 43 , 128 , 0 }, - {56 , 3584 , 43 , 128 , 0 }, - {56 , 4096 , 43 , 128 , 0 }, - {56 , 4608 , 43 , 128 , 0 }, - {56 , 5120 , 43 , 128 , 0 }, - {56 , 5632 , 43 , 128 , 0 }, - {56 , 6144 , 43 , 64 , 0 }, - {56 , 6656 , 43 , 64 , 0 }, - {56 , 7168 , 43 , 128 , 0 }, - {56 , 7680 , 43 , 64 , 0 }, - {56 , 8192 , 43 , 64 , 0 }, - {56 , 8704 , 43 , 128 , 0 }, - {56 , 9216 , 43 , 64 , 0 }, - {56 , 9728 , 43 , 64 , 0 }, - {56 , 10240 , 43 , 64 , 0 }, - {56 , 20480 , 43 , 64 , 0 }, - {56 , 30720 , 43 , 64 , 0 }, - {56 , 40960 , 43 , 64 , 0 }, - {56 , 51200 , 43 , 64 , 0 }, - {56 , 61440 , 43 , 64 , 0 }, - {56 , 71680 , 43 , 64 , 0 }, - {56 , 81920 , 43 , 64 , 0 }, - {56 , 92160 , 43 , 64 , 0 }, - {56 , 102400 , 43 , 64 , 0 }, - {56 , 204800 , 43 , 8192 , 0 }, - {56 , 307200 , 43 , 64 , 0 }, - {56 , 409600 , 43 , 8192 , 0 }, - {56 , 512000 , 43 , 64 , 0 }, - {56 , 614400 , 43 , 8192 , 0 }, - {56 , 716800 , 43 , 716800 , 0 }, - {56 , 819200 , 43 , 8192 , 0 }, - {56 , 921600 , 43 , 921600 , 0 }, - {56 , 1024000, 43 , 8192 , 0 }, - {84 , 512 , 126 , 64 , 0 }, - {84 , 1024 , 126 , 64 , 0 }, - {84 , 1536 , 126 , 64 , 0 }, - {84 , 2048 , 126 , 1024 , 0 }, - {84 , 2560 , 126 , 64 , 0 }, - {84 , 3072 , 126 , 64 , 0 }, - {84 , 3584 , 126 , 128 , 0 }, - {84 , 4096 , 126 , 64 , 0 }, - {84 , 4608 , 126 , 128 , 0 }, - {84 , 5120 , 126 , 128 , 0 }, - {84 , 5632 , 126 , 128 , 0 }, - {84 , 6144 , 126 , 128 , 0 }, - {84 , 6656 , 126 , 128 , 0 }, - {84 , 7168 , 126 , 128 , 0 }, - {84 , 7680 , 126 , 128 , 0 }, - {84 , 8192 , 126 , 8192 , 0 }, - {84 , 8704 , 126 , 128 , 0 }, - {84 , 9216 , 126 , 128 , 0 }, - {84 , 9728 , 126 , 128 , 0 }, - {84 , 10240 , 126 , 128 , 0 }, - {84 , 20480 , 126 , 4096 , 0 }, - {84 , 30720 , 126 , 128 , 0 }, - {84 , 40960 , 126 , 8192 , 0 }, - {84 , 51200 , 126 , 128 , 0 }, - {84 , 61440 , 126 , 4096 , 0 }, - {84 , 71680 , 126 , 128 , 0 }, - {84 , 81920 , 126 , 8192 , 0 }, - {84 , 92160 , 126 , 128 , 0 }, - {84 , 102400 , 126 , 4096 , 0 }, - {84 , 204800 , 126 , 8192 , 0 }, - {84 , 307200 , 126 , 4096 , 0 }, - {84 , 409600 , 126 , 16384 , 0 }, - {84 , 512000 , 126 , 4096 , 0 }, - {84 , 614400 , 126 , 8192 , 0 }, - {84 , 716800 , 126 , 4096 , 0 }, - {84 , 819200 , 126 , 16384 , 0 }, - {84 , 921600 , 126 , 4096 , 0 }, - {84 , 1024000, 126 , 8192 , 0 }, - {120 , 512 , 210 , 64 , 0 }, - {120 , 1024 , 210 , 512 , 0 }, - {120 , 1536 , 210 , 32 , 0 }, - {120 , 2048 , 210 , 2048 , 0 }, - {120 , 2560 , 210 , 64 , 0 }, - {120 , 3072 , 210 , 64 , 0 }, - {120 , 3584 , 210 , 512 , 0 }, - {120 , 4096 , 210 , 4096 , 0 }, - {120 , 4608 , 210 , 64 , 0 }, - {120 , 5120 , 210 , 64 , 0 }, - {120 , 5632 , 210 , 512 , 0 }, - {120 , 6144 , 210 , 2048 , 0 }, - {120 , 6656 , 210 , 128 , 0 }, - {120 , 7168 , 210 , 1024 , 0 }, - {120 , 7680 , 210 , 512 , 0 }, - {120 , 8192 , 210 , 8192 , 0 }, - {120 , 8704 , 210 , 256 , 0 }, - {120 , 9216 , 210 , 9216 , 0 }, - {120 , 9728 , 210 , 9728 , 0 }, - {120 , 10240 , 210 , 2048 , 0 }, - {120 , 20480 , 210 , 4096 , 0 }, - {120 , 30720 , 210 , 30720 , 0 }, - {120 , 40960 , 210 , 8192 , 0 }, - {120 , 51200 , 210 , 51200 , 0 }, - {120 , 61440 , 210 , 4096 , 0 }, - {120 , 71680 , 210 , 71680 , 0 }, - {120 , 81920 , 210 , 8192 , 0 }, - {120 , 92160 , 210 , 92160 , 0 }, - {120 , 102400 , 210 , 102400 , 0 }, - {120 , 204800 , 210 , 8192 , 0 }, - {120 , 307200 , 210 , 4096 , 0 }, - {120 , 409600 , 210 , 8192 , 0 }, - {120 , 512000 , 210 , 4096 , 0 }, - {120 , 614400 , 210 , 8192 , 0 }, - {120 , 716800 , 210 , 4096 , 0 }, - {120 , 819200 , 210 , 8192 , 0 }, - {120 , 921600 , 210 , 4096 , 0 }, - {120 , 1024000, 210 , 8192 , 0 }, - {165 , 512 , 330 , 256 , 0 }, - {165 , 1024 , 330 , 1024 , 0 }, - {165 , 1536 , 330 , 128 , 0 }, - {165 , 2048 , 330 , 2048 , 0 }, - {165 , 2560 , 330 , 512 , 0 }, - {165 , 3072 , 330 , 1024 , 0 }, - {165 , 3584 , 330 , 256 , 0 }, - {165 , 4096 , 330 , 2048 , 0 }, - {165 , 4608 , 330 , 512 , 0 }, - {165 , 5120 , 330 , 1024 , 0 }, - {165 , 5632 , 330 , 5632 , 0 }, - {165 , 6144 , 330 , 2048 , 0 }, - {165 , 6656 , 330 , 6656 , 0 }, - {165 , 7168 , 330 , 1024 , 0 }, - {165 , 7680 , 330 , 7680 , 0 }, - {165 , 8192 , 330 , 8192 , 0 }, - {165 , 8704 , 330 , 8704 , 0 }, - {165 , 9216 , 330 , 1024 , 0 }, - {165 , 9728 , 330 , 9728 , 0 }, - {165 , 10240 , 330 , 1024 , 0 }, - {165 , 20480 , 330 , 4096 , 0 }, - {165 , 30720 , 330 , 1024 , 0 }, - {165 , 40960 , 330 , 8192 , 0 }, - {165 , 51200 , 330 , 1024 , 0 }, - {165 , 61440 , 330 , 1024 , 0 }, - {165 , 71680 , 330 , 1024 , 0 }, - {165 , 81920 , 330 , 8192 , 0 }, - {165 , 92160 , 330 , 1024 , 0 }, - {165 , 102400 , 330 , 1024 , 0 }, - {165 , 204800 , 330 , 8192 , 0 }, - {165 , 307200 , 330 , 1024 , 0 }, - {165 , 409600 , 330 , 8192 , 0 }, - {165 , 512000 , 330 , 1024 , 0 }, - {165 , 614400 , 330 , 8192 , 0 }, - {165 , 716800 , 330 , 1024 , 0 }, - {165 , 819200 , 330 , 8192 , 0 }, - {165 , 921600 , 330 , 1024 , 0 }, - {165 , 1024000, 330 , 8192 , 0 } +std::vector > sgemm_nn_mi250x = { + {3, 512, 1, 128, 0}, + {3, 1024, 1, 64, 1}, + {3, 1536, 1, 64, 1}, + {3, 2048, 1, 1024, 1}, + {3, 2560, 1, 256, 1}, + {3, 3072, 1, 1024, 1}, + {3, 3584, 1, 512, 1}, + {3, 4096, 1, 2048, 0}, + {3, 4608, 1, 512, 1}, + {3, 5120, 1, 256, 1}, + {3, 5632, 1, 256, 1}, + {3, 6144, 1, 512, 1}, + {3, 6656, 1, 512, 1}, + {3, 7168, 1, 1024, 1}, + {3, 7680, 1, 512, 1}, + {3, 8192, 1, 512, 1}, + {3, 8704, 1, 512, 1}, + {3, 9216, 1, 1024, 1}, + {3, 9728, 1, 512, 1}, + {3, 10240, 1, 512, 1}, + {3, 20480, 1, 2048, 1}, + {3, 30720, 1, 2048, 1}, + {3, 40960, 1, 2048, 1}, + {3, 51200, 1, 2048, 1}, + {3, 61440, 1, 4096, 1}, + {3, 71680, 1, 2048, 1}, + {3, 81920, 1, 8192, 1}, + {3, 92160, 1, 2048, 1}, + {3, 102400, 1, 4096, 1}, + {3, 204800, 1, 8192, 1}, + {3, 307200, 1, 2048, 1}, + {3, 409600, 1, 16384, 1}, + {3, 512000, 1, 2048, 1}, + {3, 614400, 1, 8192, 1}, + {3, 716800, 1, 1024, 1}, + {3, 819200, 1, 32768, 1}, + {3, 921600, 1, 2048, 1}, + {3, 1024000, 1, 2048, 1}, + {4, 512, 1, 128, 0}, + {4, 1024, 1, 128, 1}, + {4, 1536, 1, 64, 1}, + {4, 2048, 1, 2048, 1}, + {4, 2560, 1, 512, 1}, + {4, 3072, 1, 512, 1}, + {4, 3584, 1, 512, 1}, + {4, 4096, 1, 256, 1}, + {4, 4608, 1, 512, 1}, + {4, 5120, 1, 512, 1}, + {4, 5632, 1, 256, 1}, + {4, 6144, 1, 512, 1}, + {4, 6656, 1, 256, 1}, + {4, 7168, 1, 1024, 1}, + {4, 7680, 1, 256, 1}, + {4, 8192, 1, 512, 1}, + {4, 8704, 1, 512, 1}, + {4, 9216, 1, 1024, 1}, + {4, 9728, 1, 512, 1}, + {4, 10240, 1, 2048, 1}, + {4, 20480, 1, 2048, 1}, + {4, 30720, 1, 2048, 1}, + {4, 40960, 1, 2048, 1}, + {4, 51200, 1, 2048, 1}, + {4, 61440, 1, 4096, 1}, + {4, 71680, 1, 2048, 1}, + {4, 81920, 1, 8192, 1}, + {4, 92160, 1, 2048, 1}, + {4, 102400, 1, 4096, 1}, + {4, 204800, 1, 8192, 1}, + {4, 307200, 1, 4096, 1}, + {4, 409600, 1, 16384, 1}, + {4, 512000, 1, 2048, 1}, + {4, 614400, 1, 8192, 1}, + {4, 716800, 1, 4096, 1}, + {4, 819200, 1, 32768, 1}, + {4, 921600, 1, 4096, 1}, + {4, 1024000, 1, 2048, 1}, + {6, 512, 3, 32, 1}, + {6, 1024, 3, 128, 1}, + {6, 1536, 3, 256, 1}, + {6, 2048, 3, 128, 1}, + {6, 2560, 3, 512, 1}, + {6, 3072, 3, 1024, 1}, + {6, 3584, 3, 512, 1}, + {6, 4096, 3, 1024, 1}, + {6, 4608, 3, 512, 1}, + {6, 5120, 3, 512, 1}, + {6, 5632, 3, 256, 1}, + {6, 6144, 3, 256, 1}, + {6, 6656, 3, 512, 1}, + {6, 7168, 3, 512, 1}, + {6, 7680, 3, 512, 1}, + {6, 8192, 3, 512, 1}, + {6, 8704, 3, 512, 1}, + {6, 9216, 3, 1024, 1}, + {6, 9728, 3, 512, 1}, + {6, 10240, 3, 1024, 1}, + {6, 20480, 3, 1024, 1}, + {6, 30720, 3, 1024, 1}, + {6, 40960, 3, 2048, 1}, + {6, 51200, 3, 2048, 1}, + {6, 61440, 3, 2048, 1}, + {6, 71680, 3, 2048, 1}, + {6, 81920, 3, 4096, 1}, + {6, 92160, 3, 2048, 1}, + {6, 102400, 3, 2048, 1}, + {6, 204800, 3, 8192, 1}, + {6, 307200, 3, 4096, 1}, + {6, 409600, 3, 8192, 1}, + {6, 512000, 3, 1024, 1}, + {6, 614400, 3, 8192, 1}, + {6, 716800, 3, 1024, 1}, + {6, 819200, 3, 16384, 1}, + {6, 921600, 3, 4096, 1}, + {6, 1024000, 3, 2048, 1}, + {10, 512, 4, 128, 1}, + {10, 1024, 4, 128, 1}, + {10, 1536, 4, 256, 1}, + {10, 2048, 4, 128, 1}, + {10, 2560, 4, 512, 1}, + {10, 3072, 4, 512, 1}, + {10, 3584, 4, 512, 1}, + {10, 4096, 4, 4096, 0}, + {10, 4608, 4, 64, 0}, + {10, 5120, 4, 512, 1}, + {10, 5632, 4, 512, 1}, + {10, 6144, 4, 512, 1}, + {10, 6656, 4, 32, 0}, + {10, 7168, 4, 512, 1}, + {10, 7680, 4, 512, 1}, + {10, 8192, 4, 32, 0}, + {10, 8704, 4, 512, 1}, + {10, 9216, 4, 512, 1}, + {10, 9728, 4, 32, 0}, + {10, 10240, 4, 512, 1}, + {10, 20480, 4, 2048, 1}, + {10, 30720, 4, 2048, 1}, + {10, 40960, 4, 2048, 1}, + {10, 51200, 4, 2048, 1}, + {10, 61440, 4, 2048, 1}, + {10, 71680, 4, 2048, 1}, + {10, 81920, 4, 8192, 1}, + {10, 92160, 4, 2048, 1}, + {10, 102400, 4, 2048, 1}, + {10, 204800, 4, 4096, 1}, + {10, 307200, 4, 4096, 1}, + {10, 409600, 4, 16384, 1}, + {10, 512000, 4, 4096, 1}, + {10, 614400, 4, 8192, 1}, + {10, 716800, 4, 2048, 1}, + {10, 819200, 4, 16384, 1}, + {10, 921600, 4, 4096, 1}, + {10, 1024000, 4, 8192, 1}, + {10, 512, 6, 32, 1}, + {10, 1024, 6, 128, 1}, + {10, 1536, 6, 256, 1}, + {10, 2048, 6, 512, 1}, + {10, 2560, 6, 128, 1}, + {10, 3072, 6, 512, 1}, + {10, 3584, 6, 512, 1}, + {10, 4096, 6, 256, 1}, + {10, 4608, 6, 256, 1}, + {10, 5120, 6, 256, 1}, + {10, 5632, 6, 512, 1}, + {10, 6144, 6, 512, 1}, + {10, 6656, 6, 512, 1}, + {10, 7168, 6, 512, 1}, + {10, 7680, 6, 512, 1}, + {10, 8192, 6, 32, 0}, + {10, 8704, 6, 32, 0}, + {10, 9216, 6, 1024, 1}, + {10, 9728, 6, 512, 1}, + {10, 10240, 6, 512, 1}, + {10, 20480, 6, 1024, 1}, + {10, 30720, 6, 1024, 1}, + {10, 40960, 6, 2048, 1}, + {10, 51200, 6, 1024, 1}, + {10, 61440, 6, 4096, 1}, + {10, 71680, 6, 2048, 1}, + {10, 81920, 6, 4096, 1}, + {10, 92160, 6, 2048, 1}, + {10, 102400, 6, 4096, 1}, + {10, 204800, 6, 8192, 1}, + {10, 307200, 6, 4096, 1}, + {10, 409600, 6, 16384, 1}, + {10, 512000, 6, 512, 1}, + {10, 614400, 6, 8192, 1}, + {10, 716800, 6, 4096, 1}, + {10, 819200, 6, 32768, 1}, + {10, 921600, 6, 2048, 1}, + {10, 1024000, 6, 8192, 1}, + {15, 512, 12, 128, 0}, + {15, 1024, 12, 128, 1}, + {15, 1536, 12, 256, 1}, + {15, 2048, 12, 128, 1}, + {15, 2560, 12, 512, 1}, + {15, 3072, 12, 1024, 0}, + {15, 3584, 12, 512, 0}, + {15, 4096, 12, 256, 1}, + {15, 4608, 12, 512, 0}, + {15, 5120, 12, 512, 0}, + {15, 5632, 12, 512, 0}, + {15, 6144, 12, 32, 0}, + {15, 6656, 12, 256, 0}, + {15, 7168, 12, 512, 0}, + {15, 7680, 12, 32, 0}, + {15, 8192, 12, 32, 0}, + {15, 8704, 12, 32, 0}, + {15, 9216, 12, 512, 0}, + {15, 9728, 12, 32, 0}, + {15, 10240, 12, 32, 0}, + {15, 20480, 12, 512, 1}, + {15, 30720, 12, 2048, 1}, + {15, 40960, 12, 2048, 1}, + {15, 51200, 12, 2048, 1}, + {15, 61440, 12, 2048, 1}, + {15, 71680, 12, 2048, 1}, + {15, 81920, 12, 4096, 1}, + {15, 92160, 12, 1024, 0}, + {15, 102400, 12, 2048, 0}, + {15, 204800, 12, 1024, 0}, + {15, 307200, 12, 1024, 0}, + {15, 409600, 12, 2048, 0}, + {15, 512000, 12, 1024, 0}, + {15, 614400, 12, 1024, 0}, + {15, 716800, 12, 1024, 0}, + {15, 819200, 12, 2048, 0}, + {15, 921600, 12, 2048, 0}, + {15, 1024000, 12, 1024, 0}, + {20, 512, 11, 32, 0}, + {20, 1024, 11, 256, 0}, + {20, 1536, 11, 256, 0}, + {20, 2048, 11, 256, 0}, + {20, 2560, 11, 256, 0}, + {20, 3072, 11, 256, 1}, + {20, 3584, 11, 32, 0}, + {20, 4096, 11, 256, 0}, + {20, 4608, 11, 32, 0}, + {20, 5120, 11, 256, 0}, + {20, 5632, 11, 32, 0}, + {20, 6144, 11, 32, 0}, + {20, 6656, 11, 32, 0}, + {20, 7168, 11, 32, 0}, + {20, 7680, 11, 32, 0}, + {20, 8192, 11, 32, 0}, + {20, 8704, 11, 32, 0}, + {20, 9216, 11, 32, 0}, + {20, 9728, 11, 32, 0}, + {20, 10240, 11, 32, 0}, + {20, 20480, 11, 2048, 1}, + {20, 30720, 11, 2048, 1}, + {20, 40960, 11, 8192, 1}, + {20, 51200, 11, 2048, 1}, + {20, 61440, 11, 2048, 1}, + {20, 71680, 11, 2048, 1}, + {20, 81920, 11, 16384, 1}, + {20, 92160, 11, 2048, 1}, + {20, 102400, 11, 4096, 1}, + {20, 204800, 11, 4096, 1}, + {20, 307200, 11, 4096, 1}, + {20, 409600, 11, 8192, 1}, + {20, 512000, 11, 4096, 1}, + {20, 614400, 11, 8192, 1}, + {20, 716800, 11, 4096, 1}, + {20, 819200, 11, 32768, 1}, + {20, 921600, 11, 4096, 1}, + {20, 1024000, 11, 8192, 1}, + {21, 512, 16, 32, 0}, + {21, 1024, 16, 256, 0}, + {21, 1536, 16, 256, 0}, + {21, 2048, 16, 256, 0}, + {21, 2560, 16, 256, 1}, + {21, 3072, 16, 32, 0}, + {21, 3584, 16, 32, 0}, + {21, 4096, 16, 256, 0}, + {21, 4608, 16, 32, 0}, + {21, 5120, 16, 32, 0}, + {21, 5632, 16, 32, 0}, + {21, 6144, 16, 32, 0}, + {21, 6656, 16, 32, 0}, + {21, 7168, 16, 32, 0}, + {21, 7680, 16, 32, 0}, + {21, 8192, 16, 32, 0}, + {21, 8704, 16, 32, 0}, + {21, 9216, 16, 32, 0}, + {21, 9728, 16, 32, 0}, + {21, 10240, 16, 32, 0}, + {21, 20480, 16, 512, 1}, + {21, 30720, 16, 2048, 1}, + {21, 40960, 16, 2048, 1}, + {21, 51200, 16, 2048, 1}, + {21, 61440, 16, 4096, 1}, + {21, 71680, 16, 2048, 0}, + {21, 81920, 16, 2048, 0}, + {21, 92160, 16, 1024, 0}, + {21, 102400, 16, 4096, 1}, + {21, 204800, 16, 8192, 0}, + {21, 307200, 16, 1024, 0}, + {21, 409600, 16, 8192, 0}, + {21, 512000, 16, 2048, 0}, + {21, 614400, 16, 8192, 0}, + {21, 716800, 16, 1024, 0}, + {21, 819200, 16, 8192, 0}, + {21, 921600, 16, 1024, 0}, + {21, 1024000, 16, 8192, 0}, + {28, 512, 25, 256, 0}, + {28, 1024, 25, 64, 0}, + {28, 1536, 25, 128, 0}, + {28, 2048, 25, 128, 0}, + {28, 2560, 25, 256, 0}, + {28, 3072, 25, 128, 0}, + {28, 3584, 25, 32, 0}, + {28, 4096, 25, 128, 0}, + {28, 4608, 25, 128, 0}, + {28, 5120, 25, 128, 0}, + {28, 5632, 25, 128, 0}, + {28, 6144, 25, 128, 0}, + {28, 6656, 25, 128, 0}, + {28, 7168, 25, 128, 0}, + {28, 7680, 25, 128, 0}, + {28, 8192, 25, 128, 0}, + {28, 8704, 25, 128, 0}, + {28, 9216, 25, 128, 0}, + {28, 9728, 25, 128, 0}, + {28, 10240, 25, 128, 0}, + {28, 20480, 25, 128, 0}, + {28, 30720, 25, 128, 0}, + {28, 40960, 25, 128, 0}, + {28, 51200, 25, 128, 0}, + {28, 61440, 25, 128, 0}, + {28, 71680, 25, 128, 0}, + {28, 81920, 25, 64, 0}, + {28, 92160, 25, 128, 0}, + {28, 102400, 25, 128, 0}, + {28, 204800, 25, 8192, 0}, + {28, 307200, 25, 128, 0}, + {28, 409600, 25, 8192, 0}, + {28, 512000, 25, 128, 0}, + {28, 614400, 25, 8192, 0}, + {28, 716800, 25, 128, 0}, + {28, 819200, 25, 8192, 0}, + {28, 921600, 25, 512, 0}, + {28, 1024000, 25, 8192, 0}, + {35, 512, 24, 32, 0}, + {35, 1024, 24, 256, 0}, + {35, 1536, 24, 256, 0}, + {35, 2048, 24, 256, 0}, + {35, 2560, 24, 64, 0}, + {35, 3072, 24, 32, 0}, + {35, 3584, 24, 64, 0}, + {35, 4096, 24, 128, 0}, + {35, 4608, 24, 128, 0}, + {35, 5120, 24, 128, 0}, + {35, 5632, 24, 128, 0}, + {35, 6144, 24, 128, 0}, + {35, 6656, 24, 128, 0}, + {35, 7168, 24, 128, 0}, + {35, 7680, 24, 256, 1}, + {35, 8192, 24, 1024, 1}, + {35, 8704, 24, 512, 1}, + {35, 9216, 24, 1024, 1}, + {35, 9728, 24, 256, 1}, + {35, 10240, 24, 2048, 1}, + {35, 20480, 24, 32, 0}, + {35, 30720, 24, 32, 0}, + {35, 40960, 24, 8192, 0}, + {35, 51200, 24, 2048, 1}, + {35, 61440, 24, 128, 0}, + {35, 71680, 24, 64, 0}, + {35, 81920, 24, 8192, 0}, + {35, 92160, 24, 2048, 1}, + {35, 102400, 24, 128, 0}, + {35, 204800, 24, 8192, 0}, + {35, 307200, 24, 4096, 1}, + {35, 409600, 24, 8192, 0}, + {35, 512000, 24, 4096, 1}, + {35, 614400, 24, 8192, 0}, + {35, 716800, 24, 4096, 1}, + {35, 819200, 24, 8192, 0}, + {35, 921600, 24, 921600, 1}, + {35, 1024000, 24, 8192, 0}, + {36, 512, 33, 32, 0}, + {36, 1024, 33, 32, 0}, + {36, 1536, 33, 128, 0}, + {36, 2048, 33, 128, 0}, + {36, 2560, 33, 256, 0}, + {36, 3072, 33, 32, 0}, + {36, 3584, 33, 128, 0}, + {36, 4096, 33, 128, 0}, + {36, 4608, 33, 32, 0}, + {36, 5120, 33, 128, 0}, + {36, 5632, 33, 128, 0}, + {36, 6144, 33, 128, 0}, + {36, 6656, 33, 64, 0}, + {36, 7168, 33, 128, 0}, + {36, 7680, 33, 64, 0}, + {36, 8192, 33, 8192, 1}, + {36, 8704, 33, 64, 0}, + {36, 9216, 33, 64, 0}, + {36, 9728, 33, 256, 1}, + {36, 10240, 33, 128, 0}, + {36, 20480, 33, 64, 0}, + {36, 30720, 33, 64, 0}, + {36, 40960, 33, 8192, 0}, + {36, 51200, 33, 64, 0}, + {36, 61440, 33, 32, 0}, + {36, 71680, 33, 64, 0}, + {36, 81920, 33, 8192, 0}, + {36, 92160, 33, 64, 0}, + {36, 102400, 33, 64, 0}, + {36, 204800, 33, 8192, 0}, + {36, 307200, 33, 64, 0}, + {36, 409600, 33, 8192, 0}, + {36, 512000, 33, 64, 0}, + {36, 614400, 33, 8192, 0}, + {36, 716800, 33, 512, 0}, + {36, 819200, 33, 8192, 0}, + {36, 921600, 33, 512, 0}, + {36, 1024000, 33, 8192, 0}, + {45, 512, 42, 128, 0}, + {45, 1024, 42, 128, 0}, + {45, 1536, 42, 128, 0}, + {45, 2048, 42, 128, 0}, + {45, 2560, 42, 128, 0}, + {45, 3072, 42, 32, 0}, + {45, 3584, 42, 128, 0}, + {45, 4096, 42, 32, 0}, + {45, 4608, 42, 128, 0}, + {45, 5120, 42, 128, 0}, + {45, 5632, 42, 128, 0}, + {45, 6144, 42, 128, 0}, + {45, 6656, 42, 128, 0}, + {45, 7168, 42, 128, 0}, + {45, 7680, 42, 128, 0}, + {45, 8192, 42, 32, 0}, + {45, 8704, 42, 32, 0}, + {45, 9216, 42, 64, 0}, + {45, 9728, 42, 64, 0}, + {45, 10240, 42, 128, 0}, + {45, 20480, 42, 64, 0}, + {45, 30720, 42, 64, 0}, + {45, 40960, 42, 8192, 0}, + {45, 51200, 42, 64, 0}, + {45, 61440, 42, 64, 0}, + {45, 71680, 42, 64, 0}, + {45, 81920, 42, 64, 0}, + {45, 92160, 42, 64, 0}, + {45, 102400, 42, 64, 0}, + {45, 204800, 42, 8192, 0}, + {45, 307200, 42, 64, 0}, + {45, 409600, 42, 8192, 0}, + {45, 512000, 42, 512, 0}, + {45, 614400, 42, 8192, 0}, + {45, 716800, 42, 512, 0}, + {45, 819200, 42, 8192, 0}, + {45, 921600, 42, 512, 0}, + {45, 1024000, 42, 8192, 0}, + {56, 512, 43, 512, 0}, + {56, 1024, 43, 128, 0}, + {56, 1536, 43, 128, 0}, + {56, 2048, 43, 128, 0}, + {56, 2560, 43, 128, 0}, + {56, 3072, 43, 128, 0}, + {56, 3584, 43, 128, 0}, + {56, 4096, 43, 128, 0}, + {56, 4608, 43, 128, 0}, + {56, 5120, 43, 128, 0}, + {56, 5632, 43, 128, 0}, + {56, 6144, 43, 64, 0}, + {56, 6656, 43, 64, 0}, + {56, 7168, 43, 128, 0}, + {56, 7680, 43, 64, 0}, + {56, 8192, 43, 64, 0}, + {56, 8704, 43, 128, 0}, + {56, 9216, 43, 64, 0}, + {56, 9728, 43, 64, 0}, + {56, 10240, 43, 64, 0}, + {56, 20480, 43, 64, 0}, + {56, 30720, 43, 64, 0}, + {56, 40960, 43, 64, 0}, + {56, 51200, 43, 64, 0}, + {56, 61440, 43, 64, 0}, + {56, 71680, 43, 64, 0}, + {56, 81920, 43, 64, 0}, + {56, 92160, 43, 64, 0}, + {56, 102400, 43, 64, 0}, + {56, 204800, 43, 8192, 0}, + {56, 307200, 43, 64, 0}, + {56, 409600, 43, 8192, 0}, + {56, 512000, 43, 64, 0}, + {56, 614400, 43, 8192, 0}, + {56, 716800, 43, 716800, 0}, + {56, 819200, 43, 8192, 0}, + {56, 921600, 43, 921600, 0}, + {56, 1024000, 43, 8192, 0}, + {84, 512, 126, 64, 0}, + {84, 1024, 126, 64, 0}, + {84, 1536, 126, 64, 0}, + {84, 2048, 126, 1024, 0}, + {84, 2560, 126, 64, 0}, + {84, 3072, 126, 64, 0}, + {84, 3584, 126, 128, 0}, + {84, 4096, 126, 64, 0}, + {84, 4608, 126, 128, 0}, + {84, 5120, 126, 128, 0}, + {84, 5632, 126, 128, 0}, + {84, 6144, 126, 128, 0}, + {84, 6656, 126, 128, 0}, + {84, 7168, 126, 128, 0}, + {84, 7680, 126, 128, 0}, + {84, 8192, 126, 8192, 0}, + {84, 8704, 126, 128, 0}, + {84, 9216, 126, 128, 0}, + {84, 9728, 126, 128, 0}, + {84, 10240, 126, 128, 0}, + {84, 20480, 126, 4096, 0}, + {84, 30720, 126, 128, 0}, + {84, 40960, 126, 8192, 0}, + {84, 51200, 126, 128, 0}, + {84, 61440, 126, 4096, 0}, + {84, 71680, 126, 128, 0}, + {84, 81920, 126, 8192, 0}, + {84, 92160, 126, 128, 0}, + {84, 102400, 126, 4096, 0}, + {84, 204800, 126, 8192, 0}, + {84, 307200, 126, 4096, 0}, + {84, 409600, 126, 16384, 0}, + {84, 512000, 126, 4096, 0}, + {84, 614400, 126, 8192, 0}, + {84, 716800, 126, 4096, 0}, + {84, 819200, 126, 16384, 0}, + {84, 921600, 126, 4096, 0}, + {84, 1024000, 126, 8192, 0}, + {120, 512, 210, 64, 0}, + {120, 1024, 210, 512, 0}, + {120, 1536, 210, 32, 0}, + {120, 2048, 210, 2048, 0}, + {120, 2560, 210, 64, 0}, + {120, 3072, 210, 64, 0}, + {120, 3584, 210, 512, 0}, + {120, 4096, 210, 4096, 0}, + {120, 4608, 210, 64, 0}, + {120, 5120, 210, 64, 0}, + {120, 5632, 210, 512, 0}, + {120, 6144, 210, 2048, 0}, + {120, 6656, 210, 128, 0}, + {120, 7168, 210, 1024, 0}, + {120, 7680, 210, 512, 0}, + {120, 8192, 210, 8192, 0}, + {120, 8704, 210, 256, 0}, + {120, 9216, 210, 9216, 0}, + {120, 9728, 210, 9728, 0}, + {120, 10240, 210, 2048, 0}, + {120, 20480, 210, 4096, 0}, + {120, 30720, 210, 30720, 0}, + {120, 40960, 210, 8192, 0}, + {120, 51200, 210, 51200, 0}, + {120, 61440, 210, 4096, 0}, + {120, 71680, 210, 71680, 0}, + {120, 81920, 210, 8192, 0}, + {120, 92160, 210, 92160, 0}, + {120, 102400, 210, 102400, 0}, + {120, 204800, 210, 8192, 0}, + {120, 307200, 210, 4096, 0}, + {120, 409600, 210, 8192, 0}, + {120, 512000, 210, 4096, 0}, + {120, 614400, 210, 8192, 0}, + {120, 716800, 210, 4096, 0}, + {120, 819200, 210, 8192, 0}, + {120, 921600, 210, 4096, 0}, + {120, 1024000, 210, 8192, 0}, + {165, 512, 330, 256, 0}, + {165, 1024, 330, 1024, 0}, + {165, 1536, 330, 128, 0}, + {165, 2048, 330, 2048, 0}, + {165, 2560, 330, 512, 0}, + {165, 3072, 330, 1024, 0}, + {165, 3584, 330, 256, 0}, + {165, 4096, 330, 2048, 0}, + {165, 4608, 330, 512, 0}, + {165, 5120, 330, 1024, 0}, + {165, 5632, 330, 5632, 0}, + {165, 6144, 330, 2048, 0}, + {165, 6656, 330, 6656, 0}, + {165, 7168, 330, 1024, 0}, + {165, 7680, 330, 7680, 0}, + {165, 8192, 330, 8192, 0}, + {165, 8704, 330, 8704, 0}, + {165, 9216, 330, 1024, 0}, + {165, 9728, 330, 9728, 0}, + {165, 10240, 330, 1024, 0}, + {165, 20480, 330, 4096, 0}, + {165, 30720, 330, 1024, 0}, + {165, 40960, 330, 8192, 0}, + {165, 51200, 330, 1024, 0}, + {165, 61440, 330, 1024, 0}, + {165, 71680, 330, 1024, 0}, + {165, 81920, 330, 8192, 0}, + {165, 92160, 330, 1024, 0}, + {165, 102400, 330, 1024, 0}, + {165, 204800, 330, 8192, 0}, + {165, 307200, 330, 1024, 0}, + {165, 409600, 330, 8192, 0}, + {165, 512000, 330, 1024, 0}, + {165, 614400, 330, 8192, 0}, + {165, 716800, 330, 1024, 0}, + {165, 819200, 330, 8192, 0}, + {165, 921600, 330, 1024, 0}, + {165, 1024000, 330, 8192, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > sgemm_tn_mi250x = -{ - {1 , 512 , 3 , 32 , 0 }, - {1 , 1024 , 3 , 32 , 0 }, - {1 , 1536 , 3 , 256 , 1 }, - {1 , 2048 , 3 , 1024 , 1 }, - {1 , 2560 , 3 , 256 , 1 }, - {1 , 3072 , 3 , 256 , 1 }, - {1 , 3584 , 3 , 512 , 1 }, - {1 , 4096 , 3 , 256 , 1 }, - {1 , 4608 , 3 , 32 , 0 }, - {1 , 5120 , 3 , 256 , 1 }, - {1 , 5632 , 3 , 512 , 1 }, - {1 , 6144 , 3 , 2048 , 1 }, - {1 , 6656 , 3 , 512 , 1 }, - {1 , 7168 , 3 , 1024 , 1 }, - {1 , 7680 , 3 , 512 , 1 }, - {1 , 8192 , 3 , 2048 , 1 }, - {1 , 8704 , 3 , 512 , 1 }, - {1 , 9216 , 3 , 512 , 1 }, - {1 , 9728 , 3 , 512 , 1 }, - {1 , 10240 , 3 , 2048 , 1 }, - {1 , 20480 , 3 , 2048 , 1 }, - {1 , 30720 , 3 , 2048 , 1 }, - {1 , 40960 , 3 , 8192 , 1 }, - {1 , 51200 , 3 , 2048 , 1 }, - {1 , 61440 , 3 , 4096 , 1 }, - {1 , 71680 , 3 , 2048 , 1 }, - {1 , 81920 , 3 , 8192 , 1 }, - {1 , 92160 , 3 , 2048 , 1 }, - {1 , 102400 , 3 , 4096 , 1 }, - {1 , 204800 , 3 , 8192 , 1 }, - {1 , 307200 , 3 , 4096 , 1 }, - {1 , 409600 , 3 , 16384 , 1 }, - {1 , 512000 , 3 , 2048 , 1 }, - {1 , 614400 , 3 , 8192 , 1 }, - {1 , 716800 , 3 , 2048 , 1 }, - {1 , 819200 , 3 , 32768 , 1 }, - {1 , 921600 , 3 , 2048 , 1 }, - {1 , 1024000, 3 , 2048 , 1 }, - {1 , 512 , 4 , 128 , 0 }, - {1 , 1024 , 4 , 128 , 1 }, - {1 , 1536 , 4 , 256 , 1 }, - {1 , 2048 , 4 , 128 , 1 }, - {1 , 2560 , 4 , 512 , 1 }, - {1 , 3072 , 4 , 1024 , 1 }, - {1 , 3584 , 4 , 512 , 1 }, - {1 , 4096 , 4 , 1024 , 1 }, - {1 , 4608 , 4 , 512 , 1 }, - {1 , 5120 , 4 , 256 , 1 }, - {1 , 5632 , 4 , 512 , 1 }, - {1 , 6144 , 4 , 1024 , 1 }, - {1 , 6656 , 4 , 512 , 1 }, - {1 , 7168 , 4 , 1024 , 1 }, - {1 , 7680 , 4 , 128 , 0 }, - {1 , 8192 , 4 , 4096 , 1 }, - {1 , 8704 , 4 , 512 , 1 }, - {1 , 9216 , 4 , 1024 , 1 }, - {1 , 9728 , 4 , 512 , 1 }, - {1 , 10240 , 4 , 1024 , 1 }, - {1 , 20480 , 4 , 2048 , 1 }, - {1 , 30720 , 4 , 2048 , 1 }, - {1 , 40960 , 4 , 2048 , 1 }, - {1 , 51200 , 4 , 2048 , 1 }, - {1 , 61440 , 4 , 4096 , 1 }, - {1 , 71680 , 4 , 2048 , 1 }, - {1 , 81920 , 4 , 16384 , 1 }, - {1 , 92160 , 4 , 2048 , 1 }, - {1 , 102400 , 4 , 4096 , 1 }, - {1 , 204800 , 4 , 8192 , 1 }, - {1 , 307200 , 4 , 4096 , 1 }, - {1 , 409600 , 4 , 8192 , 1 }, - {1 , 512000 , 4 , 1024 , 1 }, - {1 , 614400 , 4 , 8192 , 1 }, - {1 , 716800 , 4 , 1024 , 1 }, - {1 , 819200 , 4 , 16384 , 1 }, - {1 , 921600 , 4 , 4096 , 1 }, - {1 , 1024000, 4 , 2048 , 1 }, - {3 , 512 , 6 , 128 , 0 }, - {3 , 1024 , 6 , 128 , 1 }, - {3 , 1536 , 6 , 256 , 1 }, - {3 , 2048 , 6 , 1024 , 1 }, - {3 , 2560 , 6 , 256 , 1 }, - {3 , 3072 , 6 , 1024 , 1 }, - {3 , 3584 , 6 , 512 , 1 }, - {3 , 4096 , 6 , 512 , 1 }, - {3 , 4608 , 6 , 512 , 1 }, - {3 , 5120 , 6 , 256 , 1 }, - {3 , 5632 , 6 , 512 , 1 }, - {3 , 6144 , 6 , 512 , 1 }, - {3 , 6656 , 6 , 32 , 0 }, - {3 , 7168 , 6 , 1024 , 1 }, - {3 , 7680 , 6 , 512 , 1 }, - {3 , 8192 , 6 , 1024 , 1 }, - {3 , 8704 , 6 , 32 , 0 }, - {3 , 9216 , 6 , 512 , 1 }, - {3 , 9728 , 6 , 32 , 0 }, - {3 , 10240 , 6 , 512 , 1 }, - {3 , 20480 , 6 , 32 , 0 }, - {3 , 30720 , 6 , 64 , 0 }, - {3 , 40960 , 6 , 2048 , 1 }, - {3 , 51200 , 6 , 2048 , 1 }, - {3 , 61440 , 6 , 64 , 0 }, - {3 , 71680 , 6 , 32 , 0 }, - {3 , 81920 , 6 , 2048 , 1 }, - {3 , 92160 , 6 , 2048 , 1 }, - {3 , 102400 , 6 , 4096 , 1 }, - {3 , 204800 , 6 , 8192 , 1 }, - {3 , 307200 , 6 , 4096 , 1 }, - {3 , 409600 , 6 , 16384 , 1 }, - {3 , 512000 , 6 , 1024 , 1 }, - {3 , 614400 , 6 , 8192 , 1 }, - {3 , 716800 , 6 , 1024 , 1 }, - {3 , 819200 , 6 , 32768 , 1 }, - {3 , 921600 , 6 , 2048 , 1 }, - {3 , 1024000, 6 , 512 , 1 }, - {4 , 512 , 10 , 128 , 0 }, - {4 , 1024 , 10 , 128 , 1 }, - {4 , 1536 , 10 , 256 , 1 }, - {4 , 2048 , 10 , 1024 , 1 }, - {4 , 2560 , 10 , 512 , 1 }, - {4 , 3072 , 10 , 512 , 1 }, - {4 , 3584 , 10 , 256 , 1 }, - {4 , 4096 , 10 , 2048 , 1 }, - {4 , 4608 , 10 , 512 , 1 }, - {4 , 5120 , 10 , 32 , 0 }, - {4 , 5632 , 10 , 256 , 1 }, - {4 , 6144 , 10 , 512 , 1 }, - {4 , 6656 , 10 , 512 , 1 }, - {4 , 7168 , 10 , 512 , 1 }, - {4 , 7680 , 10 , 64 , 0 }, - {4 , 8192 , 10 , 64 , 0 }, - {4 , 8704 , 10 , 64 , 0 }, - {4 , 9216 , 10 , 64 , 0 }, - {4 , 9728 , 10 , 64 , 0 }, - {4 , 10240 , 10 , 1024 , 1 }, - {4 , 20480 , 10 , 64 , 0 }, - {4 , 30720 , 10 , 64 , 0 }, - {4 , 40960 , 10 , 64 , 0 }, - {4 , 51200 , 10 , 32 , 0 }, - {4 , 61440 , 10 , 32 , 0 }, - {4 , 71680 , 10 , 2048 , 1 }, - {4 , 81920 , 10 , 2048 , 1 }, - {4 , 92160 , 10 , 2048 , 1 }, - {4 , 102400 , 10 , 64 , 0 }, - {4 , 204800 , 10 , 32 , 0 }, - {4 , 307200 , 10 , 32 , 0 }, - {4 , 409600 , 10 , 64 , 0 }, - {4 , 512000 , 10 , 64 , 0 }, - {4 , 614400 , 10 , 64 , 0 }, - {4 , 716800 , 10 , 32 , 0 }, - {4 , 819200 , 10 , 32 , 0 }, - {4 , 921600 , 10 , 32 , 0 }, - {4 , 1024000, 10 , 32 , 0 }, - {6 , 512 , 10 , 32 , 0 }, - {6 , 1024 , 10 , 128 , 1 }, - {6 , 1536 , 10 , 256 , 1 }, - {6 , 2048 , 10 , 1024 , 1 }, - {6 , 2560 , 10 , 512 , 1 }, - {6 , 3072 , 10 , 512 , 1 }, - {6 , 3584 , 10 , 512 , 1 }, - {6 , 4096 , 10 , 2048 , 1 }, - {6 , 4608 , 10 , 256 , 1 }, - {6 , 5120 , 10 , 64 , 0 }, - {6 , 5632 , 10 , 512 , 1 }, - {6 , 6144 , 10 , 1024 , 1 }, - {6 , 6656 , 10 , 512 , 1 }, - {6 , 7168 , 10 , 64 , 0 }, - {6 , 7680 , 10 , 64 , 0 }, - {6 , 8192 , 10 , 64 , 0 }, - {6 , 8704 , 10 , 32 , 0 }, - {6 , 9216 , 10 , 64 , 0 }, - {6 , 9728 , 10 , 128 , 0 }, - {6 , 10240 , 10 , 128 , 0 }, - {6 , 20480 , 10 , 32 , 0 }, - {6 , 30720 , 10 , 64 , 0 }, - {6 , 40960 , 10 , 32 , 0 }, - {6 , 51200 , 10 , 1024 , 1 }, - {6 , 61440 , 10 , 4096 , 1 }, - {6 , 71680 , 10 , 64 , 0 }, - {6 , 81920 , 10 , 8192 , 1 }, - {6 , 92160 , 10 , 32 , 0 }, - {6 , 102400 , 10 , 32 , 0 }, - {6 , 204800 , 10 , 64 , 0 }, - {6 , 307200 , 10 , 32 , 0 }, - {6 , 409600 , 10 , 32 , 0 }, - {6 , 512000 , 10 , 64 , 0 }, - {6 , 614400 , 10 , 64 , 0 }, - {6 , 716800 , 10 , 64 , 0 }, - {6 , 819200 , 10 , 64 , 0 }, - {6 , 921600 , 10 , 32 , 0 }, - {6 , 1024000, 10 , 64 , 0 }, - {12 , 512 , 15 , 32 , 0 }, - {12 , 1024 , 15 , 128 , 1 }, - {12 , 1536 , 15 , 64 , 1 }, - {12 , 2048 , 15 , 128 , 1 }, - {12 , 2560 , 15 , 256 , 1 }, - {12 , 3072 , 15 , 512 , 1 }, - {12 , 3584 , 15 , 512 , 1 }, - {12 , 4096 , 15 , 64 , 0 }, - {12 , 4608 , 15 , 512 , 1 }, - {12 , 5120 , 15 , 512 , 1 }, - {12 , 5632 , 15 , 64 , 0 }, - {12 , 6144 , 15 , 32 , 0 }, - {12 , 6656 , 15 , 256 , 0 }, - {12 , 7168 , 15 , 64 , 0 }, - {12 , 7680 , 15 , 32 , 0 }, - {12 , 8192 , 15 , 32 , 0 }, - {12 , 8704 , 15 , 32 , 0 }, - {12 , 9216 , 15 , 512 , 0 }, - {12 , 9728 , 15 , 64 , 0 }, - {12 , 10240 , 15 , 64 , 0 }, - {12 , 20480 , 15 , 64 , 0 }, - {12 , 30720 , 15 , 64 , 0 }, - {12 , 40960 , 15 , 64 , 0 }, - {12 , 51200 , 15 , 64 , 0 }, - {12 , 61440 , 15 , 64 , 0 }, - {12 , 71680 , 15 , 64 , 0 }, - {12 , 81920 , 15 , 32 , 0 }, - {12 , 92160 , 15 , 64 , 0 }, - {12 , 102400 , 15 , 32 , 0 }, - {12 , 204800 , 15 , 32 , 0 }, - {12 , 307200 , 15 , 32 , 0 }, - {12 , 409600 , 15 , 64 , 0 }, - {12 , 512000 , 15 , 64 , 0 }, - {12 , 614400 , 15 , 32 , 0 }, - {12 , 716800 , 15 , 64 , 0 }, - {12 , 819200 , 15 , 64 , 0 }, - {12 , 921600 , 15 , 64 , 0 }, - {12 , 1024000, 15 , 64 , 0 }, - {11 , 512 , 20 , 32 , 0 }, - {11 , 1024 , 20 , 128 , 0 }, - {11 , 1536 , 20 , 256 , 0 }, - {11 , 2048 , 20 , 512 , 0 }, - {11 , 2560 , 20 , 32 , 0 }, - {11 , 3072 , 20 , 256 , 0 }, - {11 , 3584 , 20 , 32 , 0 }, - {11 , 4096 , 20 , 512 , 0 }, - {11 , 4608 , 20 , 32 , 0 }, - {11 , 5120 , 20 , 32 , 0 }, - {11 , 5632 , 20 , 128 , 0 }, - {11 , 6144 , 20 , 512 , 0 }, - {11 , 6656 , 20 , 32 , 0 }, - {11 , 7168 , 20 , 32 , 0 }, - {11 , 7680 , 20 , 32 , 0 }, - {11 , 8192 , 20 , 32 , 0 }, - {11 , 8704 , 20 , 32 , 0 }, - {11 , 9216 , 20 , 32 , 0 }, - {11 , 9728 , 20 , 32 , 0 }, - {11 , 10240 , 20 , 32 , 0 }, - {11 , 20480 , 20 , 32 , 0 }, - {11 , 30720 , 20 , 32 , 0 }, - {11 , 40960 , 20 , 32 , 0 }, - {11 , 51200 , 20 , 32 , 0 }, - {11 , 61440 , 20 , 32 , 0 }, - {11 , 71680 , 20 , 32 , 0 }, - {11 , 81920 , 20 , 128 , 0 }, - {11 , 92160 , 20 , 128 , 0 }, - {11 , 102400 , 20 , 128 , 0 }, - {11 , 204800 , 20 , 32 , 0 }, - {11 , 307200 , 20 , 128 , 0 }, - {11 , 409600 , 20 , 128 , 0 }, - {11 , 512000 , 20 , 128 , 0 }, - {11 , 614400 , 20 , 128 , 0 }, - {11 , 716800 , 20 , 128 , 0 }, - {11 , 819200 , 20 , 128 , 0 }, - {11 , 921600 , 20 , 128 , 0 }, - {11 , 1024000, 20 , 128 , 0 }, - {16 , 512 , 21 , 32 , 0 }, - {16 , 1024 , 21 , 32 , 0 }, - {16 , 1536 , 21 , 256 , 0 }, - {16 , 2048 , 21 , 512 , 0 }, - {16 , 2560 , 21 , 32 , 0 }, - {16 , 3072 , 21 , 32 , 0 }, - {16 , 3584 , 21 , 32 , 0 }, - {16 , 4096 , 21 , 32 , 0 }, - {16 , 4608 , 21 , 32 , 0 }, - {16 , 5120 , 21 , 32 , 0 }, - {16 , 5632 , 21 , 512 , 0 }, - {16 , 6144 , 21 , 32 , 0 }, - {16 , 6656 , 21 , 32 , 0 }, - {16 , 7168 , 21 , 32 , 0 }, - {16 , 7680 , 21 , 32 , 0 }, - {16 , 8192 , 21 , 256 , 0 }, - {16 , 8704 , 21 , 32 , 0 }, - {16 , 9216 , 21 , 32 , 0 }, - {16 , 9728 , 21 , 32 , 0 }, - {16 , 10240 , 21 , 32 , 0 }, - {16 , 20480 , 21 , 32 , 0 }, - {16 , 30720 , 21 , 32 , 0 }, - {16 , 40960 , 21 , 32 , 0 }, - {16 , 51200 , 21 , 128 , 0 }, - {16 , 61440 , 21 , 32 , 0 }, - {16 , 71680 , 21 , 128 , 0 }, - {16 , 81920 , 21 , 128 , 0 }, - {16 , 92160 , 21 , 128 , 0 }, - {16 , 102400 , 21 , 128 , 0 }, - {16 , 204800 , 21 , 128 , 0 }, - {16 , 307200 , 21 , 128 , 0 }, - {16 , 409600 , 21 , 128 , 0 }, - {16 , 512000 , 21 , 128 , 0 }, - {16 , 614400 , 21 , 128 , 0 }, - {16 , 716800 , 21 , 128 , 0 }, - {16 , 819200 , 21 , 128 , 0 }, - {16 , 921600 , 21 , 128 , 0 }, - {16 , 1024000, 21 , 128 , 0 }, - {25 , 512 , 28 , 256 , 0 }, - {25 , 1024 , 28 , 256 , 0 }, - {25 , 1536 , 28 , 128 , 0 }, - {25 , 2048 , 28 , 64 , 0 }, - {25 , 2560 , 28 , 256 , 0 }, - {25 , 3072 , 28 , 256 , 0 }, - {25 , 3584 , 28 , 128 , 0 }, - {25 , 4096 , 28 , 256 , 0 }, - {25 , 4608 , 28 , 64 , 0 }, - {25 , 5120 , 28 , 64 , 0 }, - {25 , 5632 , 28 , 32 , 0 }, - {25 , 6144 , 28 , 128 , 0 }, - {25 , 6656 , 28 , 128 , 0 }, - {25 , 7168 , 28 , 64 , 0 }, - {25 , 7680 , 28 , 64 , 0 }, - {25 , 8192 , 28 , 128 , 0 }, - {25 , 8704 , 28 , 32 , 0 }, - {25 , 9216 , 28 , 64 , 0 }, - {25 , 9728 , 28 , 64 , 0 }, - {25 , 10240 , 28 , 64 , 0 }, - {25 , 20480 , 28 , 64 , 0 }, - {25 , 30720 , 28 , 64 , 0 }, - {25 , 40960 , 28 , 64 , 0 }, - {25 , 51200 , 28 , 64 , 0 }, - {25 , 61440 , 28 , 64 , 0 }, - {25 , 71680 , 28 , 64 , 0 }, - {25 , 81920 , 28 , 64 , 0 }, - {25 , 92160 , 28 , 64 , 0 }, - {25 , 102400 , 28 , 64 , 0 }, - {25 , 204800 , 28 , 64 , 0 }, - {25 , 307200 , 28 , 64 , 0 }, - {25 , 409600 , 28 , 2048 , 0 }, - {25 , 512000 , 28 , 2048 , 0 }, - {25 , 614400 , 28 , 2048 , 0 }, - {25 , 716800 , 28 , 2048 , 0 }, - {25 , 819200 , 28 , 2048 , 0 }, - {25 , 921600 , 28 , 2048 , 0 }, - {25 , 1024000, 28 , 2048 , 0 }, - {24 , 512 , 35 , 256 , 0 }, - {24 , 1024 , 35 , 128 , 0 }, - {24 , 1536 , 35 , 256 , 0 }, - {24 , 2048 , 35 , 256 , 0 }, - {24 , 2560 , 35 , 128 , 0 }, - {24 , 3072 , 35 , 128 , 0 }, - {24 , 3584 , 35 , 256 , 0 }, - {24 , 4096 , 35 , 128 , 0 }, - {24 , 4608 , 35 , 128 , 0 }, - {24 , 5120 , 35 , 128 , 0 }, - {24 , 5632 , 35 , 256 , 0 }, - {24 , 6144 , 35 , 128 , 0 }, - {24 , 6656 , 35 , 128 , 0 }, - {24 , 7168 , 35 , 64 , 0 }, - {24 , 7680 , 35 , 128 , 0 }, - {24 , 8192 , 35 , 64 , 0 }, - {24 , 8704 , 35 , 64 , 0 }, - {24 , 9216 , 35 , 64 , 0 }, - {24 , 9728 , 35 , 64 , 0 }, - {24 , 10240 , 35 , 64 , 0 }, - {24 , 20480 , 35 , 64 , 0 }, - {24 , 30720 , 35 , 64 , 0 }, - {24 , 40960 , 35 , 64 , 0 }, - {24 , 51200 , 35 , 64 , 0 }, - {24 , 61440 , 35 , 2048 , 0 }, - {24 , 71680 , 35 , 64 , 0 }, - {24 , 81920 , 35 , 64 , 0 }, - {24 , 92160 , 35 , 64 , 0 }, - {24 , 102400 , 35 , 64 , 0 }, - {24 , 204800 , 35 , 64 , 0 }, - {24 , 307200 , 35 , 2048 , 0 }, - {24 , 409600 , 35 , 2048 , 0 }, - {24 , 512000 , 35 , 2048 , 0 }, - {24 , 614400 , 35 , 2048 , 0 }, - {24 , 716800 , 35 , 2048 , 0 }, - {24 , 819200 , 35 , 2048 , 0 }, - {24 , 921600 , 35 , 2048 , 0 }, - {24 , 1024000, 35 , 2048 , 0 }, - {33 , 512 , 36 , 128 , 0 }, - {33 , 1024 , 36 , 256 , 0 }, - {33 , 1536 , 36 , 32 , 0 }, - {33 , 2048 , 36 , 256 , 0 }, - {33 , 2560 , 36 , 256 , 0 }, - {33 , 3072 , 36 , 128 , 0 }, - {33 , 3584 , 36 , 128 , 0 }, - {33 , 4096 , 36 , 64 , 0 }, - {33 , 4608 , 36 , 64 , 0 }, - {33 , 5120 , 36 , 128 , 0 }, - {33 , 5632 , 36 , 64 , 0 }, - {33 , 6144 , 36 , 64 , 0 }, - {33 , 6656 , 36 , 64 , 0 }, - {33 , 7168 , 36 , 64 , 0 }, - {33 , 7680 , 36 , 64 , 0 }, - {33 , 8192 , 36 , 64 , 0 }, - {33 , 8704 , 36 , 64 , 0 }, - {33 , 9216 , 36 , 64 , 0 }, - {33 , 9728 , 36 , 64 , 0 }, - {33 , 10240 , 36 , 64 , 0 }, - {33 , 20480 , 36 , 64 , 0 }, - {33 , 30720 , 36 , 64 , 0 }, - {33 , 40960 , 36 , 64 , 0 }, - {33 , 51200 , 36 , 64 , 0 }, - {33 , 61440 , 36 , 64 , 0 }, - {33 , 71680 , 36 , 64 , 0 }, - {33 , 81920 , 36 , 64 , 0 }, - {33 , 92160 , 36 , 64 , 0 }, - {33 , 102400 , 36 , 64 , 0 }, - {33 , 204800 , 36 , 64 , 0 }, - {33 , 307200 , 36 , 64 , 0 }, - {33 , 409600 , 36 , 8192 , 0 }, - {33 , 512000 , 36 , 64 , 0 }, - {33 , 614400 , 36 , 8192 , 0 }, - {33 , 716800 , 36 , 64 , 0 }, - {33 , 819200 , 36 , 8192 , 0 }, - {33 , 921600 , 36 , 921600 , 0 }, - {33 , 1024000, 36 , 8192 , 0 }, - {42 , 512 , 45 , 512 , 0 }, - {42 , 1024 , 45 , 64 , 0 }, - {42 , 1536 , 45 , 64 , 0 }, - {42 , 2048 , 45 , 64 , 0 }, - {42 , 2560 , 45 , 64 , 0 }, - {42 , 3072 , 45 , 64 , 0 }, - {42 , 3584 , 45 , 64 , 0 }, - {42 , 4096 , 45 , 64 , 0 }, - {42 , 4608 , 45 , 64 , 0 }, - {42 , 5120 , 45 , 64 , 0 }, - {42 , 5632 , 45 , 128 , 0 }, - {42 , 6144 , 45 , 64 , 0 }, - {42 , 6656 , 45 , 32 , 0 }, - {42 , 7168 , 45 , 64 , 0 }, - {42 , 7680 , 45 , 64 , 0 }, - {42 , 8192 , 45 , 64 , 0 }, - {42 , 8704 , 45 , 64 , 0 }, - {42 , 9216 , 45 , 64 , 0 }, - {42 , 9728 , 45 , 64 , 0 }, - {42 , 10240 , 45 , 64 , 0 }, - {42 , 20480 , 45 , 64 , 0 }, - {42 , 30720 , 45 , 64 , 0 }, - {42 , 40960 , 45 , 64 , 0 }, - {42 , 51200 , 45 , 1024 , 0 }, - {42 , 61440 , 45 , 64 , 0 }, - {42 , 71680 , 45 , 64 , 0 }, - {42 , 81920 , 45 , 64 , 0 }, - {42 , 92160 , 45 , 64 , 0 }, - {42 , 102400 , 45 , 64 , 0 }, - {42 , 204800 , 45 , 8192 , 0 }, - {42 , 307200 , 45 , 1024 , 0 }, - {42 , 409600 , 45 , 8192 , 0 }, - {42 , 512000 , 45 , 1024 , 0 }, - {42 , 614400 , 45 , 8192 , 0 }, - {42 , 716800 , 45 , 1024 , 0 }, - {42 , 819200 , 45 , 8192 , 0 }, - {42 , 921600 , 45 , 921600 , 0 }, - {42 , 1024000, 45 , 8192 , 0 }, - {43 , 512 , 56 , 128 , 0 }, - {43 , 1024 , 56 , 512 , 0 }, - {43 , 1536 , 56 , 128 , 0 }, - {43 , 2048 , 56 , 64 , 0 }, - {43 , 2560 , 56 , 64 , 0 }, - {43 , 3072 , 56 , 64 , 0 }, - {43 , 3584 , 56 , 128 , 0 }, - {43 , 4096 , 56 , 64 , 0 }, - {43 , 4608 , 56 , 64 , 0 }, - {43 , 5120 , 56 , 64 , 0 }, - {43 , 5632 , 56 , 64 , 0 }, - {43 , 6144 , 56 , 64 , 0 }, - {43 , 6656 , 56 , 128 , 0 }, - {43 , 7168 , 56 , 64 , 0 }, - {43 , 7680 , 56 , 64 , 0 }, - {43 , 8192 , 56 , 64 , 0 }, - {43 , 8704 , 56 , 64 , 0 }, - {43 , 9216 , 56 , 64 , 0 }, - {43 , 9728 , 56 , 64 , 0 }, - {43 , 10240 , 56 , 64 , 0 }, - {43 , 20480 , 56 , 64 , 0 }, - {43 , 30720 , 56 , 64 , 0 }, - {43 , 40960 , 56 , 8192 , 0 }, - {43 , 51200 , 56 , 64 , 0 }, - {43 , 61440 , 56 , 64 , 0 }, - {43 , 71680 , 56 , 64 , 0 }, - {43 , 81920 , 56 , 64 , 0 }, - {43 , 92160 , 56 , 1024 , 0 }, - {43 , 102400 , 56 , 64 , 0 }, - {43 , 204800 , 56 , 8192 , 0 }, - {43 , 307200 , 56 , 1024 , 0 }, - {43 , 409600 , 56 , 8192 , 0 }, - {43 , 512000 , 56 , 1024 , 0 }, - {43 , 614400 , 56 , 8192 , 0 }, - {43 , 716800 , 56 , 1024 , 0 }, - {43 , 819200 , 56 , 8192 , 0 }, - {43 , 921600 , 56 , 1024 , 0 }, - {43 , 1024000, 56 , 8192 , 0 }, - {126 , 512 , 84 , 32 , 0 }, - {126 , 1024 , 84 , 256 , 0 }, - {126 , 1536 , 84 , 64 , 0 }, - {126 , 2048 , 84 , 512 , 0 }, - {126 , 2560 , 84 , 256 , 0 }, - {126 , 3072 , 84 , 256 , 0 }, - {126 , 3584 , 84 , 256 , 0 }, - {126 , 4096 , 84 , 256 , 0 }, - {126 , 4608 , 84 , 512 , 0 }, - {126 , 5120 , 84 , 256 , 0 }, - {126 , 5632 , 84 , 512 , 0 }, - {126 , 6144 , 84 , 2048 , 0 }, - {126 , 6656 , 84 , 512 , 0 }, - {126 , 7168 , 84 , 256 , 0 }, - {126 , 7680 , 84 , 128 , 0 }, - {126 , 8192 , 84 , 8192 , 0 }, - {126 , 8704 , 84 , 512 , 0 }, - {126 , 9216 , 84 , 256 , 0 }, - {126 , 9728 , 84 , 256 , 0 }, - {126 , 10240 , 84 , 2048 , 0 }, - {126 , 20480 , 84 , 4096 , 0 }, - {126 , 30720 , 84 , 512 , 0 }, - {126 , 40960 , 84 , 4096 , 0 }, - {126 , 51200 , 84 , 512 , 0 }, - {126 , 61440 , 84 , 4096 , 0 }, - {126 , 71680 , 84 , 512 , 0 }, - {126 , 81920 , 84 , 16384 , 0 }, - {126 , 92160 , 84 , 2048 , 0 }, - {126 , 102400 , 84 , 4096 , 0 }, - {126 , 204800 , 84 , 4096 , 0 }, - {126 , 307200 , 84 , 4096 , 0 }, - {126 , 409600 , 84 , 16384 , 0 }, - {126 , 512000 , 84 , 4096 , 0 }, - {126 , 614400 , 84 , 4096 , 0 }, - {126 , 716800 , 84 , 4096 , 0 }, - {126 , 819200 , 84 , 32768 , 0 }, - {126 , 921600 , 84 , 4096 , 0 }, - {126 , 1024000, 84 , 4096 , 0 }, - {210 , 512 , 120 , 256 , 0 }, - {210 , 1024 , 120 , 512 , 0 }, - {210 , 1536 , 120 , 512 , 0 }, - {210 , 2048 , 120 , 2048 , 0 }, - {210 , 2560 , 120 , 128 , 0 }, - {210 , 3072 , 120 , 512 , 0 }, - {210 , 3584 , 120 , 512 , 0 }, - {210 , 4096 , 120 , 128 , 0 }, - {210 , 4608 , 120 , 128 , 0 }, - {210 , 5120 , 120 , 128 , 0 }, - {210 , 5632 , 120 , 128 , 0 }, - {210 , 6144 , 120 , 128 , 0 }, - {210 , 6656 , 120 , 128 , 0 }, - {210 , 7168 , 120 , 128 , 0 }, - {210 , 7680 , 120 , 512 , 0 }, - {210 , 8192 , 120 , 8192 , 0 }, - {210 , 8704 , 120 , 512 , 0 }, - {210 , 9216 , 120 , 128 , 0 }, - {210 , 9728 , 120 , 128 , 0 }, - {210 , 10240 , 120 , 2048 , 0 }, - {210 , 20480 , 120 , 4096 , 0 }, - {210 , 30720 , 120 , 2048 , 0 }, - {210 , 40960 , 120 , 8192 , 0 }, - {210 , 51200 , 120 , 2048 , 0 }, - {210 , 61440 , 120 , 2048 , 0 }, - {210 , 71680 , 120 , 71680 , 0 }, - {210 , 81920 , 120 , 8192 , 0 }, - {210 , 92160 , 120 , 92160 , 0 }, - {210 , 102400 , 120 , 102400 , 0 }, - {210 , 204800 , 120 , 204800 , 0 }, - {210 , 307200 , 120 , 307200 , 0 }, - {210 , 409600 , 120 , 409600 , 0 }, - {210 , 512000 , 120 , 512000 , 0 }, - {210 , 614400 , 120 , 614400 , 0 }, - {210 , 716800 , 120 , 716800 , 0 }, - {210 , 819200 , 120 , 819200 , 0 }, - {210 , 921600 , 120 , 921600 , 0 }, - {210 , 1024000, 120 , 1024000, 0 }, - {330 , 512 , 165 , 128 , 0 }, - {330 , 1024 , 165 , 512 , 0 }, - {330 , 1536 , 165 , 512 , 0 }, - {330 , 2048 , 165 , 128 , 0 }, - {330 , 2560 , 165 , 128 , 0 }, - {330 , 3072 , 165 , 256 , 0 }, - {330 , 3584 , 165 , 256 , 0 }, - {330 , 4096 , 165 , 2048 , 0 }, - {330 , 4608 , 165 , 256 , 0 }, - {330 , 5120 , 165 , 256 , 0 }, - {330 , 5632 , 165 , 256 , 0 }, - {330 , 6144 , 165 , 2048 , 0 }, - {330 , 6656 , 165 , 256 , 0 }, - {330 , 7168 , 165 , 256 , 0 }, - {330 , 7680 , 165 , 256 , 0 }, - {330 , 8192 , 165 , 8192 , 0 }, - {330 , 8704 , 165 , 256 , 0 }, - {330 , 9216 , 165 , 256 , 0 }, - {330 , 9728 , 165 , 256 , 0 }, - {330 , 10240 , 165 , 256 , 0 }, - {330 , 20480 , 165 , 256 , 0 }, - {330 , 30720 , 165 , 256 , 0 }, - {330 , 40960 , 165 , 8192 , 0 }, - {330 , 51200 , 165 , 256 , 0 }, - {330 , 61440 , 165 , 61440 , 0 }, - {330 , 71680 , 165 , 71680 , 0 }, - {330 , 81920 , 165 , 81920 , 0 }, - {330 , 92160 , 165 , 92160 , 0 }, - {330 , 102400 , 165 , 102400 , 0 }, - {330 , 204800 , 165 , 204800 , 0 }, - {330 , 307200 , 165 , 307200 , 0 }, - {330 , 409600 , 165 , 409600 , 0 }, - {330 , 512000 , 165 , 512000 , 0 }, - {330 , 614400 , 165 , 614400 , 0 }, - {330 , 716800 , 165 , 716800 , 0 }, - {330 , 819200 , 165 , 819200 , 0 }, - {330 , 921600 , 165 , 921600 , 0 }, - {330 , 1024000, 165 , 1024000, 0 } +std::vector > sgemm_tn_mi250x = { + {1, 512, 3, 32, 0}, + {1, 1024, 3, 32, 0}, + {1, 1536, 3, 256, 1}, + {1, 2048, 3, 1024, 1}, + {1, 2560, 3, 256, 1}, + {1, 3072, 3, 256, 1}, + {1, 3584, 3, 512, 1}, + {1, 4096, 3, 256, 1}, + {1, 4608, 3, 32, 0}, + {1, 5120, 3, 256, 1}, + {1, 5632, 3, 512, 1}, + {1, 6144, 3, 2048, 1}, + {1, 6656, 3, 512, 1}, + {1, 7168, 3, 1024, 1}, + {1, 7680, 3, 512, 1}, + {1, 8192, 3, 2048, 1}, + {1, 8704, 3, 512, 1}, + {1, 9216, 3, 512, 1}, + {1, 9728, 3, 512, 1}, + {1, 10240, 3, 2048, 1}, + {1, 20480, 3, 2048, 1}, + {1, 30720, 3, 2048, 1}, + {1, 40960, 3, 8192, 1}, + {1, 51200, 3, 2048, 1}, + {1, 61440, 3, 4096, 1}, + {1, 71680, 3, 2048, 1}, + {1, 81920, 3, 8192, 1}, + {1, 92160, 3, 2048, 1}, + {1, 102400, 3, 4096, 1}, + {1, 204800, 3, 8192, 1}, + {1, 307200, 3, 4096, 1}, + {1, 409600, 3, 16384, 1}, + {1, 512000, 3, 2048, 1}, + {1, 614400, 3, 8192, 1}, + {1, 716800, 3, 2048, 1}, + {1, 819200, 3, 32768, 1}, + {1, 921600, 3, 2048, 1}, + {1, 1024000, 3, 2048, 1}, + {1, 512, 4, 128, 0}, + {1, 1024, 4, 128, 1}, + {1, 1536, 4, 256, 1}, + {1, 2048, 4, 128, 1}, + {1, 2560, 4, 512, 1}, + {1, 3072, 4, 1024, 1}, + {1, 3584, 4, 512, 1}, + {1, 4096, 4, 1024, 1}, + {1, 4608, 4, 512, 1}, + {1, 5120, 4, 256, 1}, + {1, 5632, 4, 512, 1}, + {1, 6144, 4, 1024, 1}, + {1, 6656, 4, 512, 1}, + {1, 7168, 4, 1024, 1}, + {1, 7680, 4, 128, 0}, + {1, 8192, 4, 4096, 1}, + {1, 8704, 4, 512, 1}, + {1, 9216, 4, 1024, 1}, + {1, 9728, 4, 512, 1}, + {1, 10240, 4, 1024, 1}, + {1, 20480, 4, 2048, 1}, + {1, 30720, 4, 2048, 1}, + {1, 40960, 4, 2048, 1}, + {1, 51200, 4, 2048, 1}, + {1, 61440, 4, 4096, 1}, + {1, 71680, 4, 2048, 1}, + {1, 81920, 4, 16384, 1}, + {1, 92160, 4, 2048, 1}, + {1, 102400, 4, 4096, 1}, + {1, 204800, 4, 8192, 1}, + {1, 307200, 4, 4096, 1}, + {1, 409600, 4, 8192, 1}, + {1, 512000, 4, 1024, 1}, + {1, 614400, 4, 8192, 1}, + {1, 716800, 4, 1024, 1}, + {1, 819200, 4, 16384, 1}, + {1, 921600, 4, 4096, 1}, + {1, 1024000, 4, 2048, 1}, + {3, 512, 6, 128, 0}, + {3, 1024, 6, 128, 1}, + {3, 1536, 6, 256, 1}, + {3, 2048, 6, 1024, 1}, + {3, 2560, 6, 256, 1}, + {3, 3072, 6, 1024, 1}, + {3, 3584, 6, 512, 1}, + {3, 4096, 6, 512, 1}, + {3, 4608, 6, 512, 1}, + {3, 5120, 6, 256, 1}, + {3, 5632, 6, 512, 1}, + {3, 6144, 6, 512, 1}, + {3, 6656, 6, 32, 0}, + {3, 7168, 6, 1024, 1}, + {3, 7680, 6, 512, 1}, + {3, 8192, 6, 1024, 1}, + {3, 8704, 6, 32, 0}, + {3, 9216, 6, 512, 1}, + {3, 9728, 6, 32, 0}, + {3, 10240, 6, 512, 1}, + {3, 20480, 6, 32, 0}, + {3, 30720, 6, 64, 0}, + {3, 40960, 6, 2048, 1}, + {3, 51200, 6, 2048, 1}, + {3, 61440, 6, 64, 0}, + {3, 71680, 6, 32, 0}, + {3, 81920, 6, 2048, 1}, + {3, 92160, 6, 2048, 1}, + {3, 102400, 6, 4096, 1}, + {3, 204800, 6, 8192, 1}, + {3, 307200, 6, 4096, 1}, + {3, 409600, 6, 16384, 1}, + {3, 512000, 6, 1024, 1}, + {3, 614400, 6, 8192, 1}, + {3, 716800, 6, 1024, 1}, + {3, 819200, 6, 32768, 1}, + {3, 921600, 6, 2048, 1}, + {3, 1024000, 6, 512, 1}, + {4, 512, 10, 128, 0}, + {4, 1024, 10, 128, 1}, + {4, 1536, 10, 256, 1}, + {4, 2048, 10, 1024, 1}, + {4, 2560, 10, 512, 1}, + {4, 3072, 10, 512, 1}, + {4, 3584, 10, 256, 1}, + {4, 4096, 10, 2048, 1}, + {4, 4608, 10, 512, 1}, + {4, 5120, 10, 32, 0}, + {4, 5632, 10, 256, 1}, + {4, 6144, 10, 512, 1}, + {4, 6656, 10, 512, 1}, + {4, 7168, 10, 512, 1}, + {4, 7680, 10, 64, 0}, + {4, 8192, 10, 64, 0}, + {4, 8704, 10, 64, 0}, + {4, 9216, 10, 64, 0}, + {4, 9728, 10, 64, 0}, + {4, 10240, 10, 1024, 1}, + {4, 20480, 10, 64, 0}, + {4, 30720, 10, 64, 0}, + {4, 40960, 10, 64, 0}, + {4, 51200, 10, 32, 0}, + {4, 61440, 10, 32, 0}, + {4, 71680, 10, 2048, 1}, + {4, 81920, 10, 2048, 1}, + {4, 92160, 10, 2048, 1}, + {4, 102400, 10, 64, 0}, + {4, 204800, 10, 32, 0}, + {4, 307200, 10, 32, 0}, + {4, 409600, 10, 64, 0}, + {4, 512000, 10, 64, 0}, + {4, 614400, 10, 64, 0}, + {4, 716800, 10, 32, 0}, + {4, 819200, 10, 32, 0}, + {4, 921600, 10, 32, 0}, + {4, 1024000, 10, 32, 0}, + {6, 512, 10, 32, 0}, + {6, 1024, 10, 128, 1}, + {6, 1536, 10, 256, 1}, + {6, 2048, 10, 1024, 1}, + {6, 2560, 10, 512, 1}, + {6, 3072, 10, 512, 1}, + {6, 3584, 10, 512, 1}, + {6, 4096, 10, 2048, 1}, + {6, 4608, 10, 256, 1}, + {6, 5120, 10, 64, 0}, + {6, 5632, 10, 512, 1}, + {6, 6144, 10, 1024, 1}, + {6, 6656, 10, 512, 1}, + {6, 7168, 10, 64, 0}, + {6, 7680, 10, 64, 0}, + {6, 8192, 10, 64, 0}, + {6, 8704, 10, 32, 0}, + {6, 9216, 10, 64, 0}, + {6, 9728, 10, 128, 0}, + {6, 10240, 10, 128, 0}, + {6, 20480, 10, 32, 0}, + {6, 30720, 10, 64, 0}, + {6, 40960, 10, 32, 0}, + {6, 51200, 10, 1024, 1}, + {6, 61440, 10, 4096, 1}, + {6, 71680, 10, 64, 0}, + {6, 81920, 10, 8192, 1}, + {6, 92160, 10, 32, 0}, + {6, 102400, 10, 32, 0}, + {6, 204800, 10, 64, 0}, + {6, 307200, 10, 32, 0}, + {6, 409600, 10, 32, 0}, + {6, 512000, 10, 64, 0}, + {6, 614400, 10, 64, 0}, + {6, 716800, 10, 64, 0}, + {6, 819200, 10, 64, 0}, + {6, 921600, 10, 32, 0}, + {6, 1024000, 10, 64, 0}, + {12, 512, 15, 32, 0}, + {12, 1024, 15, 128, 1}, + {12, 1536, 15, 64, 1}, + {12, 2048, 15, 128, 1}, + {12, 2560, 15, 256, 1}, + {12, 3072, 15, 512, 1}, + {12, 3584, 15, 512, 1}, + {12, 4096, 15, 64, 0}, + {12, 4608, 15, 512, 1}, + {12, 5120, 15, 512, 1}, + {12, 5632, 15, 64, 0}, + {12, 6144, 15, 32, 0}, + {12, 6656, 15, 256, 0}, + {12, 7168, 15, 64, 0}, + {12, 7680, 15, 32, 0}, + {12, 8192, 15, 32, 0}, + {12, 8704, 15, 32, 0}, + {12, 9216, 15, 512, 0}, + {12, 9728, 15, 64, 0}, + {12, 10240, 15, 64, 0}, + {12, 20480, 15, 64, 0}, + {12, 30720, 15, 64, 0}, + {12, 40960, 15, 64, 0}, + {12, 51200, 15, 64, 0}, + {12, 61440, 15, 64, 0}, + {12, 71680, 15, 64, 0}, + {12, 81920, 15, 32, 0}, + {12, 92160, 15, 64, 0}, + {12, 102400, 15, 32, 0}, + {12, 204800, 15, 32, 0}, + {12, 307200, 15, 32, 0}, + {12, 409600, 15, 64, 0}, + {12, 512000, 15, 64, 0}, + {12, 614400, 15, 32, 0}, + {12, 716800, 15, 64, 0}, + {12, 819200, 15, 64, 0}, + {12, 921600, 15, 64, 0}, + {12, 1024000, 15, 64, 0}, + {11, 512, 20, 32, 0}, + {11, 1024, 20, 128, 0}, + {11, 1536, 20, 256, 0}, + {11, 2048, 20, 512, 0}, + {11, 2560, 20, 32, 0}, + {11, 3072, 20, 256, 0}, + {11, 3584, 20, 32, 0}, + {11, 4096, 20, 512, 0}, + {11, 4608, 20, 32, 0}, + {11, 5120, 20, 32, 0}, + {11, 5632, 20, 128, 0}, + {11, 6144, 20, 512, 0}, + {11, 6656, 20, 32, 0}, + {11, 7168, 20, 32, 0}, + {11, 7680, 20, 32, 0}, + {11, 8192, 20, 32, 0}, + {11, 8704, 20, 32, 0}, + {11, 9216, 20, 32, 0}, + {11, 9728, 20, 32, 0}, + {11, 10240, 20, 32, 0}, + {11, 20480, 20, 32, 0}, + {11, 30720, 20, 32, 0}, + {11, 40960, 20, 32, 0}, + {11, 51200, 20, 32, 0}, + {11, 61440, 20, 32, 0}, + {11, 71680, 20, 32, 0}, + {11, 81920, 20, 128, 0}, + {11, 92160, 20, 128, 0}, + {11, 102400, 20, 128, 0}, + {11, 204800, 20, 32, 0}, + {11, 307200, 20, 128, 0}, + {11, 409600, 20, 128, 0}, + {11, 512000, 20, 128, 0}, + {11, 614400, 20, 128, 0}, + {11, 716800, 20, 128, 0}, + {11, 819200, 20, 128, 0}, + {11, 921600, 20, 128, 0}, + {11, 1024000, 20, 128, 0}, + {16, 512, 21, 32, 0}, + {16, 1024, 21, 32, 0}, + {16, 1536, 21, 256, 0}, + {16, 2048, 21, 512, 0}, + {16, 2560, 21, 32, 0}, + {16, 3072, 21, 32, 0}, + {16, 3584, 21, 32, 0}, + {16, 4096, 21, 32, 0}, + {16, 4608, 21, 32, 0}, + {16, 5120, 21, 32, 0}, + {16, 5632, 21, 512, 0}, + {16, 6144, 21, 32, 0}, + {16, 6656, 21, 32, 0}, + {16, 7168, 21, 32, 0}, + {16, 7680, 21, 32, 0}, + {16, 8192, 21, 256, 0}, + {16, 8704, 21, 32, 0}, + {16, 9216, 21, 32, 0}, + {16, 9728, 21, 32, 0}, + {16, 10240, 21, 32, 0}, + {16, 20480, 21, 32, 0}, + {16, 30720, 21, 32, 0}, + {16, 40960, 21, 32, 0}, + {16, 51200, 21, 128, 0}, + {16, 61440, 21, 32, 0}, + {16, 71680, 21, 128, 0}, + {16, 81920, 21, 128, 0}, + {16, 92160, 21, 128, 0}, + {16, 102400, 21, 128, 0}, + {16, 204800, 21, 128, 0}, + {16, 307200, 21, 128, 0}, + {16, 409600, 21, 128, 0}, + {16, 512000, 21, 128, 0}, + {16, 614400, 21, 128, 0}, + {16, 716800, 21, 128, 0}, + {16, 819200, 21, 128, 0}, + {16, 921600, 21, 128, 0}, + {16, 1024000, 21, 128, 0}, + {25, 512, 28, 256, 0}, + {25, 1024, 28, 256, 0}, + {25, 1536, 28, 128, 0}, + {25, 2048, 28, 64, 0}, + {25, 2560, 28, 256, 0}, + {25, 3072, 28, 256, 0}, + {25, 3584, 28, 128, 0}, + {25, 4096, 28, 256, 0}, + {25, 4608, 28, 64, 0}, + {25, 5120, 28, 64, 0}, + {25, 5632, 28, 32, 0}, + {25, 6144, 28, 128, 0}, + {25, 6656, 28, 128, 0}, + {25, 7168, 28, 64, 0}, + {25, 7680, 28, 64, 0}, + {25, 8192, 28, 128, 0}, + {25, 8704, 28, 32, 0}, + {25, 9216, 28, 64, 0}, + {25, 9728, 28, 64, 0}, + {25, 10240, 28, 64, 0}, + {25, 20480, 28, 64, 0}, + {25, 30720, 28, 64, 0}, + {25, 40960, 28, 64, 0}, + {25, 51200, 28, 64, 0}, + {25, 61440, 28, 64, 0}, + {25, 71680, 28, 64, 0}, + {25, 81920, 28, 64, 0}, + {25, 92160, 28, 64, 0}, + {25, 102400, 28, 64, 0}, + {25, 204800, 28, 64, 0}, + {25, 307200, 28, 64, 0}, + {25, 409600, 28, 2048, 0}, + {25, 512000, 28, 2048, 0}, + {25, 614400, 28, 2048, 0}, + {25, 716800, 28, 2048, 0}, + {25, 819200, 28, 2048, 0}, + {25, 921600, 28, 2048, 0}, + {25, 1024000, 28, 2048, 0}, + {24, 512, 35, 256, 0}, + {24, 1024, 35, 128, 0}, + {24, 1536, 35, 256, 0}, + {24, 2048, 35, 256, 0}, + {24, 2560, 35, 128, 0}, + {24, 3072, 35, 128, 0}, + {24, 3584, 35, 256, 0}, + {24, 4096, 35, 128, 0}, + {24, 4608, 35, 128, 0}, + {24, 5120, 35, 128, 0}, + {24, 5632, 35, 256, 0}, + {24, 6144, 35, 128, 0}, + {24, 6656, 35, 128, 0}, + {24, 7168, 35, 64, 0}, + {24, 7680, 35, 128, 0}, + {24, 8192, 35, 64, 0}, + {24, 8704, 35, 64, 0}, + {24, 9216, 35, 64, 0}, + {24, 9728, 35, 64, 0}, + {24, 10240, 35, 64, 0}, + {24, 20480, 35, 64, 0}, + {24, 30720, 35, 64, 0}, + {24, 40960, 35, 64, 0}, + {24, 51200, 35, 64, 0}, + {24, 61440, 35, 2048, 0}, + {24, 71680, 35, 64, 0}, + {24, 81920, 35, 64, 0}, + {24, 92160, 35, 64, 0}, + {24, 102400, 35, 64, 0}, + {24, 204800, 35, 64, 0}, + {24, 307200, 35, 2048, 0}, + {24, 409600, 35, 2048, 0}, + {24, 512000, 35, 2048, 0}, + {24, 614400, 35, 2048, 0}, + {24, 716800, 35, 2048, 0}, + {24, 819200, 35, 2048, 0}, + {24, 921600, 35, 2048, 0}, + {24, 1024000, 35, 2048, 0}, + {33, 512, 36, 128, 0}, + {33, 1024, 36, 256, 0}, + {33, 1536, 36, 32, 0}, + {33, 2048, 36, 256, 0}, + {33, 2560, 36, 256, 0}, + {33, 3072, 36, 128, 0}, + {33, 3584, 36, 128, 0}, + {33, 4096, 36, 64, 0}, + {33, 4608, 36, 64, 0}, + {33, 5120, 36, 128, 0}, + {33, 5632, 36, 64, 0}, + {33, 6144, 36, 64, 0}, + {33, 6656, 36, 64, 0}, + {33, 7168, 36, 64, 0}, + {33, 7680, 36, 64, 0}, + {33, 8192, 36, 64, 0}, + {33, 8704, 36, 64, 0}, + {33, 9216, 36, 64, 0}, + {33, 9728, 36, 64, 0}, + {33, 10240, 36, 64, 0}, + {33, 20480, 36, 64, 0}, + {33, 30720, 36, 64, 0}, + {33, 40960, 36, 64, 0}, + {33, 51200, 36, 64, 0}, + {33, 61440, 36, 64, 0}, + {33, 71680, 36, 64, 0}, + {33, 81920, 36, 64, 0}, + {33, 92160, 36, 64, 0}, + {33, 102400, 36, 64, 0}, + {33, 204800, 36, 64, 0}, + {33, 307200, 36, 64, 0}, + {33, 409600, 36, 8192, 0}, + {33, 512000, 36, 64, 0}, + {33, 614400, 36, 8192, 0}, + {33, 716800, 36, 64, 0}, + {33, 819200, 36, 8192, 0}, + {33, 921600, 36, 921600, 0}, + {33, 1024000, 36, 8192, 0}, + {42, 512, 45, 512, 0}, + {42, 1024, 45, 64, 0}, + {42, 1536, 45, 64, 0}, + {42, 2048, 45, 64, 0}, + {42, 2560, 45, 64, 0}, + {42, 3072, 45, 64, 0}, + {42, 3584, 45, 64, 0}, + {42, 4096, 45, 64, 0}, + {42, 4608, 45, 64, 0}, + {42, 5120, 45, 64, 0}, + {42, 5632, 45, 128, 0}, + {42, 6144, 45, 64, 0}, + {42, 6656, 45, 32, 0}, + {42, 7168, 45, 64, 0}, + {42, 7680, 45, 64, 0}, + {42, 8192, 45, 64, 0}, + {42, 8704, 45, 64, 0}, + {42, 9216, 45, 64, 0}, + {42, 9728, 45, 64, 0}, + {42, 10240, 45, 64, 0}, + {42, 20480, 45, 64, 0}, + {42, 30720, 45, 64, 0}, + {42, 40960, 45, 64, 0}, + {42, 51200, 45, 1024, 0}, + {42, 61440, 45, 64, 0}, + {42, 71680, 45, 64, 0}, + {42, 81920, 45, 64, 0}, + {42, 92160, 45, 64, 0}, + {42, 102400, 45, 64, 0}, + {42, 204800, 45, 8192, 0}, + {42, 307200, 45, 1024, 0}, + {42, 409600, 45, 8192, 0}, + {42, 512000, 45, 1024, 0}, + {42, 614400, 45, 8192, 0}, + {42, 716800, 45, 1024, 0}, + {42, 819200, 45, 8192, 0}, + {42, 921600, 45, 921600, 0}, + {42, 1024000, 45, 8192, 0}, + {43, 512, 56, 128, 0}, + {43, 1024, 56, 512, 0}, + {43, 1536, 56, 128, 0}, + {43, 2048, 56, 64, 0}, + {43, 2560, 56, 64, 0}, + {43, 3072, 56, 64, 0}, + {43, 3584, 56, 128, 0}, + {43, 4096, 56, 64, 0}, + {43, 4608, 56, 64, 0}, + {43, 5120, 56, 64, 0}, + {43, 5632, 56, 64, 0}, + {43, 6144, 56, 64, 0}, + {43, 6656, 56, 128, 0}, + {43, 7168, 56, 64, 0}, + {43, 7680, 56, 64, 0}, + {43, 8192, 56, 64, 0}, + {43, 8704, 56, 64, 0}, + {43, 9216, 56, 64, 0}, + {43, 9728, 56, 64, 0}, + {43, 10240, 56, 64, 0}, + {43, 20480, 56, 64, 0}, + {43, 30720, 56, 64, 0}, + {43, 40960, 56, 8192, 0}, + {43, 51200, 56, 64, 0}, + {43, 61440, 56, 64, 0}, + {43, 71680, 56, 64, 0}, + {43, 81920, 56, 64, 0}, + {43, 92160, 56, 1024, 0}, + {43, 102400, 56, 64, 0}, + {43, 204800, 56, 8192, 0}, + {43, 307200, 56, 1024, 0}, + {43, 409600, 56, 8192, 0}, + {43, 512000, 56, 1024, 0}, + {43, 614400, 56, 8192, 0}, + {43, 716800, 56, 1024, 0}, + {43, 819200, 56, 8192, 0}, + {43, 921600, 56, 1024, 0}, + {43, 1024000, 56, 8192, 0}, + {126, 512, 84, 32, 0}, + {126, 1024, 84, 256, 0}, + {126, 1536, 84, 64, 0}, + {126, 2048, 84, 512, 0}, + {126, 2560, 84, 256, 0}, + {126, 3072, 84, 256, 0}, + {126, 3584, 84, 256, 0}, + {126, 4096, 84, 256, 0}, + {126, 4608, 84, 512, 0}, + {126, 5120, 84, 256, 0}, + {126, 5632, 84, 512, 0}, + {126, 6144, 84, 2048, 0}, + {126, 6656, 84, 512, 0}, + {126, 7168, 84, 256, 0}, + {126, 7680, 84, 128, 0}, + {126, 8192, 84, 8192, 0}, + {126, 8704, 84, 512, 0}, + {126, 9216, 84, 256, 0}, + {126, 9728, 84, 256, 0}, + {126, 10240, 84, 2048, 0}, + {126, 20480, 84, 4096, 0}, + {126, 30720, 84, 512, 0}, + {126, 40960, 84, 4096, 0}, + {126, 51200, 84, 512, 0}, + {126, 61440, 84, 4096, 0}, + {126, 71680, 84, 512, 0}, + {126, 81920, 84, 16384, 0}, + {126, 92160, 84, 2048, 0}, + {126, 102400, 84, 4096, 0}, + {126, 204800, 84, 4096, 0}, + {126, 307200, 84, 4096, 0}, + {126, 409600, 84, 16384, 0}, + {126, 512000, 84, 4096, 0}, + {126, 614400, 84, 4096, 0}, + {126, 716800, 84, 4096, 0}, + {126, 819200, 84, 32768, 0}, + {126, 921600, 84, 4096, 0}, + {126, 1024000, 84, 4096, 0}, + {210, 512, 120, 256, 0}, + {210, 1024, 120, 512, 0}, + {210, 1536, 120, 512, 0}, + {210, 2048, 120, 2048, 0}, + {210, 2560, 120, 128, 0}, + {210, 3072, 120, 512, 0}, + {210, 3584, 120, 512, 0}, + {210, 4096, 120, 128, 0}, + {210, 4608, 120, 128, 0}, + {210, 5120, 120, 128, 0}, + {210, 5632, 120, 128, 0}, + {210, 6144, 120, 128, 0}, + {210, 6656, 120, 128, 0}, + {210, 7168, 120, 128, 0}, + {210, 7680, 120, 512, 0}, + {210, 8192, 120, 8192, 0}, + {210, 8704, 120, 512, 0}, + {210, 9216, 120, 128, 0}, + {210, 9728, 120, 128, 0}, + {210, 10240, 120, 2048, 0}, + {210, 20480, 120, 4096, 0}, + {210, 30720, 120, 2048, 0}, + {210, 40960, 120, 8192, 0}, + {210, 51200, 120, 2048, 0}, + {210, 61440, 120, 2048, 0}, + {210, 71680, 120, 71680, 0}, + {210, 81920, 120, 8192, 0}, + {210, 92160, 120, 92160, 0}, + {210, 102400, 120, 102400, 0}, + {210, 204800, 120, 204800, 0}, + {210, 307200, 120, 307200, 0}, + {210, 409600, 120, 409600, 0}, + {210, 512000, 120, 512000, 0}, + {210, 614400, 120, 614400, 0}, + {210, 716800, 120, 716800, 0}, + {210, 819200, 120, 819200, 0}, + {210, 921600, 120, 921600, 0}, + {210, 1024000, 120, 1024000, 0}, + {330, 512, 165, 128, 0}, + {330, 1024, 165, 512, 0}, + {330, 1536, 165, 512, 0}, + {330, 2048, 165, 128, 0}, + {330, 2560, 165, 128, 0}, + {330, 3072, 165, 256, 0}, + {330, 3584, 165, 256, 0}, + {330, 4096, 165, 2048, 0}, + {330, 4608, 165, 256, 0}, + {330, 5120, 165, 256, 0}, + {330, 5632, 165, 256, 0}, + {330, 6144, 165, 2048, 0}, + {330, 6656, 165, 256, 0}, + {330, 7168, 165, 256, 0}, + {330, 7680, 165, 256, 0}, + {330, 8192, 165, 8192, 0}, + {330, 8704, 165, 256, 0}, + {330, 9216, 165, 256, 0}, + {330, 9728, 165, 256, 0}, + {330, 10240, 165, 256, 0}, + {330, 20480, 165, 256, 0}, + {330, 30720, 165, 256, 0}, + {330, 40960, 165, 8192, 0}, + {330, 51200, 165, 256, 0}, + {330, 61440, 165, 61440, 0}, + {330, 71680, 165, 71680, 0}, + {330, 81920, 165, 81920, 0}, + {330, 92160, 165, 92160, 0}, + {330, 102400, 165, 102400, 0}, + {330, 204800, 165, 204800, 0}, + {330, 307200, 165, 307200, 0}, + {330, 409600, 165, 409600, 0}, + {330, 512000, 165, 512000, 0}, + {330, 614400, 165, 614400, 0}, + {330, 716800, 165, 716800, 0}, + {330, 819200, 165, 819200, 0}, + {330, 921600, 165, 921600, 0}, + {330, 1024000, 165, 1024000, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > dgemm_nn_mi250x = -{ - {3 , 512 , 1 , 32 , 1 }, - {3 , 1024 , 1 , 512 , 1 }, - {3 , 1536 , 1 , 64 , 1 }, - {3 , 2048 , 1 , 512 , 1 }, - {3 , 2560 , 1 , 256 , 1 }, - {3 , 3072 , 1 , 512 , 1 }, - {3 , 3584 , 1 , 128 , 1 }, - {3 , 4096 , 1 , 2048 , 1 }, - {3 , 4608 , 1 , 512 , 1 }, - {3 , 5120 , 1 , 256 , 1 }, - {3 , 5632 , 1 , 256 , 1 }, - {3 , 6144 , 1 , 2048 , 1 }, - {3 , 6656 , 1 , 256 , 1 }, - {3 , 7168 , 1 , 256 , 1 }, - {3 , 7680 , 1 , 512 , 1 }, - {3 , 8192 , 1 , 512 , 1 }, - {3 , 8704 , 1 , 512 , 1 }, - {3 , 9216 , 1 , 512 , 1 }, - {3 , 9728 , 1 , 512 , 1 }, - {3 , 10240 , 1 , 1024 , 1 }, - {3 , 20480 , 1 , 1024 , 1 }, - {3 , 30720 , 1 , 2048 , 1 }, - {3 , 40960 , 1 , 2048 , 1 }, - {3 , 51200 , 1 , 2048 , 1 }, - {3 , 61440 , 1 , 2048 , 1 }, - {3 , 71680 , 1 , 2048 , 1 }, - {3 , 81920 , 1 , 8192 , 1 }, - {3 , 92160 , 1 , 2048 , 1 }, - {3 , 102400 , 1 , 4096 , 1 }, - {3 , 204800 , 1 , 8192 , 1 }, - {3 , 307200 , 1 , 512 , 1 }, - {3 , 409600 , 1 , 16384 , 1 }, - {3 , 512000 , 1 , 4096 , 1 }, - {3 , 614400 , 1 , 8192 , 1 }, - {3 , 716800 , 1 , 2048 , 1 }, - {3 , 819200 , 1 , 32768 , 1 }, - {3 , 921600 , 1 , 2048 , 1 }, - {3 , 1024000, 1 , 8192 , 1 }, - {4 , 512 , 1 , 64 , 1 }, - {4 , 1024 , 1 , 256 , 1 }, - {4 , 1536 , 1 , 128 , 1 }, - {4 , 2048 , 1 , 256 , 1 }, - {4 , 2560 , 1 , 512 , 1 }, - {4 , 3072 , 1 , 256 , 1 }, - {4 , 3584 , 1 , 512 , 1 }, - {4 , 4096 , 1 , 256 , 1 }, - {4 , 4608 , 1 , 256 , 1 }, - {4 , 5120 , 1 , 512 , 1 }, - {4 , 5632 , 1 , 512 , 1 }, - {4 , 6144 , 1 , 1024 , 1 }, - {4 , 6656 , 1 , 256 , 1 }, - {4 , 7168 , 1 , 512 , 1 }, - {4 , 7680 , 1 , 512 , 1 }, - {4 , 8192 , 1 , 512 , 1 }, - {4 , 8704 , 1 , 512 , 1 }, - {4 , 9216 , 1 , 1024 , 1 }, - {4 , 9728 , 1 , 512 , 1 }, - {4 , 10240 , 1 , 1024 , 1 }, - {4 , 20480 , 1 , 2048 , 1 }, - {4 , 30720 , 1 , 2048 , 1 }, - {4 , 40960 , 1 , 2048 , 1 }, - {4 , 51200 , 1 , 2048 , 1 }, - {4 , 61440 , 1 , 2048 , 1 }, - {4 , 71680 , 1 , 2048 , 1 }, - {4 , 81920 , 1 , 4096 , 1 }, - {4 , 92160 , 1 , 2048 , 1 }, - {4 , 102400 , 1 , 4096 , 1 }, - {4 , 204800 , 1 , 8192 , 1 }, - {4 , 307200 , 1 , 4096 , 1 }, - {4 , 409600 , 1 , 8192 , 1 }, - {4 , 512000 , 1 , 512 , 1 }, - {4 , 614400 , 1 , 8192 , 1 }, - {4 , 716800 , 1 , 2048 , 1 }, - {4 , 819200 , 1 , 32768 , 1 }, - {4 , 921600 , 1 , 921600 , 1 }, - {4 , 1024000, 1 , 1024000, 1 }, - {6 , 512 , 3 , 32 , 1 }, - {6 , 1024 , 3 , 128 , 1 }, - {6 , 1536 , 3 , 128 , 1 }, - {6 , 2048 , 3 , 2048 , 1 }, - {6 , 2560 , 3 , 128 , 1 }, - {6 , 3072 , 3 , 1024 , 1 }, - {6 , 3584 , 3 , 128 , 1 }, - {6 , 4096 , 3 , 512 , 1 }, - {6 , 4608 , 3 , 256 , 1 }, - {6 , 5120 , 3 , 512 , 1 }, - {6 , 5632 , 3 , 256 , 1 }, - {6 , 6144 , 3 , 1024 , 1 }, - {6 , 6656 , 3 , 512 , 1 }, - {6 , 7168 , 3 , 1024 , 1 }, - {6 , 7680 , 3 , 512 , 1 }, - {6 , 8192 , 3 , 512 , 1 }, - {6 , 8704 , 3 , 512 , 1 }, - {6 , 9216 , 3 , 1024 , 1 }, - {6 , 9728 , 3 , 256 , 1 }, - {6 , 10240 , 3 , 1024 , 1 }, - {6 , 20480 , 3 , 4096 , 1 }, - {6 , 30720 , 3 , 1024 , 1 }, - {6 , 40960 , 3 , 2048 , 1 }, - {6 , 51200 , 3 , 2048 , 1 }, - {6 , 61440 , 3 , 2048 , 1 }, - {6 , 71680 , 3 , 2048 , 1 }, - {6 , 81920 , 3 , 2048 , 1 }, - {6 , 92160 , 3 , 2048 , 1 }, - {6 , 102400 , 3 , 2048 , 1 }, - {6 , 204800 , 3 , 8192 , 1 }, - {6 , 307200 , 3 , 4096 , 1 }, - {6 , 409600 , 3 , 16384 , 1 }, - {6 , 512000 , 3 , 256 , 1 }, - {6 , 614400 , 3 , 8192 , 1 }, - {6 , 716800 , 3 , 512 , 1 }, - {6 , 819200 , 3 , 32768 , 1 }, - {6 , 921600 , 3 , 4096 , 1 }, - {6 , 1024000, 3 , 1024000, 1 }, - {10 , 512 , 4 , 64 , 1 }, - {10 , 1024 , 4 , 64 , 1 }, - {10 , 1536 , 4 , 256 , 1 }, - {10 , 2048 , 4 , 2048 , 1 }, - {10 , 2560 , 4 , 256 , 1 }, - {10 , 3072 , 4 , 1024 , 1 }, - {10 , 3584 , 4 , 512 , 1 }, - {10 , 4096 , 4 , 512 , 1 }, - {10 , 4608 , 4 , 256 , 1 }, - {10 , 5120 , 4 , 256 , 1 }, - {10 , 5632 , 4 , 512 , 1 }, - {10 , 6144 , 4 , 512 , 1 }, - {10 , 6656 , 4 , 512 , 1 }, - {10 , 7168 , 4 , 512 , 1 }, - {10 , 7680 , 4 , 512 , 1 }, - {10 , 8192 , 4 , 512 , 1 }, - {10 , 8704 , 4 , 512 , 1 }, - {10 , 9216 , 4 , 512 , 1 }, - {10 , 9728 , 4 , 512 , 1 }, - {10 , 10240 , 4 , 1024 , 1 }, - {10 , 20480 , 4 , 1024 , 1 }, - {10 , 30720 , 4 , 2048 , 1 }, - {10 , 40960 , 4 , 2048 , 1 }, - {10 , 51200 , 4 , 2048 , 1 }, - {10 , 61440 , 4 , 1024 , 1 }, - {10 , 71680 , 4 , 2048 , 1 }, - {10 , 81920 , 4 , 2048 , 1 }, - {10 , 92160 , 4 , 2048 , 1 }, - {10 , 102400 , 4 , 2048 , 1 }, - {10 , 204800 , 4 , 8192 , 1 }, - {10 , 307200 , 4 , 4096 , 1 }, - {10 , 409600 , 4 , 16384 , 1 }, - {10 , 512000 , 4 , 2048 , 1 }, - {10 , 614400 , 4 , 8192 , 1 }, - {10 , 716800 , 4 , 4096 , 1 }, - {10 , 819200 , 4 , 16384 , 1 }, - {10 , 921600 , 4 , 921600 , 1 }, - {10 , 1024000, 4 , 1024000, 1 }, - {10 , 512 , 6 , 32 , 1 }, - {10 , 1024 , 6 , 512 , 1 }, - {10 , 1536 , 6 , 256 , 1 }, - {10 , 2048 , 6 , 512 , 1 }, - {10 , 2560 , 6 , 128 , 1 }, - {10 , 3072 , 6 , 1024 , 1 }, - {10 , 3584 , 6 , 256 , 1 }, - {10 , 4096 , 6 , 256 , 1 }, - {10 , 4608 , 6 , 512 , 1 }, - {10 , 5120 , 6 , 256 , 1 }, - {10 , 5632 , 6 , 256 , 1 }, - {10 , 6144 , 6 , 512 , 1 }, - {10 , 6656 , 6 , 512 , 1 }, - {10 , 7168 , 6 , 512 , 1 }, - {10 , 7680 , 6 , 512 , 1 }, - {10 , 8192 , 6 , 512 , 1 }, - {10 , 8704 , 6 , 512 , 1 }, - {10 , 9216 , 6 , 1024 , 1 }, - {10 , 9728 , 6 , 256 , 0 }, - {10 , 10240 , 6 , 1024 , 1 }, - {10 , 20480 , 6 , 1024 , 1 }, - {10 , 30720 , 6 , 1024 , 1 }, - {10 , 40960 , 6 , 2048 , 1 }, - {10 , 51200 , 6 , 1024 , 1 }, - {10 , 61440 , 6 , 2048 , 1 }, - {10 , 71680 , 6 , 2048 , 1 }, - {10 , 81920 , 6 , 8192 , 1 }, - {10 , 92160 , 6 , 2048 , 1 }, - {10 , 102400 , 6 , 4096 , 1 }, - {10 , 204800 , 6 , 8192 , 1 }, - {10 , 307200 , 6 , 4096 , 1 }, - {10 , 409600 , 6 , 8192 , 1 }, - {10 , 512000 , 6 , 512 , 1 }, - {10 , 614400 , 6 , 8192 , 1 }, - {10 , 716800 , 6 , 1024 , 1 }, - {10 , 819200 , 6 , 32768 , 1 }, - {10 , 921600 , 6 , 2048 , 1 }, - {10 , 1024000, 6 , 8192 , 1 }, - {15 , 512 , 12 , 32 , 1 }, - {15 , 1024 , 12 , 64 , 1 }, - {15 , 1536 , 12 , 128 , 1 }, - {15 , 2048 , 12 , 512 , 1 }, - {15 , 2560 , 12 , 512 , 1 }, - {15 , 3072 , 12 , 512 , 1 }, - {15 , 3584 , 12 , 256 , 1 }, - {15 , 4096 , 12 , 512 , 1 }, - {15 , 4608 , 12 , 512 , 1 }, - {15 , 5120 , 12 , 128 , 0 }, - {15 , 5632 , 12 , 64 , 0 }, - {15 , 6144 , 12 , 512 , 0 }, - {15 , 6656 , 12 , 512 , 0 }, - {15 , 7168 , 12 , 128 , 0 }, - {15 , 7680 , 12 , 512 , 0 }, - {15 , 8192 , 12 , 4096 , 1 }, - {15 , 8704 , 12 , 256 , 0 }, - {15 , 9216 , 12 , 512 , 1 }, - {15 , 9728 , 12 , 512 , 1 }, - {15 , 10240 , 12 , 512 , 1 }, - {15 , 20480 , 12 , 2048 , 1 }, - {15 , 30720 , 12 , 512 , 1 }, - {15 , 40960 , 12 , 8192 , 1 }, - {15 , 51200 , 12 , 2048 , 1 }, - {15 , 61440 , 12 , 4096 , 1 }, - {15 , 71680 , 12 , 2048 , 1 }, - {15 , 81920 , 12 , 16384 , 1 }, - {15 , 92160 , 12 , 2048 , 1 }, - {15 , 102400 , 12 , 2048 , 1 }, - {15 , 204800 , 12 , 8192 , 1 }, - {15 , 307200 , 12 , 4096 , 1 }, - {15 , 409600 , 12 , 16384 , 1 }, - {15 , 512000 , 12 , 512 , 1 }, - {15 , 614400 , 12 , 8192 , 1 }, - {15 , 716800 , 12 , 4096 , 1 }, - {15 , 819200 , 12 , 16384 , 1 }, - {15 , 921600 , 12 , 4096 , 1 }, - {15 , 1024000, 12 , 8192 , 1 }, - {20 , 512 , 11 , 64 , 1 }, - {20 , 1024 , 11 , 128 , 1 }, - {20 , 1536 , 11 , 256 , 1 }, - {20 , 2048 , 11 , 128 , 1 }, - {20 , 2560 , 11 , 256 , 1 }, - {20 , 3072 , 11 , 256 , 1 }, - {20 , 3584 , 11 , 256 , 1 }, - {20 , 4096 , 11 , 1024 , 1 }, - {20 , 4608 , 11 , 32 , 0 }, - {20 , 5120 , 11 , 32 , 0 }, - {20 , 5632 , 11 , 64 , 0 }, - {20 , 6144 , 11 , 128 , 0 }, - {20 , 6656 , 11 , 512 , 1 }, - {20 , 7168 , 11 , 512 , 1 }, - {20 , 7680 , 11 , 256 , 1 }, - {20 , 8192 , 11 , 4096 , 1 }, - {20 , 8704 , 11 , 512 , 1 }, - {20 , 9216 , 11 , 512 , 1 }, - {20 , 9728 , 11 , 256 , 1 }, - {20 , 10240 , 11 , 512 , 1 }, - {20 , 20480 , 11 , 2048 , 1 }, - {20 , 30720 , 11 , 2048 , 1 }, - {20 , 40960 , 11 , 2048 , 1 }, - {20 , 51200 , 11 , 2048 , 1 }, - {20 , 61440 , 11 , 4096 , 1 }, - {20 , 71680 , 11 , 2048 , 1 }, - {20 , 81920 , 11 , 16384 , 1 }, - {20 , 92160 , 11 , 2048 , 1 }, - {20 , 102400 , 11 , 4096 , 1 }, - {20 , 204800 , 11 , 4096 , 1 }, - {20 , 307200 , 11 , 4096 , 1 }, - {20 , 409600 , 11 , 8192 , 1 }, - {20 , 512000 , 11 , 4096 , 1 }, - {20 , 614400 , 11 , 8192 , 1 }, - {20 , 716800 , 11 , 4096 , 1 }, - {20 , 819200 , 11 , 32768 , 1 }, - {20 , 921600 , 11 , 4096 , 1 }, - {20 , 1024000, 11 , 8192 , 0 }, - {21 , 512 , 16 , 32 , 1 }, - {21 , 1024 , 16 , 64 , 1 }, - {21 , 1536 , 16 , 256 , 1 }, - {21 , 2048 , 16 , 128 , 1 }, - {21 , 2560 , 16 , 256 , 1 }, - {21 , 3072 , 16 , 256 , 1 }, - {21 , 3584 , 16 , 128 , 0 }, - {21 , 4096 , 16 , 128 , 0 }, - {21 , 4608 , 16 , 128 , 0 }, - {21 , 5120 , 16 , 128 , 0 }, - {21 , 5632 , 16 , 256 , 0 }, - {21 , 6144 , 16 , 128 , 0 }, - {21 , 6656 , 16 , 512 , 1 }, - {21 , 7168 , 16 , 512 , 1 }, - {21 , 7680 , 16 , 256 , 1 }, - {21 , 8192 , 16 , 512 , 1 }, - {21 , 8704 , 16 , 256 , 1 }, - {21 , 9216 , 16 , 512 , 1 }, - {21 , 9728 , 16 , 512 , 1 }, - {21 , 10240 , 16 , 512 , 1 }, - {21 , 20480 , 16 , 4096 , 1 }, - {21 , 30720 , 16 , 2048 , 1 }, - {21 , 40960 , 16 , 8192 , 1 }, - {21 , 51200 , 16 , 2048 , 1 }, - {21 , 61440 , 16 , 2048 , 1 }, - {21 , 71680 , 16 , 2048 , 1 }, - {21 , 81920 , 16 , 8192 , 1 }, - {21 , 92160 , 16 , 2048 , 1 }, - {21 , 102400 , 16 , 4096 , 1 }, - {21 , 204800 , 16 , 4096 , 1 }, - {21 , 307200 , 16 , 4096 , 1 }, - {21 , 409600 , 16 , 16384 , 1 }, - {21 , 512000 , 16 , 2048 , 1 }, - {21 , 614400 , 16 , 8192 , 1 }, - {21 , 716800 , 16 , 4096 , 1 }, - {21 , 819200 , 16 , 32768 , 1 }, - {21 , 921600 , 16 , 2048 , 1 }, - {21 , 1024000, 16 , 2048 , 1 }, - {28 , 512 , 25 , 64 , 1 }, - {28 , 1024 , 25 , 128 , 1 }, - {28 , 1536 , 25 , 256 , 1 }, - {28 , 2048 , 25 , 256 , 1 }, - {28 , 2560 , 25 , 64 , 0 }, - {28 , 3072 , 25 , 32 , 0 }, - {28 , 3584 , 25 , 64 , 0 }, - {28 , 4096 , 25 , 256 , 0 }, - {28 , 4608 , 25 , 128 , 0 }, - {28 , 5120 , 25 , 1024 , 1 }, - {28 , 5632 , 25 , 256 , 1 }, - {28 , 6144 , 25 , 256 , 1 }, - {28 , 6656 , 25 , 256 , 1 }, - {28 , 7168 , 25 , 1024 , 1 }, - {28 , 7680 , 25 , 256 , 1 }, - {28 , 8192 , 25 , 256 , 1 }, - {28 , 8704 , 25 , 256 , 1 }, - {28 , 9216 , 25 , 256 , 1 }, - {28 , 9728 , 25 , 256 , 1 }, - {28 , 10240 , 25 , 256 , 1 }, - {28 , 20480 , 25 , 2048 , 1 }, - {28 , 30720 , 25 , 2048 , 1 }, - {28 , 40960 , 25 , 8192 , 1 }, - {28 , 51200 , 25 , 1024 , 1 }, - {28 , 61440 , 25 , 2048 , 1 }, - {28 , 71680 , 25 , 2048 , 1 }, - {28 , 81920 , 25 , 4096 , 1 }, - {28 , 92160 , 25 , 2048 , 1 }, - {28 , 102400 , 25 , 4096 , 1 }, - {28 , 204800 , 25 , 4096 , 1 }, - {28 , 307200 , 25 , 4096 , 1 }, - {28 , 409600 , 25 , 8192 , 1 }, - {28 , 512000 , 25 , 4096 , 1 }, - {28 , 614400 , 25 , 8192 , 1 }, - {28 , 716800 , 25 , 4096 , 1 }, - {28 , 819200 , 25 , 32768 , 1 }, - {28 , 921600 , 25 , 4096 , 1 }, - {28 , 1024000, 25 , 8192 , 1 }, - {35 , 512 , 24 , 64 , 1 }, - {35 , 1024 , 24 , 256 , 1 }, - {35 , 1536 , 24 , 32 , 0 }, - {35 , 2048 , 24 , 256 , 1 }, - {35 , 2560 , 24 , 64 , 0 }, - {35 , 3072 , 24 , 256 , 0 }, - {35 , 3584 , 24 , 256 , 0 }, - {35 , 4096 , 24 , 256 , 1 }, - {35 , 4608 , 24 , 256 , 1 }, - {35 , 5120 , 24 , 256 , 1 }, - {35 , 5632 , 24 , 256 , 1 }, - {35 , 6144 , 24 , 256 , 1 }, - {35 , 6656 , 24 , 256 , 1 }, - {35 , 7168 , 24 , 256 , 1 }, - {35 , 7680 , 24 , 512 , 1 }, - {35 , 8192 , 24 , 2048 , 1 }, - {35 , 8704 , 24 , 512 , 1 }, - {35 , 9216 , 24 , 1024 , 1 }, - {35 , 9728 , 24 , 256 , 1 }, - {35 , 10240 , 24 , 512 , 1 }, - {35 , 20480 , 24 , 1024 , 1 }, - {35 , 30720 , 24 , 1024 , 1 }, - {35 , 40960 , 24 , 4096 , 1 }, - {35 , 51200 , 24 , 2048 , 1 }, - {35 , 61440 , 24 , 2048 , 1 }, - {35 , 71680 , 24 , 2048 , 1 }, - {35 , 81920 , 24 , 4096 , 1 }, - {35 , 92160 , 24 , 2048 , 1 }, - {35 , 102400 , 24 , 4096 , 1 }, - {35 , 204800 , 24 , 8192 , 1 }, - {35 , 307200 , 24 , 4096 , 1 }, - {35 , 409600 , 24 , 8192 , 1 }, - {35 , 512000 , 24 , 2048 , 1 }, - {35 , 614400 , 24 , 8192 , 0 }, - {35 , 716800 , 24 , 2048 , 1 }, - {35 , 819200 , 24 , 16384 , 0 }, - {35 , 921600 , 24 , 4096 , 1 }, - {35 , 1024000, 24 , 8192 , 0 }, - {36 , 512 , 33 , 64 , 1 }, - {36 , 1024 , 33 , 256 , 0 }, - {36 , 1536 , 33 , 64 , 0 }, - {36 , 2048 , 33 , 128 , 0 }, - {36 , 2560 , 33 , 64 , 0 }, - {36 , 3072 , 33 , 256 , 0 }, - {36 , 3584 , 33 , 256 , 0 }, - {36 , 4096 , 33 , 256 , 1 }, - {36 , 4608 , 33 , 256 , 1 }, - {36 , 5120 , 33 , 256 , 1 }, - {36 , 5632 , 33 , 256 , 1 }, - {36 , 6144 , 33 , 256 , 1 }, - {36 , 6656 , 33 , 256 , 1 }, - {36 , 7168 , 33 , 512 , 1 }, - {36 , 7680 , 33 , 256 , 1 }, - {36 , 8192 , 33 , 8192 , 1 }, - {36 , 8704 , 33 , 256 , 1 }, - {36 , 9216 , 33 , 512 , 1 }, - {36 , 9728 , 33 , 256 , 1 }, - {36 , 10240 , 33 , 256 , 1 }, - {36 , 20480 , 33 , 2048 , 1 }, - {36 , 30720 , 33 , 1024 , 1 }, - {36 , 40960 , 33 , 8192 , 1 }, - {36 , 51200 , 33 , 2048 , 1 }, - {36 , 61440 , 33 , 4096 , 1 }, - {36 , 71680 , 33 , 2048 , 1 }, - {36 , 81920 , 33 , 4096 , 1 }, - {36 , 92160 , 33 , 2048 , 1 }, - {36 , 102400 , 33 , 4096 , 1 }, - {36 , 204800 , 33 , 8192 , 0 }, - {36 , 307200 , 33 , 307200 , 1 }, - {36 , 409600 , 33 , 8192 , 0 }, - {36 , 512000 , 33 , 512000 , 1 }, - {36 , 614400 , 33 , 8192 , 0 }, - {36 , 716800 , 33 , 716800 , 1 }, - {36 , 819200 , 33 , 8192 , 0 }, - {36 , 921600 , 33 , 921600 , 1 }, - {36 , 1024000, 33 , 8192 , 0 }, - {45 , 512 , 42 , 64 , 0 }, - {45 , 1024 , 42 , 64 , 0 }, - {45 , 1536 , 42 , 64 , 0 }, - {45 , 2048 , 42 , 64 , 0 }, - {45 , 2560 , 42 , 64 , 0 }, - {45 , 3072 , 42 , 128 , 1 }, - {45 , 3584 , 42 , 128 , 1 }, - {45 , 4096 , 42 , 256 , 1 }, - {45 , 4608 , 42 , 512 , 1 }, - {45 , 5120 , 42 , 1024 , 1 }, - {45 , 5632 , 42 , 512 , 1 }, - {45 , 6144 , 42 , 2048 , 1 }, - {45 , 6656 , 42 , 256 , 1 }, - {45 , 7168 , 42 , 512 , 1 }, - {45 , 7680 , 42 , 512 , 1 }, - {45 , 8192 , 42 , 4096 , 1 }, - {45 , 8704 , 42 , 512 , 1 }, - {45 , 9216 , 42 , 256 , 1 }, - {45 , 9728 , 42 , 256 , 1 }, - {45 , 10240 , 42 , 256 , 1 }, - {45 , 20480 , 42 , 1024 , 1 }, - {45 , 30720 , 42 , 2048 , 1 }, - {45 , 40960 , 42 , 1024 , 1 }, - {45 , 51200 , 42 , 1024 , 1 }, - {45 , 61440 , 42 , 2048 , 1 }, - {45 , 71680 , 42 , 2048 , 1 }, - {45 , 81920 , 42 , 16384 , 1 }, - {45 , 92160 , 42 , 2048 , 1 }, - {45 , 102400 , 42 , 4096 , 1 }, - {45 , 204800 , 42 , 4096 , 1 }, - {45 , 307200 , 42 , 4096 , 1 }, - {45 , 409600 , 42 , 16384 , 0 }, - {45 , 512000 , 42 , 2048 , 0 }, - {45 , 614400 , 42 , 8192 , 0 }, - {45 , 716800 , 42 , 1024 , 0 }, - {45 , 819200 , 42 , 32768 , 0 }, - {45 , 921600 , 42 , 2048 , 0 }, - {45 , 1024000, 42 , 8192 , 0 }, - {56 , 512 , 43 , 32 , 0 }, - {56 , 1024 , 43 , 256 , 1 }, - {56 , 1536 , 43 , 32 , 0 }, - {56 , 2048 , 43 , 64 , 0 }, - {56 , 2560 , 43 , 128 , 1 }, - {56 , 3072 , 43 , 128 , 1 }, - {56 , 3584 , 43 , 512 , 1 }, - {56 , 4096 , 43 , 512 , 1 }, - {56 , 4608 , 43 , 256 , 1 }, - {56 , 5120 , 43 , 512 , 1 }, - {56 , 5632 , 43 , 256 , 1 }, - {56 , 6144 , 43 , 2048 , 1 }, - {56 , 6656 , 43 , 512 , 1 }, - {56 , 7168 , 43 , 512 , 1 }, - {56 , 7680 , 43 , 256 , 1 }, - {56 , 8192 , 43 , 1024 , 1 }, - {56 , 8704 , 43 , 512 , 1 }, - {56 , 9216 , 43 , 512 , 1 }, - {56 , 9728 , 43 , 512 , 1 }, - {56 , 10240 , 43 , 1024 , 1 }, - {56 , 20480 , 43 , 1024 , 1 }, - {56 , 30720 , 43 , 1024 , 1 }, - {56 , 40960 , 43 , 8192 , 1 }, - {56 , 51200 , 43 , 1024 , 1 }, - {56 , 61440 , 43 , 2048 , 1 }, - {56 , 71680 , 43 , 2048 , 1 }, - {56 , 81920 , 43 , 16384 , 1 }, - {56 , 92160 , 43 , 2048 , 1 }, - {56 , 102400 , 43 , 4096 , 1 }, - {56 , 204800 , 43 , 8192 , 1 }, - {56 , 307200 , 43 , 4096 , 1 }, - {56 , 409600 , 43 , 8192 , 0 }, - {56 , 512000 , 43 , 4096 , 1 }, - {56 , 614400 , 43 , 8192 , 0 }, - {56 , 716800 , 43 , 4096 , 1 }, - {56 , 819200 , 43 , 32768 , 0 }, - {56 , 921600 , 43 , 2048 , 0 }, - {56 , 1024000, 43 , 8192 , 0 }, - {84 , 512 , 126 , 64 , 0 }, - {84 , 1024 , 126 , 64 , 0 }, - {84 , 1536 , 126 , 64 , 0 }, - {84 , 2048 , 126 , 1024 , 1 }, - {84 , 2560 , 126 , 64 , 1 }, - {84 , 3072 , 126 , 256 , 1 }, - {84 , 3584 , 126 , 256 , 1 }, - {84 , 4096 , 126 , 4096 , 1 }, - {84 , 4608 , 126 , 128 , 1 }, - {84 , 5120 , 126 , 256 , 1 }, - {84 , 5632 , 126 , 256 , 1 }, - {84 , 6144 , 126 , 1024 , 0 }, - {84 , 6656 , 126 , 256 , 1 }, - {84 , 7168 , 126 , 1024 , 1 }, - {84 , 7680 , 126 , 512 , 1 }, - {84 , 8192 , 126 , 4096 , 1 }, - {84 , 8704 , 126 , 512 , 1 }, - {84 , 9216 , 126 , 512 , 1 }, - {84 , 9728 , 126 , 256 , 1 }, - {84 , 10240 , 126 , 1024 , 1 }, - {84 , 20480 , 126 , 2048 , 1 }, - {84 , 30720 , 126 , 1024 , 0 }, - {84 , 40960 , 126 , 40960 , 0 }, - {84 , 51200 , 126 , 51200 , 0 }, - {84 , 61440 , 126 , 61440 , 0 }, - {84 , 71680 , 126 , 71680 , 0 }, - {84 , 81920 , 126 , 81920 , 0 }, - {84 , 92160 , 126 , 92160 , 0 }, - {84 , 102400 , 126 , 102400 , 0 }, - {84 , 204800 , 126 , 204800 , 0 }, - {84 , 307200 , 126 , 307200 , 0 }, - {84 , 409600 , 126 , 409600 , 0 }, - {84 , 512000 , 126 , 512000 , 0 }, - {84 , 614400 , 126 , 614400 , 0 }, - {84 , 716800 , 126 , 716800 , 0 }, - {84 , 819200 , 126 , 819200 , 0 }, - {84 , 921600 , 126 , 921600 , 0 }, - {84 , 1024000, 126 , 1024000, 0 }, - {120 , 512 , 210 , 128 , 0 }, - {120 , 1024 , 210 , 512 , 0 }, - {120 , 1536 , 210 , 512 , 0 }, - {120 , 2048 , 210 , 2048 , 0 }, - {120 , 2560 , 210 , 512 , 0 }, - {120 , 3072 , 210 , 1024 , 0 }, - {120 , 3584 , 210 , 512 , 0 }, - {120 , 4096 , 210 , 2048 , 0 }, - {120 , 4608 , 210 , 512 , 0 }, - {120 , 5120 , 210 , 1024 , 0 }, - {120 , 5632 , 210 , 512 , 0 }, - {120 , 6144 , 210 , 2048 , 0 }, - {120 , 6656 , 210 , 512 , 0 }, - {120 , 7168 , 210 , 1024 , 0 }, - {120 , 7680 , 210 , 512 , 0 }, - {120 , 8192 , 210 , 2048 , 0 }, - {120 , 8704 , 210 , 512 , 0 }, - {120 , 9216 , 210 , 1024 , 0 }, - {120 , 9728 , 210 , 512 , 0 }, - {120 , 10240 , 210 , 1024 , 0 }, - {120 , 20480 , 210 , 1024 , 0 }, - {120 , 30720 , 210 , 30720 , 0 }, - {120 , 40960 , 210 , 40960 , 0 }, - {120 , 51200 , 210 , 51200 , 0 }, - {120 , 61440 , 210 , 61440 , 0 }, - {120 , 71680 , 210 , 71680 , 0 }, - {120 , 81920 , 210 , 81920 , 0 }, - {120 , 92160 , 210 , 92160 , 0 }, - {120 , 102400 , 210 , 102400 , 0 }, - {120 , 204800 , 210 , 204800 , 0 }, - {120 , 307200 , 210 , 307200 , 0 }, - {120 , 409600 , 210 , 409600 , 0 }, - {120 , 512000 , 210 , 512000 , 0 }, - {120 , 614400 , 210 , 614400 , 0 }, - {120 , 716800 , 210 , 716800 , 0 }, - {120 , 819200 , 210 , 819200 , 0 }, - {120 , 921600 , 210 , 921600 , 0 }, - {120 , 1024000, 210 , 1024000, 0 }, - {165 , 512 , 330 , 128 , 0 }, - {165 , 1024 , 330 , 1024 , 0 }, - {165 , 1536 , 330 , 512 , 0 }, - {165 , 2048 , 330 , 2048 , 0 }, - {165 , 2560 , 330 , 512 , 0 }, - {165 , 3072 , 330 , 512 , 0 }, - {165 , 3584 , 330 , 512 , 0 }, - {165 , 4096 , 330 , 2048 , 0 }, - {165 , 4608 , 330 , 512 , 0 }, - {165 , 5120 , 330 , 1024 , 0 }, - {165 , 5632 , 330 , 512 , 0 }, - {165 , 6144 , 330 , 2048 , 0 }, - {165 , 6656 , 330 , 512 , 0 }, - {165 , 7168 , 330 , 1024 , 0 }, - {165 , 7680 , 330 , 512 , 0 }, - {165 , 8192 , 330 , 2048 , 0 }, - {165 , 8704 , 330 , 512 , 0 }, - {165 , 9216 , 330 , 512 , 0 }, - {165 , 9728 , 330 , 512 , 0 }, - {165 , 10240 , 330 , 2048 , 0 }, - {165 , 20480 , 330 , 2048 , 0 }, - {165 , 30720 , 330 , 2048 , 0 }, - {165 , 40960 , 330 , 40960 , 0 }, - {165 , 51200 , 330 , 51200 , 0 }, - {165 , 61440 , 330 , 61440 , 0 }, - {165 , 71680 , 330 , 71680 , 0 }, - {165 , 81920 , 330 , 81920 , 0 }, - {165 , 92160 , 330 , 92160 , 0 }, - {165 , 102400 , 330 , 102400 , 0 }, - {165 , 204800 , 330 , 204800 , 0 }, - {165 , 307200 , 330 , 307200 , 0 }, - {165 , 409600 , 330 , 409600 , 0 }, - {165 , 512000 , 330 , 512000 , 0 }, - {165 , 614400 , 330 , 614400 , 0 }, - {165 , 716800 , 330 , 716800 , 0 }, - {165 , 819200 , 330 , 819200 , 0 }, - {165 , 921600 , 330 , 921600 , 0 }, - {165 , 1024000, 330 , 1024000, 0 } +std::vector > dgemm_nn_mi250x = { + {3, 512, 1, 32, 1}, + {3, 1024, 1, 512, 1}, + {3, 1536, 1, 64, 1}, + {3, 2048, 1, 512, 1}, + {3, 2560, 1, 256, 1}, + {3, 3072, 1, 512, 1}, + {3, 3584, 1, 128, 1}, + {3, 4096, 1, 2048, 1}, + {3, 4608, 1, 512, 1}, + {3, 5120, 1, 256, 1}, + {3, 5632, 1, 256, 1}, + {3, 6144, 1, 2048, 1}, + {3, 6656, 1, 256, 1}, + {3, 7168, 1, 256, 1}, + {3, 7680, 1, 512, 1}, + {3, 8192, 1, 512, 1}, + {3, 8704, 1, 512, 1}, + {3, 9216, 1, 512, 1}, + {3, 9728, 1, 512, 1}, + {3, 10240, 1, 1024, 1}, + {3, 20480, 1, 1024, 1}, + {3, 30720, 1, 2048, 1}, + {3, 40960, 1, 2048, 1}, + {3, 51200, 1, 2048, 1}, + {3, 61440, 1, 2048, 1}, + {3, 71680, 1, 2048, 1}, + {3, 81920, 1, 8192, 1}, + {3, 92160, 1, 2048, 1}, + {3, 102400, 1, 4096, 1}, + {3, 204800, 1, 8192, 1}, + {3, 307200, 1, 512, 1}, + {3, 409600, 1, 16384, 1}, + {3, 512000, 1, 4096, 1}, + {3, 614400, 1, 8192, 1}, + {3, 716800, 1, 2048, 1}, + {3, 819200, 1, 32768, 1}, + {3, 921600, 1, 2048, 1}, + {3, 1024000, 1, 8192, 1}, + {4, 512, 1, 64, 1}, + {4, 1024, 1, 256, 1}, + {4, 1536, 1, 128, 1}, + {4, 2048, 1, 256, 1}, + {4, 2560, 1, 512, 1}, + {4, 3072, 1, 256, 1}, + {4, 3584, 1, 512, 1}, + {4, 4096, 1, 256, 1}, + {4, 4608, 1, 256, 1}, + {4, 5120, 1, 512, 1}, + {4, 5632, 1, 512, 1}, + {4, 6144, 1, 1024, 1}, + {4, 6656, 1, 256, 1}, + {4, 7168, 1, 512, 1}, + {4, 7680, 1, 512, 1}, + {4, 8192, 1, 512, 1}, + {4, 8704, 1, 512, 1}, + {4, 9216, 1, 1024, 1}, + {4, 9728, 1, 512, 1}, + {4, 10240, 1, 1024, 1}, + {4, 20480, 1, 2048, 1}, + {4, 30720, 1, 2048, 1}, + {4, 40960, 1, 2048, 1}, + {4, 51200, 1, 2048, 1}, + {4, 61440, 1, 2048, 1}, + {4, 71680, 1, 2048, 1}, + {4, 81920, 1, 4096, 1}, + {4, 92160, 1, 2048, 1}, + {4, 102400, 1, 4096, 1}, + {4, 204800, 1, 8192, 1}, + {4, 307200, 1, 4096, 1}, + {4, 409600, 1, 8192, 1}, + {4, 512000, 1, 512, 1}, + {4, 614400, 1, 8192, 1}, + {4, 716800, 1, 2048, 1}, + {4, 819200, 1, 32768, 1}, + {4, 921600, 1, 921600, 1}, + {4, 1024000, 1, 1024000, 1}, + {6, 512, 3, 32, 1}, + {6, 1024, 3, 128, 1}, + {6, 1536, 3, 128, 1}, + {6, 2048, 3, 2048, 1}, + {6, 2560, 3, 128, 1}, + {6, 3072, 3, 1024, 1}, + {6, 3584, 3, 128, 1}, + {6, 4096, 3, 512, 1}, + {6, 4608, 3, 256, 1}, + {6, 5120, 3, 512, 1}, + {6, 5632, 3, 256, 1}, + {6, 6144, 3, 1024, 1}, + {6, 6656, 3, 512, 1}, + {6, 7168, 3, 1024, 1}, + {6, 7680, 3, 512, 1}, + {6, 8192, 3, 512, 1}, + {6, 8704, 3, 512, 1}, + {6, 9216, 3, 1024, 1}, + {6, 9728, 3, 256, 1}, + {6, 10240, 3, 1024, 1}, + {6, 20480, 3, 4096, 1}, + {6, 30720, 3, 1024, 1}, + {6, 40960, 3, 2048, 1}, + {6, 51200, 3, 2048, 1}, + {6, 61440, 3, 2048, 1}, + {6, 71680, 3, 2048, 1}, + {6, 81920, 3, 2048, 1}, + {6, 92160, 3, 2048, 1}, + {6, 102400, 3, 2048, 1}, + {6, 204800, 3, 8192, 1}, + {6, 307200, 3, 4096, 1}, + {6, 409600, 3, 16384, 1}, + {6, 512000, 3, 256, 1}, + {6, 614400, 3, 8192, 1}, + {6, 716800, 3, 512, 1}, + {6, 819200, 3, 32768, 1}, + {6, 921600, 3, 4096, 1}, + {6, 1024000, 3, 1024000, 1}, + {10, 512, 4, 64, 1}, + {10, 1024, 4, 64, 1}, + {10, 1536, 4, 256, 1}, + {10, 2048, 4, 2048, 1}, + {10, 2560, 4, 256, 1}, + {10, 3072, 4, 1024, 1}, + {10, 3584, 4, 512, 1}, + {10, 4096, 4, 512, 1}, + {10, 4608, 4, 256, 1}, + {10, 5120, 4, 256, 1}, + {10, 5632, 4, 512, 1}, + {10, 6144, 4, 512, 1}, + {10, 6656, 4, 512, 1}, + {10, 7168, 4, 512, 1}, + {10, 7680, 4, 512, 1}, + {10, 8192, 4, 512, 1}, + {10, 8704, 4, 512, 1}, + {10, 9216, 4, 512, 1}, + {10, 9728, 4, 512, 1}, + {10, 10240, 4, 1024, 1}, + {10, 20480, 4, 1024, 1}, + {10, 30720, 4, 2048, 1}, + {10, 40960, 4, 2048, 1}, + {10, 51200, 4, 2048, 1}, + {10, 61440, 4, 1024, 1}, + {10, 71680, 4, 2048, 1}, + {10, 81920, 4, 2048, 1}, + {10, 92160, 4, 2048, 1}, + {10, 102400, 4, 2048, 1}, + {10, 204800, 4, 8192, 1}, + {10, 307200, 4, 4096, 1}, + {10, 409600, 4, 16384, 1}, + {10, 512000, 4, 2048, 1}, + {10, 614400, 4, 8192, 1}, + {10, 716800, 4, 4096, 1}, + {10, 819200, 4, 16384, 1}, + {10, 921600, 4, 921600, 1}, + {10, 1024000, 4, 1024000, 1}, + {10, 512, 6, 32, 1}, + {10, 1024, 6, 512, 1}, + {10, 1536, 6, 256, 1}, + {10, 2048, 6, 512, 1}, + {10, 2560, 6, 128, 1}, + {10, 3072, 6, 1024, 1}, + {10, 3584, 6, 256, 1}, + {10, 4096, 6, 256, 1}, + {10, 4608, 6, 512, 1}, + {10, 5120, 6, 256, 1}, + {10, 5632, 6, 256, 1}, + {10, 6144, 6, 512, 1}, + {10, 6656, 6, 512, 1}, + {10, 7168, 6, 512, 1}, + {10, 7680, 6, 512, 1}, + {10, 8192, 6, 512, 1}, + {10, 8704, 6, 512, 1}, + {10, 9216, 6, 1024, 1}, + {10, 9728, 6, 256, 0}, + {10, 10240, 6, 1024, 1}, + {10, 20480, 6, 1024, 1}, + {10, 30720, 6, 1024, 1}, + {10, 40960, 6, 2048, 1}, + {10, 51200, 6, 1024, 1}, + {10, 61440, 6, 2048, 1}, + {10, 71680, 6, 2048, 1}, + {10, 81920, 6, 8192, 1}, + {10, 92160, 6, 2048, 1}, + {10, 102400, 6, 4096, 1}, + {10, 204800, 6, 8192, 1}, + {10, 307200, 6, 4096, 1}, + {10, 409600, 6, 8192, 1}, + {10, 512000, 6, 512, 1}, + {10, 614400, 6, 8192, 1}, + {10, 716800, 6, 1024, 1}, + {10, 819200, 6, 32768, 1}, + {10, 921600, 6, 2048, 1}, + {10, 1024000, 6, 8192, 1}, + {15, 512, 12, 32, 1}, + {15, 1024, 12, 64, 1}, + {15, 1536, 12, 128, 1}, + {15, 2048, 12, 512, 1}, + {15, 2560, 12, 512, 1}, + {15, 3072, 12, 512, 1}, + {15, 3584, 12, 256, 1}, + {15, 4096, 12, 512, 1}, + {15, 4608, 12, 512, 1}, + {15, 5120, 12, 128, 0}, + {15, 5632, 12, 64, 0}, + {15, 6144, 12, 512, 0}, + {15, 6656, 12, 512, 0}, + {15, 7168, 12, 128, 0}, + {15, 7680, 12, 512, 0}, + {15, 8192, 12, 4096, 1}, + {15, 8704, 12, 256, 0}, + {15, 9216, 12, 512, 1}, + {15, 9728, 12, 512, 1}, + {15, 10240, 12, 512, 1}, + {15, 20480, 12, 2048, 1}, + {15, 30720, 12, 512, 1}, + {15, 40960, 12, 8192, 1}, + {15, 51200, 12, 2048, 1}, + {15, 61440, 12, 4096, 1}, + {15, 71680, 12, 2048, 1}, + {15, 81920, 12, 16384, 1}, + {15, 92160, 12, 2048, 1}, + {15, 102400, 12, 2048, 1}, + {15, 204800, 12, 8192, 1}, + {15, 307200, 12, 4096, 1}, + {15, 409600, 12, 16384, 1}, + {15, 512000, 12, 512, 1}, + {15, 614400, 12, 8192, 1}, + {15, 716800, 12, 4096, 1}, + {15, 819200, 12, 16384, 1}, + {15, 921600, 12, 4096, 1}, + {15, 1024000, 12, 8192, 1}, + {20, 512, 11, 64, 1}, + {20, 1024, 11, 128, 1}, + {20, 1536, 11, 256, 1}, + {20, 2048, 11, 128, 1}, + {20, 2560, 11, 256, 1}, + {20, 3072, 11, 256, 1}, + {20, 3584, 11, 256, 1}, + {20, 4096, 11, 1024, 1}, + {20, 4608, 11, 32, 0}, + {20, 5120, 11, 32, 0}, + {20, 5632, 11, 64, 0}, + {20, 6144, 11, 128, 0}, + {20, 6656, 11, 512, 1}, + {20, 7168, 11, 512, 1}, + {20, 7680, 11, 256, 1}, + {20, 8192, 11, 4096, 1}, + {20, 8704, 11, 512, 1}, + {20, 9216, 11, 512, 1}, + {20, 9728, 11, 256, 1}, + {20, 10240, 11, 512, 1}, + {20, 20480, 11, 2048, 1}, + {20, 30720, 11, 2048, 1}, + {20, 40960, 11, 2048, 1}, + {20, 51200, 11, 2048, 1}, + {20, 61440, 11, 4096, 1}, + {20, 71680, 11, 2048, 1}, + {20, 81920, 11, 16384, 1}, + {20, 92160, 11, 2048, 1}, + {20, 102400, 11, 4096, 1}, + {20, 204800, 11, 4096, 1}, + {20, 307200, 11, 4096, 1}, + {20, 409600, 11, 8192, 1}, + {20, 512000, 11, 4096, 1}, + {20, 614400, 11, 8192, 1}, + {20, 716800, 11, 4096, 1}, + {20, 819200, 11, 32768, 1}, + {20, 921600, 11, 4096, 1}, + {20, 1024000, 11, 8192, 0}, + {21, 512, 16, 32, 1}, + {21, 1024, 16, 64, 1}, + {21, 1536, 16, 256, 1}, + {21, 2048, 16, 128, 1}, + {21, 2560, 16, 256, 1}, + {21, 3072, 16, 256, 1}, + {21, 3584, 16, 128, 0}, + {21, 4096, 16, 128, 0}, + {21, 4608, 16, 128, 0}, + {21, 5120, 16, 128, 0}, + {21, 5632, 16, 256, 0}, + {21, 6144, 16, 128, 0}, + {21, 6656, 16, 512, 1}, + {21, 7168, 16, 512, 1}, + {21, 7680, 16, 256, 1}, + {21, 8192, 16, 512, 1}, + {21, 8704, 16, 256, 1}, + {21, 9216, 16, 512, 1}, + {21, 9728, 16, 512, 1}, + {21, 10240, 16, 512, 1}, + {21, 20480, 16, 4096, 1}, + {21, 30720, 16, 2048, 1}, + {21, 40960, 16, 8192, 1}, + {21, 51200, 16, 2048, 1}, + {21, 61440, 16, 2048, 1}, + {21, 71680, 16, 2048, 1}, + {21, 81920, 16, 8192, 1}, + {21, 92160, 16, 2048, 1}, + {21, 102400, 16, 4096, 1}, + {21, 204800, 16, 4096, 1}, + {21, 307200, 16, 4096, 1}, + {21, 409600, 16, 16384, 1}, + {21, 512000, 16, 2048, 1}, + {21, 614400, 16, 8192, 1}, + {21, 716800, 16, 4096, 1}, + {21, 819200, 16, 32768, 1}, + {21, 921600, 16, 2048, 1}, + {21, 1024000, 16, 2048, 1}, + {28, 512, 25, 64, 1}, + {28, 1024, 25, 128, 1}, + {28, 1536, 25, 256, 1}, + {28, 2048, 25, 256, 1}, + {28, 2560, 25, 64, 0}, + {28, 3072, 25, 32, 0}, + {28, 3584, 25, 64, 0}, + {28, 4096, 25, 256, 0}, + {28, 4608, 25, 128, 0}, + {28, 5120, 25, 1024, 1}, + {28, 5632, 25, 256, 1}, + {28, 6144, 25, 256, 1}, + {28, 6656, 25, 256, 1}, + {28, 7168, 25, 1024, 1}, + {28, 7680, 25, 256, 1}, + {28, 8192, 25, 256, 1}, + {28, 8704, 25, 256, 1}, + {28, 9216, 25, 256, 1}, + {28, 9728, 25, 256, 1}, + {28, 10240, 25, 256, 1}, + {28, 20480, 25, 2048, 1}, + {28, 30720, 25, 2048, 1}, + {28, 40960, 25, 8192, 1}, + {28, 51200, 25, 1024, 1}, + {28, 61440, 25, 2048, 1}, + {28, 71680, 25, 2048, 1}, + {28, 81920, 25, 4096, 1}, + {28, 92160, 25, 2048, 1}, + {28, 102400, 25, 4096, 1}, + {28, 204800, 25, 4096, 1}, + {28, 307200, 25, 4096, 1}, + {28, 409600, 25, 8192, 1}, + {28, 512000, 25, 4096, 1}, + {28, 614400, 25, 8192, 1}, + {28, 716800, 25, 4096, 1}, + {28, 819200, 25, 32768, 1}, + {28, 921600, 25, 4096, 1}, + {28, 1024000, 25, 8192, 1}, + {35, 512, 24, 64, 1}, + {35, 1024, 24, 256, 1}, + {35, 1536, 24, 32, 0}, + {35, 2048, 24, 256, 1}, + {35, 2560, 24, 64, 0}, + {35, 3072, 24, 256, 0}, + {35, 3584, 24, 256, 0}, + {35, 4096, 24, 256, 1}, + {35, 4608, 24, 256, 1}, + {35, 5120, 24, 256, 1}, + {35, 5632, 24, 256, 1}, + {35, 6144, 24, 256, 1}, + {35, 6656, 24, 256, 1}, + {35, 7168, 24, 256, 1}, + {35, 7680, 24, 512, 1}, + {35, 8192, 24, 2048, 1}, + {35, 8704, 24, 512, 1}, + {35, 9216, 24, 1024, 1}, + {35, 9728, 24, 256, 1}, + {35, 10240, 24, 512, 1}, + {35, 20480, 24, 1024, 1}, + {35, 30720, 24, 1024, 1}, + {35, 40960, 24, 4096, 1}, + {35, 51200, 24, 2048, 1}, + {35, 61440, 24, 2048, 1}, + {35, 71680, 24, 2048, 1}, + {35, 81920, 24, 4096, 1}, + {35, 92160, 24, 2048, 1}, + {35, 102400, 24, 4096, 1}, + {35, 204800, 24, 8192, 1}, + {35, 307200, 24, 4096, 1}, + {35, 409600, 24, 8192, 1}, + {35, 512000, 24, 2048, 1}, + {35, 614400, 24, 8192, 0}, + {35, 716800, 24, 2048, 1}, + {35, 819200, 24, 16384, 0}, + {35, 921600, 24, 4096, 1}, + {35, 1024000, 24, 8192, 0}, + {36, 512, 33, 64, 1}, + {36, 1024, 33, 256, 0}, + {36, 1536, 33, 64, 0}, + {36, 2048, 33, 128, 0}, + {36, 2560, 33, 64, 0}, + {36, 3072, 33, 256, 0}, + {36, 3584, 33, 256, 0}, + {36, 4096, 33, 256, 1}, + {36, 4608, 33, 256, 1}, + {36, 5120, 33, 256, 1}, + {36, 5632, 33, 256, 1}, + {36, 6144, 33, 256, 1}, + {36, 6656, 33, 256, 1}, + {36, 7168, 33, 512, 1}, + {36, 7680, 33, 256, 1}, + {36, 8192, 33, 8192, 1}, + {36, 8704, 33, 256, 1}, + {36, 9216, 33, 512, 1}, + {36, 9728, 33, 256, 1}, + {36, 10240, 33, 256, 1}, + {36, 20480, 33, 2048, 1}, + {36, 30720, 33, 1024, 1}, + {36, 40960, 33, 8192, 1}, + {36, 51200, 33, 2048, 1}, + {36, 61440, 33, 4096, 1}, + {36, 71680, 33, 2048, 1}, + {36, 81920, 33, 4096, 1}, + {36, 92160, 33, 2048, 1}, + {36, 102400, 33, 4096, 1}, + {36, 204800, 33, 8192, 0}, + {36, 307200, 33, 307200, 1}, + {36, 409600, 33, 8192, 0}, + {36, 512000, 33, 512000, 1}, + {36, 614400, 33, 8192, 0}, + {36, 716800, 33, 716800, 1}, + {36, 819200, 33, 8192, 0}, + {36, 921600, 33, 921600, 1}, + {36, 1024000, 33, 8192, 0}, + {45, 512, 42, 64, 0}, + {45, 1024, 42, 64, 0}, + {45, 1536, 42, 64, 0}, + {45, 2048, 42, 64, 0}, + {45, 2560, 42, 64, 0}, + {45, 3072, 42, 128, 1}, + {45, 3584, 42, 128, 1}, + {45, 4096, 42, 256, 1}, + {45, 4608, 42, 512, 1}, + {45, 5120, 42, 1024, 1}, + {45, 5632, 42, 512, 1}, + {45, 6144, 42, 2048, 1}, + {45, 6656, 42, 256, 1}, + {45, 7168, 42, 512, 1}, + {45, 7680, 42, 512, 1}, + {45, 8192, 42, 4096, 1}, + {45, 8704, 42, 512, 1}, + {45, 9216, 42, 256, 1}, + {45, 9728, 42, 256, 1}, + {45, 10240, 42, 256, 1}, + {45, 20480, 42, 1024, 1}, + {45, 30720, 42, 2048, 1}, + {45, 40960, 42, 1024, 1}, + {45, 51200, 42, 1024, 1}, + {45, 61440, 42, 2048, 1}, + {45, 71680, 42, 2048, 1}, + {45, 81920, 42, 16384, 1}, + {45, 92160, 42, 2048, 1}, + {45, 102400, 42, 4096, 1}, + {45, 204800, 42, 4096, 1}, + {45, 307200, 42, 4096, 1}, + {45, 409600, 42, 16384, 0}, + {45, 512000, 42, 2048, 0}, + {45, 614400, 42, 8192, 0}, + {45, 716800, 42, 1024, 0}, + {45, 819200, 42, 32768, 0}, + {45, 921600, 42, 2048, 0}, + {45, 1024000, 42, 8192, 0}, + {56, 512, 43, 32, 0}, + {56, 1024, 43, 256, 1}, + {56, 1536, 43, 32, 0}, + {56, 2048, 43, 64, 0}, + {56, 2560, 43, 128, 1}, + {56, 3072, 43, 128, 1}, + {56, 3584, 43, 512, 1}, + {56, 4096, 43, 512, 1}, + {56, 4608, 43, 256, 1}, + {56, 5120, 43, 512, 1}, + {56, 5632, 43, 256, 1}, + {56, 6144, 43, 2048, 1}, + {56, 6656, 43, 512, 1}, + {56, 7168, 43, 512, 1}, + {56, 7680, 43, 256, 1}, + {56, 8192, 43, 1024, 1}, + {56, 8704, 43, 512, 1}, + {56, 9216, 43, 512, 1}, + {56, 9728, 43, 512, 1}, + {56, 10240, 43, 1024, 1}, + {56, 20480, 43, 1024, 1}, + {56, 30720, 43, 1024, 1}, + {56, 40960, 43, 8192, 1}, + {56, 51200, 43, 1024, 1}, + {56, 61440, 43, 2048, 1}, + {56, 71680, 43, 2048, 1}, + {56, 81920, 43, 16384, 1}, + {56, 92160, 43, 2048, 1}, + {56, 102400, 43, 4096, 1}, + {56, 204800, 43, 8192, 1}, + {56, 307200, 43, 4096, 1}, + {56, 409600, 43, 8192, 0}, + {56, 512000, 43, 4096, 1}, + {56, 614400, 43, 8192, 0}, + {56, 716800, 43, 4096, 1}, + {56, 819200, 43, 32768, 0}, + {56, 921600, 43, 2048, 0}, + {56, 1024000, 43, 8192, 0}, + {84, 512, 126, 64, 0}, + {84, 1024, 126, 64, 0}, + {84, 1536, 126, 64, 0}, + {84, 2048, 126, 1024, 1}, + {84, 2560, 126, 64, 1}, + {84, 3072, 126, 256, 1}, + {84, 3584, 126, 256, 1}, + {84, 4096, 126, 4096, 1}, + {84, 4608, 126, 128, 1}, + {84, 5120, 126, 256, 1}, + {84, 5632, 126, 256, 1}, + {84, 6144, 126, 1024, 0}, + {84, 6656, 126, 256, 1}, + {84, 7168, 126, 1024, 1}, + {84, 7680, 126, 512, 1}, + {84, 8192, 126, 4096, 1}, + {84, 8704, 126, 512, 1}, + {84, 9216, 126, 512, 1}, + {84, 9728, 126, 256, 1}, + {84, 10240, 126, 1024, 1}, + {84, 20480, 126, 2048, 1}, + {84, 30720, 126, 1024, 0}, + {84, 40960, 126, 40960, 0}, + {84, 51200, 126, 51200, 0}, + {84, 61440, 126, 61440, 0}, + {84, 71680, 126, 71680, 0}, + {84, 81920, 126, 81920, 0}, + {84, 92160, 126, 92160, 0}, + {84, 102400, 126, 102400, 0}, + {84, 204800, 126, 204800, 0}, + {84, 307200, 126, 307200, 0}, + {84, 409600, 126, 409600, 0}, + {84, 512000, 126, 512000, 0}, + {84, 614400, 126, 614400, 0}, + {84, 716800, 126, 716800, 0}, + {84, 819200, 126, 819200, 0}, + {84, 921600, 126, 921600, 0}, + {84, 1024000, 126, 1024000, 0}, + {120, 512, 210, 128, 0}, + {120, 1024, 210, 512, 0}, + {120, 1536, 210, 512, 0}, + {120, 2048, 210, 2048, 0}, + {120, 2560, 210, 512, 0}, + {120, 3072, 210, 1024, 0}, + {120, 3584, 210, 512, 0}, + {120, 4096, 210, 2048, 0}, + {120, 4608, 210, 512, 0}, + {120, 5120, 210, 1024, 0}, + {120, 5632, 210, 512, 0}, + {120, 6144, 210, 2048, 0}, + {120, 6656, 210, 512, 0}, + {120, 7168, 210, 1024, 0}, + {120, 7680, 210, 512, 0}, + {120, 8192, 210, 2048, 0}, + {120, 8704, 210, 512, 0}, + {120, 9216, 210, 1024, 0}, + {120, 9728, 210, 512, 0}, + {120, 10240, 210, 1024, 0}, + {120, 20480, 210, 1024, 0}, + {120, 30720, 210, 30720, 0}, + {120, 40960, 210, 40960, 0}, + {120, 51200, 210, 51200, 0}, + {120, 61440, 210, 61440, 0}, + {120, 71680, 210, 71680, 0}, + {120, 81920, 210, 81920, 0}, + {120, 92160, 210, 92160, 0}, + {120, 102400, 210, 102400, 0}, + {120, 204800, 210, 204800, 0}, + {120, 307200, 210, 307200, 0}, + {120, 409600, 210, 409600, 0}, + {120, 512000, 210, 512000, 0}, + {120, 614400, 210, 614400, 0}, + {120, 716800, 210, 716800, 0}, + {120, 819200, 210, 819200, 0}, + {120, 921600, 210, 921600, 0}, + {120, 1024000, 210, 1024000, 0}, + {165, 512, 330, 128, 0}, + {165, 1024, 330, 1024, 0}, + {165, 1536, 330, 512, 0}, + {165, 2048, 330, 2048, 0}, + {165, 2560, 330, 512, 0}, + {165, 3072, 330, 512, 0}, + {165, 3584, 330, 512, 0}, + {165, 4096, 330, 2048, 0}, + {165, 4608, 330, 512, 0}, + {165, 5120, 330, 1024, 0}, + {165, 5632, 330, 512, 0}, + {165, 6144, 330, 2048, 0}, + {165, 6656, 330, 512, 0}, + {165, 7168, 330, 1024, 0}, + {165, 7680, 330, 512, 0}, + {165, 8192, 330, 2048, 0}, + {165, 8704, 330, 512, 0}, + {165, 9216, 330, 512, 0}, + {165, 9728, 330, 512, 0}, + {165, 10240, 330, 2048, 0}, + {165, 20480, 330, 2048, 0}, + {165, 30720, 330, 2048, 0}, + {165, 40960, 330, 40960, 0}, + {165, 51200, 330, 51200, 0}, + {165, 61440, 330, 61440, 0}, + {165, 71680, 330, 71680, 0}, + {165, 81920, 330, 81920, 0}, + {165, 92160, 330, 92160, 0}, + {165, 102400, 330, 102400, 0}, + {165, 204800, 330, 204800, 0}, + {165, 307200, 330, 307200, 0}, + {165, 409600, 330, 409600, 0}, + {165, 512000, 330, 512000, 0}, + {165, 614400, 330, 614400, 0}, + {165, 716800, 330, 716800, 0}, + {165, 819200, 330, 819200, 0}, + {165, 921600, 330, 921600, 0}, + {165, 1024000, 330, 1024000, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > dgemm_tn_mi250x = -{ - {1 , 512 , 3 , 32 , 1 }, - {1 , 1024 , 3 , 128 , 1 }, - {1 , 1536 , 3 , 512 , 1 }, - {1 , 2048 , 3 , 1024 , 1 }, - {1 , 2560 , 3 , 512 , 1 }, - {1 , 3072 , 3 , 1024 , 1 }, - {1 , 3584 , 3 , 128 , 1 }, - {1 , 4096 , 3 , 4096 , 1 }, - {1 , 4608 , 3 , 512 , 1 }, - {1 , 5120 , 3 , 512 , 1 }, - {1 , 5632 , 3 , 256 , 1 }, - {1 , 6144 , 3 , 2048 , 1 }, - {1 , 6656 , 3 , 256 , 1 }, - {1 , 7168 , 3 , 512 , 1 }, - {1 , 7680 , 3 , 512 , 1 }, - {1 , 8192 , 3 , 2048 , 1 }, - {1 , 8704 , 3 , 512 , 1 }, - {1 , 9216 , 3 , 512 , 1 }, - {1 , 9728 , 3 , 512 , 1 }, - {1 , 10240 , 3 , 1024 , 1 }, - {1 , 20480 , 3 , 2048 , 1 }, - {1 , 30720 , 3 , 2048 , 1 }, - {1 , 40960 , 3 , 4096 , 1 }, - {1 , 51200 , 3 , 2048 , 1 }, - {1 , 61440 , 3 , 4096 , 1 }, - {1 , 71680 , 3 , 2048 , 1 }, - {1 , 81920 , 3 , 8192 , 1 }, - {1 , 92160 , 3 , 2048 , 1 }, - {1 , 102400 , 3 , 2048 , 1 }, - {1 , 204800 , 3 , 8192 , 1 }, - {1 , 307200 , 3 , 4096 , 1 }, - {1 , 409600 , 3 , 8192 , 1 }, - {1 , 512000 , 3 , 2048 , 1 }, - {1 , 614400 , 3 , 8192 , 1 }, - {1 , 716800 , 3 , 2048 , 1 }, - {1 , 819200 , 3 , 32768 , 1 }, - {1 , 921600 , 3 , 2048 , 1 }, - {1 , 1024000, 3 , 2048 , 1 }, - {1 , 512 , 4 , 64 , 1 }, - {1 , 1024 , 4 , 128 , 1 }, - {1 , 1536 , 4 , 64 , 1 }, - {1 , 2048 , 4 , 256 , 1 }, - {1 , 2560 , 4 , 128 , 1 }, - {1 , 3072 , 4 , 128 , 1 }, - {1 , 3584 , 4 , 512 , 1 }, - {1 , 4096 , 4 , 256 , 1 }, - {1 , 4608 , 4 , 512 , 1 }, - {1 , 5120 , 4 , 1024 , 1 }, - {1 , 5632 , 4 , 512 , 1 }, - {1 , 6144 , 4 , 2048 , 1 }, - {1 , 6656 , 4 , 512 , 1 }, - {1 , 7168 , 4 , 1024 , 1 }, - {1 , 7680 , 4 , 512 , 1 }, - {1 , 8192 , 4 , 1024 , 1 }, - {1 , 8704 , 4 , 512 , 1 }, - {1 , 9216 , 4 , 512 , 1 }, - {1 , 9728 , 4 , 512 , 1 }, - {1 , 10240 , 4 , 512 , 1 }, - {1 , 20480 , 4 , 2048 , 1 }, - {1 , 30720 , 4 , 2048 , 1 }, - {1 , 40960 , 4 , 4096 , 1 }, - {1 , 51200 , 4 , 2048 , 1 }, - {1 , 61440 , 4 , 2048 , 1 }, - {1 , 71680 , 4 , 2048 , 1 }, - {1 , 81920 , 4 , 4096 , 1 }, - {1 , 92160 , 4 , 2048 , 1 }, - {1 , 102400 , 4 , 2048 , 1 }, - {1 , 204800 , 4 , 8192 , 1 }, - {1 , 307200 , 4 , 4096 , 1 }, - {1 , 409600 , 4 , 16384 , 1 }, - {1 , 512000 , 4 , 1024 , 1 }, - {1 , 614400 , 4 , 8192 , 1 }, - {1 , 716800 , 4 , 1024 , 1 }, - {1 , 819200 , 4 , 32768 , 1 }, - {1 , 921600 , 4 , 2048 , 1 }, - {1 , 1024000, 4 , 2048 , 1 }, - {3 , 512 , 6 , 128 , 1 }, - {3 , 1024 , 6 , 64 , 1 }, - {3 , 1536 , 6 , 512 , 1 }, - {3 , 2048 , 6 , 2048 , 1 }, - {3 , 2560 , 6 , 128 , 1 }, - {3 , 3072 , 6 , 128 , 1 }, - {3 , 3584 , 6 , 512 , 1 }, - {3 , 4096 , 6 , 1024 , 1 }, - {3 , 4608 , 6 , 512 , 1 }, - {3 , 5120 , 6 , 512 , 1 }, - {3 , 5632 , 6 , 512 , 1 }, - {3 , 6144 , 6 , 256 , 1 }, - {3 , 6656 , 6 , 512 , 1 }, - {3 , 7168 , 6 , 1024 , 1 }, - {3 , 7680 , 6 , 512 , 1 }, - {3 , 8192 , 6 , 1024 , 1 }, - {3 , 8704 , 6 , 512 , 1 }, - {3 , 9216 , 6 , 512 , 1 }, - {3 , 9728 , 6 , 512 , 1 }, - {3 , 10240 , 6 , 512 , 1 }, - {3 , 20480 , 6 , 1024 , 1 }, - {3 , 30720 , 6 , 1024 , 1 }, - {3 , 40960 , 6 , 2048 , 1 }, - {3 , 51200 , 6 , 2048 , 1 }, - {3 , 61440 , 6 , 4096 , 1 }, - {3 , 71680 , 6 , 2048 , 1 }, - {3 , 81920 , 6 , 2048 , 1 }, - {3 , 92160 , 6 , 2048 , 1 }, - {3 , 102400 , 6 , 4096 , 1 }, - {3 , 204800 , 6 , 8192 , 1 }, - {3 , 307200 , 6 , 4096 , 1 }, - {3 , 409600 , 6 , 16384 , 1 }, - {3 , 512000 , 6 , 512 , 1 }, - {3 , 614400 , 6 , 8192 , 1 }, - {3 , 716800 , 6 , 1024 , 1 }, - {3 , 819200 , 6 , 32768 , 1 }, - {3 , 921600 , 6 , 512 , 1 }, - {3 , 1024000, 6 , 1024 , 1 }, - {4 , 512 , 10 , 32 , 1 }, - {4 , 1024 , 10 , 128 , 1 }, - {4 , 1536 , 10 , 512 , 1 }, - {4 , 2048 , 10 , 1024 , 1 }, - {4 , 2560 , 10 , 128 , 1 }, - {4 , 3072 , 10 , 128 , 1 }, - {4 , 3584 , 10 , 256 , 1 }, - {4 , 4096 , 10 , 1024 , 1 }, - {4 , 4608 , 10 , 512 , 1 }, - {4 , 5120 , 10 , 512 , 1 }, - {4 , 5632 , 10 , 512 , 1 }, - {4 , 6144 , 10 , 512 , 1 }, - {4 , 6656 , 10 , 512 , 1 }, - {4 , 7168 , 10 , 512 , 1 }, - {4 , 7680 , 10 , 512 , 1 }, - {4 , 8192 , 10 , 2048 , 1 }, - {4 , 8704 , 10 , 512 , 1 }, - {4 , 9216 , 10 , 512 , 1 }, - {4 , 9728 , 10 , 512 , 1 }, - {4 , 10240 , 10 , 1024 , 1 }, - {4 , 20480 , 10 , 4096 , 1 }, - {4 , 30720 , 10 , 2048 , 1 }, - {4 , 40960 , 10 , 2048 , 1 }, - {4 , 51200 , 10 , 2048 , 1 }, - {4 , 61440 , 10 , 2048 , 1 }, - {4 , 71680 , 10 , 2048 , 1 }, - {4 , 81920 , 10 , 16384 , 1 }, - {4 , 92160 , 10 , 2048 , 1 }, - {4 , 102400 , 10 , 2048 , 1 }, - {4 , 204800 , 10 , 4096 , 1 }, - {4 , 307200 , 10 , 4096 , 1 }, - {4 , 409600 , 10 , 16384 , 1 }, - {4 , 512000 , 10 , 512 , 1 }, - {4 , 614400 , 10 , 8192 , 1 }, - {4 , 716800 , 10 , 4096 , 1 }, - {4 , 819200 , 10 , 32768 , 1 }, - {4 , 921600 , 10 , 512 , 1 }, - {4 , 1024000, 10 , 8192 , 1 }, - {6 , 512 , 10 , 64 , 1 }, - {6 , 1024 , 10 , 64 , 1 }, - {6 , 1536 , 10 , 256 , 1 }, - {6 , 2048 , 10 , 128 , 1 }, - {6 , 2560 , 10 , 256 , 1 }, - {6 , 3072 , 10 , 128 , 1 }, - {6 , 3584 , 10 , 256 , 1 }, - {6 , 4096 , 10 , 1024 , 1 }, - {6 , 4608 , 10 , 256 , 1 }, - {6 , 5120 , 10 , 512 , 1 }, - {6 , 5632 , 10 , 512 , 1 }, - {6 , 6144 , 10 , 1024 , 1 }, - {6 , 6656 , 10 , 512 , 1 }, - {6 , 7168 , 10 , 512 , 1 }, - {6 , 7680 , 10 , 512 , 1 }, - {6 , 8192 , 10 , 512 , 1 }, - {6 , 8704 , 10 , 512 , 1 }, - {6 , 9216 , 10 , 512 , 1 }, - {6 , 9728 , 10 , 256 , 1 }, - {6 , 10240 , 10 , 1024 , 1 }, - {6 , 20480 , 10 , 1024 , 1 }, - {6 , 30720 , 10 , 1024 , 1 }, - {6 , 40960 , 10 , 2048 , 1 }, - {6 , 51200 , 10 , 1024 , 1 }, - {6 , 61440 , 10 , 2048 , 1 }, - {6 , 71680 , 10 , 2048 , 1 }, - {6 , 81920 , 10 , 16384 , 1 }, - {6 , 92160 , 10 , 2048 , 1 }, - {6 , 102400 , 10 , 4096 , 1 }, - {6 , 204800 , 10 , 4096 , 1 }, - {6 , 307200 , 10 , 4096 , 1 }, - {6 , 409600 , 10 , 16384 , 1 }, - {6 , 512000 , 10 , 4096 , 1 }, - {6 , 614400 , 10 , 8192 , 1 }, - {6 , 716800 , 10 , 4096 , 1 }, - {6 , 819200 , 10 , 16384 , 1 }, - {6 , 921600 , 10 , 1024 , 1 }, - {6 , 1024000, 10 , 8192 , 1 }, - {12 , 512 , 15 , 64 , 1 }, - {12 , 1024 , 15 , 64 , 1 }, - {12 , 1536 , 15 , 256 , 1 }, - {12 , 2048 , 15 , 128 , 1 }, - {12 , 2560 , 15 , 128 , 1 }, - {12 , 3072 , 15 , 512 , 1 }, - {12 , 3584 , 15 , 256 , 1 }, - {12 , 4096 , 15 , 512 , 1 }, - {12 , 4608 , 15 , 512 , 1 }, - {12 , 5120 , 15 , 1024 , 1 }, - {12 , 5632 , 15 , 256 , 1 }, - {12 , 6144 , 15 , 256 , 1 }, - {12 , 6656 , 15 , 512 , 1 }, - {12 , 7168 , 15 , 512 , 1 }, - {12 , 7680 , 15 , 512 , 1 }, - {12 , 8192 , 15 , 2048 , 1 }, - {12 , 8704 , 15 , 512 , 1 }, - {12 , 9216 , 15 , 512 , 1 }, - {12 , 9728 , 15 , 512 , 1 }, - {12 , 10240 , 15 , 512 , 1 }, - {12 , 20480 , 15 , 2048 , 1 }, - {12 , 30720 , 15 , 1024 , 1 }, - {12 , 40960 , 15 , 8192 , 1 }, - {12 , 51200 , 15 , 2048 , 1 }, - {12 , 61440 , 15 , 2048 , 1 }, - {12 , 71680 , 15 , 1024 , 1 }, - {12 , 81920 , 15 , 4096 , 1 }, - {12 , 92160 , 15 , 2048 , 1 }, - {12 , 102400 , 15 , 2048 , 1 }, - {12 , 204800 , 15 , 4096 , 1 }, - {12 , 307200 , 15 , 4096 , 1 }, - {12 , 409600 , 15 , 8192 , 1 }, - {12 , 512000 , 15 , 512 , 1 }, - {12 , 614400 , 15 , 4096 , 1 }, - {12 , 716800 , 15 , 4096 , 1 }, - {12 , 819200 , 15 , 32768 , 1 }, - {12 , 921600 , 15 , 2048 , 1 }, - {12 , 1024000, 15 , 4096 , 1 }, - {11 , 512 , 20 , 64 , 1 }, - {11 , 1024 , 20 , 128 , 1 }, - {11 , 1536 , 20 , 512 , 1 }, - {11 , 2048 , 20 , 256 , 1 }, - {11 , 2560 , 20 , 128 , 1 }, - {11 , 3072 , 20 , 256 , 1 }, - {11 , 3584 , 20 , 256 , 1 }, - {11 , 4096 , 20 , 256 , 1 }, - {11 , 4608 , 20 , 128 , 1 }, - {11 , 5120 , 20 , 256 , 1 }, - {11 , 5632 , 20 , 512 , 1 }, - {11 , 6144 , 20 , 2048 , 1 }, - {11 , 6656 , 20 , 256 , 1 }, - {11 , 7168 , 20 , 256 , 1 }, - {11 , 7680 , 20 , 256 , 1 }, - {11 , 8192 , 20 , 8192 , 1 }, - {11 , 8704 , 20 , 512 , 1 }, - {11 , 9216 , 20 , 512 , 1 }, - {11 , 9728 , 20 , 256 , 1 }, - {11 , 10240 , 20 , 2048 , 1 }, - {11 , 20480 , 20 , 2048 , 1 }, - {11 , 30720 , 20 , 2048 , 1 }, - {11 , 40960 , 20 , 8192 , 1 }, - {11 , 51200 , 20 , 2048 , 1 }, - {11 , 61440 , 20 , 4096 , 1 }, - {11 , 71680 , 20 , 2048 , 1 }, - {11 , 81920 , 20 , 16384 , 1 }, - {11 , 92160 , 20 , 2048 , 1 }, - {11 , 102400 , 20 , 4096 , 1 }, - {11 , 204800 , 20 , 8192 , 1 }, - {11 , 307200 , 20 , 4096 , 1 }, - {11 , 409600 , 20 , 16384 , 1 }, - {11 , 512000 , 20 , 512 , 1 }, - {11 , 614400 , 20 , 8192 , 1 }, - {11 , 716800 , 20 , 4096 , 1 }, - {11 , 819200 , 20 , 16384 , 1 }, - {11 , 921600 , 20 , 2048 , 1 }, - {11 , 1024000, 20 , 2048 , 1 }, - {16 , 512 , 21 , 128 , 1 }, - {16 , 1024 , 21 , 128 , 1 }, - {16 , 1536 , 21 , 512 , 1 }, - {16 , 2048 , 21 , 2048 , 1 }, - {16 , 2560 , 21 , 256 , 1 }, - {16 , 3072 , 21 , 256 , 1 }, - {16 , 3584 , 21 , 128 , 1 }, - {16 , 4096 , 21 , 256 , 1 }, - {16 , 4608 , 21 , 64 , 0 }, - {16 , 5120 , 21 , 32 , 0 }, - {16 , 5632 , 21 , 128 , 0 }, - {16 , 6144 , 21 , 512 , 1 }, - {16 , 6656 , 21 , 256 , 1 }, - {16 , 7168 , 21 , 1024 , 1 }, - {16 , 7680 , 21 , 512 , 1 }, - {16 , 8192 , 21 , 512 , 1 }, - {16 , 8704 , 21 , 256 , 1 }, - {16 , 9216 , 21 , 512 , 1 }, - {16 , 9728 , 21 , 256 , 1 }, - {16 , 10240 , 21 , 256 , 1 }, - {16 , 20480 , 21 , 1024 , 1 }, - {16 , 30720 , 21 , 2048 , 1 }, - {16 , 40960 , 21 , 4096 , 1 }, - {16 , 51200 , 21 , 2048 , 1 }, - {16 , 61440 , 21 , 2048 , 1 }, - {16 , 71680 , 21 , 2048 , 1 }, - {16 , 81920 , 21 , 4096 , 1 }, - {16 , 92160 , 21 , 256 , 1 }, - {16 , 102400 , 21 , 4096 , 1 }, - {16 , 204800 , 21 , 8192 , 1 }, - {16 , 307200 , 21 , 4096 , 1 }, - {16 , 409600 , 21 , 16384 , 1 }, - {16 , 512000 , 21 , 512 , 1 }, - {16 , 614400 , 21 , 8192 , 1 }, - {16 , 716800 , 21 , 4096 , 1 }, - {16 , 819200 , 21 , 32768 , 1 }, - {16 , 921600 , 21 , 1024 , 1 }, - {16 , 1024000, 21 , 8192 , 1 }, - {25 , 512 , 28 , 128 , 1 }, - {25 , 1024 , 28 , 128 , 1 }, - {25 , 1536 , 28 , 256 , 1 }, - {25 , 2048 , 28 , 128 , 1 }, - {25 , 2560 , 28 , 256 , 1 }, - {25 , 3072 , 28 , 256 , 0 }, - {25 , 3584 , 28 , 256 , 0 }, - {25 , 4096 , 28 , 4096 , 1 }, - {25 , 4608 , 28 , 32 , 0 }, - {25 , 5120 , 28 , 256 , 1 }, - {25 , 5632 , 28 , 256 , 1 }, - {25 , 6144 , 28 , 256 , 1 }, - {25 , 6656 , 28 , 512 , 1 }, - {25 , 7168 , 28 , 1024 , 1 }, - {25 , 7680 , 28 , 256 , 1 }, - {25 , 8192 , 28 , 4096 , 1 }, - {25 , 8704 , 28 , 256 , 1 }, - {25 , 9216 , 28 , 256 , 1 }, - {25 , 9728 , 28 , 512 , 1 }, - {25 , 10240 , 28 , 256 , 1 }, - {25 , 20480 , 28 , 2048 , 1 }, - {25 , 30720 , 28 , 1024 , 1 }, - {25 , 40960 , 28 , 2048 , 1 }, - {25 , 51200 , 28 , 1024 , 1 }, - {25 , 61440 , 28 , 4096 , 1 }, - {25 , 71680 , 28 , 2048 , 1 }, - {25 , 81920 , 28 , 4096 , 1 }, - {25 , 92160 , 28 , 2048 , 1 }, - {25 , 102400 , 28 , 4096 , 1 }, - {25 , 204800 , 28 , 4096 , 1 }, - {25 , 307200 , 28 , 4096 , 1 }, - {25 , 409600 , 28 , 8192 , 1 }, - {25 , 512000 , 28 , 2048 , 1 }, - {25 , 614400 , 28 , 8192 , 1 }, - {25 , 716800 , 28 , 4096 , 1 }, - {25 , 819200 , 28 , 32768 , 1 }, - {25 , 921600 , 28 , 2048 , 1 }, - {25 , 1024000, 28 , 4096 , 1 }, - {24 , 512 , 35 , 32 , 1 }, - {24 , 1024 , 35 , 256 , 1 }, - {24 , 1536 , 35 , 64 , 1 }, - {24 , 2048 , 35 , 256 , 1 }, - {24 , 2560 , 35 , 128 , 0 }, - {24 , 3072 , 35 , 128 , 0 }, - {24 , 3584 , 35 , 64 , 0 }, - {24 , 4096 , 35 , 1024 , 1 }, - {24 , 4608 , 35 , 256 , 1 }, - {24 , 5120 , 35 , 256 , 1 }, - {24 , 5632 , 35 , 256 , 1 }, - {24 , 6144 , 35 , 256 , 1 }, - {24 , 6656 , 35 , 256 , 1 }, - {24 , 7168 , 35 , 256 , 1 }, - {24 , 7680 , 35 , 256 , 1 }, - {24 , 8192 , 35 , 2048 , 1 }, - {24 , 8704 , 35 , 512 , 1 }, - {24 , 9216 , 35 , 256 , 1 }, - {24 , 9728 , 35 , 512 , 1 }, - {24 , 10240 , 35 , 256 , 1 }, - {24 , 20480 , 35 , 1024 , 1 }, - {24 , 30720 , 35 , 1024 , 1 }, - {24 , 40960 , 35 , 4096 , 1 }, - {24 , 51200 , 35 , 2048 , 1 }, - {24 , 61440 , 35 , 2048 , 1 }, - {24 , 71680 , 35 , 2048 , 1 }, - {24 , 81920 , 35 , 8192 , 1 }, - {24 , 92160 , 35 , 2048 , 1 }, - {24 , 102400 , 35 , 2048 , 1 }, - {24 , 204800 , 35 , 8192 , 1 }, - {24 , 307200 , 35 , 4096 , 1 }, - {24 , 409600 , 35 , 16384 , 1 }, - {24 , 512000 , 35 , 2048 , 1 }, - {24 , 614400 , 35 , 8192 , 1 }, - {24 , 716800 , 35 , 4096 , 1 }, - {24 , 819200 , 35 , 32768 , 1 }, - {24 , 921600 , 35 , 4096 , 1 }, - {24 , 1024000, 35 , 8192 , 1 }, - {33 , 512 , 36 , 128 , 0 }, - {33 , 1024 , 36 , 64 , 1 }, - {33 , 1536 , 36 , 32 , 0 }, - {33 , 2048 , 36 , 64 , 0 }, - {33 , 2560 , 36 , 64 , 0 }, - {33 , 3072 , 36 , 128 , 0 }, - {33 , 3584 , 36 , 128 , 0 }, - {33 , 4096 , 36 , 256 , 1 }, - {33 , 4608 , 36 , 256 , 1 }, - {33 , 5120 , 36 , 256 , 1 }, - {33 , 5632 , 36 , 256 , 1 }, - {33 , 6144 , 36 , 1024 , 1 }, - {33 , 6656 , 36 , 256 , 1 }, - {33 , 7168 , 36 , 256 , 1 }, - {33 , 7680 , 36 , 256 , 1 }, - {33 , 8192 , 36 , 8192 , 1 }, - {33 , 8704 , 36 , 512 , 1 }, - {33 , 9216 , 36 , 1024 , 1 }, - {33 , 9728 , 36 , 256 , 1 }, - {33 , 10240 , 36 , 256 , 1 }, - {33 , 20480 , 36 , 2048 , 1 }, - {33 , 30720 , 36 , 1024 , 1 }, - {33 , 40960 , 36 , 8192 , 1 }, - {33 , 51200 , 36 , 1024 , 1 }, - {33 , 61440 , 36 , 2048 , 1 }, - {33 , 71680 , 36 , 2048 , 1 }, - {33 , 81920 , 36 , 16384 , 1 }, - {33 , 92160 , 36 , 2048 , 1 }, - {33 , 102400 , 36 , 2048 , 1 }, - {33 , 204800 , 36 , 8192 , 1 }, - {33 , 307200 , 36 , 4096 , 1 }, - {33 , 409600 , 36 , 16384 , 1 }, - {33 , 512000 , 36 , 4096 , 1 }, - {33 , 614400 , 36 , 8192 , 1 }, - {33 , 716800 , 36 , 4096 , 1 }, - {33 , 819200 , 36 , 32768 , 1 }, - {33 , 921600 , 36 , 4096 , 1 }, - {33 , 1024000, 36 , 8192 , 1 }, - {42 , 512 , 45 , 128 , 0 }, - {42 , 1024 , 45 , 32 , 0 }, - {42 , 1536 , 45 , 32 , 0 }, - {42 , 2048 , 45 , 64 , 0 }, - {42 , 2560 , 45 , 64 , 0 }, - {42 , 3072 , 45 , 1024 , 1 }, - {42 , 3584 , 45 , 256 , 1 }, - {42 , 4096 , 45 , 1024 , 1 }, - {42 , 4608 , 45 , 128 , 1 }, - {42 , 5120 , 45 , 1024 , 1 }, - {42 , 5632 , 45 , 256 , 1 }, - {42 , 6144 , 45 , 256 , 1 }, - {42 , 6656 , 45 , 256 , 1 }, - {42 , 7168 , 45 , 1024 , 1 }, - {42 , 7680 , 45 , 512 , 1 }, - {42 , 8192 , 45 , 4096 , 1 }, - {42 , 8704 , 45 , 512 , 1 }, - {42 , 9216 , 45 , 512 , 1 }, - {42 , 9728 , 45 , 256 , 1 }, - {42 , 10240 , 45 , 2048 , 1 }, - {42 , 20480 , 45 , 512 , 1 }, - {42 , 30720 , 45 , 1024 , 1 }, - {42 , 40960 , 45 , 8192 , 1 }, - {42 , 51200 , 45 , 2048 , 1 }, - {42 , 61440 , 45 , 2048 , 1 }, - {42 , 71680 , 45 , 2048 , 1 }, - {42 , 81920 , 45 , 4096 , 1 }, - {42 , 92160 , 45 , 2048 , 1 }, - {42 , 102400 , 45 , 4096 , 1 }, - {42 , 204800 , 45 , 8192 , 1 }, - {42 , 307200 , 45 , 4096 , 1 }, - {42 , 409600 , 45 , 16384 , 1 }, - {42 , 512000 , 45 , 4096 , 1 }, - {42 , 614400 , 45 , 8192 , 1 }, - {42 , 716800 , 45 , 4096 , 1 }, - {42 , 819200 , 45 , 32768 , 1 }, - {42 , 921600 , 45 , 2048 , 1 }, - {42 , 1024000, 45 , 8192 , 1 }, - {43 , 512 , 56 , 128 , 0 }, - {43 , 1024 , 56 , 128 , 0 }, - {43 , 1536 , 56 , 64 , 0 }, - {43 , 2048 , 56 , 128 , 0 }, - {43 , 2560 , 56 , 128 , 1 }, - {43 , 3072 , 56 , 128 , 1 }, - {43 , 3584 , 56 , 128 , 1 }, - {43 , 4096 , 56 , 4096 , 1 }, - {43 , 4608 , 56 , 128 , 1 }, - {43 , 5120 , 56 , 128 , 1 }, - {43 , 5632 , 56 , 256 , 1 }, - {43 , 6144 , 56 , 1024 , 1 }, - {43 , 6656 , 56 , 256 , 1 }, - {43 , 7168 , 56 , 1024 , 1 }, - {43 , 7680 , 56 , 128 , 1 }, - {43 , 8192 , 56 , 8192 , 1 }, - {43 , 8704 , 56 , 512 , 1 }, - {43 , 9216 , 56 , 1024 , 1 }, - {43 , 9728 , 56 , 512 , 1 }, - {43 , 10240 , 56 , 256 , 1 }, - {43 , 20480 , 56 , 1024 , 1 }, - {43 , 30720 , 56 , 1024 , 1 }, - {43 , 40960 , 56 , 1024 , 1 }, - {43 , 51200 , 56 , 2048 , 1 }, - {43 , 61440 , 56 , 1024 , 1 }, - {43 , 71680 , 56 , 2048 , 1 }, - {43 , 81920 , 56 , 8192 , 1 }, - {43 , 92160 , 56 , 2048 , 1 }, - {43 , 102400 , 56 , 4096 , 1 }, - {43 , 204800 , 56 , 8192 , 1 }, - {43 , 307200 , 56 , 4096 , 1 }, - {43 , 409600 , 56 , 16384 , 1 }, - {43 , 512000 , 56 , 4096 , 1 }, - {43 , 614400 , 56 , 8192 , 1 }, - {43 , 716800 , 56 , 4096 , 1 }, - {43 , 819200 , 56 , 16384 , 1 }, - {43 , 921600 , 56 , 4096 , 1 }, - {43 , 1024000, 56 , 8192 , 1 }, - {126 , 512 , 84 , 64 , 0 }, - {126 , 1024 , 84 , 64 , 1 }, - {126 , 1536 , 84 , 512 , 1 }, - {126 , 2048 , 84 , 64 , 1 }, - {126 , 2560 , 84 , 128 , 1 }, - {126 , 3072 , 84 , 1024 , 1 }, - {126 , 3584 , 84 , 256 , 1 }, - {126 , 4096 , 84 , 256 , 1 }, - {126 , 4608 , 84 , 256 , 1 }, - {126 , 5120 , 84 , 1024 , 1 }, - {126 , 5632 , 84 , 512 , 1 }, - {126 , 6144 , 84 , 2048 , 1 }, - {126 , 6656 , 84 , 256 , 1 }, - {126 , 7168 , 84 , 1024 , 1 }, - {126 , 7680 , 84 , 512 , 1 }, - {126 , 8192 , 84 , 8192 , 1 }, - {126 , 8704 , 84 , 128 , 1 }, - {126 , 9216 , 84 , 1024 , 1 }, - {126 , 9728 , 84 , 512 , 1 }, - {126 , 10240 , 84 , 2048 , 1 }, - {126 , 20480 , 84 , 1024 , 1 }, - {126 , 30720 , 84 , 1024 , 1 }, - {126 , 40960 , 84 , 2048 , 1 }, - {126 , 51200 , 84 , 1024 , 1 }, - {126 , 61440 , 84 , 2048 , 1 }, - {126 , 71680 , 84 , 71680 , 1 }, - {126 , 81920 , 84 , 81920 , 1 }, - {126 , 92160 , 84 , 92160 , 1 }, - {126 , 102400 , 84 , 102400 , 1 }, - {126 , 204800 , 84 , 204800 , 1 }, - {126 , 307200 , 84 , 307200 , 1 }, - {126 , 409600 , 84 , 409600 , 1 }, - {126 , 512000 , 84 , 512000 , 1 }, - {126 , 614400 , 84 , 614400 , 1 }, - {126 , 716800 , 84 , 716800 , 1 }, - {126 , 819200 , 84 , 819200 , 1 }, - {126 , 921600 , 84 , 921600 , 1 }, - {126 , 1024000, 84 , 1024000, 1 }, - {210 , 512 , 120 , 32 , 0 }, - {210 , 1024 , 120 , 1024 , 1 }, - {210 , 1536 , 120 , 512 , 1 }, - {210 , 2048 , 120 , 64 , 1 }, - {210 , 2560 , 120 , 128 , 1 }, - {210 , 3072 , 120 , 512 , 1 }, - {210 , 3584 , 120 , 512 , 1 }, - {210 , 4096 , 120 , 256 , 1 }, - {210 , 4608 , 120 , 256 , 1 }, - {210 , 5120 , 120 , 512 , 1 }, - {210 , 5632 , 120 , 256 , 1 }, - {210 , 6144 , 120 , 2048 , 1 }, - {210 , 6656 , 120 , 128 , 1 }, - {210 , 7168 , 120 , 256 , 1 }, - {210 , 7680 , 120 , 512 , 1 }, - {210 , 8192 , 120 , 8192 , 1 }, - {210 , 8704 , 120 , 256 , 1 }, - {210 , 9216 , 120 , 512 , 1 }, - {210 , 9728 , 120 , 256 , 1 }, - {210 , 10240 , 120 , 1024 , 1 }, - {210 , 20480 , 120 , 20480 , 1 }, - {210 , 30720 , 120 , 30720 , 1 }, - {210 , 40960 , 120 , 40960 , 1 }, - {210 , 51200 , 120 , 51200 , 1 }, - {210 , 61440 , 120 , 61440 , 1 }, - {210 , 71680 , 120 , 71680 , 1 }, - {210 , 81920 , 120 , 81920 , 1 }, - {210 , 92160 , 120 , 92160 , 1 }, - {210 , 102400 , 120 , 102400 , 1 }, - {210 , 204800 , 120 , 204800 , 1 }, - {210 , 307200 , 120 , 307200 , 1 }, - {210 , 409600 , 120 , 409600 , 1 }, - {210 , 512000 , 120 , 512000 , 1 }, - {210 , 614400 , 120 , 614400 , 1 }, - {210 , 716800 , 120 , 716800 , 1 }, - {210 , 819200 , 120 , 819200 , 1 }, - {210 , 921600 , 120 , 921600 , 1 }, - {210 , 1024000, 120 , 1024000, 1 }, - {330 , 512 , 165 , 128 , 0 }, - {330 , 1024 , 165 , 256 , 1 }, - {330 , 1536 , 165 , 512 , 1 }, - {330 , 2048 , 165 , 2048 , 1 }, - {330 , 2560 , 165 , 512 , 1 }, - {330 , 3072 , 165 , 1024 , 1 }, - {330 , 3584 , 165 , 512 , 1 }, - {330 , 4096 , 165 , 2048 , 1 }, - {330 , 4608 , 165 , 128 , 1 }, - {330 , 5120 , 165 , 512 , 1 }, - {330 , 5632 , 165 , 512 , 1 }, - {330 , 6144 , 165 , 2048 , 1 }, - {330 , 6656 , 165 , 512 , 1 }, - {330 , 7168 , 165 , 512 , 1 }, - {330 , 7680 , 165 , 512 , 1 }, - {330 , 8192 , 165 , 8192 , 1 }, - {330 , 8704 , 165 , 512 , 1 }, - {330 , 9216 , 165 , 1024 , 1 }, - {330 , 9728 , 165 , 512 , 1 }, - {330 , 10240 , 165 , 2048 , 1 }, - {330 , 20480 , 165 , 2048 , 1 }, - {330 , 30720 , 165 , 2048 , 1 }, - {330 , 40960 , 165 , 8192 , 1 }, - {330 , 51200 , 165 , 2048 , 1 }, - {330 , 61440 , 165 , 2048 , 1 }, - {330 , 71680 , 165 , 2048 , 1 }, - {330 , 81920 , 165 , 16384 , 1 }, - {330 , 92160 , 165 , 92160 , 1 }, - {330 , 102400 , 165 , 4096 , 1 }, - {330 , 204800 , 165 , 8192 , 1 }, - {330 , 307200 , 165 , 2048 , 1 }, - {330 , 409600 , 165 , 8192 , 1 }, - {330 , 512000 , 165 , 4096 , 1 }, - {330 , 614400 , 165 , 8192 , 1 }, - {330 , 716800 , 165 , 4096 , 1 }, - {330 , 819200 , 165 , 16384 , 1 }, - {330 , 921600 , 165 , 4096 , 1 }, - {330 , 1024000, 165 , 8192 , 1 } +std::vector > dgemm_tn_mi250x = { + {1, 512, 3, 32, 1}, + {1, 1024, 3, 128, 1}, + {1, 1536, 3, 512, 1}, + {1, 2048, 3, 1024, 1}, + {1, 2560, 3, 512, 1}, + {1, 3072, 3, 1024, 1}, + {1, 3584, 3, 128, 1}, + {1, 4096, 3, 4096, 1}, + {1, 4608, 3, 512, 1}, + {1, 5120, 3, 512, 1}, + {1, 5632, 3, 256, 1}, + {1, 6144, 3, 2048, 1}, + {1, 6656, 3, 256, 1}, + {1, 7168, 3, 512, 1}, + {1, 7680, 3, 512, 1}, + {1, 8192, 3, 2048, 1}, + {1, 8704, 3, 512, 1}, + {1, 9216, 3, 512, 1}, + {1, 9728, 3, 512, 1}, + {1, 10240, 3, 1024, 1}, + {1, 20480, 3, 2048, 1}, + {1, 30720, 3, 2048, 1}, + {1, 40960, 3, 4096, 1}, + {1, 51200, 3, 2048, 1}, + {1, 61440, 3, 4096, 1}, + {1, 71680, 3, 2048, 1}, + {1, 81920, 3, 8192, 1}, + {1, 92160, 3, 2048, 1}, + {1, 102400, 3, 2048, 1}, + {1, 204800, 3, 8192, 1}, + {1, 307200, 3, 4096, 1}, + {1, 409600, 3, 8192, 1}, + {1, 512000, 3, 2048, 1}, + {1, 614400, 3, 8192, 1}, + {1, 716800, 3, 2048, 1}, + {1, 819200, 3, 32768, 1}, + {1, 921600, 3, 2048, 1}, + {1, 1024000, 3, 2048, 1}, + {1, 512, 4, 64, 1}, + {1, 1024, 4, 128, 1}, + {1, 1536, 4, 64, 1}, + {1, 2048, 4, 256, 1}, + {1, 2560, 4, 128, 1}, + {1, 3072, 4, 128, 1}, + {1, 3584, 4, 512, 1}, + {1, 4096, 4, 256, 1}, + {1, 4608, 4, 512, 1}, + {1, 5120, 4, 1024, 1}, + {1, 5632, 4, 512, 1}, + {1, 6144, 4, 2048, 1}, + {1, 6656, 4, 512, 1}, + {1, 7168, 4, 1024, 1}, + {1, 7680, 4, 512, 1}, + {1, 8192, 4, 1024, 1}, + {1, 8704, 4, 512, 1}, + {1, 9216, 4, 512, 1}, + {1, 9728, 4, 512, 1}, + {1, 10240, 4, 512, 1}, + {1, 20480, 4, 2048, 1}, + {1, 30720, 4, 2048, 1}, + {1, 40960, 4, 4096, 1}, + {1, 51200, 4, 2048, 1}, + {1, 61440, 4, 2048, 1}, + {1, 71680, 4, 2048, 1}, + {1, 81920, 4, 4096, 1}, + {1, 92160, 4, 2048, 1}, + {1, 102400, 4, 2048, 1}, + {1, 204800, 4, 8192, 1}, + {1, 307200, 4, 4096, 1}, + {1, 409600, 4, 16384, 1}, + {1, 512000, 4, 1024, 1}, + {1, 614400, 4, 8192, 1}, + {1, 716800, 4, 1024, 1}, + {1, 819200, 4, 32768, 1}, + {1, 921600, 4, 2048, 1}, + {1, 1024000, 4, 2048, 1}, + {3, 512, 6, 128, 1}, + {3, 1024, 6, 64, 1}, + {3, 1536, 6, 512, 1}, + {3, 2048, 6, 2048, 1}, + {3, 2560, 6, 128, 1}, + {3, 3072, 6, 128, 1}, + {3, 3584, 6, 512, 1}, + {3, 4096, 6, 1024, 1}, + {3, 4608, 6, 512, 1}, + {3, 5120, 6, 512, 1}, + {3, 5632, 6, 512, 1}, + {3, 6144, 6, 256, 1}, + {3, 6656, 6, 512, 1}, + {3, 7168, 6, 1024, 1}, + {3, 7680, 6, 512, 1}, + {3, 8192, 6, 1024, 1}, + {3, 8704, 6, 512, 1}, + {3, 9216, 6, 512, 1}, + {3, 9728, 6, 512, 1}, + {3, 10240, 6, 512, 1}, + {3, 20480, 6, 1024, 1}, + {3, 30720, 6, 1024, 1}, + {3, 40960, 6, 2048, 1}, + {3, 51200, 6, 2048, 1}, + {3, 61440, 6, 4096, 1}, + {3, 71680, 6, 2048, 1}, + {3, 81920, 6, 2048, 1}, + {3, 92160, 6, 2048, 1}, + {3, 102400, 6, 4096, 1}, + {3, 204800, 6, 8192, 1}, + {3, 307200, 6, 4096, 1}, + {3, 409600, 6, 16384, 1}, + {3, 512000, 6, 512, 1}, + {3, 614400, 6, 8192, 1}, + {3, 716800, 6, 1024, 1}, + {3, 819200, 6, 32768, 1}, + {3, 921600, 6, 512, 1}, + {3, 1024000, 6, 1024, 1}, + {4, 512, 10, 32, 1}, + {4, 1024, 10, 128, 1}, + {4, 1536, 10, 512, 1}, + {4, 2048, 10, 1024, 1}, + {4, 2560, 10, 128, 1}, + {4, 3072, 10, 128, 1}, + {4, 3584, 10, 256, 1}, + {4, 4096, 10, 1024, 1}, + {4, 4608, 10, 512, 1}, + {4, 5120, 10, 512, 1}, + {4, 5632, 10, 512, 1}, + {4, 6144, 10, 512, 1}, + {4, 6656, 10, 512, 1}, + {4, 7168, 10, 512, 1}, + {4, 7680, 10, 512, 1}, + {4, 8192, 10, 2048, 1}, + {4, 8704, 10, 512, 1}, + {4, 9216, 10, 512, 1}, + {4, 9728, 10, 512, 1}, + {4, 10240, 10, 1024, 1}, + {4, 20480, 10, 4096, 1}, + {4, 30720, 10, 2048, 1}, + {4, 40960, 10, 2048, 1}, + {4, 51200, 10, 2048, 1}, + {4, 61440, 10, 2048, 1}, + {4, 71680, 10, 2048, 1}, + {4, 81920, 10, 16384, 1}, + {4, 92160, 10, 2048, 1}, + {4, 102400, 10, 2048, 1}, + {4, 204800, 10, 4096, 1}, + {4, 307200, 10, 4096, 1}, + {4, 409600, 10, 16384, 1}, + {4, 512000, 10, 512, 1}, + {4, 614400, 10, 8192, 1}, + {4, 716800, 10, 4096, 1}, + {4, 819200, 10, 32768, 1}, + {4, 921600, 10, 512, 1}, + {4, 1024000, 10, 8192, 1}, + {6, 512, 10, 64, 1}, + {6, 1024, 10, 64, 1}, + {6, 1536, 10, 256, 1}, + {6, 2048, 10, 128, 1}, + {6, 2560, 10, 256, 1}, + {6, 3072, 10, 128, 1}, + {6, 3584, 10, 256, 1}, + {6, 4096, 10, 1024, 1}, + {6, 4608, 10, 256, 1}, + {6, 5120, 10, 512, 1}, + {6, 5632, 10, 512, 1}, + {6, 6144, 10, 1024, 1}, + {6, 6656, 10, 512, 1}, + {6, 7168, 10, 512, 1}, + {6, 7680, 10, 512, 1}, + {6, 8192, 10, 512, 1}, + {6, 8704, 10, 512, 1}, + {6, 9216, 10, 512, 1}, + {6, 9728, 10, 256, 1}, + {6, 10240, 10, 1024, 1}, + {6, 20480, 10, 1024, 1}, + {6, 30720, 10, 1024, 1}, + {6, 40960, 10, 2048, 1}, + {6, 51200, 10, 1024, 1}, + {6, 61440, 10, 2048, 1}, + {6, 71680, 10, 2048, 1}, + {6, 81920, 10, 16384, 1}, + {6, 92160, 10, 2048, 1}, + {6, 102400, 10, 4096, 1}, + {6, 204800, 10, 4096, 1}, + {6, 307200, 10, 4096, 1}, + {6, 409600, 10, 16384, 1}, + {6, 512000, 10, 4096, 1}, + {6, 614400, 10, 8192, 1}, + {6, 716800, 10, 4096, 1}, + {6, 819200, 10, 16384, 1}, + {6, 921600, 10, 1024, 1}, + {6, 1024000, 10, 8192, 1}, + {12, 512, 15, 64, 1}, + {12, 1024, 15, 64, 1}, + {12, 1536, 15, 256, 1}, + {12, 2048, 15, 128, 1}, + {12, 2560, 15, 128, 1}, + {12, 3072, 15, 512, 1}, + {12, 3584, 15, 256, 1}, + {12, 4096, 15, 512, 1}, + {12, 4608, 15, 512, 1}, + {12, 5120, 15, 1024, 1}, + {12, 5632, 15, 256, 1}, + {12, 6144, 15, 256, 1}, + {12, 6656, 15, 512, 1}, + {12, 7168, 15, 512, 1}, + {12, 7680, 15, 512, 1}, + {12, 8192, 15, 2048, 1}, + {12, 8704, 15, 512, 1}, + {12, 9216, 15, 512, 1}, + {12, 9728, 15, 512, 1}, + {12, 10240, 15, 512, 1}, + {12, 20480, 15, 2048, 1}, + {12, 30720, 15, 1024, 1}, + {12, 40960, 15, 8192, 1}, + {12, 51200, 15, 2048, 1}, + {12, 61440, 15, 2048, 1}, + {12, 71680, 15, 1024, 1}, + {12, 81920, 15, 4096, 1}, + {12, 92160, 15, 2048, 1}, + {12, 102400, 15, 2048, 1}, + {12, 204800, 15, 4096, 1}, + {12, 307200, 15, 4096, 1}, + {12, 409600, 15, 8192, 1}, + {12, 512000, 15, 512, 1}, + {12, 614400, 15, 4096, 1}, + {12, 716800, 15, 4096, 1}, + {12, 819200, 15, 32768, 1}, + {12, 921600, 15, 2048, 1}, + {12, 1024000, 15, 4096, 1}, + {11, 512, 20, 64, 1}, + {11, 1024, 20, 128, 1}, + {11, 1536, 20, 512, 1}, + {11, 2048, 20, 256, 1}, + {11, 2560, 20, 128, 1}, + {11, 3072, 20, 256, 1}, + {11, 3584, 20, 256, 1}, + {11, 4096, 20, 256, 1}, + {11, 4608, 20, 128, 1}, + {11, 5120, 20, 256, 1}, + {11, 5632, 20, 512, 1}, + {11, 6144, 20, 2048, 1}, + {11, 6656, 20, 256, 1}, + {11, 7168, 20, 256, 1}, + {11, 7680, 20, 256, 1}, + {11, 8192, 20, 8192, 1}, + {11, 8704, 20, 512, 1}, + {11, 9216, 20, 512, 1}, + {11, 9728, 20, 256, 1}, + {11, 10240, 20, 2048, 1}, + {11, 20480, 20, 2048, 1}, + {11, 30720, 20, 2048, 1}, + {11, 40960, 20, 8192, 1}, + {11, 51200, 20, 2048, 1}, + {11, 61440, 20, 4096, 1}, + {11, 71680, 20, 2048, 1}, + {11, 81920, 20, 16384, 1}, + {11, 92160, 20, 2048, 1}, + {11, 102400, 20, 4096, 1}, + {11, 204800, 20, 8192, 1}, + {11, 307200, 20, 4096, 1}, + {11, 409600, 20, 16384, 1}, + {11, 512000, 20, 512, 1}, + {11, 614400, 20, 8192, 1}, + {11, 716800, 20, 4096, 1}, + {11, 819200, 20, 16384, 1}, + {11, 921600, 20, 2048, 1}, + {11, 1024000, 20, 2048, 1}, + {16, 512, 21, 128, 1}, + {16, 1024, 21, 128, 1}, + {16, 1536, 21, 512, 1}, + {16, 2048, 21, 2048, 1}, + {16, 2560, 21, 256, 1}, + {16, 3072, 21, 256, 1}, + {16, 3584, 21, 128, 1}, + {16, 4096, 21, 256, 1}, + {16, 4608, 21, 64, 0}, + {16, 5120, 21, 32, 0}, + {16, 5632, 21, 128, 0}, + {16, 6144, 21, 512, 1}, + {16, 6656, 21, 256, 1}, + {16, 7168, 21, 1024, 1}, + {16, 7680, 21, 512, 1}, + {16, 8192, 21, 512, 1}, + {16, 8704, 21, 256, 1}, + {16, 9216, 21, 512, 1}, + {16, 9728, 21, 256, 1}, + {16, 10240, 21, 256, 1}, + {16, 20480, 21, 1024, 1}, + {16, 30720, 21, 2048, 1}, + {16, 40960, 21, 4096, 1}, + {16, 51200, 21, 2048, 1}, + {16, 61440, 21, 2048, 1}, + {16, 71680, 21, 2048, 1}, + {16, 81920, 21, 4096, 1}, + {16, 92160, 21, 256, 1}, + {16, 102400, 21, 4096, 1}, + {16, 204800, 21, 8192, 1}, + {16, 307200, 21, 4096, 1}, + {16, 409600, 21, 16384, 1}, + {16, 512000, 21, 512, 1}, + {16, 614400, 21, 8192, 1}, + {16, 716800, 21, 4096, 1}, + {16, 819200, 21, 32768, 1}, + {16, 921600, 21, 1024, 1}, + {16, 1024000, 21, 8192, 1}, + {25, 512, 28, 128, 1}, + {25, 1024, 28, 128, 1}, + {25, 1536, 28, 256, 1}, + {25, 2048, 28, 128, 1}, + {25, 2560, 28, 256, 1}, + {25, 3072, 28, 256, 0}, + {25, 3584, 28, 256, 0}, + {25, 4096, 28, 4096, 1}, + {25, 4608, 28, 32, 0}, + {25, 5120, 28, 256, 1}, + {25, 5632, 28, 256, 1}, + {25, 6144, 28, 256, 1}, + {25, 6656, 28, 512, 1}, + {25, 7168, 28, 1024, 1}, + {25, 7680, 28, 256, 1}, + {25, 8192, 28, 4096, 1}, + {25, 8704, 28, 256, 1}, + {25, 9216, 28, 256, 1}, + {25, 9728, 28, 512, 1}, + {25, 10240, 28, 256, 1}, + {25, 20480, 28, 2048, 1}, + {25, 30720, 28, 1024, 1}, + {25, 40960, 28, 2048, 1}, + {25, 51200, 28, 1024, 1}, + {25, 61440, 28, 4096, 1}, + {25, 71680, 28, 2048, 1}, + {25, 81920, 28, 4096, 1}, + {25, 92160, 28, 2048, 1}, + {25, 102400, 28, 4096, 1}, + {25, 204800, 28, 4096, 1}, + {25, 307200, 28, 4096, 1}, + {25, 409600, 28, 8192, 1}, + {25, 512000, 28, 2048, 1}, + {25, 614400, 28, 8192, 1}, + {25, 716800, 28, 4096, 1}, + {25, 819200, 28, 32768, 1}, + {25, 921600, 28, 2048, 1}, + {25, 1024000, 28, 4096, 1}, + {24, 512, 35, 32, 1}, + {24, 1024, 35, 256, 1}, + {24, 1536, 35, 64, 1}, + {24, 2048, 35, 256, 1}, + {24, 2560, 35, 128, 0}, + {24, 3072, 35, 128, 0}, + {24, 3584, 35, 64, 0}, + {24, 4096, 35, 1024, 1}, + {24, 4608, 35, 256, 1}, + {24, 5120, 35, 256, 1}, + {24, 5632, 35, 256, 1}, + {24, 6144, 35, 256, 1}, + {24, 6656, 35, 256, 1}, + {24, 7168, 35, 256, 1}, + {24, 7680, 35, 256, 1}, + {24, 8192, 35, 2048, 1}, + {24, 8704, 35, 512, 1}, + {24, 9216, 35, 256, 1}, + {24, 9728, 35, 512, 1}, + {24, 10240, 35, 256, 1}, + {24, 20480, 35, 1024, 1}, + {24, 30720, 35, 1024, 1}, + {24, 40960, 35, 4096, 1}, + {24, 51200, 35, 2048, 1}, + {24, 61440, 35, 2048, 1}, + {24, 71680, 35, 2048, 1}, + {24, 81920, 35, 8192, 1}, + {24, 92160, 35, 2048, 1}, + {24, 102400, 35, 2048, 1}, + {24, 204800, 35, 8192, 1}, + {24, 307200, 35, 4096, 1}, + {24, 409600, 35, 16384, 1}, + {24, 512000, 35, 2048, 1}, + {24, 614400, 35, 8192, 1}, + {24, 716800, 35, 4096, 1}, + {24, 819200, 35, 32768, 1}, + {24, 921600, 35, 4096, 1}, + {24, 1024000, 35, 8192, 1}, + {33, 512, 36, 128, 0}, + {33, 1024, 36, 64, 1}, + {33, 1536, 36, 32, 0}, + {33, 2048, 36, 64, 0}, + {33, 2560, 36, 64, 0}, + {33, 3072, 36, 128, 0}, + {33, 3584, 36, 128, 0}, + {33, 4096, 36, 256, 1}, + {33, 4608, 36, 256, 1}, + {33, 5120, 36, 256, 1}, + {33, 5632, 36, 256, 1}, + {33, 6144, 36, 1024, 1}, + {33, 6656, 36, 256, 1}, + {33, 7168, 36, 256, 1}, + {33, 7680, 36, 256, 1}, + {33, 8192, 36, 8192, 1}, + {33, 8704, 36, 512, 1}, + {33, 9216, 36, 1024, 1}, + {33, 9728, 36, 256, 1}, + {33, 10240, 36, 256, 1}, + {33, 20480, 36, 2048, 1}, + {33, 30720, 36, 1024, 1}, + {33, 40960, 36, 8192, 1}, + {33, 51200, 36, 1024, 1}, + {33, 61440, 36, 2048, 1}, + {33, 71680, 36, 2048, 1}, + {33, 81920, 36, 16384, 1}, + {33, 92160, 36, 2048, 1}, + {33, 102400, 36, 2048, 1}, + {33, 204800, 36, 8192, 1}, + {33, 307200, 36, 4096, 1}, + {33, 409600, 36, 16384, 1}, + {33, 512000, 36, 4096, 1}, + {33, 614400, 36, 8192, 1}, + {33, 716800, 36, 4096, 1}, + {33, 819200, 36, 32768, 1}, + {33, 921600, 36, 4096, 1}, + {33, 1024000, 36, 8192, 1}, + {42, 512, 45, 128, 0}, + {42, 1024, 45, 32, 0}, + {42, 1536, 45, 32, 0}, + {42, 2048, 45, 64, 0}, + {42, 2560, 45, 64, 0}, + {42, 3072, 45, 1024, 1}, + {42, 3584, 45, 256, 1}, + {42, 4096, 45, 1024, 1}, + {42, 4608, 45, 128, 1}, + {42, 5120, 45, 1024, 1}, + {42, 5632, 45, 256, 1}, + {42, 6144, 45, 256, 1}, + {42, 6656, 45, 256, 1}, + {42, 7168, 45, 1024, 1}, + {42, 7680, 45, 512, 1}, + {42, 8192, 45, 4096, 1}, + {42, 8704, 45, 512, 1}, + {42, 9216, 45, 512, 1}, + {42, 9728, 45, 256, 1}, + {42, 10240, 45, 2048, 1}, + {42, 20480, 45, 512, 1}, + {42, 30720, 45, 1024, 1}, + {42, 40960, 45, 8192, 1}, + {42, 51200, 45, 2048, 1}, + {42, 61440, 45, 2048, 1}, + {42, 71680, 45, 2048, 1}, + {42, 81920, 45, 4096, 1}, + {42, 92160, 45, 2048, 1}, + {42, 102400, 45, 4096, 1}, + {42, 204800, 45, 8192, 1}, + {42, 307200, 45, 4096, 1}, + {42, 409600, 45, 16384, 1}, + {42, 512000, 45, 4096, 1}, + {42, 614400, 45, 8192, 1}, + {42, 716800, 45, 4096, 1}, + {42, 819200, 45, 32768, 1}, + {42, 921600, 45, 2048, 1}, + {42, 1024000, 45, 8192, 1}, + {43, 512, 56, 128, 0}, + {43, 1024, 56, 128, 0}, + {43, 1536, 56, 64, 0}, + {43, 2048, 56, 128, 0}, + {43, 2560, 56, 128, 1}, + {43, 3072, 56, 128, 1}, + {43, 3584, 56, 128, 1}, + {43, 4096, 56, 4096, 1}, + {43, 4608, 56, 128, 1}, + {43, 5120, 56, 128, 1}, + {43, 5632, 56, 256, 1}, + {43, 6144, 56, 1024, 1}, + {43, 6656, 56, 256, 1}, + {43, 7168, 56, 1024, 1}, + {43, 7680, 56, 128, 1}, + {43, 8192, 56, 8192, 1}, + {43, 8704, 56, 512, 1}, + {43, 9216, 56, 1024, 1}, + {43, 9728, 56, 512, 1}, + {43, 10240, 56, 256, 1}, + {43, 20480, 56, 1024, 1}, + {43, 30720, 56, 1024, 1}, + {43, 40960, 56, 1024, 1}, + {43, 51200, 56, 2048, 1}, + {43, 61440, 56, 1024, 1}, + {43, 71680, 56, 2048, 1}, + {43, 81920, 56, 8192, 1}, + {43, 92160, 56, 2048, 1}, + {43, 102400, 56, 4096, 1}, + {43, 204800, 56, 8192, 1}, + {43, 307200, 56, 4096, 1}, + {43, 409600, 56, 16384, 1}, + {43, 512000, 56, 4096, 1}, + {43, 614400, 56, 8192, 1}, + {43, 716800, 56, 4096, 1}, + {43, 819200, 56, 16384, 1}, + {43, 921600, 56, 4096, 1}, + {43, 1024000, 56, 8192, 1}, + {126, 512, 84, 64, 0}, + {126, 1024, 84, 64, 1}, + {126, 1536, 84, 512, 1}, + {126, 2048, 84, 64, 1}, + {126, 2560, 84, 128, 1}, + {126, 3072, 84, 1024, 1}, + {126, 3584, 84, 256, 1}, + {126, 4096, 84, 256, 1}, + {126, 4608, 84, 256, 1}, + {126, 5120, 84, 1024, 1}, + {126, 5632, 84, 512, 1}, + {126, 6144, 84, 2048, 1}, + {126, 6656, 84, 256, 1}, + {126, 7168, 84, 1024, 1}, + {126, 7680, 84, 512, 1}, + {126, 8192, 84, 8192, 1}, + {126, 8704, 84, 128, 1}, + {126, 9216, 84, 1024, 1}, + {126, 9728, 84, 512, 1}, + {126, 10240, 84, 2048, 1}, + {126, 20480, 84, 1024, 1}, + {126, 30720, 84, 1024, 1}, + {126, 40960, 84, 2048, 1}, + {126, 51200, 84, 1024, 1}, + {126, 61440, 84, 2048, 1}, + {126, 71680, 84, 71680, 1}, + {126, 81920, 84, 81920, 1}, + {126, 92160, 84, 92160, 1}, + {126, 102400, 84, 102400, 1}, + {126, 204800, 84, 204800, 1}, + {126, 307200, 84, 307200, 1}, + {126, 409600, 84, 409600, 1}, + {126, 512000, 84, 512000, 1}, + {126, 614400, 84, 614400, 1}, + {126, 716800, 84, 716800, 1}, + {126, 819200, 84, 819200, 1}, + {126, 921600, 84, 921600, 1}, + {126, 1024000, 84, 1024000, 1}, + {210, 512, 120, 32, 0}, + {210, 1024, 120, 1024, 1}, + {210, 1536, 120, 512, 1}, + {210, 2048, 120, 64, 1}, + {210, 2560, 120, 128, 1}, + {210, 3072, 120, 512, 1}, + {210, 3584, 120, 512, 1}, + {210, 4096, 120, 256, 1}, + {210, 4608, 120, 256, 1}, + {210, 5120, 120, 512, 1}, + {210, 5632, 120, 256, 1}, + {210, 6144, 120, 2048, 1}, + {210, 6656, 120, 128, 1}, + {210, 7168, 120, 256, 1}, + {210, 7680, 120, 512, 1}, + {210, 8192, 120, 8192, 1}, + {210, 8704, 120, 256, 1}, + {210, 9216, 120, 512, 1}, + {210, 9728, 120, 256, 1}, + {210, 10240, 120, 1024, 1}, + {210, 20480, 120, 20480, 1}, + {210, 30720, 120, 30720, 1}, + {210, 40960, 120, 40960, 1}, + {210, 51200, 120, 51200, 1}, + {210, 61440, 120, 61440, 1}, + {210, 71680, 120, 71680, 1}, + {210, 81920, 120, 81920, 1}, + {210, 92160, 120, 92160, 1}, + {210, 102400, 120, 102400, 1}, + {210, 204800, 120, 204800, 1}, + {210, 307200, 120, 307200, 1}, + {210, 409600, 120, 409600, 1}, + {210, 512000, 120, 512000, 1}, + {210, 614400, 120, 614400, 1}, + {210, 716800, 120, 716800, 1}, + {210, 819200, 120, 819200, 1}, + {210, 921600, 120, 921600, 1}, + {210, 1024000, 120, 1024000, 1}, + {330, 512, 165, 128, 0}, + {330, 1024, 165, 256, 1}, + {330, 1536, 165, 512, 1}, + {330, 2048, 165, 2048, 1}, + {330, 2560, 165, 512, 1}, + {330, 3072, 165, 1024, 1}, + {330, 3584, 165, 512, 1}, + {330, 4096, 165, 2048, 1}, + {330, 4608, 165, 128, 1}, + {330, 5120, 165, 512, 1}, + {330, 5632, 165, 512, 1}, + {330, 6144, 165, 2048, 1}, + {330, 6656, 165, 512, 1}, + {330, 7168, 165, 512, 1}, + {330, 7680, 165, 512, 1}, + {330, 8192, 165, 8192, 1}, + {330, 8704, 165, 512, 1}, + {330, 9216, 165, 1024, 1}, + {330, 9728, 165, 512, 1}, + {330, 10240, 165, 2048, 1}, + {330, 20480, 165, 2048, 1}, + {330, 30720, 165, 2048, 1}, + {330, 40960, 165, 8192, 1}, + {330, 51200, 165, 2048, 1}, + {330, 61440, 165, 2048, 1}, + {330, 71680, 165, 2048, 1}, + {330, 81920, 165, 16384, 1}, + {330, 92160, 165, 92160, 1}, + {330, 102400, 165, 4096, 1}, + {330, 204800, 165, 8192, 1}, + {330, 307200, 165, 2048, 1}, + {330, 409600, 165, 8192, 1}, + {330, 512000, 165, 4096, 1}, + {330, 614400, 165, 8192, 1}, + {330, 716800, 165, 4096, 1}, + {330, 819200, 165, 16384, 1}, + {330, 921600, 165, 4096, 1}, + {330, 1024000, 165, 8192, 1} }; - diff --git a/backends/magma/gemm_tuning/v100.h b/backends/magma/gemm_tuning/v100.h index dd679f73d0..c0185625c3 100644 --- a/backends/magma/gemm_tuning/v100.h +++ b/backends/magma/gemm_tuning/v100.h @@ -2,2454 +2,2449 @@ // auto-generated from data on v100-cuda11.2 //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > sgemm_nn_v100 = -{ - {3 , 512 , 1 , 64 , 0 }, - {3 , 1024 , 1 , 32 , 0 }, - {3 , 1536 , 1 , 32 , 0 }, - {3 , 2048 , 1 , 64 , 0 }, - {3 , 2560 , 1 , 128 , 0 }, - {3 , 3072 , 1 , 64 , 0 }, - {3 , 3584 , 1 , 64 , 0 }, - {3 , 4096 , 1 , 64 , 0 }, - {3 , 4608 , 1 , 32 , 0 }, - {3 , 5120 , 1 , 32 , 0 }, - {3 , 5632 , 1 , 32 , 0 }, - {3 , 6144 , 1 , 32 , 0 }, - {3 , 6656 , 1 , 64 , 0 }, - {3 , 7168 , 1 , 32 , 0 }, - {3 , 7680 , 1 , 32 , 0 }, - {3 , 8192 , 1 , 1024 , 0 }, - {3 , 8704 , 1 , 512 , 0 }, - {3 , 9216 , 1 , 1024 , 0 }, - {3 , 9728 , 1 , 64 , 0 }, - {3 , 10240 , 1 , 32 , 0 }, - {3 , 20480 , 1 , 32 , 0 }, - {3 , 30720 , 1 , 64 , 0 }, - {3 , 40960 , 1 , 32 , 0 }, - {3 , 51200 , 1 , 64 , 0 }, - {3 , 61440 , 1 , 64 , 0 }, - {3 , 71680 , 1 , 32 , 0 }, - {3 , 81920 , 1 , 64 , 0 }, - {3 , 92160 , 1 , 64 , 0 }, - {3 , 102400 , 1 , 32 , 0 }, - {3 , 204800 , 1 , 64 , 0 }, - {3 , 307200 , 1 , 64 , 0 }, - {3 , 409600 , 1 , 32 , 0 }, - {3 , 512000 , 1 , 32 , 0 }, - {3 , 614400 , 1 , 64 , 0 }, - {3 , 716800 , 1 , 32 , 0 }, - {3 , 819200 , 1 , 64 , 0 }, - {3 , 921600 , 1 , 64 , 0 }, - {3 , 1024000, 1 , 64 , 0 }, - {4 , 512 , 1 , 32 , 0 }, - {4 , 1024 , 1 , 32 , 0 }, - {4 , 1536 , 1 , 32 , 0 }, - {4 , 2048 , 1 , 64 , 0 }, - {4 , 2560 , 1 , 32 , 0 }, - {4 , 3072 , 1 , 64 , 0 }, - {4 , 3584 , 1 , 64 , 0 }, - {4 , 4096 , 1 , 64 , 0 }, - {4 , 4608 , 1 , 128 , 0 }, - {4 , 5120 , 1 , 32 , 0 }, - {4 , 5632 , 1 , 32 , 0 }, - {4 , 6144 , 1 , 32 , 0 }, - {4 , 6656 , 1 , 32 , 0 }, - {4 , 7168 , 1 , 64 , 0 }, - {4 , 7680 , 1 , 256 , 0 }, - {4 , 8192 , 1 , 256 , 0 }, - {4 , 8704 , 1 , 64 , 0 }, - {4 , 9216 , 1 , 64 , 0 }, - {4 , 9728 , 1 , 512 , 0 }, - {4 , 10240 , 1 , 512 , 0 }, - {4 , 20480 , 1 , 64 , 0 }, - {4 , 30720 , 1 , 64 , 0 }, - {4 , 40960 , 1 , 32 , 0 }, - {4 , 51200 , 1 , 64 , 0 }, - {4 , 61440 , 1 , 64 , 0 }, - {4 , 71680 , 1 , 64 , 0 }, - {4 , 81920 , 1 , 32 , 0 }, - {4 , 92160 , 1 , 64 , 0 }, - {4 , 102400 , 1 , 32 , 0 }, - {4 , 204800 , 1 , 32 , 0 }, - {4 , 307200 , 1 , 32 , 0 }, - {4 , 409600 , 1 , 64 , 0 }, - {4 , 512000 , 1 , 32 , 0 }, - {4 , 614400 , 1 , 64 , 0 }, - {4 , 716800 , 1 , 32 , 0 }, - {4 , 819200 , 1 , 64 , 0 }, - {4 , 921600 , 1 , 64 , 0 }, - {4 , 1024000, 1 , 32 , 0 }, - {6 , 512 , 3 , 64 , 1 }, - {6 , 1024 , 3 , 32 , 1 }, - {6 , 1536 , 3 , 64 , 1 }, - {6 , 2048 , 3 , 32 , 1 }, - {6 , 2560 , 3 , 32 , 1 }, - {6 , 3072 , 3 , 512 , 1 }, - {6 , 3584 , 3 , 64 , 1 }, - {6 , 4096 , 3 , 32 , 1 }, - {6 , 4608 , 3 , 32 , 1 }, - {6 , 5120 , 3 , 64 , 1 }, - {6 , 5632 , 3 , 32 , 1 }, - {6 , 6144 , 3 , 64 , 1 }, - {6 , 6656 , 3 , 64 , 1 }, - {6 , 7168 , 3 , 128 , 1 }, - {6 , 7680 , 3 , 64 , 1 }, - {6 , 8192 , 3 , 32 , 1 }, - {6 , 8704 , 3 , 32 , 1 }, - {6 , 9216 , 3 , 32 , 1 }, - {6 , 9728 , 3 , 64 , 1 }, - {6 , 10240 , 3 , 64 , 1 }, - {6 , 20480 , 3 , 32 , 1 }, - {6 , 30720 , 3 , 64 , 1 }, - {6 , 40960 , 3 , 64 , 1 }, - {6 , 51200 , 3 , 64 , 1 }, - {6 , 61440 , 3 , 64 , 1 }, - {6 , 71680 , 3 , 1024 , 1 }, - {6 , 81920 , 3 , 64 , 1 }, - {6 , 92160 , 3 , 64 , 1 }, - {6 , 102400 , 3 , 64 , 1 }, - {6 , 204800 , 3 , 512 , 1 }, - {6 , 307200 , 3 , 2048 , 1 }, - {6 , 409600 , 3 , 2048 , 1 }, - {6 , 512000 , 3 , 2048 , 1 }, - {6 , 614400 , 3 , 8192 , 1 }, - {6 , 716800 , 3 , 2048 , 1 }, - {6 , 819200 , 3 , 32768 , 1 }, - {6 , 921600 , 3 , 4096 , 1 }, - {6 , 1024000, 3 , 8192 , 1 }, - {10 , 512 , 4 , 64 , 1 }, - {10 , 1024 , 4 , 32 , 1 }, - {10 , 1536 , 4 , 32 , 1 }, - {10 , 2048 , 4 , 64 , 1 }, - {10 , 2560 , 4 , 64 , 1 }, - {10 , 3072 , 4 , 32 , 1 }, - {10 , 3584 , 4 , 64 , 1 }, - {10 , 4096 , 4 , 64 , 1 }, - {10 , 4608 , 4 , 64 , 1 }, - {10 , 5120 , 4 , 32 , 1 }, - {10 , 5632 , 4 , 64 , 1 }, - {10 , 6144 , 4 , 32 , 1 }, - {10 , 6656 , 4 , 64 , 1 }, - {10 , 7168 , 4 , 32 , 1 }, - {10 , 7680 , 4 , 64 , 1 }, - {10 , 8192 , 4 , 64 , 1 }, - {10 , 8704 , 4 , 64 , 1 }, - {10 , 9216 , 4 , 32 , 1 }, - {10 , 9728 , 4 , 32 , 1 }, - {10 , 10240 , 4 , 32 , 1 }, - {10 , 20480 , 4 , 64 , 1 }, - {10 , 30720 , 4 , 64 , 1 }, - {10 , 40960 , 4 , 512 , 1 }, - {10 , 51200 , 4 , 64 , 1 }, - {10 , 61440 , 4 , 64 , 1 }, - {10 , 71680 , 4 , 64 , 1 }, - {10 , 81920 , 4 , 16384 , 1 }, - {10 , 92160 , 4 , 64 , 1 }, - {10 , 102400 , 4 , 64 , 1 }, - {10 , 204800 , 4 , 8192 , 1 }, - {10 , 307200 , 4 , 4096 , 1 }, - {10 , 409600 , 4 , 8192 , 1 }, - {10 , 512000 , 4 , 4096 , 1 }, - {10 , 614400 , 4 , 8192 , 1 }, - {10 , 716800 , 4 , 1024 , 1 }, - {10 , 819200 , 4 , 32768 , 1 }, - {10 , 921600 , 4 , 4096 , 1 }, - {10 , 1024000, 4 , 8192 , 1 }, - {10 , 512 , 6 , 64 , 1 }, - {10 , 1024 , 6 , 64 , 1 }, - {10 , 1536 , 6 , 64 , 1 }, - {10 , 2048 , 6 , 64 , 1 }, - {10 , 2560 , 6 , 32 , 1 }, - {10 , 3072 , 6 , 32 , 1 }, - {10 , 3584 , 6 , 128 , 1 }, - {10 , 4096 , 6 , 128 , 1 }, - {10 , 4608 , 6 , 128 , 1 }, - {10 , 5120 , 6 , 32 , 1 }, - {10 , 5632 , 6 , 32 , 1 }, - {10 , 6144 , 6 , 32 , 1 }, - {10 , 6656 , 6 , 32 , 1 }, - {10 , 7168 , 6 , 32 , 1 }, - {10 , 7680 , 6 , 256 , 1 }, - {10 , 8192 , 6 , 64 , 1 }, - {10 , 8704 , 6 , 32 , 1 }, - {10 , 9216 , 6 , 64 , 1 }, - {10 , 9728 , 6 , 32 , 1 }, - {10 , 10240 , 6 , 512 , 1 }, - {10 , 20480 , 6 , 64 , 1 }, - {10 , 30720 , 6 , 64 , 1 }, - {10 , 40960 , 6 , 64 , 1 }, - {10 , 51200 , 6 , 64 , 1 }, - {10 , 61440 , 6 , 64 , 1 }, - {10 , 71680 , 6 , 64 , 1 }, - {10 , 81920 , 6 , 64 , 1 }, - {10 , 92160 , 6 , 512 , 1 }, - {10 , 102400 , 6 , 64 , 1 }, - {10 , 204800 , 6 , 512 , 1 }, - {10 , 307200 , 6 , 4096 , 1 }, - {10 , 409600 , 6 , 8192 , 1 }, - {10 , 512000 , 6 , 4096 , 1 }, - {10 , 614400 , 6 , 8192 , 1 }, - {10 , 716800 , 6 , 4096 , 1 }, - {10 , 819200 , 6 , 16384 , 1 }, - {10 , 921600 , 6 , 4096 , 1 }, - {10 , 1024000, 6 , 8192 , 1 }, - {15 , 512 , 12 , 32 , 1 }, - {15 , 1024 , 12 , 64 , 1 }, - {15 , 1536 , 12 , 32 , 1 }, - {15 , 2048 , 12 , 32 , 1 }, - {15 , 2560 , 12 , 256 , 1 }, - {15 , 3072 , 12 , 32 , 1 }, - {15 , 3584 , 12 , 32 , 1 }, - {15 , 4096 , 12 , 32 , 1 }, - {15 , 4608 , 12 , 64 , 1 }, - {15 , 5120 , 12 , 32 , 1 }, - {15 , 5632 , 12 , 32 , 1 }, - {15 , 6144 , 12 , 64 , 1 }, - {15 , 6656 , 12 , 32 , 1 }, - {15 , 7168 , 12 , 64 , 1 }, - {15 , 7680 , 12 , 64 , 1 }, - {15 , 8192 , 12 , 64 , 1 }, - {15 , 8704 , 12 , 32 , 1 }, - {15 , 9216 , 12 , 64 , 1 }, - {15 , 9728 , 12 , 32 , 1 }, - {15 , 10240 , 12 , 32 , 1 }, - {15 , 20480 , 12 , 64 , 1 }, - {15 , 30720 , 12 , 32 , 1 }, - {15 , 40960 , 12 , 64 , 1 }, - {15 , 51200 , 12 , 64 , 1 }, - {15 , 61440 , 12 , 64 , 1 }, - {15 , 71680 , 12 , 64 , 1 }, - {15 , 81920 , 12 , 64 , 1 }, - {15 , 92160 , 12 , 64 , 1 }, - {15 , 102400 , 12 , 64 , 1 }, - {15 , 204800 , 12 , 8192 , 1 }, - {15 , 307200 , 12 , 4096 , 1 }, - {15 , 409600 , 12 , 16384 , 1 }, - {15 , 512000 , 12 , 4096 , 1 }, - {15 , 614400 , 12 , 8192 , 1 }, - {15 , 716800 , 12 , 512 , 1 }, - {15 , 819200 , 12 , 32768 , 1 }, - {15 , 921600 , 12 , 4096 , 1 }, - {15 , 1024000, 12 , 8192 , 1 }, - {20 , 512 , 11 , 64 , 1 }, - {20 , 1024 , 11 , 32 , 1 }, - {20 , 1536 , 11 , 64 , 1 }, - {20 , 2048 , 11 , 64 , 1 }, - {20 , 2560 , 11 , 128 , 1 }, - {20 , 3072 , 11 , 64 , 1 }, - {20 , 3584 , 11 , 32 , 1 }, - {20 , 4096 , 11 , 32 , 1 }, - {20 , 4608 , 11 , 64 , 1 }, - {20 , 5120 , 11 , 32 , 1 }, - {20 , 5632 , 11 , 64 , 1 }, - {20 , 6144 , 11 , 64 , 1 }, - {20 , 6656 , 11 , 64 , 1 }, - {20 , 7168 , 11 , 32 , 1 }, - {20 , 7680 , 11 , 32 , 1 }, - {20 , 8192 , 11 , 32 , 1 }, - {20 , 8704 , 11 , 64 , 1 }, - {20 , 9216 , 11 , 32 , 1 }, - {20 , 9728 , 11 , 32 , 1 }, - {20 , 10240 , 11 , 32 , 1 }, - {20 , 20480 , 11 , 64 , 1 }, - {20 , 30720 , 11 , 64 , 1 }, - {20 , 40960 , 11 , 64 , 1 }, - {20 , 51200 , 11 , 64 , 1 }, - {20 , 61440 , 11 , 64 , 1 }, - {20 , 71680 , 11 , 64 , 1 }, - {20 , 81920 , 11 , 64 , 1 }, - {20 , 92160 , 11 , 32 , 1 }, - {20 , 102400 , 11 , 64 , 1 }, - {20 , 204800 , 11 , 4096 , 1 }, - {20 , 307200 , 11 , 512 , 1 }, - {20 , 409600 , 11 , 16384 , 1 }, - {20 , 512000 , 11 , 4096 , 1 }, - {20 , 614400 , 11 , 8192 , 1 }, - {20 , 716800 , 11 , 4096 , 1 }, - {20 , 819200 , 11 , 32768 , 1 }, - {20 , 921600 , 11 , 2048 , 1 }, - {20 , 1024000, 11 , 8192 , 1 }, - {21 , 512 , 16 , 32 , 1 }, - {21 , 1024 , 16 , 128 , 1 }, - {21 , 1536 , 16 , 32 , 1 }, - {21 , 2048 , 16 , 64 , 1 }, - {21 , 2560 , 16 , 32 , 1 }, - {21 , 3072 , 16 , 64 , 1 }, - {21 , 3584 , 16 , 64 , 1 }, - {21 , 4096 , 16 , 32 , 1 }, - {21 , 4608 , 16 , 32 , 1 }, - {21 , 5120 , 16 , 32 , 1 }, - {21 , 5632 , 16 , 64 , 1 }, - {21 , 6144 , 16 , 32 , 1 }, - {21 , 6656 , 16 , 64 , 1 }, - {21 , 7168 , 16 , 32 , 1 }, - {21 , 7680 , 16 , 64 , 1 }, - {21 , 8192 , 16 , 32 , 1 }, - {21 , 8704 , 16 , 32 , 1 }, - {21 , 9216 , 16 , 32 , 1 }, - {21 , 9728 , 16 , 32 , 1 }, - {21 , 10240 , 16 , 32 , 1 }, - {21 , 20480 , 16 , 64 , 1 }, - {21 , 30720 , 16 , 32 , 1 }, - {21 , 40960 , 16 , 64 , 1 }, - {21 , 51200 , 16 , 64 , 1 }, - {21 , 61440 , 16 , 64 , 1 }, - {21 , 71680 , 16 , 64 , 1 }, - {21 , 81920 , 16 , 64 , 1 }, - {21 , 92160 , 16 , 64 , 1 }, - {21 , 102400 , 16 , 64 , 1 }, - {21 , 204800 , 16 , 8192 , 1 }, - {21 , 307200 , 16 , 4096 , 1 }, - {21 , 409600 , 16 , 16384 , 1 }, - {21 , 512000 , 16 , 4096 , 1 }, - {21 , 614400 , 16 , 8192 , 1 }, - {21 , 716800 , 16 , 4096 , 1 }, - {21 , 819200 , 16 , 32768 , 1 }, - {21 , 921600 , 16 , 4096 , 1 }, - {21 , 1024000, 16 , 8192 , 1 }, - {28 , 512 , 25 , 64 , 1 }, - {28 , 1024 , 25 , 32 , 1 }, - {28 , 1536 , 25 , 64 , 0 }, - {28 , 2048 , 25 , 64 , 1 }, - {28 , 2560 , 25 , 32 , 1 }, - {28 , 3072 , 25 , 64 , 1 }, - {28 , 3584 , 25 , 32 , 1 }, - {28 , 4096 , 25 , 32 , 1 }, - {28 , 4608 , 25 , 32 , 1 }, - {28 , 5120 , 25 , 32 , 1 }, - {28 , 5632 , 25 , 32 , 1 }, - {28 , 6144 , 25 , 32 , 1 }, - {28 , 6656 , 25 , 64 , 1 }, - {28 , 7168 , 25 , 64 , 1 }, - {28 , 7680 , 25 , 64 , 1 }, - {28 , 8192 , 25 , 64 , 1 }, - {28 , 8704 , 25 , 32 , 1 }, - {28 , 9216 , 25 , 32 , 1 }, - {28 , 9728 , 25 , 32 , 1 }, - {28 , 10240 , 25 , 32 , 1 }, - {28 , 20480 , 25 , 32 , 1 }, - {28 , 30720 , 25 , 32 , 1 }, - {28 , 40960 , 25 , 64 , 1 }, - {28 , 51200 , 25 , 64 , 1 }, - {28 , 61440 , 25 , 64 , 1 }, - {28 , 71680 , 25 , 64 , 1 }, - {28 , 81920 , 25 , 64 , 1 }, - {28 , 92160 , 25 , 64 , 1 }, - {28 , 102400 , 25 , 4096 , 1 }, - {28 , 204800 , 25 , 8192 , 1 }, - {28 , 307200 , 25 , 4096 , 1 }, - {28 , 409600 , 25 , 16384 , 1 }, - {28 , 512000 , 25 , 4096 , 1 }, - {28 , 614400 , 25 , 4096 , 1 }, - {28 , 716800 , 25 , 2048 , 1 }, - {28 , 819200 , 25 , 32768 , 1 }, - {28 , 921600 , 25 , 4096 , 1 }, - {28 , 1024000, 25 , 4096 , 1 }, - {35 , 512 , 24 , 32 , 1 }, - {35 , 1024 , 24 , 64 , 0 }, - {35 , 1536 , 24 , 64 , 1 }, - {35 , 2048 , 24 , 32 , 0 }, - {35 , 2560 , 24 , 32 , 1 }, - {35 , 3072 , 24 , 32 , 1 }, - {35 , 3584 , 24 , 64 , 1 }, - {35 , 4096 , 24 , 32 , 1 }, - {35 , 4608 , 24 , 64 , 1 }, - {35 , 5120 , 24 , 32 , 1 }, - {35 , 5632 , 24 , 64 , 1 }, - {35 , 6144 , 24 , 32 , 1 }, - {35 , 6656 , 24 , 32 , 1 }, - {35 , 7168 , 24 , 32 , 1 }, - {35 , 7680 , 24 , 32 , 1 }, - {35 , 8192 , 24 , 64 , 1 }, - {35 , 8704 , 24 , 32 , 1 }, - {35 , 9216 , 24 , 32 , 1 }, - {35 , 9728 , 24 , 32 , 1 }, - {35 , 10240 , 24 , 32 , 1 }, - {35 , 20480 , 24 , 32 , 1 }, - {35 , 30720 , 24 , 32 , 1 }, - {35 , 40960 , 24 , 32 , 1 }, - {35 , 51200 , 24 , 64 , 1 }, - {35 , 61440 , 24 , 64 , 1 }, - {35 , 71680 , 24 , 64 , 1 }, - {35 , 81920 , 24 , 16384 , 1 }, - {35 , 92160 , 24 , 64 , 1 }, - {35 , 102400 , 24 , 4096 , 1 }, - {35 , 204800 , 24 , 8192 , 1 }, - {35 , 307200 , 24 , 2048 , 1 }, - {35 , 409600 , 24 , 8192 , 1 }, - {35 , 512000 , 24 , 4096 , 1 }, - {35 , 614400 , 24 , 8192 , 1 }, - {35 , 716800 , 24 , 2048 , 1 }, - {35 , 819200 , 24 , 16384 , 1 }, - {35 , 921600 , 24 , 4096 , 1 }, - {35 , 1024000, 24 , 8192 , 1 }, - {36 , 512 , 33 , 32 , 1 }, - {36 , 1024 , 33 , 64 , 1 }, - {36 , 1536 , 33 , 64 , 0 }, - {36 , 2048 , 33 , 32 , 1 }, - {36 , 2560 , 33 , 64 , 1 }, - {36 , 3072 , 33 , 64 , 1 }, - {36 , 3584 , 33 , 64 , 1 }, - {36 , 4096 , 33 , 32 , 1 }, - {36 , 4608 , 33 , 32 , 1 }, - {36 , 5120 , 33 , 32 , 1 }, - {36 , 5632 , 33 , 32 , 1 }, - {36 , 6144 , 33 , 64 , 1 }, - {36 , 6656 , 33 , 32 , 1 }, - {36 , 7168 , 33 , 32 , 1 }, - {36 , 7680 , 33 , 64 , 1 }, - {36 , 8192 , 33 , 64 , 1 }, - {36 , 8704 , 33 , 64 , 1 }, - {36 , 9216 , 33 , 64 , 1 }, - {36 , 9728 , 33 , 32 , 1 }, - {36 , 10240 , 33 , 64 , 1 }, - {36 , 20480 , 33 , 64 , 1 }, - {36 , 30720 , 33 , 32 , 1 }, - {36 , 40960 , 33 , 32 , 1 }, - {36 , 51200 , 33 , 32 , 1 }, - {36 , 61440 , 33 , 32 , 1 }, - {36 , 71680 , 33 , 64 , 1 }, - {36 , 81920 , 33 , 8192 , 1 }, - {36 , 92160 , 33 , 2048 , 1 }, - {36 , 102400 , 33 , 2048 , 1 }, - {36 , 204800 , 33 , 8192 , 1 }, - {36 , 307200 , 33 , 2048 , 1 }, - {36 , 409600 , 33 , 8192 , 1 }, - {36 , 512000 , 33 , 2048 , 1 }, - {36 , 614400 , 33 , 8192 , 1 }, - {36 , 716800 , 33 , 2048 , 1 }, - {36 , 819200 , 33 , 4096 , 1 }, - {36 , 921600 , 33 , 4096 , 1 }, - {36 , 1024000, 33 , 4096 , 1 }, - {45 , 512 , 42 , 32 , 1 }, - {45 , 1024 , 42 , 64 , 0 }, - {45 , 1536 , 42 , 64 , 0 }, - {45 , 2048 , 42 , 32 , 1 }, - {45 , 2560 , 42 , 64 , 0 }, - {45 , 3072 , 42 , 32 , 1 }, - {45 , 3584 , 42 , 32 , 1 }, - {45 , 4096 , 42 , 32 , 1 }, - {45 , 4608 , 42 , 64 , 1 }, - {45 , 5120 , 42 , 32 , 1 }, - {45 , 5632 , 42 , 32 , 1 }, - {45 , 6144 , 42 , 32 , 1 }, - {45 , 6656 , 42 , 64 , 1 }, - {45 , 7168 , 42 , 32 , 1 }, - {45 , 7680 , 42 , 64 , 1 }, - {45 , 8192 , 42 , 64 , 1 }, - {45 , 8704 , 42 , 32 , 1 }, - {45 , 9216 , 42 , 32 , 1 }, - {45 , 9728 , 42 , 64 , 1 }, - {45 , 10240 , 42 , 32 , 1 }, - {45 , 20480 , 42 , 64 , 1 }, - {45 , 30720 , 42 , 32 , 1 }, - {45 , 40960 , 42 , 64 , 1 }, - {45 , 51200 , 42 , 64 , 1 }, - {45 , 61440 , 42 , 2048 , 1 }, - {45 , 71680 , 42 , 1024 , 1 }, - {45 , 81920 , 42 , 16384 , 1 }, - {45 , 92160 , 42 , 2048 , 1 }, - {45 , 102400 , 42 , 2048 , 1 }, - {45 , 204800 , 42 , 4096 , 1 }, - {45 , 307200 , 42 , 4096 , 1 }, - {45 , 409600 , 42 , 16384 , 1 }, - {45 , 512000 , 42 , 4096 , 1 }, - {45 , 614400 , 42 , 8192 , 1 }, - {45 , 716800 , 42 , 4096 , 1 }, - {45 , 819200 , 42 , 32768 , 1 }, - {45 , 921600 , 42 , 4096 , 1 }, - {45 , 1024000, 42 , 4096 , 1 }, - {56 , 512 , 43 , 32 , 0 }, - {56 , 1024 , 43 , 32 , 1 }, - {56 , 1536 , 43 , 64 , 0 }, - {56 , 2048 , 43 , 64 , 0 }, - {56 , 2560 , 43 , 64 , 0 }, - {56 , 3072 , 43 , 64 , 1 }, - {56 , 3584 , 43 , 32 , 1 }, - {56 , 4096 , 43 , 32 , 1 }, - {56 , 4608 , 43 , 64 , 1 }, - {56 , 5120 , 43 , 64 , 1 }, - {56 , 5632 , 43 , 32 , 1 }, - {56 , 6144 , 43 , 64 , 1 }, - {56 , 6656 , 43 , 32 , 1 }, - {56 , 7168 , 43 , 64 , 1 }, - {56 , 7680 , 43 , 64 , 1 }, - {56 , 8192 , 43 , 32 , 1 }, - {56 , 8704 , 43 , 64 , 1 }, - {56 , 9216 , 43 , 32 , 1 }, - {56 , 9728 , 43 , 64 , 0 }, - {56 , 10240 , 43 , 32 , 1 }, - {56 , 20480 , 43 , 64 , 1 }, - {56 , 30720 , 43 , 32 , 1 }, - {56 , 40960 , 43 , 32 , 1 }, - {56 , 51200 , 43 , 2048 , 1 }, - {56 , 61440 , 43 , 4096 , 1 }, - {56 , 71680 , 43 , 2048 , 1 }, - {56 , 81920 , 43 , 8192 , 1 }, - {56 , 92160 , 43 , 2048 , 1 }, - {56 , 102400 , 43 , 4096 , 0 }, - {56 , 204800 , 43 , 2048 , 0 }, - {56 , 307200 , 43 , 4096 , 0 }, - {56 , 409600 , 43 , 16384 , 0 }, - {56 , 512000 , 43 , 4096 , 0 }, - {56 , 614400 , 43 , 4096 , 0 }, - {56 , 716800 , 43 , 4096 , 0 }, - {56 , 819200 , 43 , 8192 , 0 }, - {56 , 921600 , 43 , 1024 , 0 }, - {56 , 1024000, 43 , 128 , 0 }, - {84 , 512 , 126 , 64 , 0 }, - {84 , 1024 , 126 , 32 , 0 }, - {84 , 1536 , 126 , 64 , 0 }, - {84 , 2048 , 126 , 1024 , 0 }, - {84 , 2560 , 126 , 64 , 0 }, - {84 , 3072 , 126 , 64 , 0 }, - {84 , 3584 , 126 , 32 , 0 }, - {84 , 4096 , 126 , 32 , 0 }, - {84 , 4608 , 126 , 32 , 0 }, - {84 , 5120 , 126 , 32 , 1 }, - {84 , 5632 , 126 , 64 , 0 }, - {84 , 6144 , 126 , 2048 , 1 }, - {84 , 6656 , 126 , 64 , 0 }, - {84 , 7168 , 126 , 64 , 0 }, - {84 , 7680 , 126 , 64 , 0 }, - {84 , 8192 , 126 , 2048 , 0 }, - {84 , 8704 , 126 , 64 , 0 }, - {84 , 9216 , 126 , 64 , 0 }, - {84 , 9728 , 126 , 64 , 0 }, - {84 , 10240 , 126 , 2048 , 0 }, - {84 , 20480 , 126 , 128 , 0 }, - {84 , 30720 , 126 , 2048 , 0 }, - {84 , 40960 , 126 , 8192 , 0 }, - {84 , 51200 , 126 , 2048 , 0 }, - {84 , 61440 , 126 , 128 , 0 }, - {84 , 71680 , 126 , 512 , 0 }, - {84 , 81920 , 126 , 256 , 0 }, - {84 , 92160 , 126 , 2048 , 0 }, - {84 , 102400 , 126 , 2048 , 0 }, - {84 , 204800 , 126 , 2048 , 0 }, - {84 , 307200 , 126 , 1024 , 0 }, - {84 , 409600 , 126 , 8192 , 0 }, - {84 , 512000 , 126 , 4096 , 0 }, - {84 , 614400 , 126 , 4096 , 0 }, - {84 , 716800 , 126 , 2048 , 0 }, - {84 , 819200 , 126 , 32768 , 0 }, - {84 , 921600 , 126 , 4096 , 0 }, - {84 , 1024000, 126 , 2048 , 0 }, - {120 , 512 , 210 , 64 , 0 }, - {120 , 1024 , 210 , 32 , 0 }, - {120 , 1536 , 210 , 512 , 0 }, - {120 , 2048 , 210 , 32 , 0 }, - {120 , 2560 , 210 , 64 , 0 }, - {120 , 3072 , 210 , 1024 , 0 }, - {120 , 3584 , 210 , 32 , 0 }, - {120 , 4096 , 210 , 32 , 0 }, - {120 , 4608 , 210 , 32 , 0 }, - {120 , 5120 , 210 , 1024 , 1 }, - {120 , 5632 , 210 , 512 , 1 }, - {120 , 6144 , 210 , 2048 , 1 }, - {120 , 6656 , 210 , 64 , 0 }, - {120 , 7168 , 210 , 512 , 0 }, - {120 , 7680 , 210 , 512 , 1 }, - {120 , 8192 , 210 , 1024 , 1 }, - {120 , 8704 , 210 , 64 , 0 }, - {120 , 9216 , 210 , 1024 , 0 }, - {120 , 9728 , 210 , 64 , 0 }, - {120 , 10240 , 210 , 2048 , 0 }, - {120 , 20480 , 210 , 1024 , 0 }, - {120 , 30720 , 210 , 2048 , 0 }, - {120 , 40960 , 210 , 64 , 0 }, - {120 , 51200 , 210 , 1024 , 0 }, - {120 , 61440 , 210 , 61440 , 0 }, - {120 , 71680 , 210 , 71680 , 0 }, - {120 , 81920 , 210 , 81920 , 0 }, - {120 , 92160 , 210 , 92160 , 0 }, - {120 , 102400 , 210 , 102400 , 0 }, - {120 , 204800 , 210 , 204800 , 0 }, - {120 , 307200 , 210 , 307200 , 0 }, - {120 , 409600 , 210 , 409600 , 0 }, - {120 , 512000 , 210 , 512000 , 0 }, - {120 , 614400 , 210 , 614400 , 0 }, - {120 , 716800 , 210 , 716800 , 0 }, - {120 , 819200 , 210 , 819200 , 1 }, - {120 , 921600 , 210 , 921600 , 0 }, - {120 , 1024000, 210 , 1024000, 0 }, - {165 , 512 , 330 , 512 , 0 }, - {165 , 1024 , 330 , 256 , 0 }, - {165 , 1536 , 330 , 512 , 0 }, - {165 , 2048 , 330 , 2048 , 0 }, - {165 , 2560 , 330 , 512 , 1 }, - {165 , 3072 , 330 , 1024 , 1 }, - {165 , 3584 , 330 , 512 , 1 }, - {165 , 4096 , 330 , 1024 , 1 }, - {165 , 4608 , 330 , 512 , 0 }, - {165 , 5120 , 330 , 64 , 0 }, - {165 , 5632 , 330 , 64 , 0 }, - {165 , 6144 , 330 , 512 , 0 }, - {165 , 6656 , 330 , 256 , 0 }, - {165 , 7168 , 330 , 1024 , 1 }, - {165 , 7680 , 330 , 64 , 0 }, - {165 , 8192 , 330 , 8192 , 1 }, - {165 , 8704 , 330 , 256 , 1 }, - {165 , 9216 , 330 , 1024 , 1 }, - {165 , 9728 , 330 , 512 , 1 }, - {165 , 10240 , 330 , 1024 , 1 }, - {165 , 20480 , 330 , 2048 , 0 }, - {165 , 30720 , 330 , 512 , 0 }, - {165 , 40960 , 330 , 2048 , 0 }, - {165 , 51200 , 330 , 256 , 0 }, - {165 , 61440 , 330 , 4096 , 0 }, - {165 , 71680 , 330 , 512 , 0 }, - {165 , 81920 , 330 , 8192 , 0 }, - {165 , 92160 , 330 , 512 , 0 }, - {165 , 102400 , 330 , 2048 , 0 }, - {165 , 204800 , 330 , 4096 , 0 }, - {165 , 307200 , 330 , 1024 , 0 }, - {165 , 409600 , 330 , 1024 , 0 }, - {165 , 512000 , 330 , 4096 , 0 }, - {165 , 614400 , 330 , 2048 , 0 }, - {165 , 716800 , 330 , 2048 , 0 }, - {165 , 819200 , 330 , 8192 , 0 }, - {165 , 921600 , 330 , 4096 , 0 }, - {165 , 1024000, 330 , 4096 , 0 } +std::vector > sgemm_nn_v100 = { + {3, 512, 1, 64, 0}, + {3, 1024, 1, 32, 0}, + {3, 1536, 1, 32, 0}, + {3, 2048, 1, 64, 0}, + {3, 2560, 1, 128, 0}, + {3, 3072, 1, 64, 0}, + {3, 3584, 1, 64, 0}, + {3, 4096, 1, 64, 0}, + {3, 4608, 1, 32, 0}, + {3, 5120, 1, 32, 0}, + {3, 5632, 1, 32, 0}, + {3, 6144, 1, 32, 0}, + {3, 6656, 1, 64, 0}, + {3, 7168, 1, 32, 0}, + {3, 7680, 1, 32, 0}, + {3, 8192, 1, 1024, 0}, + {3, 8704, 1, 512, 0}, + {3, 9216, 1, 1024, 0}, + {3, 9728, 1, 64, 0}, + {3, 10240, 1, 32, 0}, + {3, 20480, 1, 32, 0}, + {3, 30720, 1, 64, 0}, + {3, 40960, 1, 32, 0}, + {3, 51200, 1, 64, 0}, + {3, 61440, 1, 64, 0}, + {3, 71680, 1, 32, 0}, + {3, 81920, 1, 64, 0}, + {3, 92160, 1, 64, 0}, + {3, 102400, 1, 32, 0}, + {3, 204800, 1, 64, 0}, + {3, 307200, 1, 64, 0}, + {3, 409600, 1, 32, 0}, + {3, 512000, 1, 32, 0}, + {3, 614400, 1, 64, 0}, + {3, 716800, 1, 32, 0}, + {3, 819200, 1, 64, 0}, + {3, 921600, 1, 64, 0}, + {3, 1024000, 1, 64, 0}, + {4, 512, 1, 32, 0}, + {4, 1024, 1, 32, 0}, + {4, 1536, 1, 32, 0}, + {4, 2048, 1, 64, 0}, + {4, 2560, 1, 32, 0}, + {4, 3072, 1, 64, 0}, + {4, 3584, 1, 64, 0}, + {4, 4096, 1, 64, 0}, + {4, 4608, 1, 128, 0}, + {4, 5120, 1, 32, 0}, + {4, 5632, 1, 32, 0}, + {4, 6144, 1, 32, 0}, + {4, 6656, 1, 32, 0}, + {4, 7168, 1, 64, 0}, + {4, 7680, 1, 256, 0}, + {4, 8192, 1, 256, 0}, + {4, 8704, 1, 64, 0}, + {4, 9216, 1, 64, 0}, + {4, 9728, 1, 512, 0}, + {4, 10240, 1, 512, 0}, + {4, 20480, 1, 64, 0}, + {4, 30720, 1, 64, 0}, + {4, 40960, 1, 32, 0}, + {4, 51200, 1, 64, 0}, + {4, 61440, 1, 64, 0}, + {4, 71680, 1, 64, 0}, + {4, 81920, 1, 32, 0}, + {4, 92160, 1, 64, 0}, + {4, 102400, 1, 32, 0}, + {4, 204800, 1, 32, 0}, + {4, 307200, 1, 32, 0}, + {4, 409600, 1, 64, 0}, + {4, 512000, 1, 32, 0}, + {4, 614400, 1, 64, 0}, + {4, 716800, 1, 32, 0}, + {4, 819200, 1, 64, 0}, + {4, 921600, 1, 64, 0}, + {4, 1024000, 1, 32, 0}, + {6, 512, 3, 64, 1}, + {6, 1024, 3, 32, 1}, + {6, 1536, 3, 64, 1}, + {6, 2048, 3, 32, 1}, + {6, 2560, 3, 32, 1}, + {6, 3072, 3, 512, 1}, + {6, 3584, 3, 64, 1}, + {6, 4096, 3, 32, 1}, + {6, 4608, 3, 32, 1}, + {6, 5120, 3, 64, 1}, + {6, 5632, 3, 32, 1}, + {6, 6144, 3, 64, 1}, + {6, 6656, 3, 64, 1}, + {6, 7168, 3, 128, 1}, + {6, 7680, 3, 64, 1}, + {6, 8192, 3, 32, 1}, + {6, 8704, 3, 32, 1}, + {6, 9216, 3, 32, 1}, + {6, 9728, 3, 64, 1}, + {6, 10240, 3, 64, 1}, + {6, 20480, 3, 32, 1}, + {6, 30720, 3, 64, 1}, + {6, 40960, 3, 64, 1}, + {6, 51200, 3, 64, 1}, + {6, 61440, 3, 64, 1}, + {6, 71680, 3, 1024, 1}, + {6, 81920, 3, 64, 1}, + {6, 92160, 3, 64, 1}, + {6, 102400, 3, 64, 1}, + {6, 204800, 3, 512, 1}, + {6, 307200, 3, 2048, 1}, + {6, 409600, 3, 2048, 1}, + {6, 512000, 3, 2048, 1}, + {6, 614400, 3, 8192, 1}, + {6, 716800, 3, 2048, 1}, + {6, 819200, 3, 32768, 1}, + {6, 921600, 3, 4096, 1}, + {6, 1024000, 3, 8192, 1}, + {10, 512, 4, 64, 1}, + {10, 1024, 4, 32, 1}, + {10, 1536, 4, 32, 1}, + {10, 2048, 4, 64, 1}, + {10, 2560, 4, 64, 1}, + {10, 3072, 4, 32, 1}, + {10, 3584, 4, 64, 1}, + {10, 4096, 4, 64, 1}, + {10, 4608, 4, 64, 1}, + {10, 5120, 4, 32, 1}, + {10, 5632, 4, 64, 1}, + {10, 6144, 4, 32, 1}, + {10, 6656, 4, 64, 1}, + {10, 7168, 4, 32, 1}, + {10, 7680, 4, 64, 1}, + {10, 8192, 4, 64, 1}, + {10, 8704, 4, 64, 1}, + {10, 9216, 4, 32, 1}, + {10, 9728, 4, 32, 1}, + {10, 10240, 4, 32, 1}, + {10, 20480, 4, 64, 1}, + {10, 30720, 4, 64, 1}, + {10, 40960, 4, 512, 1}, + {10, 51200, 4, 64, 1}, + {10, 61440, 4, 64, 1}, + {10, 71680, 4, 64, 1}, + {10, 81920, 4, 16384, 1}, + {10, 92160, 4, 64, 1}, + {10, 102400, 4, 64, 1}, + {10, 204800, 4, 8192, 1}, + {10, 307200, 4, 4096, 1}, + {10, 409600, 4, 8192, 1}, + {10, 512000, 4, 4096, 1}, + {10, 614400, 4, 8192, 1}, + {10, 716800, 4, 1024, 1}, + {10, 819200, 4, 32768, 1}, + {10, 921600, 4, 4096, 1}, + {10, 1024000, 4, 8192, 1}, + {10, 512, 6, 64, 1}, + {10, 1024, 6, 64, 1}, + {10, 1536, 6, 64, 1}, + {10, 2048, 6, 64, 1}, + {10, 2560, 6, 32, 1}, + {10, 3072, 6, 32, 1}, + {10, 3584, 6, 128, 1}, + {10, 4096, 6, 128, 1}, + {10, 4608, 6, 128, 1}, + {10, 5120, 6, 32, 1}, + {10, 5632, 6, 32, 1}, + {10, 6144, 6, 32, 1}, + {10, 6656, 6, 32, 1}, + {10, 7168, 6, 32, 1}, + {10, 7680, 6, 256, 1}, + {10, 8192, 6, 64, 1}, + {10, 8704, 6, 32, 1}, + {10, 9216, 6, 64, 1}, + {10, 9728, 6, 32, 1}, + {10, 10240, 6, 512, 1}, + {10, 20480, 6, 64, 1}, + {10, 30720, 6, 64, 1}, + {10, 40960, 6, 64, 1}, + {10, 51200, 6, 64, 1}, + {10, 61440, 6, 64, 1}, + {10, 71680, 6, 64, 1}, + {10, 81920, 6, 64, 1}, + {10, 92160, 6, 512, 1}, + {10, 102400, 6, 64, 1}, + {10, 204800, 6, 512, 1}, + {10, 307200, 6, 4096, 1}, + {10, 409600, 6, 8192, 1}, + {10, 512000, 6, 4096, 1}, + {10, 614400, 6, 8192, 1}, + {10, 716800, 6, 4096, 1}, + {10, 819200, 6, 16384, 1}, + {10, 921600, 6, 4096, 1}, + {10, 1024000, 6, 8192, 1}, + {15, 512, 12, 32, 1}, + {15, 1024, 12, 64, 1}, + {15, 1536, 12, 32, 1}, + {15, 2048, 12, 32, 1}, + {15, 2560, 12, 256, 1}, + {15, 3072, 12, 32, 1}, + {15, 3584, 12, 32, 1}, + {15, 4096, 12, 32, 1}, + {15, 4608, 12, 64, 1}, + {15, 5120, 12, 32, 1}, + {15, 5632, 12, 32, 1}, + {15, 6144, 12, 64, 1}, + {15, 6656, 12, 32, 1}, + {15, 7168, 12, 64, 1}, + {15, 7680, 12, 64, 1}, + {15, 8192, 12, 64, 1}, + {15, 8704, 12, 32, 1}, + {15, 9216, 12, 64, 1}, + {15, 9728, 12, 32, 1}, + {15, 10240, 12, 32, 1}, + {15, 20480, 12, 64, 1}, + {15, 30720, 12, 32, 1}, + {15, 40960, 12, 64, 1}, + {15, 51200, 12, 64, 1}, + {15, 61440, 12, 64, 1}, + {15, 71680, 12, 64, 1}, + {15, 81920, 12, 64, 1}, + {15, 92160, 12, 64, 1}, + {15, 102400, 12, 64, 1}, + {15, 204800, 12, 8192, 1}, + {15, 307200, 12, 4096, 1}, + {15, 409600, 12, 16384, 1}, + {15, 512000, 12, 4096, 1}, + {15, 614400, 12, 8192, 1}, + {15, 716800, 12, 512, 1}, + {15, 819200, 12, 32768, 1}, + {15, 921600, 12, 4096, 1}, + {15, 1024000, 12, 8192, 1}, + {20, 512, 11, 64, 1}, + {20, 1024, 11, 32, 1}, + {20, 1536, 11, 64, 1}, + {20, 2048, 11, 64, 1}, + {20, 2560, 11, 128, 1}, + {20, 3072, 11, 64, 1}, + {20, 3584, 11, 32, 1}, + {20, 4096, 11, 32, 1}, + {20, 4608, 11, 64, 1}, + {20, 5120, 11, 32, 1}, + {20, 5632, 11, 64, 1}, + {20, 6144, 11, 64, 1}, + {20, 6656, 11, 64, 1}, + {20, 7168, 11, 32, 1}, + {20, 7680, 11, 32, 1}, + {20, 8192, 11, 32, 1}, + {20, 8704, 11, 64, 1}, + {20, 9216, 11, 32, 1}, + {20, 9728, 11, 32, 1}, + {20, 10240, 11, 32, 1}, + {20, 20480, 11, 64, 1}, + {20, 30720, 11, 64, 1}, + {20, 40960, 11, 64, 1}, + {20, 51200, 11, 64, 1}, + {20, 61440, 11, 64, 1}, + {20, 71680, 11, 64, 1}, + {20, 81920, 11, 64, 1}, + {20, 92160, 11, 32, 1}, + {20, 102400, 11, 64, 1}, + {20, 204800, 11, 4096, 1}, + {20, 307200, 11, 512, 1}, + {20, 409600, 11, 16384, 1}, + {20, 512000, 11, 4096, 1}, + {20, 614400, 11, 8192, 1}, + {20, 716800, 11, 4096, 1}, + {20, 819200, 11, 32768, 1}, + {20, 921600, 11, 2048, 1}, + {20, 1024000, 11, 8192, 1}, + {21, 512, 16, 32, 1}, + {21, 1024, 16, 128, 1}, + {21, 1536, 16, 32, 1}, + {21, 2048, 16, 64, 1}, + {21, 2560, 16, 32, 1}, + {21, 3072, 16, 64, 1}, + {21, 3584, 16, 64, 1}, + {21, 4096, 16, 32, 1}, + {21, 4608, 16, 32, 1}, + {21, 5120, 16, 32, 1}, + {21, 5632, 16, 64, 1}, + {21, 6144, 16, 32, 1}, + {21, 6656, 16, 64, 1}, + {21, 7168, 16, 32, 1}, + {21, 7680, 16, 64, 1}, + {21, 8192, 16, 32, 1}, + {21, 8704, 16, 32, 1}, + {21, 9216, 16, 32, 1}, + {21, 9728, 16, 32, 1}, + {21, 10240, 16, 32, 1}, + {21, 20480, 16, 64, 1}, + {21, 30720, 16, 32, 1}, + {21, 40960, 16, 64, 1}, + {21, 51200, 16, 64, 1}, + {21, 61440, 16, 64, 1}, + {21, 71680, 16, 64, 1}, + {21, 81920, 16, 64, 1}, + {21, 92160, 16, 64, 1}, + {21, 102400, 16, 64, 1}, + {21, 204800, 16, 8192, 1}, + {21, 307200, 16, 4096, 1}, + {21, 409600, 16, 16384, 1}, + {21, 512000, 16, 4096, 1}, + {21, 614400, 16, 8192, 1}, + {21, 716800, 16, 4096, 1}, + {21, 819200, 16, 32768, 1}, + {21, 921600, 16, 4096, 1}, + {21, 1024000, 16, 8192, 1}, + {28, 512, 25, 64, 1}, + {28, 1024, 25, 32, 1}, + {28, 1536, 25, 64, 0}, + {28, 2048, 25, 64, 1}, + {28, 2560, 25, 32, 1}, + {28, 3072, 25, 64, 1}, + {28, 3584, 25, 32, 1}, + {28, 4096, 25, 32, 1}, + {28, 4608, 25, 32, 1}, + {28, 5120, 25, 32, 1}, + {28, 5632, 25, 32, 1}, + {28, 6144, 25, 32, 1}, + {28, 6656, 25, 64, 1}, + {28, 7168, 25, 64, 1}, + {28, 7680, 25, 64, 1}, + {28, 8192, 25, 64, 1}, + {28, 8704, 25, 32, 1}, + {28, 9216, 25, 32, 1}, + {28, 9728, 25, 32, 1}, + {28, 10240, 25, 32, 1}, + {28, 20480, 25, 32, 1}, + {28, 30720, 25, 32, 1}, + {28, 40960, 25, 64, 1}, + {28, 51200, 25, 64, 1}, + {28, 61440, 25, 64, 1}, + {28, 71680, 25, 64, 1}, + {28, 81920, 25, 64, 1}, + {28, 92160, 25, 64, 1}, + {28, 102400, 25, 4096, 1}, + {28, 204800, 25, 8192, 1}, + {28, 307200, 25, 4096, 1}, + {28, 409600, 25, 16384, 1}, + {28, 512000, 25, 4096, 1}, + {28, 614400, 25, 4096, 1}, + {28, 716800, 25, 2048, 1}, + {28, 819200, 25, 32768, 1}, + {28, 921600, 25, 4096, 1}, + {28, 1024000, 25, 4096, 1}, + {35, 512, 24, 32, 1}, + {35, 1024, 24, 64, 0}, + {35, 1536, 24, 64, 1}, + {35, 2048, 24, 32, 0}, + {35, 2560, 24, 32, 1}, + {35, 3072, 24, 32, 1}, + {35, 3584, 24, 64, 1}, + {35, 4096, 24, 32, 1}, + {35, 4608, 24, 64, 1}, + {35, 5120, 24, 32, 1}, + {35, 5632, 24, 64, 1}, + {35, 6144, 24, 32, 1}, + {35, 6656, 24, 32, 1}, + {35, 7168, 24, 32, 1}, + {35, 7680, 24, 32, 1}, + {35, 8192, 24, 64, 1}, + {35, 8704, 24, 32, 1}, + {35, 9216, 24, 32, 1}, + {35, 9728, 24, 32, 1}, + {35, 10240, 24, 32, 1}, + {35, 20480, 24, 32, 1}, + {35, 30720, 24, 32, 1}, + {35, 40960, 24, 32, 1}, + {35, 51200, 24, 64, 1}, + {35, 61440, 24, 64, 1}, + {35, 71680, 24, 64, 1}, + {35, 81920, 24, 16384, 1}, + {35, 92160, 24, 64, 1}, + {35, 102400, 24, 4096, 1}, + {35, 204800, 24, 8192, 1}, + {35, 307200, 24, 2048, 1}, + {35, 409600, 24, 8192, 1}, + {35, 512000, 24, 4096, 1}, + {35, 614400, 24, 8192, 1}, + {35, 716800, 24, 2048, 1}, + {35, 819200, 24, 16384, 1}, + {35, 921600, 24, 4096, 1}, + {35, 1024000, 24, 8192, 1}, + {36, 512, 33, 32, 1}, + {36, 1024, 33, 64, 1}, + {36, 1536, 33, 64, 0}, + {36, 2048, 33, 32, 1}, + {36, 2560, 33, 64, 1}, + {36, 3072, 33, 64, 1}, + {36, 3584, 33, 64, 1}, + {36, 4096, 33, 32, 1}, + {36, 4608, 33, 32, 1}, + {36, 5120, 33, 32, 1}, + {36, 5632, 33, 32, 1}, + {36, 6144, 33, 64, 1}, + {36, 6656, 33, 32, 1}, + {36, 7168, 33, 32, 1}, + {36, 7680, 33, 64, 1}, + {36, 8192, 33, 64, 1}, + {36, 8704, 33, 64, 1}, + {36, 9216, 33, 64, 1}, + {36, 9728, 33, 32, 1}, + {36, 10240, 33, 64, 1}, + {36, 20480, 33, 64, 1}, + {36, 30720, 33, 32, 1}, + {36, 40960, 33, 32, 1}, + {36, 51200, 33, 32, 1}, + {36, 61440, 33, 32, 1}, + {36, 71680, 33, 64, 1}, + {36, 81920, 33, 8192, 1}, + {36, 92160, 33, 2048, 1}, + {36, 102400, 33, 2048, 1}, + {36, 204800, 33, 8192, 1}, + {36, 307200, 33, 2048, 1}, + {36, 409600, 33, 8192, 1}, + {36, 512000, 33, 2048, 1}, + {36, 614400, 33, 8192, 1}, + {36, 716800, 33, 2048, 1}, + {36, 819200, 33, 4096, 1}, + {36, 921600, 33, 4096, 1}, + {36, 1024000, 33, 4096, 1}, + {45, 512, 42, 32, 1}, + {45, 1024, 42, 64, 0}, + {45, 1536, 42, 64, 0}, + {45, 2048, 42, 32, 1}, + {45, 2560, 42, 64, 0}, + {45, 3072, 42, 32, 1}, + {45, 3584, 42, 32, 1}, + {45, 4096, 42, 32, 1}, + {45, 4608, 42, 64, 1}, + {45, 5120, 42, 32, 1}, + {45, 5632, 42, 32, 1}, + {45, 6144, 42, 32, 1}, + {45, 6656, 42, 64, 1}, + {45, 7168, 42, 32, 1}, + {45, 7680, 42, 64, 1}, + {45, 8192, 42, 64, 1}, + {45, 8704, 42, 32, 1}, + {45, 9216, 42, 32, 1}, + {45, 9728, 42, 64, 1}, + {45, 10240, 42, 32, 1}, + {45, 20480, 42, 64, 1}, + {45, 30720, 42, 32, 1}, + {45, 40960, 42, 64, 1}, + {45, 51200, 42, 64, 1}, + {45, 61440, 42, 2048, 1}, + {45, 71680, 42, 1024, 1}, + {45, 81920, 42, 16384, 1}, + {45, 92160, 42, 2048, 1}, + {45, 102400, 42, 2048, 1}, + {45, 204800, 42, 4096, 1}, + {45, 307200, 42, 4096, 1}, + {45, 409600, 42, 16384, 1}, + {45, 512000, 42, 4096, 1}, + {45, 614400, 42, 8192, 1}, + {45, 716800, 42, 4096, 1}, + {45, 819200, 42, 32768, 1}, + {45, 921600, 42, 4096, 1}, + {45, 1024000, 42, 4096, 1}, + {56, 512, 43, 32, 0}, + {56, 1024, 43, 32, 1}, + {56, 1536, 43, 64, 0}, + {56, 2048, 43, 64, 0}, + {56, 2560, 43, 64, 0}, + {56, 3072, 43, 64, 1}, + {56, 3584, 43, 32, 1}, + {56, 4096, 43, 32, 1}, + {56, 4608, 43, 64, 1}, + {56, 5120, 43, 64, 1}, + {56, 5632, 43, 32, 1}, + {56, 6144, 43, 64, 1}, + {56, 6656, 43, 32, 1}, + {56, 7168, 43, 64, 1}, + {56, 7680, 43, 64, 1}, + {56, 8192, 43, 32, 1}, + {56, 8704, 43, 64, 1}, + {56, 9216, 43, 32, 1}, + {56, 9728, 43, 64, 0}, + {56, 10240, 43, 32, 1}, + {56, 20480, 43, 64, 1}, + {56, 30720, 43, 32, 1}, + {56, 40960, 43, 32, 1}, + {56, 51200, 43, 2048, 1}, + {56, 61440, 43, 4096, 1}, + {56, 71680, 43, 2048, 1}, + {56, 81920, 43, 8192, 1}, + {56, 92160, 43, 2048, 1}, + {56, 102400, 43, 4096, 0}, + {56, 204800, 43, 2048, 0}, + {56, 307200, 43, 4096, 0}, + {56, 409600, 43, 16384, 0}, + {56, 512000, 43, 4096, 0}, + {56, 614400, 43, 4096, 0}, + {56, 716800, 43, 4096, 0}, + {56, 819200, 43, 8192, 0}, + {56, 921600, 43, 1024, 0}, + {56, 1024000, 43, 128, 0}, + {84, 512, 126, 64, 0}, + {84, 1024, 126, 32, 0}, + {84, 1536, 126, 64, 0}, + {84, 2048, 126, 1024, 0}, + {84, 2560, 126, 64, 0}, + {84, 3072, 126, 64, 0}, + {84, 3584, 126, 32, 0}, + {84, 4096, 126, 32, 0}, + {84, 4608, 126, 32, 0}, + {84, 5120, 126, 32, 1}, + {84, 5632, 126, 64, 0}, + {84, 6144, 126, 2048, 1}, + {84, 6656, 126, 64, 0}, + {84, 7168, 126, 64, 0}, + {84, 7680, 126, 64, 0}, + {84, 8192, 126, 2048, 0}, + {84, 8704, 126, 64, 0}, + {84, 9216, 126, 64, 0}, + {84, 9728, 126, 64, 0}, + {84, 10240, 126, 2048, 0}, + {84, 20480, 126, 128, 0}, + {84, 30720, 126, 2048, 0}, + {84, 40960, 126, 8192, 0}, + {84, 51200, 126, 2048, 0}, + {84, 61440, 126, 128, 0}, + {84, 71680, 126, 512, 0}, + {84, 81920, 126, 256, 0}, + {84, 92160, 126, 2048, 0}, + {84, 102400, 126, 2048, 0}, + {84, 204800, 126, 2048, 0}, + {84, 307200, 126, 1024, 0}, + {84, 409600, 126, 8192, 0}, + {84, 512000, 126, 4096, 0}, + {84, 614400, 126, 4096, 0}, + {84, 716800, 126, 2048, 0}, + {84, 819200, 126, 32768, 0}, + {84, 921600, 126, 4096, 0}, + {84, 1024000, 126, 2048, 0}, + {120, 512, 210, 64, 0}, + {120, 1024, 210, 32, 0}, + {120, 1536, 210, 512, 0}, + {120, 2048, 210, 32, 0}, + {120, 2560, 210, 64, 0}, + {120, 3072, 210, 1024, 0}, + {120, 3584, 210, 32, 0}, + {120, 4096, 210, 32, 0}, + {120, 4608, 210, 32, 0}, + {120, 5120, 210, 1024, 1}, + {120, 5632, 210, 512, 1}, + {120, 6144, 210, 2048, 1}, + {120, 6656, 210, 64, 0}, + {120, 7168, 210, 512, 0}, + {120, 7680, 210, 512, 1}, + {120, 8192, 210, 1024, 1}, + {120, 8704, 210, 64, 0}, + {120, 9216, 210, 1024, 0}, + {120, 9728, 210, 64, 0}, + {120, 10240, 210, 2048, 0}, + {120, 20480, 210, 1024, 0}, + {120, 30720, 210, 2048, 0}, + {120, 40960, 210, 64, 0}, + {120, 51200, 210, 1024, 0}, + {120, 61440, 210, 61440, 0}, + {120, 71680, 210, 71680, 0}, + {120, 81920, 210, 81920, 0}, + {120, 92160, 210, 92160, 0}, + {120, 102400, 210, 102400, 0}, + {120, 204800, 210, 204800, 0}, + {120, 307200, 210, 307200, 0}, + {120, 409600, 210, 409600, 0}, + {120, 512000, 210, 512000, 0}, + {120, 614400, 210, 614400, 0}, + {120, 716800, 210, 716800, 0}, + {120, 819200, 210, 819200, 1}, + {120, 921600, 210, 921600, 0}, + {120, 1024000, 210, 1024000, 0}, + {165, 512, 330, 512, 0}, + {165, 1024, 330, 256, 0}, + {165, 1536, 330, 512, 0}, + {165, 2048, 330, 2048, 0}, + {165, 2560, 330, 512, 1}, + {165, 3072, 330, 1024, 1}, + {165, 3584, 330, 512, 1}, + {165, 4096, 330, 1024, 1}, + {165, 4608, 330, 512, 0}, + {165, 5120, 330, 64, 0}, + {165, 5632, 330, 64, 0}, + {165, 6144, 330, 512, 0}, + {165, 6656, 330, 256, 0}, + {165, 7168, 330, 1024, 1}, + {165, 7680, 330, 64, 0}, + {165, 8192, 330, 8192, 1}, + {165, 8704, 330, 256, 1}, + {165, 9216, 330, 1024, 1}, + {165, 9728, 330, 512, 1}, + {165, 10240, 330, 1024, 1}, + {165, 20480, 330, 2048, 0}, + {165, 30720, 330, 512, 0}, + {165, 40960, 330, 2048, 0}, + {165, 51200, 330, 256, 0}, + {165, 61440, 330, 4096, 0}, + {165, 71680, 330, 512, 0}, + {165, 81920, 330, 8192, 0}, + {165, 92160, 330, 512, 0}, + {165, 102400, 330, 2048, 0}, + {165, 204800, 330, 4096, 0}, + {165, 307200, 330, 1024, 0}, + {165, 409600, 330, 1024, 0}, + {165, 512000, 330, 4096, 0}, + {165, 614400, 330, 2048, 0}, + {165, 716800, 330, 2048, 0}, + {165, 819200, 330, 8192, 0}, + {165, 921600, 330, 4096, 0}, + {165, 1024000, 330, 4096, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > sgemm_tn_v100 = -{ - {1 , 512 , 3 , 64 , 0 }, - {1 , 1024 , 3 , 64 , 0 }, - {1 , 1536 , 3 , 64 , 0 }, - {1 , 2048 , 3 , 64 , 0 }, - {1 , 2560 , 3 , 32 , 0 }, - {1 , 3072 , 3 , 32 , 0 }, - {1 , 3584 , 3 , 32 , 0 }, - {1 , 4096 , 3 , 32 , 0 }, - {1 , 4608 , 3 , 64 , 0 }, - {1 , 5120 , 3 , 64 , 0 }, - {1 , 5632 , 3 , 64 , 0 }, - {1 , 6144 , 3 , 64 , 0 }, - {1 , 6656 , 3 , 32 , 0 }, - {1 , 7168 , 3 , 64 , 0 }, - {1 , 7680 , 3 , 64 , 0 }, - {1 , 8192 , 3 , 32 , 0 }, - {1 , 8704 , 3 , 64 , 0 }, - {1 , 9216 , 3 , 64 , 0 }, - {1 , 9728 , 3 , 64 , 0 }, - {1 , 10240 , 3 , 32 , 0 }, - {1 , 20480 , 3 , 64 , 0 }, - {1 , 30720 , 3 , 64 , 1 }, - {1 , 40960 , 3 , 32 , 0 }, - {1 , 51200 , 3 , 64 , 1 }, - {1 , 61440 , 3 , 64 , 1 }, - {1 , 71680 , 3 , 1024 , 1 }, - {1 , 81920 , 3 , 32 , 0 }, - {1 , 92160 , 3 , 2048 , 1 }, - {1 , 102400 , 3 , 4096 , 1 }, - {1 , 204800 , 3 , 2048 , 1 }, - {1 , 307200 , 3 , 4096 , 1 }, - {1 , 409600 , 3 , 4096 , 1 }, - {1 , 512000 , 3 , 4096 , 1 }, - {1 , 614400 , 3 , 4096 , 1 }, - {1 , 716800 , 3 , 4096 , 1 }, - {1 , 819200 , 3 , 16384 , 1 }, - {1 , 921600 , 3 , 4096 , 1 }, - {1 , 1024000, 3 , 8192 , 1 }, - {1 , 512 , 4 , 64 , 0 }, - {1 , 1024 , 4 , 64 , 0 }, - {1 , 1536 , 4 , 32 , 0 }, - {1 , 2048 , 4 , 1024 , 0 }, - {1 , 2560 , 4 , 64 , 0 }, - {1 , 3072 , 4 , 32 , 0 }, - {1 , 3584 , 4 , 32 , 0 }, - {1 , 4096 , 4 , 32 , 0 }, - {1 , 4608 , 4 , 64 , 0 }, - {1 , 5120 , 4 , 64 , 0 }, - {1 , 5632 , 4 , 32 , 0 }, - {1 , 6144 , 4 , 32 , 0 }, - {1 , 6656 , 4 , 32 , 0 }, - {1 , 7168 , 4 , 64 , 0 }, - {1 , 7680 , 4 , 32 , 0 }, - {1 , 8192 , 4 , 32 , 1 }, - {1 , 8704 , 4 , 32 , 1 }, - {1 , 9216 , 4 , 64 , 0 }, - {1 , 9728 , 4 , 32 , 0 }, - {1 , 10240 , 4 , 64 , 0 }, - {1 , 20480 , 4 , 32 , 0 }, - {1 , 30720 , 4 , 64 , 1 }, - {1 , 40960 , 4 , 32 , 0 }, - {1 , 51200 , 4 , 2048 , 1 }, - {1 , 61440 , 4 , 64 , 1 }, - {1 , 71680 , 4 , 64 , 1 }, - {1 , 81920 , 4 , 64 , 1 }, - {1 , 92160 , 4 , 64 , 1 }, - {1 , 102400 , 4 , 4096 , 1 }, - {1 , 204800 , 4 , 4096 , 1 }, - {1 , 307200 , 4 , 4096 , 1 }, - {1 , 409600 , 4 , 4096 , 1 }, - {1 , 512000 , 4 , 4096 , 1 }, - {1 , 614400 , 4 , 8192 , 1 }, - {1 , 716800 , 4 , 4096 , 1 }, - {1 , 819200 , 4 , 32768 , 1 }, - {1 , 921600 , 4 , 4096 , 1 }, - {1 , 1024000, 4 , 8192 , 1 }, - {3 , 512 , 6 , 64 , 1 }, - {3 , 1024 , 6 , 64 , 0 }, - {3 , 1536 , 6 , 32 , 1 }, - {3 , 2048 , 6 , 32 , 1 }, - {3 , 2560 , 6 , 32 , 1 }, - {3 , 3072 , 6 , 64 , 1 }, - {3 , 3584 , 6 , 32 , 1 }, - {3 , 4096 , 6 , 32 , 1 }, - {3 , 4608 , 6 , 64 , 1 }, - {3 , 5120 , 6 , 64 , 1 }, - {3 , 5632 , 6 , 64 , 1 }, - {3 , 6144 , 6 , 1024 , 1 }, - {3 , 6656 , 6 , 64 , 1 }, - {3 , 7168 , 6 , 512 , 1 }, - {3 , 7680 , 6 , 32 , 1 }, - {3 , 8192 , 6 , 32 , 1 }, - {3 , 8704 , 6 , 512 , 1 }, - {3 , 9216 , 6 , 32 , 1 }, - {3 , 9728 , 6 , 64 , 1 }, - {3 , 10240 , 6 , 1024 , 1 }, - {3 , 20480 , 6 , 64 , 1 }, - {3 , 30720 , 6 , 64 , 1 }, - {3 , 40960 , 6 , 64 , 1 }, - {3 , 51200 , 6 , 64 , 1 }, - {3 , 61440 , 6 , 64 , 1 }, - {3 , 71680 , 6 , 64 , 1 }, - {3 , 81920 , 6 , 64 , 1 }, - {3 , 92160 , 6 , 64 , 1 }, - {3 , 102400 , 6 , 512 , 1 }, - {3 , 204800 , 6 , 2048 , 1 }, - {3 , 307200 , 6 , 2048 , 1 }, - {3 , 409600 , 6 , 16384 , 1 }, - {3 , 512000 , 6 , 4096 , 1 }, - {3 , 614400 , 6 , 4096 , 1 }, - {3 , 716800 , 6 , 2048 , 0 }, - {3 , 819200 , 6 , 32768 , 0 }, - {3 , 921600 , 6 , 4096 , 0 }, - {3 , 1024000, 6 , 8192 , 0 }, - {4 , 512 , 10 , 64 , 1 }, - {4 , 1024 , 10 , 32 , 1 }, - {4 , 1536 , 10 , 64 , 1 }, - {4 , 2048 , 10 , 32 , 1 }, - {4 , 2560 , 10 , 32 , 1 }, - {4 , 3072 , 10 , 32 , 1 }, - {4 , 3584 , 10 , 64 , 1 }, - {4 , 4096 , 10 , 64 , 1 }, - {4 , 4608 , 10 , 64 , 1 }, - {4 , 5120 , 10 , 64 , 1 }, - {4 , 5632 , 10 , 64 , 1 }, - {4 , 6144 , 10 , 32 , 1 }, - {4 , 6656 , 10 , 64 , 1 }, - {4 , 7168 , 10 , 64 , 1 }, - {4 , 7680 , 10 , 64 , 1 }, - {4 , 8192 , 10 , 32 , 1 }, - {4 , 8704 , 10 , 512 , 1 }, - {4 , 9216 , 10 , 64 , 1 }, - {4 , 9728 , 10 , 32 , 1 }, - {4 , 10240 , 10 , 64 , 1 }, - {4 , 20480 , 10 , 64 , 1 }, - {4 , 30720 , 10 , 64 , 1 }, - {4 , 40960 , 10 , 64 , 1 }, - {4 , 51200 , 10 , 64 , 1 }, - {4 , 61440 , 10 , 64 , 1 }, - {4 , 71680 , 10 , 64 , 1 }, - {4 , 81920 , 10 , 16384 , 1 }, - {4 , 92160 , 10 , 64 , 1 }, - {4 , 102400 , 10 , 64 , 1 }, - {4 , 204800 , 10 , 1024 , 1 }, - {4 , 307200 , 10 , 4096 , 1 }, - {4 , 409600 , 10 , 16384 , 1 }, - {4 , 512000 , 10 , 4096 , 1 }, - {4 , 614400 , 10 , 8192 , 1 }, - {4 , 716800 , 10 , 4096 , 1 }, - {4 , 819200 , 10 , 32768 , 1 }, - {4 , 921600 , 10 , 4096 , 1 }, - {4 , 1024000, 10 , 8192 , 1 }, - {6 , 512 , 10 , 64 , 1 }, - {6 , 1024 , 10 , 64 , 1 }, - {6 , 1536 , 10 , 32 , 1 }, - {6 , 2048 , 10 , 64 , 1 }, - {6 , 2560 , 10 , 64 , 1 }, - {6 , 3072 , 10 , 32 , 1 }, - {6 , 3584 , 10 , 64 , 1 }, - {6 , 4096 , 10 , 256 , 1 }, - {6 , 4608 , 10 , 64 , 1 }, - {6 , 5120 , 10 , 64 , 1 }, - {6 , 5632 , 10 , 512 , 1 }, - {6 , 6144 , 10 , 64 , 1 }, - {6 , 6656 , 10 , 64 , 1 }, - {6 , 7168 , 10 , 64 , 1 }, - {6 , 7680 , 10 , 64 , 1 }, - {6 , 8192 , 10 , 32 , 1 }, - {6 , 8704 , 10 , 128 , 1 }, - {6 , 9216 , 10 , 32 , 1 }, - {6 , 9728 , 10 , 32 , 1 }, - {6 , 10240 , 10 , 32 , 1 }, - {6 , 20480 , 10 , 64 , 1 }, - {6 , 30720 , 10 , 64 , 1 }, - {6 , 40960 , 10 , 64 , 1 }, - {6 , 51200 , 10 , 64 , 1 }, - {6 , 61440 , 10 , 64 , 1 }, - {6 , 71680 , 10 , 64 , 1 }, - {6 , 81920 , 10 , 64 , 1 }, - {6 , 92160 , 10 , 1024 , 1 }, - {6 , 102400 , 10 , 64 , 1 }, - {6 , 204800 , 10 , 8192 , 1 }, - {6 , 307200 , 10 , 4096 , 1 }, - {6 , 409600 , 10 , 16384 , 1 }, - {6 , 512000 , 10 , 4096 , 1 }, - {6 , 614400 , 10 , 8192 , 1 }, - {6 , 716800 , 10 , 4096 , 1 }, - {6 , 819200 , 10 , 32768 , 1 }, - {6 , 921600 , 10 , 4096 , 1 }, - {6 , 1024000, 10 , 8192 , 1 }, - {12 , 512 , 15 , 64 , 1 }, - {12 , 1024 , 15 , 32 , 1 }, - {12 , 1536 , 15 , 64 , 1 }, - {12 , 2048 , 15 , 32 , 1 }, - {12 , 2560 , 15 , 32 , 1 }, - {12 , 3072 , 15 , 32 , 1 }, - {12 , 3584 , 15 , 64 , 1 }, - {12 , 4096 , 15 , 64 , 1 }, - {12 , 4608 , 15 , 32 , 1 }, - {12 , 5120 , 15 , 32 , 1 }, - {12 , 5632 , 15 , 64 , 1 }, - {12 , 6144 , 15 , 64 , 1 }, - {12 , 6656 , 15 , 64 , 1 }, - {12 , 7168 , 15 , 32 , 1 }, - {12 , 7680 , 15 , 64 , 1 }, - {12 , 8192 , 15 , 32 , 1 }, - {12 , 8704 , 15 , 64 , 1 }, - {12 , 9216 , 15 , 32 , 1 }, - {12 , 9728 , 15 , 64 , 1 }, - {12 , 10240 , 15 , 256 , 1 }, - {12 , 20480 , 15 , 32 , 1 }, - {12 , 30720 , 15 , 64 , 1 }, - {12 , 40960 , 15 , 64 , 1 }, - {12 , 51200 , 15 , 64 , 1 }, - {12 , 61440 , 15 , 64 , 1 }, - {12 , 71680 , 15 , 64 , 1 }, - {12 , 81920 , 15 , 64 , 1 }, - {12 , 92160 , 15 , 64 , 1 }, - {12 , 102400 , 15 , 512 , 1 }, - {12 , 204800 , 15 , 8192 , 1 }, - {12 , 307200 , 15 , 4096 , 1 }, - {12 , 409600 , 15 , 16384 , 1 }, - {12 , 512000 , 15 , 4096 , 1 }, - {12 , 614400 , 15 , 8192 , 1 }, - {12 , 716800 , 15 , 4096 , 1 }, - {12 , 819200 , 15 , 32768 , 1 }, - {12 , 921600 , 15 , 4096 , 1 }, - {12 , 1024000, 15 , 8192 , 1 }, - {11 , 512 , 20 , 32 , 1 }, - {11 , 1024 , 20 , 64 , 1 }, - {11 , 1536 , 20 , 32 , 1 }, - {11 , 2048 , 20 , 32 , 1 }, - {11 , 2560 , 20 , 64 , 1 }, - {11 , 3072 , 20 , 32 , 1 }, - {11 , 3584 , 20 , 32 , 1 }, - {11 , 4096 , 20 , 64 , 1 }, - {11 , 4608 , 20 , 32 , 1 }, - {11 , 5120 , 20 , 64 , 1 }, - {11 , 5632 , 20 , 64 , 1 }, - {11 , 6144 , 20 , 32 , 1 }, - {11 , 6656 , 20 , 64 , 1 }, - {11 , 7168 , 20 , 32 , 1 }, - {11 , 7680 , 20 , 32 , 1 }, - {11 , 8192 , 20 , 64 , 1 }, - {11 , 8704 , 20 , 32 , 1 }, - {11 , 9216 , 20 , 32 , 1 }, - {11 , 9728 , 20 , 32 , 1 }, - {11 , 10240 , 20 , 32 , 1 }, - {11 , 20480 , 20 , 32 , 1 }, - {11 , 30720 , 20 , 32 , 1 }, - {11 , 40960 , 20 , 64 , 1 }, - {11 , 51200 , 20 , 64 , 1 }, - {11 , 61440 , 20 , 64 , 1 }, - {11 , 71680 , 20 , 64 , 1 }, - {11 , 81920 , 20 , 512 , 1 }, - {11 , 92160 , 20 , 64 , 1 }, - {11 , 102400 , 20 , 64 , 1 }, - {11 , 204800 , 20 , 8192 , 1 }, - {11 , 307200 , 20 , 2048 , 1 }, - {11 , 409600 , 20 , 8192 , 1 }, - {11 , 512000 , 20 , 4096 , 1 }, - {11 , 614400 , 20 , 8192 , 1 }, - {11 , 716800 , 20 , 4096 , 1 }, - {11 , 819200 , 20 , 16384 , 1 }, - {11 , 921600 , 20 , 4096 , 1 }, - {11 , 1024000, 20 , 8192 , 1 }, - {16 , 512 , 21 , 64 , 1 }, - {16 , 1024 , 21 , 64 , 1 }, - {16 , 1536 , 21 , 64 , 1 }, - {16 , 2048 , 21 , 32 , 1 }, - {16 , 2560 , 21 , 64 , 1 }, - {16 , 3072 , 21 , 64 , 1 }, - {16 , 3584 , 21 , 64 , 1 }, - {16 , 4096 , 21 , 64 , 1 }, - {16 , 4608 , 21 , 32 , 1 }, - {16 , 5120 , 21 , 32 , 1 }, - {16 , 5632 , 21 , 32 , 1 }, - {16 , 6144 , 21 , 64 , 1 }, - {16 , 6656 , 21 , 64 , 1 }, - {16 , 7168 , 21 , 32 , 1 }, - {16 , 7680 , 21 , 32 , 1 }, - {16 , 8192 , 21 , 32 , 1 }, - {16 , 8704 , 21 , 64 , 1 }, - {16 , 9216 , 21 , 32 , 1 }, - {16 , 9728 , 21 , 64 , 1 }, - {16 , 10240 , 21 , 64 , 1 }, - {16 , 20480 , 21 , 32 , 1 }, - {16 , 30720 , 21 , 64 , 1 }, - {16 , 40960 , 21 , 32 , 1 }, - {16 , 51200 , 21 , 32 , 1 }, - {16 , 61440 , 21 , 64 , 1 }, - {16 , 71680 , 21 , 64 , 1 }, - {16 , 81920 , 21 , 64 , 1 }, - {16 , 92160 , 21 , 64 , 1 }, - {16 , 102400 , 21 , 4096 , 1 }, - {16 , 204800 , 21 , 2048 , 1 }, - {16 , 307200 , 21 , 1024 , 1 }, - {16 , 409600 , 21 , 16384 , 1 }, - {16 , 512000 , 21 , 4096 , 1 }, - {16 , 614400 , 21 , 8192 , 1 }, - {16 , 716800 , 21 , 4096 , 1 }, - {16 , 819200 , 21 , 32768 , 1 }, - {16 , 921600 , 21 , 4096 , 1 }, - {16 , 1024000, 21 , 8192 , 1 }, - {25 , 512 , 28 , 32 , 1 }, - {25 , 1024 , 28 , 64 , 1 }, - {25 , 1536 , 28 , 32 , 1 }, - {25 , 2048 , 28 , 64 , 1 }, - {25 , 2560 , 28 , 64 , 1 }, - {25 , 3072 , 28 , 32 , 1 }, - {25 , 3584 , 28 , 256 , 1 }, - {25 , 4096 , 28 , 32 , 1 }, - {25 , 4608 , 28 , 32 , 1 }, - {25 , 5120 , 28 , 64 , 1 }, - {25 , 5632 , 28 , 64 , 1 }, - {25 , 6144 , 28 , 32 , 1 }, - {25 , 6656 , 28 , 32 , 1 }, - {25 , 7168 , 28 , 64 , 1 }, - {25 , 7680 , 28 , 64 , 1 }, - {25 , 8192 , 28 , 32 , 1 }, - {25 , 8704 , 28 , 32 , 1 }, - {25 , 9216 , 28 , 64 , 1 }, - {25 , 9728 , 28 , 32 , 1 }, - {25 , 10240 , 28 , 32 , 1 }, - {25 , 20480 , 28 , 64 , 1 }, - {25 , 30720 , 28 , 32 , 1 }, - {25 , 40960 , 28 , 32 , 1 }, - {25 , 51200 , 28 , 64 , 1 }, - {25 , 61440 , 28 , 32 , 1 }, - {25 , 71680 , 28 , 64 , 1 }, - {25 , 81920 , 28 , 64 , 1 }, - {25 , 92160 , 28 , 64 , 1 }, - {25 , 102400 , 28 , 64 , 1 }, - {25 , 204800 , 28 , 8192 , 1 }, - {25 , 307200 , 28 , 2048 , 1 }, - {25 , 409600 , 28 , 16384 , 1 }, - {25 , 512000 , 28 , 4096 , 1 }, - {25 , 614400 , 28 , 8192 , 1 }, - {25 , 716800 , 28 , 4096 , 1 }, - {25 , 819200 , 28 , 16384 , 1 }, - {25 , 921600 , 28 , 4096 , 1 }, - {25 , 1024000, 28 , 8192 , 1 }, - {24 , 512 , 35 , 32 , 1 }, - {24 , 1024 , 35 , 64 , 1 }, - {24 , 1536 , 35 , 64 , 1 }, - {24 , 2048 , 35 , 64 , 1 }, - {24 , 2560 , 35 , 64 , 1 }, - {24 , 3072 , 35 , 32 , 1 }, - {24 , 3584 , 35 , 32 , 1 }, - {24 , 4096 , 35 , 64 , 1 }, - {24 , 4608 , 35 , 64 , 1 }, - {24 , 5120 , 35 , 32 , 1 }, - {24 , 5632 , 35 , 64 , 1 }, - {24 , 6144 , 35 , 32 , 1 }, - {24 , 6656 , 35 , 32 , 1 }, - {24 , 7168 , 35 , 32 , 1 }, - {24 , 7680 , 35 , 64 , 1 }, - {24 , 8192 , 35 , 32 , 1 }, - {24 , 8704 , 35 , 32 , 1 }, - {24 , 9216 , 35 , 32 , 1 }, - {24 , 9728 , 35 , 32 , 1 }, - {24 , 10240 , 35 , 32 , 1 }, - {24 , 20480 , 35 , 64 , 1 }, - {24 , 30720 , 35 , 32 , 1 }, - {24 , 40960 , 35 , 32 , 1 }, - {24 , 51200 , 35 , 64 , 1 }, - {24 , 61440 , 35 , 64 , 1 }, - {24 , 71680 , 35 , 64 , 1 }, - {24 , 81920 , 35 , 64 , 1 }, - {24 , 92160 , 35 , 2048 , 1 }, - {24 , 102400 , 35 , 4096 , 1 }, - {24 , 204800 , 35 , 4096 , 1 }, - {24 , 307200 , 35 , 4096 , 1 }, - {24 , 409600 , 35 , 16384 , 1 }, - {24 , 512000 , 35 , 2048 , 1 }, - {24 , 614400 , 35 , 8192 , 1 }, - {24 , 716800 , 35 , 4096 , 1 }, - {24 , 819200 , 35 , 8192 , 1 }, - {24 , 921600 , 35 , 4096 , 1 }, - {24 , 1024000, 35 , 8192 , 1 }, - {33 , 512 , 36 , 32 , 1 }, - {33 , 1024 , 36 , 32 , 1 }, - {33 , 1536 , 36 , 32 , 0 }, - {33 , 2048 , 36 , 32 , 0 }, - {33 , 2560 , 36 , 32 , 1 }, - {33 , 3072 , 36 , 32 , 1 }, - {33 , 3584 , 36 , 32 , 1 }, - {33 , 4096 , 36 , 64 , 1 }, - {33 , 4608 , 36 , 32 , 1 }, - {33 , 5120 , 36 , 32 , 1 }, - {33 , 5632 , 36 , 32 , 1 }, - {33 , 6144 , 36 , 64 , 1 }, - {33 , 6656 , 36 , 32 , 1 }, - {33 , 7168 , 36 , 32 , 1 }, - {33 , 7680 , 36 , 32 , 1 }, - {33 , 8192 , 36 , 32 , 1 }, - {33 , 8704 , 36 , 32 , 1 }, - {33 , 9216 , 36 , 64 , 1 }, - {33 , 9728 , 36 , 64 , 1 }, - {33 , 10240 , 36 , 32 , 1 }, - {33 , 20480 , 36 , 32 , 1 }, - {33 , 30720 , 36 , 32 , 1 }, - {33 , 40960 , 36 , 32 , 1 }, - {33 , 51200 , 36 , 32 , 1 }, - {33 , 61440 , 36 , 64 , 1 }, - {33 , 71680 , 36 , 64 , 1 }, - {33 , 81920 , 36 , 16384 , 1 }, - {33 , 92160 , 36 , 2048 , 1 }, - {33 , 102400 , 36 , 4096 , 1 }, - {33 , 204800 , 36 , 4096 , 1 }, - {33 , 307200 , 36 , 4096 , 1 }, - {33 , 409600 , 36 , 8192 , 1 }, - {33 , 512000 , 36 , 4096 , 1 }, - {33 , 614400 , 36 , 8192 , 1 }, - {33 , 716800 , 36 , 4096 , 1 }, - {33 , 819200 , 36 , 16384 , 1 }, - {33 , 921600 , 36 , 4096 , 1 }, - {33 , 1024000, 36 , 8192 , 1 }, - {42 , 512 , 45 , 64 , 1 }, - {42 , 1024 , 45 , 32 , 0 }, - {42 , 1536 , 45 , 32 , 0 }, - {42 , 2048 , 45 , 64 , 0 }, - {42 , 2560 , 45 , 64 , 0 }, - {42 , 3072 , 45 , 32 , 1 }, - {42 , 3584 , 45 , 64 , 1 }, - {42 , 4096 , 45 , 32 , 1 }, - {42 , 4608 , 45 , 128 , 1 }, - {42 , 5120 , 45 , 64 , 1 }, - {42 , 5632 , 45 , 64 , 1 }, - {42 , 6144 , 45 , 32 , 1 }, - {42 , 6656 , 45 , 32 , 1 }, - {42 , 7168 , 45 , 64 , 1 }, - {42 , 7680 , 45 , 32 , 1 }, - {42 , 8192 , 45 , 64 , 1 }, - {42 , 8704 , 45 , 64 , 1 }, - {42 , 9216 , 45 , 64 , 1 }, - {42 , 9728 , 45 , 32 , 1 }, - {42 , 10240 , 45 , 32 , 1 }, - {42 , 20480 , 45 , 32 , 1 }, - {42 , 30720 , 45 , 64 , 1 }, - {42 , 40960 , 45 , 64 , 1 }, - {42 , 51200 , 45 , 32 , 1 }, - {42 , 61440 , 45 , 64 , 1 }, - {42 , 71680 , 45 , 64 , 1 }, - {42 , 81920 , 45 , 16384 , 1 }, - {42 , 92160 , 45 , 2048 , 1 }, - {42 , 102400 , 45 , 2048 , 1 }, - {42 , 204800 , 45 , 2048 , 1 }, - {42 , 307200 , 45 , 4096 , 1 }, - {42 , 409600 , 45 , 16384 , 1 }, - {42 , 512000 , 45 , 4096 , 1 }, - {42 , 614400 , 45 , 4096 , 1 }, - {42 , 716800 , 45 , 4096 , 1 }, - {42 , 819200 , 45 , 32768 , 1 }, - {42 , 921600 , 45 , 4096 , 1 }, - {42 , 1024000, 45 , 8192 , 1 }, - {43 , 512 , 56 , 64 , 1 }, - {43 , 1024 , 56 , 64 , 0 }, - {43 , 1536 , 56 , 32 , 1 }, - {43 , 2048 , 56 , 64 , 1 }, - {43 , 2560 , 56 , 64 , 0 }, - {43 , 3072 , 56 , 64 , 1 }, - {43 , 3584 , 56 , 64 , 1 }, - {43 , 4096 , 56 , 32 , 1 }, - {43 , 4608 , 56 , 32 , 1 }, - {43 , 5120 , 56 , 64 , 1 }, - {43 , 5632 , 56 , 64 , 1 }, - {43 , 6144 , 56 , 64 , 1 }, - {43 , 6656 , 56 , 32 , 1 }, - {43 , 7168 , 56 , 32 , 1 }, - {43 , 7680 , 56 , 32 , 1 }, - {43 , 8192 , 56 , 4096 , 1 }, - {43 , 8704 , 56 , 64 , 1 }, - {43 , 9216 , 56 , 32 , 1 }, - {43 , 9728 , 56 , 32 , 1 }, - {43 , 10240 , 56 , 64 , 1 }, - {43 , 20480 , 56 , 32 , 1 }, - {43 , 30720 , 56 , 32 , 1 }, - {43 , 40960 , 56 , 32 , 1 }, - {43 , 51200 , 56 , 32 , 1 }, - {43 , 61440 , 56 , 2048 , 1 }, - {43 , 71680 , 56 , 2048 , 1 }, - {43 , 81920 , 56 , 4096 , 1 }, - {43 , 92160 , 56 , 512 , 1 }, - {43 , 102400 , 56 , 4096 , 1 }, - {43 , 204800 , 56 , 4096 , 1 }, - {43 , 307200 , 56 , 512 , 1 }, - {43 , 409600 , 56 , 16384 , 1 }, - {43 , 512000 , 56 , 4096 , 1 }, - {43 , 614400 , 56 , 8192 , 1 }, - {43 , 716800 , 56 , 4096 , 1 }, - {43 , 819200 , 56 , 16384 , 1 }, - {43 , 921600 , 56 , 4096 , 1 }, - {43 , 1024000, 56 , 8192 , 1 }, - {126 , 512 , 84 , 32 , 0 }, - {126 , 1024 , 84 , 32 , 0 }, - {126 , 1536 , 84 , 64 , 0 }, - {126 , 2048 , 84 , 64 , 1 }, - {126 , 2560 , 84 , 64 , 1 }, - {126 , 3072 , 84 , 32 , 1 }, - {126 , 3584 , 84 , 32 , 0 }, - {126 , 4096 , 84 , 32 , 1 }, - {126 , 4608 , 84 , 32 , 0 }, - {126 , 5120 , 84 , 32 , 1 }, - {126 , 5632 , 84 , 32 , 1 }, - {126 , 6144 , 84 , 64 , 1 }, - {126 , 6656 , 84 , 64 , 1 }, - {126 , 7168 , 84 , 1024 , 0 }, - {126 , 7680 , 84 , 32 , 1 }, - {126 , 8192 , 84 , 64 , 1 }, - {126 , 8704 , 84 , 64 , 0 }, - {126 , 9216 , 84 , 32 , 1 }, - {126 , 9728 , 84 , 32 , 1 }, - {126 , 10240 , 84 , 32 , 1 }, - {126 , 20480 , 84 , 32 , 1 }, - {126 , 30720 , 84 , 64 , 1 }, - {126 , 40960 , 84 , 64 , 0 }, - {126 , 51200 , 84 , 2048 , 1 }, - {126 , 61440 , 84 , 64 , 0 }, - {126 , 71680 , 84 , 64 , 0 }, - {126 , 81920 , 84 , 81920 , 0 }, - {126 , 92160 , 84 , 1024 , 0 }, - {126 , 102400 , 84 , 102400 , 0 }, - {126 , 204800 , 84 , 204800 , 0 }, - {126 , 307200 , 84 , 307200 , 0 }, - {126 , 409600 , 84 , 409600 , 0 }, - {126 , 512000 , 84 , 512000 , 0 }, - {126 , 614400 , 84 , 614400 , 0 }, - {126 , 716800 , 84 , 716800 , 0 }, - {126 , 819200 , 84 , 819200 , 0 }, - {126 , 921600 , 84 , 921600 , 0 }, - {126 , 1024000, 84 , 1024000, 0 }, - {210 , 512 , 120 , 32 , 0 }, - {210 , 1024 , 120 , 32 , 0 }, - {210 , 1536 , 120 , 512 , 1 }, - {210 , 2048 , 120 , 1024 , 1 }, - {210 , 2560 , 120 , 32 , 1 }, - {210 , 3072 , 120 , 1024 , 1 }, - {210 , 3584 , 120 , 512 , 1 }, - {210 , 4096 , 120 , 1024 , 0 }, - {210 , 4608 , 120 , 512 , 1 }, - {210 , 5120 , 120 , 64 , 0 }, - {210 , 5632 , 120 , 64 , 0 }, - {210 , 6144 , 120 , 1024 , 1 }, - {210 , 6656 , 120 , 256 , 1 }, - {210 , 7168 , 120 , 512 , 1 }, - {210 , 7680 , 120 , 512 , 1 }, - {210 , 8192 , 120 , 4096 , 1 }, - {210 , 8704 , 120 , 512 , 1 }, - {210 , 9216 , 120 , 1024 , 1 }, - {210 , 9728 , 120 , 64 , 0 }, - {210 , 10240 , 120 , 1024 , 0 }, - {210 , 20480 , 120 , 4096 , 0 }, - {210 , 30720 , 120 , 2048 , 0 }, - {210 , 40960 , 120 , 2048 , 0 }, - {210 , 51200 , 120 , 2048 , 0 }, - {210 , 61440 , 120 , 2048 , 0 }, - {210 , 71680 , 120 , 2048 , 0 }, - {210 , 81920 , 120 , 4096 , 0 }, - {210 , 92160 , 120 , 2048 , 0 }, - {210 , 102400 , 120 , 4096 , 0 }, - {210 , 204800 , 120 , 204800 , 0 }, - {210 , 307200 , 120 , 307200 , 0 }, - {210 , 409600 , 120 , 409600 , 0 }, - {210 , 512000 , 120 , 512000 , 0 }, - {210 , 614400 , 120 , 614400 , 0 }, - {210 , 716800 , 120 , 716800 , 0 }, - {210 , 819200 , 120 , 819200 , 0 }, - {210 , 921600 , 120 , 921600 , 0 }, - {210 , 1024000, 120 , 1024000, 0 }, - {330 , 512 , 165 , 512 , 1 }, - {330 , 1024 , 165 , 1024 , 1 }, - {330 , 1536 , 165 , 512 , 1 }, - {330 , 2048 , 165 , 2048 , 1 }, - {330 , 2560 , 165 , 512 , 0 }, - {330 , 3072 , 165 , 512 , 0 }, - {330 , 3584 , 165 , 256 , 0 }, - {330 , 4096 , 165 , 1024 , 0 }, - {330 , 4608 , 165 , 512 , 1 }, - {330 , 5120 , 165 , 1024 , 1 }, - {330 , 5632 , 165 , 64 , 0 }, - {330 , 6144 , 165 , 64 , 0 }, - {330 , 6656 , 165 , 64 , 0 }, - {330 , 7168 , 165 , 1024 , 0 }, - {330 , 7680 , 165 , 512 , 1 }, - {330 , 8192 , 165 , 1024 , 0 }, - {330 , 8704 , 165 , 512 , 0 }, - {330 , 9216 , 165 , 512 , 0 }, - {330 , 9728 , 165 , 128 , 0 }, - {330 , 10240 , 165 , 1024 , 0 }, - {330 , 20480 , 165 , 1024 , 0 }, - {330 , 30720 , 165 , 30720 , 0 }, - {330 , 40960 , 165 , 40960 , 0 }, - {330 , 51200 , 165 , 51200 , 0 }, - {330 , 61440 , 165 , 61440 , 0 }, - {330 , 71680 , 165 , 71680 , 0 }, - {330 , 81920 , 165 , 81920 , 0 }, - {330 , 92160 , 165 , 92160 , 0 }, - {330 , 102400 , 165 , 102400 , 0 }, - {330 , 204800 , 165 , 204800 , 0 }, - {330 , 307200 , 165 , 307200 , 0 }, - {330 , 409600 , 165 , 409600 , 0 }, - {330 , 512000 , 165 , 512000 , 0 }, - {330 , 614400 , 165 , 614400 , 0 }, - {330 , 716800 , 165 , 716800 , 0 }, - {330 , 819200 , 165 , 819200 , 0 }, - {330 , 921600 , 165 , 921600 , 0 }, - {330 , 1024000, 165 , 1024000, 0 } +std::vector > sgemm_tn_v100 = { + {1, 512, 3, 64, 0}, + {1, 1024, 3, 64, 0}, + {1, 1536, 3, 64, 0}, + {1, 2048, 3, 64, 0}, + {1, 2560, 3, 32, 0}, + {1, 3072, 3, 32, 0}, + {1, 3584, 3, 32, 0}, + {1, 4096, 3, 32, 0}, + {1, 4608, 3, 64, 0}, + {1, 5120, 3, 64, 0}, + {1, 5632, 3, 64, 0}, + {1, 6144, 3, 64, 0}, + {1, 6656, 3, 32, 0}, + {1, 7168, 3, 64, 0}, + {1, 7680, 3, 64, 0}, + {1, 8192, 3, 32, 0}, + {1, 8704, 3, 64, 0}, + {1, 9216, 3, 64, 0}, + {1, 9728, 3, 64, 0}, + {1, 10240, 3, 32, 0}, + {1, 20480, 3, 64, 0}, + {1, 30720, 3, 64, 1}, + {1, 40960, 3, 32, 0}, + {1, 51200, 3, 64, 1}, + {1, 61440, 3, 64, 1}, + {1, 71680, 3, 1024, 1}, + {1, 81920, 3, 32, 0}, + {1, 92160, 3, 2048, 1}, + {1, 102400, 3, 4096, 1}, + {1, 204800, 3, 2048, 1}, + {1, 307200, 3, 4096, 1}, + {1, 409600, 3, 4096, 1}, + {1, 512000, 3, 4096, 1}, + {1, 614400, 3, 4096, 1}, + {1, 716800, 3, 4096, 1}, + {1, 819200, 3, 16384, 1}, + {1, 921600, 3, 4096, 1}, + {1, 1024000, 3, 8192, 1}, + {1, 512, 4, 64, 0}, + {1, 1024, 4, 64, 0}, + {1, 1536, 4, 32, 0}, + {1, 2048, 4, 1024, 0}, + {1, 2560, 4, 64, 0}, + {1, 3072, 4, 32, 0}, + {1, 3584, 4, 32, 0}, + {1, 4096, 4, 32, 0}, + {1, 4608, 4, 64, 0}, + {1, 5120, 4, 64, 0}, + {1, 5632, 4, 32, 0}, + {1, 6144, 4, 32, 0}, + {1, 6656, 4, 32, 0}, + {1, 7168, 4, 64, 0}, + {1, 7680, 4, 32, 0}, + {1, 8192, 4, 32, 1}, + {1, 8704, 4, 32, 1}, + {1, 9216, 4, 64, 0}, + {1, 9728, 4, 32, 0}, + {1, 10240, 4, 64, 0}, + {1, 20480, 4, 32, 0}, + {1, 30720, 4, 64, 1}, + {1, 40960, 4, 32, 0}, + {1, 51200, 4, 2048, 1}, + {1, 61440, 4, 64, 1}, + {1, 71680, 4, 64, 1}, + {1, 81920, 4, 64, 1}, + {1, 92160, 4, 64, 1}, + {1, 102400, 4, 4096, 1}, + {1, 204800, 4, 4096, 1}, + {1, 307200, 4, 4096, 1}, + {1, 409600, 4, 4096, 1}, + {1, 512000, 4, 4096, 1}, + {1, 614400, 4, 8192, 1}, + {1, 716800, 4, 4096, 1}, + {1, 819200, 4, 32768, 1}, + {1, 921600, 4, 4096, 1}, + {1, 1024000, 4, 8192, 1}, + {3, 512, 6, 64, 1}, + {3, 1024, 6, 64, 0}, + {3, 1536, 6, 32, 1}, + {3, 2048, 6, 32, 1}, + {3, 2560, 6, 32, 1}, + {3, 3072, 6, 64, 1}, + {3, 3584, 6, 32, 1}, + {3, 4096, 6, 32, 1}, + {3, 4608, 6, 64, 1}, + {3, 5120, 6, 64, 1}, + {3, 5632, 6, 64, 1}, + {3, 6144, 6, 1024, 1}, + {3, 6656, 6, 64, 1}, + {3, 7168, 6, 512, 1}, + {3, 7680, 6, 32, 1}, + {3, 8192, 6, 32, 1}, + {3, 8704, 6, 512, 1}, + {3, 9216, 6, 32, 1}, + {3, 9728, 6, 64, 1}, + {3, 10240, 6, 1024, 1}, + {3, 20480, 6, 64, 1}, + {3, 30720, 6, 64, 1}, + {3, 40960, 6, 64, 1}, + {3, 51200, 6, 64, 1}, + {3, 61440, 6, 64, 1}, + {3, 71680, 6, 64, 1}, + {3, 81920, 6, 64, 1}, + {3, 92160, 6, 64, 1}, + {3, 102400, 6, 512, 1}, + {3, 204800, 6, 2048, 1}, + {3, 307200, 6, 2048, 1}, + {3, 409600, 6, 16384, 1}, + {3, 512000, 6, 4096, 1}, + {3, 614400, 6, 4096, 1}, + {3, 716800, 6, 2048, 0}, + {3, 819200, 6, 32768, 0}, + {3, 921600, 6, 4096, 0}, + {3, 1024000, 6, 8192, 0}, + {4, 512, 10, 64, 1}, + {4, 1024, 10, 32, 1}, + {4, 1536, 10, 64, 1}, + {4, 2048, 10, 32, 1}, + {4, 2560, 10, 32, 1}, + {4, 3072, 10, 32, 1}, + {4, 3584, 10, 64, 1}, + {4, 4096, 10, 64, 1}, + {4, 4608, 10, 64, 1}, + {4, 5120, 10, 64, 1}, + {4, 5632, 10, 64, 1}, + {4, 6144, 10, 32, 1}, + {4, 6656, 10, 64, 1}, + {4, 7168, 10, 64, 1}, + {4, 7680, 10, 64, 1}, + {4, 8192, 10, 32, 1}, + {4, 8704, 10, 512, 1}, + {4, 9216, 10, 64, 1}, + {4, 9728, 10, 32, 1}, + {4, 10240, 10, 64, 1}, + {4, 20480, 10, 64, 1}, + {4, 30720, 10, 64, 1}, + {4, 40960, 10, 64, 1}, + {4, 51200, 10, 64, 1}, + {4, 61440, 10, 64, 1}, + {4, 71680, 10, 64, 1}, + {4, 81920, 10, 16384, 1}, + {4, 92160, 10, 64, 1}, + {4, 102400, 10, 64, 1}, + {4, 204800, 10, 1024, 1}, + {4, 307200, 10, 4096, 1}, + {4, 409600, 10, 16384, 1}, + {4, 512000, 10, 4096, 1}, + {4, 614400, 10, 8192, 1}, + {4, 716800, 10, 4096, 1}, + {4, 819200, 10, 32768, 1}, + {4, 921600, 10, 4096, 1}, + {4, 1024000, 10, 8192, 1}, + {6, 512, 10, 64, 1}, + {6, 1024, 10, 64, 1}, + {6, 1536, 10, 32, 1}, + {6, 2048, 10, 64, 1}, + {6, 2560, 10, 64, 1}, + {6, 3072, 10, 32, 1}, + {6, 3584, 10, 64, 1}, + {6, 4096, 10, 256, 1}, + {6, 4608, 10, 64, 1}, + {6, 5120, 10, 64, 1}, + {6, 5632, 10, 512, 1}, + {6, 6144, 10, 64, 1}, + {6, 6656, 10, 64, 1}, + {6, 7168, 10, 64, 1}, + {6, 7680, 10, 64, 1}, + {6, 8192, 10, 32, 1}, + {6, 8704, 10, 128, 1}, + {6, 9216, 10, 32, 1}, + {6, 9728, 10, 32, 1}, + {6, 10240, 10, 32, 1}, + {6, 20480, 10, 64, 1}, + {6, 30720, 10, 64, 1}, + {6, 40960, 10, 64, 1}, + {6, 51200, 10, 64, 1}, + {6, 61440, 10, 64, 1}, + {6, 71680, 10, 64, 1}, + {6, 81920, 10, 64, 1}, + {6, 92160, 10, 1024, 1}, + {6, 102400, 10, 64, 1}, + {6, 204800, 10, 8192, 1}, + {6, 307200, 10, 4096, 1}, + {6, 409600, 10, 16384, 1}, + {6, 512000, 10, 4096, 1}, + {6, 614400, 10, 8192, 1}, + {6, 716800, 10, 4096, 1}, + {6, 819200, 10, 32768, 1}, + {6, 921600, 10, 4096, 1}, + {6, 1024000, 10, 8192, 1}, + {12, 512, 15, 64, 1}, + {12, 1024, 15, 32, 1}, + {12, 1536, 15, 64, 1}, + {12, 2048, 15, 32, 1}, + {12, 2560, 15, 32, 1}, + {12, 3072, 15, 32, 1}, + {12, 3584, 15, 64, 1}, + {12, 4096, 15, 64, 1}, + {12, 4608, 15, 32, 1}, + {12, 5120, 15, 32, 1}, + {12, 5632, 15, 64, 1}, + {12, 6144, 15, 64, 1}, + {12, 6656, 15, 64, 1}, + {12, 7168, 15, 32, 1}, + {12, 7680, 15, 64, 1}, + {12, 8192, 15, 32, 1}, + {12, 8704, 15, 64, 1}, + {12, 9216, 15, 32, 1}, + {12, 9728, 15, 64, 1}, + {12, 10240, 15, 256, 1}, + {12, 20480, 15, 32, 1}, + {12, 30720, 15, 64, 1}, + {12, 40960, 15, 64, 1}, + {12, 51200, 15, 64, 1}, + {12, 61440, 15, 64, 1}, + {12, 71680, 15, 64, 1}, + {12, 81920, 15, 64, 1}, + {12, 92160, 15, 64, 1}, + {12, 102400, 15, 512, 1}, + {12, 204800, 15, 8192, 1}, + {12, 307200, 15, 4096, 1}, + {12, 409600, 15, 16384, 1}, + {12, 512000, 15, 4096, 1}, + {12, 614400, 15, 8192, 1}, + {12, 716800, 15, 4096, 1}, + {12, 819200, 15, 32768, 1}, + {12, 921600, 15, 4096, 1}, + {12, 1024000, 15, 8192, 1}, + {11, 512, 20, 32, 1}, + {11, 1024, 20, 64, 1}, + {11, 1536, 20, 32, 1}, + {11, 2048, 20, 32, 1}, + {11, 2560, 20, 64, 1}, + {11, 3072, 20, 32, 1}, + {11, 3584, 20, 32, 1}, + {11, 4096, 20, 64, 1}, + {11, 4608, 20, 32, 1}, + {11, 5120, 20, 64, 1}, + {11, 5632, 20, 64, 1}, + {11, 6144, 20, 32, 1}, + {11, 6656, 20, 64, 1}, + {11, 7168, 20, 32, 1}, + {11, 7680, 20, 32, 1}, + {11, 8192, 20, 64, 1}, + {11, 8704, 20, 32, 1}, + {11, 9216, 20, 32, 1}, + {11, 9728, 20, 32, 1}, + {11, 10240, 20, 32, 1}, + {11, 20480, 20, 32, 1}, + {11, 30720, 20, 32, 1}, + {11, 40960, 20, 64, 1}, + {11, 51200, 20, 64, 1}, + {11, 61440, 20, 64, 1}, + {11, 71680, 20, 64, 1}, + {11, 81920, 20, 512, 1}, + {11, 92160, 20, 64, 1}, + {11, 102400, 20, 64, 1}, + {11, 204800, 20, 8192, 1}, + {11, 307200, 20, 2048, 1}, + {11, 409600, 20, 8192, 1}, + {11, 512000, 20, 4096, 1}, + {11, 614400, 20, 8192, 1}, + {11, 716800, 20, 4096, 1}, + {11, 819200, 20, 16384, 1}, + {11, 921600, 20, 4096, 1}, + {11, 1024000, 20, 8192, 1}, + {16, 512, 21, 64, 1}, + {16, 1024, 21, 64, 1}, + {16, 1536, 21, 64, 1}, + {16, 2048, 21, 32, 1}, + {16, 2560, 21, 64, 1}, + {16, 3072, 21, 64, 1}, + {16, 3584, 21, 64, 1}, + {16, 4096, 21, 64, 1}, + {16, 4608, 21, 32, 1}, + {16, 5120, 21, 32, 1}, + {16, 5632, 21, 32, 1}, + {16, 6144, 21, 64, 1}, + {16, 6656, 21, 64, 1}, + {16, 7168, 21, 32, 1}, + {16, 7680, 21, 32, 1}, + {16, 8192, 21, 32, 1}, + {16, 8704, 21, 64, 1}, + {16, 9216, 21, 32, 1}, + {16, 9728, 21, 64, 1}, + {16, 10240, 21, 64, 1}, + {16, 20480, 21, 32, 1}, + {16, 30720, 21, 64, 1}, + {16, 40960, 21, 32, 1}, + {16, 51200, 21, 32, 1}, + {16, 61440, 21, 64, 1}, + {16, 71680, 21, 64, 1}, + {16, 81920, 21, 64, 1}, + {16, 92160, 21, 64, 1}, + {16, 102400, 21, 4096, 1}, + {16, 204800, 21, 2048, 1}, + {16, 307200, 21, 1024, 1}, + {16, 409600, 21, 16384, 1}, + {16, 512000, 21, 4096, 1}, + {16, 614400, 21, 8192, 1}, + {16, 716800, 21, 4096, 1}, + {16, 819200, 21, 32768, 1}, + {16, 921600, 21, 4096, 1}, + {16, 1024000, 21, 8192, 1}, + {25, 512, 28, 32, 1}, + {25, 1024, 28, 64, 1}, + {25, 1536, 28, 32, 1}, + {25, 2048, 28, 64, 1}, + {25, 2560, 28, 64, 1}, + {25, 3072, 28, 32, 1}, + {25, 3584, 28, 256, 1}, + {25, 4096, 28, 32, 1}, + {25, 4608, 28, 32, 1}, + {25, 5120, 28, 64, 1}, + {25, 5632, 28, 64, 1}, + {25, 6144, 28, 32, 1}, + {25, 6656, 28, 32, 1}, + {25, 7168, 28, 64, 1}, + {25, 7680, 28, 64, 1}, + {25, 8192, 28, 32, 1}, + {25, 8704, 28, 32, 1}, + {25, 9216, 28, 64, 1}, + {25, 9728, 28, 32, 1}, + {25, 10240, 28, 32, 1}, + {25, 20480, 28, 64, 1}, + {25, 30720, 28, 32, 1}, + {25, 40960, 28, 32, 1}, + {25, 51200, 28, 64, 1}, + {25, 61440, 28, 32, 1}, + {25, 71680, 28, 64, 1}, + {25, 81920, 28, 64, 1}, + {25, 92160, 28, 64, 1}, + {25, 102400, 28, 64, 1}, + {25, 204800, 28, 8192, 1}, + {25, 307200, 28, 2048, 1}, + {25, 409600, 28, 16384, 1}, + {25, 512000, 28, 4096, 1}, + {25, 614400, 28, 8192, 1}, + {25, 716800, 28, 4096, 1}, + {25, 819200, 28, 16384, 1}, + {25, 921600, 28, 4096, 1}, + {25, 1024000, 28, 8192, 1}, + {24, 512, 35, 32, 1}, + {24, 1024, 35, 64, 1}, + {24, 1536, 35, 64, 1}, + {24, 2048, 35, 64, 1}, + {24, 2560, 35, 64, 1}, + {24, 3072, 35, 32, 1}, + {24, 3584, 35, 32, 1}, + {24, 4096, 35, 64, 1}, + {24, 4608, 35, 64, 1}, + {24, 5120, 35, 32, 1}, + {24, 5632, 35, 64, 1}, + {24, 6144, 35, 32, 1}, + {24, 6656, 35, 32, 1}, + {24, 7168, 35, 32, 1}, + {24, 7680, 35, 64, 1}, + {24, 8192, 35, 32, 1}, + {24, 8704, 35, 32, 1}, + {24, 9216, 35, 32, 1}, + {24, 9728, 35, 32, 1}, + {24, 10240, 35, 32, 1}, + {24, 20480, 35, 64, 1}, + {24, 30720, 35, 32, 1}, + {24, 40960, 35, 32, 1}, + {24, 51200, 35, 64, 1}, + {24, 61440, 35, 64, 1}, + {24, 71680, 35, 64, 1}, + {24, 81920, 35, 64, 1}, + {24, 92160, 35, 2048, 1}, + {24, 102400, 35, 4096, 1}, + {24, 204800, 35, 4096, 1}, + {24, 307200, 35, 4096, 1}, + {24, 409600, 35, 16384, 1}, + {24, 512000, 35, 2048, 1}, + {24, 614400, 35, 8192, 1}, + {24, 716800, 35, 4096, 1}, + {24, 819200, 35, 8192, 1}, + {24, 921600, 35, 4096, 1}, + {24, 1024000, 35, 8192, 1}, + {33, 512, 36, 32, 1}, + {33, 1024, 36, 32, 1}, + {33, 1536, 36, 32, 0}, + {33, 2048, 36, 32, 0}, + {33, 2560, 36, 32, 1}, + {33, 3072, 36, 32, 1}, + {33, 3584, 36, 32, 1}, + {33, 4096, 36, 64, 1}, + {33, 4608, 36, 32, 1}, + {33, 5120, 36, 32, 1}, + {33, 5632, 36, 32, 1}, + {33, 6144, 36, 64, 1}, + {33, 6656, 36, 32, 1}, + {33, 7168, 36, 32, 1}, + {33, 7680, 36, 32, 1}, + {33, 8192, 36, 32, 1}, + {33, 8704, 36, 32, 1}, + {33, 9216, 36, 64, 1}, + {33, 9728, 36, 64, 1}, + {33, 10240, 36, 32, 1}, + {33, 20480, 36, 32, 1}, + {33, 30720, 36, 32, 1}, + {33, 40960, 36, 32, 1}, + {33, 51200, 36, 32, 1}, + {33, 61440, 36, 64, 1}, + {33, 71680, 36, 64, 1}, + {33, 81920, 36, 16384, 1}, + {33, 92160, 36, 2048, 1}, + {33, 102400, 36, 4096, 1}, + {33, 204800, 36, 4096, 1}, + {33, 307200, 36, 4096, 1}, + {33, 409600, 36, 8192, 1}, + {33, 512000, 36, 4096, 1}, + {33, 614400, 36, 8192, 1}, + {33, 716800, 36, 4096, 1}, + {33, 819200, 36, 16384, 1}, + {33, 921600, 36, 4096, 1}, + {33, 1024000, 36, 8192, 1}, + {42, 512, 45, 64, 1}, + {42, 1024, 45, 32, 0}, + {42, 1536, 45, 32, 0}, + {42, 2048, 45, 64, 0}, + {42, 2560, 45, 64, 0}, + {42, 3072, 45, 32, 1}, + {42, 3584, 45, 64, 1}, + {42, 4096, 45, 32, 1}, + {42, 4608, 45, 128, 1}, + {42, 5120, 45, 64, 1}, + {42, 5632, 45, 64, 1}, + {42, 6144, 45, 32, 1}, + {42, 6656, 45, 32, 1}, + {42, 7168, 45, 64, 1}, + {42, 7680, 45, 32, 1}, + {42, 8192, 45, 64, 1}, + {42, 8704, 45, 64, 1}, + {42, 9216, 45, 64, 1}, + {42, 9728, 45, 32, 1}, + {42, 10240, 45, 32, 1}, + {42, 20480, 45, 32, 1}, + {42, 30720, 45, 64, 1}, + {42, 40960, 45, 64, 1}, + {42, 51200, 45, 32, 1}, + {42, 61440, 45, 64, 1}, + {42, 71680, 45, 64, 1}, + {42, 81920, 45, 16384, 1}, + {42, 92160, 45, 2048, 1}, + {42, 102400, 45, 2048, 1}, + {42, 204800, 45, 2048, 1}, + {42, 307200, 45, 4096, 1}, + {42, 409600, 45, 16384, 1}, + {42, 512000, 45, 4096, 1}, + {42, 614400, 45, 4096, 1}, + {42, 716800, 45, 4096, 1}, + {42, 819200, 45, 32768, 1}, + {42, 921600, 45, 4096, 1}, + {42, 1024000, 45, 8192, 1}, + {43, 512, 56, 64, 1}, + {43, 1024, 56, 64, 0}, + {43, 1536, 56, 32, 1}, + {43, 2048, 56, 64, 1}, + {43, 2560, 56, 64, 0}, + {43, 3072, 56, 64, 1}, + {43, 3584, 56, 64, 1}, + {43, 4096, 56, 32, 1}, + {43, 4608, 56, 32, 1}, + {43, 5120, 56, 64, 1}, + {43, 5632, 56, 64, 1}, + {43, 6144, 56, 64, 1}, + {43, 6656, 56, 32, 1}, + {43, 7168, 56, 32, 1}, + {43, 7680, 56, 32, 1}, + {43, 8192, 56, 4096, 1}, + {43, 8704, 56, 64, 1}, + {43, 9216, 56, 32, 1}, + {43, 9728, 56, 32, 1}, + {43, 10240, 56, 64, 1}, + {43, 20480, 56, 32, 1}, + {43, 30720, 56, 32, 1}, + {43, 40960, 56, 32, 1}, + {43, 51200, 56, 32, 1}, + {43, 61440, 56, 2048, 1}, + {43, 71680, 56, 2048, 1}, + {43, 81920, 56, 4096, 1}, + {43, 92160, 56, 512, 1}, + {43, 102400, 56, 4096, 1}, + {43, 204800, 56, 4096, 1}, + {43, 307200, 56, 512, 1}, + {43, 409600, 56, 16384, 1}, + {43, 512000, 56, 4096, 1}, + {43, 614400, 56, 8192, 1}, + {43, 716800, 56, 4096, 1}, + {43, 819200, 56, 16384, 1}, + {43, 921600, 56, 4096, 1}, + {43, 1024000, 56, 8192, 1}, + {126, 512, 84, 32, 0}, + {126, 1024, 84, 32, 0}, + {126, 1536, 84, 64, 0}, + {126, 2048, 84, 64, 1}, + {126, 2560, 84, 64, 1}, + {126, 3072, 84, 32, 1}, + {126, 3584, 84, 32, 0}, + {126, 4096, 84, 32, 1}, + {126, 4608, 84, 32, 0}, + {126, 5120, 84, 32, 1}, + {126, 5632, 84, 32, 1}, + {126, 6144, 84, 64, 1}, + {126, 6656, 84, 64, 1}, + {126, 7168, 84, 1024, 0}, + {126, 7680, 84, 32, 1}, + {126, 8192, 84, 64, 1}, + {126, 8704, 84, 64, 0}, + {126, 9216, 84, 32, 1}, + {126, 9728, 84, 32, 1}, + {126, 10240, 84, 32, 1}, + {126, 20480, 84, 32, 1}, + {126, 30720, 84, 64, 1}, + {126, 40960, 84, 64, 0}, + {126, 51200, 84, 2048, 1}, + {126, 61440, 84, 64, 0}, + {126, 71680, 84, 64, 0}, + {126, 81920, 84, 81920, 0}, + {126, 92160, 84, 1024, 0}, + {126, 102400, 84, 102400, 0}, + {126, 204800, 84, 204800, 0}, + {126, 307200, 84, 307200, 0}, + {126, 409600, 84, 409600, 0}, + {126, 512000, 84, 512000, 0}, + {126, 614400, 84, 614400, 0}, + {126, 716800, 84, 716800, 0}, + {126, 819200, 84, 819200, 0}, + {126, 921600, 84, 921600, 0}, + {126, 1024000, 84, 1024000, 0}, + {210, 512, 120, 32, 0}, + {210, 1024, 120, 32, 0}, + {210, 1536, 120, 512, 1}, + {210, 2048, 120, 1024, 1}, + {210, 2560, 120, 32, 1}, + {210, 3072, 120, 1024, 1}, + {210, 3584, 120, 512, 1}, + {210, 4096, 120, 1024, 0}, + {210, 4608, 120, 512, 1}, + {210, 5120, 120, 64, 0}, + {210, 5632, 120, 64, 0}, + {210, 6144, 120, 1024, 1}, + {210, 6656, 120, 256, 1}, + {210, 7168, 120, 512, 1}, + {210, 7680, 120, 512, 1}, + {210, 8192, 120, 4096, 1}, + {210, 8704, 120, 512, 1}, + {210, 9216, 120, 1024, 1}, + {210, 9728, 120, 64, 0}, + {210, 10240, 120, 1024, 0}, + {210, 20480, 120, 4096, 0}, + {210, 30720, 120, 2048, 0}, + {210, 40960, 120, 2048, 0}, + {210, 51200, 120, 2048, 0}, + {210, 61440, 120, 2048, 0}, + {210, 71680, 120, 2048, 0}, + {210, 81920, 120, 4096, 0}, + {210, 92160, 120, 2048, 0}, + {210, 102400, 120, 4096, 0}, + {210, 204800, 120, 204800, 0}, + {210, 307200, 120, 307200, 0}, + {210, 409600, 120, 409600, 0}, + {210, 512000, 120, 512000, 0}, + {210, 614400, 120, 614400, 0}, + {210, 716800, 120, 716800, 0}, + {210, 819200, 120, 819200, 0}, + {210, 921600, 120, 921600, 0}, + {210, 1024000, 120, 1024000, 0}, + {330, 512, 165, 512, 1}, + {330, 1024, 165, 1024, 1}, + {330, 1536, 165, 512, 1}, + {330, 2048, 165, 2048, 1}, + {330, 2560, 165, 512, 0}, + {330, 3072, 165, 512, 0}, + {330, 3584, 165, 256, 0}, + {330, 4096, 165, 1024, 0}, + {330, 4608, 165, 512, 1}, + {330, 5120, 165, 1024, 1}, + {330, 5632, 165, 64, 0}, + {330, 6144, 165, 64, 0}, + {330, 6656, 165, 64, 0}, + {330, 7168, 165, 1024, 0}, + {330, 7680, 165, 512, 1}, + {330, 8192, 165, 1024, 0}, + {330, 8704, 165, 512, 0}, + {330, 9216, 165, 512, 0}, + {330, 9728, 165, 128, 0}, + {330, 10240, 165, 1024, 0}, + {330, 20480, 165, 1024, 0}, + {330, 30720, 165, 30720, 0}, + {330, 40960, 165, 40960, 0}, + {330, 51200, 165, 51200, 0}, + {330, 61440, 165, 61440, 0}, + {330, 71680, 165, 71680, 0}, + {330, 81920, 165, 81920, 0}, + {330, 92160, 165, 92160, 0}, + {330, 102400, 165, 102400, 0}, + {330, 204800, 165, 204800, 0}, + {330, 307200, 165, 307200, 0}, + {330, 409600, 165, 409600, 0}, + {330, 512000, 165, 512000, 0}, + {330, 614400, 165, 614400, 0}, + {330, 716800, 165, 716800, 0}, + {330, 819200, 165, 819200, 0}, + {330, 921600, 165, 921600, 0}, + {330, 1024000, 165, 1024000, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > dgemm_nn_v100 = -{ - {3 , 512 , 1 , 32 , 0 }, - {3 , 1024 , 1 , 32 , 0 }, - {3 , 1536 , 1 , 32 , 0 }, - {3 , 2048 , 1 , 32 , 0 }, - {3 , 2560 , 1 , 64 , 0 }, - {3 , 3072 , 1 , 32 , 0 }, - {3 , 3584 , 1 , 64 , 0 }, - {3 , 4096 , 1 , 32 , 0 }, - {3 , 4608 , 1 , 32 , 0 }, - {3 , 5120 , 1 , 1024 , 0 }, - {3 , 5632 , 1 , 64 , 0 }, - {3 , 6144 , 1 , 64 , 0 }, - {3 , 6656 , 1 , 128 , 0 }, - {3 , 7168 , 1 , 64 , 0 }, - {3 , 7680 , 1 , 64 , 0 }, - {3 , 8192 , 1 , 64 , 0 }, - {3 , 8704 , 1 , 64 , 0 }, - {3 , 9216 , 1 , 32 , 0 }, - {3 , 9728 , 1 , 256 , 0 }, - {3 , 10240 , 1 , 32 , 0 }, - {3 , 20480 , 1 , 64 , 0 }, - {3 , 30720 , 1 , 32 , 0 }, - {3 , 40960 , 1 , 64 , 0 }, - {3 , 51200 , 1 , 32 , 0 }, - {3 , 61440 , 1 , 32 , 0 }, - {3 , 71680 , 1 , 32 , 0 }, - {3 , 81920 , 1 , 64 , 0 }, - {3 , 92160 , 1 , 32 , 0 }, - {3 , 102400 , 1 , 64 , 0 }, - {3 , 204800 , 1 , 64 , 0 }, - {3 , 307200 , 1 , 64 , 0 }, - {3 , 409600 , 1 , 32 , 0 }, - {3 , 512000 , 1 , 64 , 0 }, - {3 , 614400 , 1 , 32 , 0 }, - {3 , 716800 , 1 , 32 , 0 }, - {3 , 819200 , 1 , 32 , 0 }, - {3 , 921600 , 1 , 32 , 0 }, - {3 , 1024000, 1 , 64 , 0 }, - {4 , 512 , 1 , 64 , 0 }, - {4 , 1024 , 1 , 64 , 0 }, - {4 , 1536 , 1 , 512 , 0 }, - {4 , 2048 , 1 , 64 , 0 }, - {4 , 2560 , 1 , 32 , 0 }, - {4 , 3072 , 1 , 32 , 0 }, - {4 , 3584 , 1 , 32 , 0 }, - {4 , 4096 , 1 , 64 , 1 }, - {4 , 4608 , 1 , 64 , 0 }, - {4 , 5120 , 1 , 64 , 0 }, - {4 , 5632 , 1 , 64 , 0 }, - {4 , 6144 , 1 , 32 , 0 }, - {4 , 6656 , 1 , 32 , 0 }, - {4 , 7168 , 1 , 32 , 0 }, - {4 , 7680 , 1 , 32 , 0 }, - {4 , 8192 , 1 , 64 , 0 }, - {4 , 8704 , 1 , 32 , 0 }, - {4 , 9216 , 1 , 32 , 0 }, - {4 , 9728 , 1 , 32 , 0 }, - {4 , 10240 , 1 , 32 , 0 }, - {4 , 20480 , 1 , 64 , 1 }, - {4 , 30720 , 1 , 32 , 0 }, - {4 , 40960 , 1 , 64 , 0 }, - {4 , 51200 , 1 , 64 , 0 }, - {4 , 61440 , 1 , 32 , 0 }, - {4 , 71680 , 1 , 32 , 0 }, - {4 , 81920 , 1 , 32 , 0 }, - {4 , 92160 , 1 , 64 , 0 }, - {4 , 102400 , 1 , 64 , 0 }, - {4 , 204800 , 1 , 64 , 0 }, - {4 , 307200 , 1 , 32 , 0 }, - {4 , 409600 , 1 , 64 , 0 }, - {4 , 512000 , 1 , 32 , 0 }, - {4 , 614400 , 1 , 32 , 0 }, - {4 , 716800 , 1 , 32 , 0 }, - {4 , 819200 , 1 , 32 , 0 }, - {4 , 921600 , 1 , 64 , 0 }, - {4 , 1024000, 1 , 32 , 0 }, - {6 , 512 , 3 , 64 , 1 }, - {6 , 1024 , 3 , 32 , 1 }, - {6 , 1536 , 3 , 32 , 1 }, - {6 , 2048 , 3 , 64 , 1 }, - {6 , 2560 , 3 , 32 , 1 }, - {6 , 3072 , 3 , 32 , 1 }, - {6 , 3584 , 3 , 64 , 1 }, - {6 , 4096 , 3 , 32 , 1 }, - {6 , 4608 , 3 , 64 , 1 }, - {6 , 5120 , 3 , 512 , 1 }, - {6 , 5632 , 3 , 32 , 1 }, - {6 , 6144 , 3 , 64 , 1 }, - {6 , 6656 , 3 , 64 , 1 }, - {6 , 7168 , 3 , 64 , 1 }, - {6 , 7680 , 3 , 64 , 1 }, - {6 , 8192 , 3 , 32 , 1 }, - {6 , 8704 , 3 , 64 , 1 }, - {6 , 9216 , 3 , 64 , 1 }, - {6 , 9728 , 3 , 32 , 1 }, - {6 , 10240 , 3 , 64 , 1 }, - {6 , 20480 , 3 , 64 , 1 }, - {6 , 30720 , 3 , 64 , 1 }, - {6 , 40960 , 3 , 64 , 1 }, - {6 , 51200 , 3 , 64 , 1 }, - {6 , 61440 , 3 , 64 , 1 }, - {6 , 71680 , 3 , 64 , 1 }, - {6 , 81920 , 3 , 512 , 1 }, - {6 , 92160 , 3 , 64 , 1 }, - {6 , 102400 , 3 , 64 , 1 }, - {6 , 204800 , 3 , 4096 , 1 }, - {6 , 307200 , 3 , 1024 , 1 }, - {6 , 409600 , 3 , 16384 , 1 }, - {6 , 512000 , 3 , 1024 , 1 }, - {6 , 614400 , 3 , 8192 , 1 }, - {6 , 716800 , 3 , 4096 , 1 }, - {6 , 819200 , 3 , 32768 , 1 }, - {6 , 921600 , 3 , 4096 , 1 }, - {6 , 1024000, 3 , 8192 , 1 }, - {10 , 512 , 4 , 32 , 1 }, - {10 , 1024 , 4 , 32 , 1 }, - {10 , 1536 , 4 , 64 , 1 }, - {10 , 2048 , 4 , 32 , 1 }, - {10 , 2560 , 4 , 64 , 1 }, - {10 , 3072 , 4 , 64 , 1 }, - {10 , 3584 , 4 , 32 , 1 }, - {10 , 4096 , 4 , 32 , 1 }, - {10 , 4608 , 4 , 32 , 1 }, - {10 , 5120 , 4 , 64 , 1 }, - {10 , 5632 , 4 , 32 , 1 }, - {10 , 6144 , 4 , 64 , 1 }, - {10 , 6656 , 4 , 32 , 1 }, - {10 , 7168 , 4 , 32 , 1 }, - {10 , 7680 , 4 , 32 , 1 }, - {10 , 8192 , 4 , 32 , 1 }, - {10 , 8704 , 4 , 64 , 1 }, - {10 , 9216 , 4 , 32 , 1 }, - {10 , 9728 , 4 , 32 , 1 }, - {10 , 10240 , 4 , 32 , 1 }, - {10 , 20480 , 4 , 64 , 1 }, - {10 , 30720 , 4 , 64 , 1 }, - {10 , 40960 , 4 , 64 , 1 }, - {10 , 51200 , 4 , 64 , 1 }, - {10 , 61440 , 4 , 64 , 1 }, - {10 , 71680 , 4 , 64 , 1 }, - {10 , 81920 , 4 , 64 , 1 }, - {10 , 92160 , 4 , 64 , 1 }, - {10 , 102400 , 4 , 64 , 1 }, - {10 , 204800 , 4 , 512 , 1 }, - {10 , 307200 , 4 , 512 , 1 }, - {10 , 409600 , 4 , 16384 , 1 }, - {10 , 512000 , 4 , 4096 , 1 }, - {10 , 614400 , 4 , 8192 , 1 }, - {10 , 716800 , 4 , 4096 , 1 }, - {10 , 819200 , 4 , 32768 , 1 }, - {10 , 921600 , 4 , 4096 , 1 }, - {10 , 1024000, 4 , 8192 , 1 }, - {10 , 512 , 6 , 256 , 1 }, - {10 , 1024 , 6 , 32 , 1 }, - {10 , 1536 , 6 , 32 , 1 }, - {10 , 2048 , 6 , 32 , 1 }, - {10 , 2560 , 6 , 32 , 1 }, - {10 , 3072 , 6 , 32 , 1 }, - {10 , 3584 , 6 , 256 , 1 }, - {10 , 4096 , 6 , 32 , 1 }, - {10 , 4608 , 6 , 64 , 1 }, - {10 , 5120 , 6 , 64 , 1 }, - {10 , 5632 , 6 , 64 , 1 }, - {10 , 6144 , 6 , 32 , 1 }, - {10 , 6656 , 6 , 64 , 1 }, - {10 , 7168 , 6 , 32 , 1 }, - {10 , 7680 , 6 , 64 , 1 }, - {10 , 8192 , 6 , 64 , 1 }, - {10 , 8704 , 6 , 32 , 1 }, - {10 , 9216 , 6 , 32 , 1 }, - {10 , 9728 , 6 , 64 , 1 }, - {10 , 10240 , 6 , 64 , 1 }, - {10 , 20480 , 6 , 32 , 1 }, - {10 , 30720 , 6 , 64 , 1 }, - {10 , 40960 , 6 , 64 , 1 }, - {10 , 51200 , 6 , 64 , 1 }, - {10 , 61440 , 6 , 64 , 1 }, - {10 , 71680 , 6 , 64 , 1 }, - {10 , 81920 , 6 , 64 , 1 }, - {10 , 92160 , 6 , 64 , 1 }, - {10 , 102400 , 6 , 64 , 1 }, - {10 , 204800 , 6 , 8192 , 1 }, - {10 , 307200 , 6 , 512 , 1 }, - {10 , 409600 , 6 , 16384 , 1 }, - {10 , 512000 , 6 , 4096 , 1 }, - {10 , 614400 , 6 , 8192 , 1 }, - {10 , 716800 , 6 , 4096 , 1 }, - {10 , 819200 , 6 , 32768 , 1 }, - {10 , 921600 , 6 , 4096 , 1 }, - {10 , 1024000, 6 , 8192 , 1 }, - {15 , 512 , 12 , 64 , 1 }, - {15 , 1024 , 12 , 64 , 1 }, - {15 , 1536 , 12 , 32 , 1 }, - {15 , 2048 , 12 , 32 , 1 }, - {15 , 2560 , 12 , 64 , 1 }, - {15 , 3072 , 12 , 128 , 1 }, - {15 , 3584 , 12 , 32 , 1 }, - {15 , 4096 , 12 , 32 , 1 }, - {15 , 4608 , 12 , 32 , 1 }, - {15 , 5120 , 12 , 64 , 1 }, - {15 , 5632 , 12 , 64 , 1 }, - {15 , 6144 , 12 , 128 , 1 }, - {15 , 6656 , 12 , 32 , 1 }, - {15 , 7168 , 12 , 64 , 1 }, - {15 , 7680 , 12 , 64 , 1 }, - {15 , 8192 , 12 , 32 , 1 }, - {15 , 8704 , 12 , 64 , 1 }, - {15 , 9216 , 12 , 64 , 1 }, - {15 , 9728 , 12 , 32 , 1 }, - {15 , 10240 , 12 , 64 , 1 }, - {15 , 20480 , 12 , 64 , 1 }, - {15 , 30720 , 12 , 64 , 1 }, - {15 , 40960 , 12 , 64 , 1 }, - {15 , 51200 , 12 , 64 , 1 }, - {15 , 61440 , 12 , 64 , 1 }, - {15 , 71680 , 12 , 64 , 1 }, - {15 , 81920 , 12 , 64 , 1 }, - {15 , 92160 , 12 , 64 , 1 }, - {15 , 102400 , 12 , 64 , 1 }, - {15 , 204800 , 12 , 2048 , 1 }, - {15 , 307200 , 12 , 512 , 1 }, - {15 , 409600 , 12 , 16384 , 1 }, - {15 , 512000 , 12 , 4096 , 1 }, - {15 , 614400 , 12 , 8192 , 1 }, - {15 , 716800 , 12 , 4096 , 1 }, - {15 , 819200 , 12 , 32768 , 1 }, - {15 , 921600 , 12 , 4096 , 1 }, - {15 , 1024000, 12 , 8192 , 1 }, - {20 , 512 , 11 , 64 , 1 }, - {20 , 1024 , 11 , 64 , 1 }, - {20 , 1536 , 11 , 64 , 1 }, - {20 , 2048 , 11 , 64 , 1 }, - {20 , 2560 , 11 , 32 , 1 }, - {20 , 3072 , 11 , 32 , 1 }, - {20 , 3584 , 11 , 64 , 1 }, - {20 , 4096 , 11 , 32 , 1 }, - {20 , 4608 , 11 , 32 , 1 }, - {20 , 5120 , 11 , 32 , 1 }, - {20 , 5632 , 11 , 128 , 1 }, - {20 , 6144 , 11 , 64 , 1 }, - {20 , 6656 , 11 , 32 , 1 }, - {20 , 7168 , 11 , 32 , 1 }, - {20 , 7680 , 11 , 32 , 1 }, - {20 , 8192 , 11 , 8192 , 1 }, - {20 , 8704 , 11 , 32 , 1 }, - {20 , 9216 , 11 , 64 , 1 }, - {20 , 9728 , 11 , 32 , 1 }, - {20 , 10240 , 11 , 32 , 1 }, - {20 , 20480 , 11 , 64 , 1 }, - {20 , 30720 , 11 , 64 , 1 }, - {20 , 40960 , 11 , 64 , 1 }, - {20 , 51200 , 11 , 64 , 1 }, - {20 , 61440 , 11 , 64 , 1 }, - {20 , 71680 , 11 , 64 , 1 }, - {20 , 81920 , 11 , 64 , 1 }, - {20 , 92160 , 11 , 64 , 1 }, - {20 , 102400 , 11 , 64 , 1 }, - {20 , 204800 , 11 , 8192 , 1 }, - {20 , 307200 , 11 , 256 , 1 }, - {20 , 409600 , 11 , 16384 , 1 }, - {20 , 512000 , 11 , 512 , 1 }, - {20 , 614400 , 11 , 8192 , 1 }, - {20 , 716800 , 11 , 4096 , 1 }, - {20 , 819200 , 11 , 32768 , 1 }, - {20 , 921600 , 11 , 921600 , 1 }, - {20 , 1024000, 11 , 1024000, 1 }, - {21 , 512 , 16 , 64 , 1 }, - {21 , 1024 , 16 , 32 , 1 }, - {21 , 1536 , 16 , 32 , 1 }, - {21 , 2048 , 16 , 64 , 1 }, - {21 , 2560 , 16 , 64 , 1 }, - {21 , 3072 , 16 , 32 , 1 }, - {21 , 3584 , 16 , 64 , 1 }, - {21 , 4096 , 16 , 32 , 1 }, - {21 , 4608 , 16 , 64 , 1 }, - {21 , 5120 , 16 , 32 , 1 }, - {21 , 5632 , 16 , 32 , 1 }, - {21 , 6144 , 16 , 32 , 1 }, - {21 , 6656 , 16 , 64 , 1 }, - {21 , 7168 , 16 , 64 , 1 }, - {21 , 7680 , 16 , 128 , 1 }, - {21 , 8192 , 16 , 32 , 1 }, - {21 , 8704 , 16 , 32 , 1 }, - {21 , 9216 , 16 , 64 , 1 }, - {21 , 9728 , 16 , 64 , 1 }, - {21 , 10240 , 16 , 64 , 1 }, - {21 , 20480 , 16 , 64 , 1 }, - {21 , 30720 , 16 , 32 , 1 }, - {21 , 40960 , 16 , 32 , 1 }, - {21 , 51200 , 16 , 64 , 1 }, - {21 , 61440 , 16 , 64 , 1 }, - {21 , 71680 , 16 , 64 , 1 }, - {21 , 81920 , 16 , 64 , 1 }, - {21 , 92160 , 16 , 64 , 1 }, - {21 , 102400 , 16 , 64 , 1 }, - {21 , 204800 , 16 , 8192 , 1 }, - {21 , 307200 , 16 , 4096 , 1 }, - {21 , 409600 , 16 , 16384 , 1 }, - {21 , 512000 , 16 , 4096 , 1 }, - {21 , 614400 , 16 , 8192 , 1 }, - {21 , 716800 , 16 , 716800 , 1 }, - {21 , 819200 , 16 , 32768 , 1 }, - {21 , 921600 , 16 , 921600 , 1 }, - {21 , 1024000, 16 , 1024000, 1 }, - {28 , 512 , 25 , 64 , 1 }, - {28 , 1024 , 25 , 64 , 1 }, - {28 , 1536 , 25 , 64 , 1 }, - {28 , 2048 , 25 , 32 , 1 }, - {28 , 2560 , 25 , 32 , 1 }, - {28 , 3072 , 25 , 64 , 1 }, - {28 , 3584 , 25 , 64 , 1 }, - {28 , 4096 , 25 , 64 , 1 }, - {28 , 4608 , 25 , 64 , 1 }, - {28 , 5120 , 25 , 64 , 1 }, - {28 , 5632 , 25 , 32 , 1 }, - {28 , 6144 , 25 , 32 , 1 }, - {28 , 6656 , 25 , 32 , 1 }, - {28 , 7168 , 25 , 64 , 1 }, - {28 , 7680 , 25 , 32 , 1 }, - {28 , 8192 , 25 , 64 , 1 }, - {28 , 8704 , 25 , 32 , 1 }, - {28 , 9216 , 25 , 64 , 1 }, - {28 , 9728 , 25 , 32 , 1 }, - {28 , 10240 , 25 , 64 , 1 }, - {28 , 20480 , 25 , 32 , 1 }, - {28 , 30720 , 25 , 32 , 1 }, - {28 , 40960 , 25 , 64 , 1 }, - {28 , 51200 , 25 , 64 , 1 }, - {28 , 61440 , 25 , 64 , 1 }, - {28 , 71680 , 25 , 32 , 1 }, - {28 , 81920 , 25 , 64 , 1 }, - {28 , 92160 , 25 , 64 , 1 }, - {28 , 102400 , 25 , 64 , 1 }, - {28 , 204800 , 25 , 4096 , 1 }, - {28 , 307200 , 25 , 4096 , 1 }, - {28 , 409600 , 25 , 8192 , 1 }, - {28 , 512000 , 25 , 512000 , 1 }, - {28 , 614400 , 25 , 8192 , 1 }, - {28 , 716800 , 25 , 716800 , 1 }, - {28 , 819200 , 25 , 819200 , 1 }, - {28 , 921600 , 25 , 921600 , 1 }, - {28 , 1024000, 25 , 1024000, 1 }, - {35 , 512 , 24 , 64 , 1 }, - {35 , 1024 , 24 , 32 , 1 }, - {35 , 1536 , 24 , 32 , 1 }, - {35 , 2048 , 24 , 32 , 1 }, - {35 , 2560 , 24 , 64 , 1 }, - {35 , 3072 , 24 , 64 , 1 }, - {35 , 3584 , 24 , 64 , 1 }, - {35 , 4096 , 24 , 64 , 1 }, - {35 , 4608 , 24 , 32 , 1 }, - {35 , 5120 , 24 , 64 , 1 }, - {35 , 5632 , 24 , 32 , 1 }, - {35 , 6144 , 24 , 64 , 1 }, - {35 , 6656 , 24 , 64 , 1 }, - {35 , 7168 , 24 , 32 , 1 }, - {35 , 7680 , 24 , 32 , 1 }, - {35 , 8192 , 24 , 64 , 1 }, - {35 , 8704 , 24 , 32 , 1 }, - {35 , 9216 , 24 , 64 , 1 }, - {35 , 9728 , 24 , 64 , 1 }, - {35 , 10240 , 24 , 64 , 1 }, - {35 , 20480 , 24 , 64 , 1 }, - {35 , 30720 , 24 , 32 , 1 }, - {35 , 40960 , 24 , 8192 , 1 }, - {35 , 51200 , 24 , 32 , 1 }, - {35 , 61440 , 24 , 64 , 1 }, - {35 , 71680 , 24 , 128 , 1 }, - {35 , 81920 , 24 , 64 , 1 }, - {35 , 92160 , 24 , 64 , 1 }, - {35 , 102400 , 24 , 4096 , 1 }, - {35 , 204800 , 24 , 8192 , 1 }, - {35 , 307200 , 24 , 4096 , 1 }, - {35 , 409600 , 24 , 16384 , 1 }, - {35 , 512000 , 24 , 4096 , 1 }, - {35 , 614400 , 24 , 614400 , 1 }, - {35 , 716800 , 24 , 716800 , 1 }, - {35 , 819200 , 24 , 819200 , 1 }, - {35 , 921600 , 24 , 921600 , 1 }, - {35 , 1024000, 24 , 8192 , 1 }, - {36 , 512 , 33 , 32 , 1 }, - {36 , 1024 , 33 , 64 , 1 }, - {36 , 1536 , 33 , 64 , 1 }, - {36 , 2048 , 33 , 64 , 1 }, - {36 , 2560 , 33 , 64 , 1 }, - {36 , 3072 , 33 , 32 , 1 }, - {36 , 3584 , 33 , 64 , 1 }, - {36 , 4096 , 33 , 64 , 1 }, - {36 , 4608 , 33 , 64 , 1 }, - {36 , 5120 , 33 , 64 , 1 }, - {36 , 5632 , 33 , 64 , 1 }, - {36 , 6144 , 33 , 64 , 1 }, - {36 , 6656 , 33 , 64 , 1 }, - {36 , 7168 , 33 , 64 , 1 }, - {36 , 7680 , 33 , 64 , 1 }, - {36 , 8192 , 33 , 64 , 1 }, - {36 , 8704 , 33 , 64 , 1 }, - {36 , 9216 , 33 , 64 , 1 }, - {36 , 9728 , 33 , 64 , 1 }, - {36 , 10240 , 33 , 64 , 1 }, - {36 , 20480 , 33 , 64 , 1 }, - {36 , 30720 , 33 , 64 , 1 }, - {36 , 40960 , 33 , 64 , 1 }, - {36 , 51200 , 33 , 64 , 1 }, - {36 , 61440 , 33 , 64 , 1 }, - {36 , 71680 , 33 , 2048 , 1 }, - {36 , 81920 , 33 , 16384 , 1 }, - {36 , 92160 , 33 , 2048 , 1 }, - {36 , 102400 , 33 , 1024 , 1 }, - {36 , 204800 , 33 , 4096 , 1 }, - {36 , 307200 , 33 , 307200 , 1 }, - {36 , 409600 , 33 , 409600 , 1 }, - {36 , 512000 , 33 , 512000 , 1 }, - {36 , 614400 , 33 , 614400 , 1 }, - {36 , 716800 , 33 , 716800 , 1 }, - {36 , 819200 , 33 , 819200 , 1 }, - {36 , 921600 , 33 , 921600 , 1 }, - {36 , 1024000, 33 , 1024000, 1 }, - {45 , 512 , 42 , 64 , 1 }, - {45 , 1024 , 42 , 64 , 1 }, - {45 , 1536 , 42 , 64 , 1 }, - {45 , 2048 , 42 , 32 , 1 }, - {45 , 2560 , 42 , 64 , 1 }, - {45 , 3072 , 42 , 64 , 1 }, - {45 , 3584 , 42 , 64 , 1 }, - {45 , 4096 , 42 , 64 , 1 }, - {45 , 4608 , 42 , 64 , 1 }, - {45 , 5120 , 42 , 64 , 1 }, - {45 , 5632 , 42 , 64 , 1 }, - {45 , 6144 , 42 , 64 , 1 }, - {45 , 6656 , 42 , 64 , 1 }, - {45 , 7168 , 42 , 32 , 1 }, - {45 , 7680 , 42 , 64 , 1 }, - {45 , 8192 , 42 , 64 , 1 }, - {45 , 8704 , 42 , 64 , 1 }, - {45 , 9216 , 42 , 64 , 1 }, - {45 , 9728 , 42 , 64 , 1 }, - {45 , 10240 , 42 , 64 , 1 }, - {45 , 20480 , 42 , 4096 , 1 }, - {45 , 30720 , 42 , 64 , 1 }, - {45 , 40960 , 42 , 8192 , 1 }, - {45 , 51200 , 42 , 256 , 1 }, - {45 , 61440 , 42 , 4096 , 1 }, - {45 , 71680 , 42 , 2048 , 1 }, - {45 , 81920 , 42 , 16384 , 1 }, - {45 , 92160 , 42 , 2048 , 1 }, - {45 , 102400 , 42 , 4096 , 1 }, - {45 , 204800 , 42 , 8192 , 1 }, - {45 , 307200 , 42 , 4096 , 1 }, - {45 , 409600 , 42 , 8192 , 1 }, - {45 , 512000 , 42 , 4096 , 1 }, - {45 , 614400 , 42 , 8192 , 1 }, - {45 , 716800 , 42 , 4096 , 1 }, - {45 , 819200 , 42 , 16384 , 1 }, - {45 , 921600 , 42 , 2048 , 1 }, - {45 , 1024000, 42 , 4096 , 1 }, - {56 , 512 , 43 , 64 , 1 }, - {56 , 1024 , 43 , 32 , 1 }, - {56 , 1536 , 43 , 64 , 1 }, - {56 , 2048 , 43 , 32 , 1 }, - {56 , 2560 , 43 , 64 , 1 }, - {56 , 3072 , 43 , 64 , 1 }, - {56 , 3584 , 43 , 32 , 1 }, - {56 , 4096 , 43 , 64 , 1 }, - {56 , 4608 , 43 , 64 , 1 }, - {56 , 5120 , 43 , 64 , 1 }, - {56 , 5632 , 43 , 64 , 1 }, - {56 , 6144 , 43 , 64 , 1 }, - {56 , 6656 , 43 , 64 , 1 }, - {56 , 7168 , 43 , 64 , 1 }, - {56 , 7680 , 43 , 64 , 1 }, - {56 , 8192 , 43 , 32 , 1 }, - {56 , 8704 , 43 , 64 , 1 }, - {56 , 9216 , 43 , 32 , 1 }, - {56 , 9728 , 43 , 64 , 1 }, - {56 , 10240 , 43 , 64 , 1 }, - {56 , 20480 , 43 , 64 , 1 }, - {56 , 30720 , 43 , 64 , 1 }, - {56 , 40960 , 43 , 8192 , 1 }, - {56 , 51200 , 43 , 512 , 1 }, - {56 , 61440 , 43 , 4096 , 1 }, - {56 , 71680 , 43 , 2048 , 1 }, - {56 , 81920 , 43 , 8192 , 1 }, - {56 , 92160 , 43 , 2048 , 1 }, - {56 , 102400 , 43 , 4096 , 1 }, - {56 , 204800 , 43 , 4096 , 1 }, - {56 , 307200 , 43 , 2048 , 1 }, - {56 , 409600 , 43 , 8192 , 1 }, - {56 , 512000 , 43 , 4096 , 1 }, - {56 , 614400 , 43 , 4096 , 1 }, - {56 , 716800 , 43 , 4096 , 1 }, - {56 , 819200 , 43 , 16384 , 1 }, - {56 , 921600 , 43 , 2048 , 1 }, - {56 , 1024000, 43 , 8192 , 1 }, - {84 , 512 , 126 , 32 , 1 }, - {84 , 1024 , 126 , 64 , 1 }, - {84 , 1536 , 126 , 64 , 1 }, - {84 , 2048 , 126 , 64 , 1 }, - {84 , 2560 , 126 , 64 , 1 }, - {84 , 3072 , 126 , 64 , 1 }, - {84 , 3584 , 126 , 32 , 1 }, - {84 , 4096 , 126 , 32 , 1 }, - {84 , 4608 , 126 , 32 , 1 }, - {84 , 5120 , 126 , 64 , 1 }, - {84 , 5632 , 126 , 32 , 1 }, - {84 , 6144 , 126 , 64 , 1 }, - {84 , 6656 , 126 , 32 , 1 }, - {84 , 7168 , 126 , 64 , 1 }, - {84 , 7680 , 126 , 64 , 1 }, - {84 , 8192 , 126 , 64 , 1 }, - {84 , 8704 , 126 , 64 , 1 }, - {84 , 9216 , 126 , 32 , 1 }, - {84 , 9728 , 126 , 64 , 1 }, - {84 , 10240 , 126 , 64 , 1 }, - {84 , 20480 , 126 , 32 , 1 }, - {84 , 30720 , 126 , 32 , 1 }, - {84 , 40960 , 126 , 64 , 1 }, - {84 , 51200 , 126 , 64 , 1 }, - {84 , 61440 , 126 , 32 , 1 }, - {84 , 71680 , 126 , 32 , 1 }, - {84 , 81920 , 126 , 8192 , 1 }, - {84 , 92160 , 126 , 2048 , 1 }, - {84 , 102400 , 126 , 4096 , 1 }, - {84 , 204800 , 126 , 8192 , 1 }, - {84 , 307200 , 126 , 64 , 0 }, - {84 , 409600 , 126 , 64 , 0 }, - {84 , 512000 , 126 , 4096 , 0 }, - {84 , 614400 , 126 , 614400 , 0 }, - {84 , 716800 , 126 , 716800 , 0 }, - {84 , 819200 , 126 , 819200 , 0 }, - {84 , 921600 , 126 , 921600 , 0 }, - {84 , 1024000, 126 , 1024000, 0 }, - {120 , 512 , 210 , 64 , 1 }, - {120 , 1024 , 210 , 32 , 1 }, - {120 , 1536 , 210 , 32 , 1 }, - {120 , 2048 , 210 , 32 , 1 }, - {120 , 2560 , 210 , 32 , 1 }, - {120 , 3072 , 210 , 1024 , 1 }, - {120 , 3584 , 210 , 64 , 1 }, - {120 , 4096 , 210 , 1024 , 1 }, - {120 , 4608 , 210 , 32 , 1 }, - {120 , 5120 , 210 , 1024 , 1 }, - {120 , 5632 , 210 , 64 , 1 }, - {120 , 6144 , 210 , 1024 , 1 }, - {120 , 6656 , 210 , 64 , 1 }, - {120 , 7168 , 210 , 64 , 0 }, - {120 , 7680 , 210 , 64 , 0 }, - {120 , 8192 , 210 , 2048 , 0 }, - {120 , 8704 , 210 , 64 , 0 }, - {120 , 9216 , 210 , 1024 , 0 }, - {120 , 9728 , 210 , 64 , 0 }, - {120 , 10240 , 210 , 2048 , 0 }, - {120 , 20480 , 210 , 4096 , 0 }, - {120 , 30720 , 210 , 64 , 0 }, - {120 , 40960 , 210 , 64 , 0 }, - {120 , 51200 , 210 , 64 , 0 }, - {120 , 61440 , 210 , 2048 , 0 }, - {120 , 71680 , 210 , 64 , 0 }, - {120 , 81920 , 210 , 2048 , 0 }, - {120 , 92160 , 210 , 2048 , 0 }, - {120 , 102400 , 210 , 4096 , 0 }, - {120 , 204800 , 210 , 8192 , 0 }, - {120 , 307200 , 210 , 307200 , 0 }, - {120 , 409600 , 210 , 409600 , 0 }, - {120 , 512000 , 210 , 512000 , 0 }, - {120 , 614400 , 210 , 614400 , 0 }, - {120 , 716800 , 210 , 716800 , 0 }, - {120 , 819200 , 210 , 819200 , 0 }, - {120 , 921600 , 210 , 921600 , 0 }, - {120 , 1024000, 210 , 1024000, 0 }, - {165 , 512 , 330 , 32 , 1 }, - {165 , 1024 , 330 , 1024 , 1 }, - {165 , 1536 , 330 , 32 , 1 }, - {165 , 2048 , 330 , 1024 , 1 }, - {165 , 2560 , 330 , 64 , 1 }, - {165 , 3072 , 330 , 64 , 1 }, - {165 , 3584 , 330 , 512 , 1 }, - {165 , 4096 , 330 , 2048 , 0 }, - {165 , 4608 , 330 , 64 , 0 }, - {165 , 5120 , 330 , 1024 , 0 }, - {165 , 5632 , 330 , 512 , 0 }, - {165 , 6144 , 330 , 2048 , 0 }, - {165 , 6656 , 330 , 64 , 0 }, - {165 , 7168 , 330 , 1024 , 0 }, - {165 , 7680 , 330 , 512 , 0 }, - {165 , 8192 , 330 , 2048 , 0 }, - {165 , 8704 , 330 , 64 , 0 }, - {165 , 9216 , 330 , 9216 , 1 }, - {165 , 9728 , 330 , 9728 , 1 }, - {165 , 10240 , 330 , 512 , 0 }, - {165 , 20480 , 330 , 64 , 0 }, - {165 , 30720 , 330 , 64 , 0 }, - {165 , 40960 , 330 , 4096 , 0 }, - {165 , 51200 , 330 , 2048 , 0 }, - {165 , 61440 , 330 , 64 , 0 }, - {165 , 71680 , 330 , 1024 , 0 }, - {165 , 81920 , 330 , 4096 , 0 }, - {165 , 92160 , 330 , 512 , 0 }, - {165 , 102400 , 330 , 4096 , 0 }, - {165 , 204800 , 330 , 8192 , 0 }, - {165 , 307200 , 330 , 4096 , 0 }, - {165 , 409600 , 330 , 409600 , 0 }, - {165 , 512000 , 330 , 512000 , 0 }, - {165 , 614400 , 330 , 614400 , 0 }, - {165 , 716800 , 330 , 716800 , 0 }, - {165 , 819200 , 330 , 819200 , 0 }, - {165 , 921600 , 330 , 921600 , 0 }, - {165 , 1024000, 330 , 1024000, 0 } +std::vector > dgemm_nn_v100 = { + {3, 512, 1, 32, 0}, + {3, 1024, 1, 32, 0}, + {3, 1536, 1, 32, 0}, + {3, 2048, 1, 32, 0}, + {3, 2560, 1, 64, 0}, + {3, 3072, 1, 32, 0}, + {3, 3584, 1, 64, 0}, + {3, 4096, 1, 32, 0}, + {3, 4608, 1, 32, 0}, + {3, 5120, 1, 1024, 0}, + {3, 5632, 1, 64, 0}, + {3, 6144, 1, 64, 0}, + {3, 6656, 1, 128, 0}, + {3, 7168, 1, 64, 0}, + {3, 7680, 1, 64, 0}, + {3, 8192, 1, 64, 0}, + {3, 8704, 1, 64, 0}, + {3, 9216, 1, 32, 0}, + {3, 9728, 1, 256, 0}, + {3, 10240, 1, 32, 0}, + {3, 20480, 1, 64, 0}, + {3, 30720, 1, 32, 0}, + {3, 40960, 1, 64, 0}, + {3, 51200, 1, 32, 0}, + {3, 61440, 1, 32, 0}, + {3, 71680, 1, 32, 0}, + {3, 81920, 1, 64, 0}, + {3, 92160, 1, 32, 0}, + {3, 102400, 1, 64, 0}, + {3, 204800, 1, 64, 0}, + {3, 307200, 1, 64, 0}, + {3, 409600, 1, 32, 0}, + {3, 512000, 1, 64, 0}, + {3, 614400, 1, 32, 0}, + {3, 716800, 1, 32, 0}, + {3, 819200, 1, 32, 0}, + {3, 921600, 1, 32, 0}, + {3, 1024000, 1, 64, 0}, + {4, 512, 1, 64, 0}, + {4, 1024, 1, 64, 0}, + {4, 1536, 1, 512, 0}, + {4, 2048, 1, 64, 0}, + {4, 2560, 1, 32, 0}, + {4, 3072, 1, 32, 0}, + {4, 3584, 1, 32, 0}, + {4, 4096, 1, 64, 1}, + {4, 4608, 1, 64, 0}, + {4, 5120, 1, 64, 0}, + {4, 5632, 1, 64, 0}, + {4, 6144, 1, 32, 0}, + {4, 6656, 1, 32, 0}, + {4, 7168, 1, 32, 0}, + {4, 7680, 1, 32, 0}, + {4, 8192, 1, 64, 0}, + {4, 8704, 1, 32, 0}, + {4, 9216, 1, 32, 0}, + {4, 9728, 1, 32, 0}, + {4, 10240, 1, 32, 0}, + {4, 20480, 1, 64, 1}, + {4, 30720, 1, 32, 0}, + {4, 40960, 1, 64, 0}, + {4, 51200, 1, 64, 0}, + {4, 61440, 1, 32, 0}, + {4, 71680, 1, 32, 0}, + {4, 81920, 1, 32, 0}, + {4, 92160, 1, 64, 0}, + {4, 102400, 1, 64, 0}, + {4, 204800, 1, 64, 0}, + {4, 307200, 1, 32, 0}, + {4, 409600, 1, 64, 0}, + {4, 512000, 1, 32, 0}, + {4, 614400, 1, 32, 0}, + {4, 716800, 1, 32, 0}, + {4, 819200, 1, 32, 0}, + {4, 921600, 1, 64, 0}, + {4, 1024000, 1, 32, 0}, + {6, 512, 3, 64, 1}, + {6, 1024, 3, 32, 1}, + {6, 1536, 3, 32, 1}, + {6, 2048, 3, 64, 1}, + {6, 2560, 3, 32, 1}, + {6, 3072, 3, 32, 1}, + {6, 3584, 3, 64, 1}, + {6, 4096, 3, 32, 1}, + {6, 4608, 3, 64, 1}, + {6, 5120, 3, 512, 1}, + {6, 5632, 3, 32, 1}, + {6, 6144, 3, 64, 1}, + {6, 6656, 3, 64, 1}, + {6, 7168, 3, 64, 1}, + {6, 7680, 3, 64, 1}, + {6, 8192, 3, 32, 1}, + {6, 8704, 3, 64, 1}, + {6, 9216, 3, 64, 1}, + {6, 9728, 3, 32, 1}, + {6, 10240, 3, 64, 1}, + {6, 20480, 3, 64, 1}, + {6, 30720, 3, 64, 1}, + {6, 40960, 3, 64, 1}, + {6, 51200, 3, 64, 1}, + {6, 61440, 3, 64, 1}, + {6, 71680, 3, 64, 1}, + {6, 81920, 3, 512, 1}, + {6, 92160, 3, 64, 1}, + {6, 102400, 3, 64, 1}, + {6, 204800, 3, 4096, 1}, + {6, 307200, 3, 1024, 1}, + {6, 409600, 3, 16384, 1}, + {6, 512000, 3, 1024, 1}, + {6, 614400, 3, 8192, 1}, + {6, 716800, 3, 4096, 1}, + {6, 819200, 3, 32768, 1}, + {6, 921600, 3, 4096, 1}, + {6, 1024000, 3, 8192, 1}, + {10, 512, 4, 32, 1}, + {10, 1024, 4, 32, 1}, + {10, 1536, 4, 64, 1}, + {10, 2048, 4, 32, 1}, + {10, 2560, 4, 64, 1}, + {10, 3072, 4, 64, 1}, + {10, 3584, 4, 32, 1}, + {10, 4096, 4, 32, 1}, + {10, 4608, 4, 32, 1}, + {10, 5120, 4, 64, 1}, + {10, 5632, 4, 32, 1}, + {10, 6144, 4, 64, 1}, + {10, 6656, 4, 32, 1}, + {10, 7168, 4, 32, 1}, + {10, 7680, 4, 32, 1}, + {10, 8192, 4, 32, 1}, + {10, 8704, 4, 64, 1}, + {10, 9216, 4, 32, 1}, + {10, 9728, 4, 32, 1}, + {10, 10240, 4, 32, 1}, + {10, 20480, 4, 64, 1}, + {10, 30720, 4, 64, 1}, + {10, 40960, 4, 64, 1}, + {10, 51200, 4, 64, 1}, + {10, 61440, 4, 64, 1}, + {10, 71680, 4, 64, 1}, + {10, 81920, 4, 64, 1}, + {10, 92160, 4, 64, 1}, + {10, 102400, 4, 64, 1}, + {10, 204800, 4, 512, 1}, + {10, 307200, 4, 512, 1}, + {10, 409600, 4, 16384, 1}, + {10, 512000, 4, 4096, 1}, + {10, 614400, 4, 8192, 1}, + {10, 716800, 4, 4096, 1}, + {10, 819200, 4, 32768, 1}, + {10, 921600, 4, 4096, 1}, + {10, 1024000, 4, 8192, 1}, + {10, 512, 6, 256, 1}, + {10, 1024, 6, 32, 1}, + {10, 1536, 6, 32, 1}, + {10, 2048, 6, 32, 1}, + {10, 2560, 6, 32, 1}, + {10, 3072, 6, 32, 1}, + {10, 3584, 6, 256, 1}, + {10, 4096, 6, 32, 1}, + {10, 4608, 6, 64, 1}, + {10, 5120, 6, 64, 1}, + {10, 5632, 6, 64, 1}, + {10, 6144, 6, 32, 1}, + {10, 6656, 6, 64, 1}, + {10, 7168, 6, 32, 1}, + {10, 7680, 6, 64, 1}, + {10, 8192, 6, 64, 1}, + {10, 8704, 6, 32, 1}, + {10, 9216, 6, 32, 1}, + {10, 9728, 6, 64, 1}, + {10, 10240, 6, 64, 1}, + {10, 20480, 6, 32, 1}, + {10, 30720, 6, 64, 1}, + {10, 40960, 6, 64, 1}, + {10, 51200, 6, 64, 1}, + {10, 61440, 6, 64, 1}, + {10, 71680, 6, 64, 1}, + {10, 81920, 6, 64, 1}, + {10, 92160, 6, 64, 1}, + {10, 102400, 6, 64, 1}, + {10, 204800, 6, 8192, 1}, + {10, 307200, 6, 512, 1}, + {10, 409600, 6, 16384, 1}, + {10, 512000, 6, 4096, 1}, + {10, 614400, 6, 8192, 1}, + {10, 716800, 6, 4096, 1}, + {10, 819200, 6, 32768, 1}, + {10, 921600, 6, 4096, 1}, + {10, 1024000, 6, 8192, 1}, + {15, 512, 12, 64, 1}, + {15, 1024, 12, 64, 1}, + {15, 1536, 12, 32, 1}, + {15, 2048, 12, 32, 1}, + {15, 2560, 12, 64, 1}, + {15, 3072, 12, 128, 1}, + {15, 3584, 12, 32, 1}, + {15, 4096, 12, 32, 1}, + {15, 4608, 12, 32, 1}, + {15, 5120, 12, 64, 1}, + {15, 5632, 12, 64, 1}, + {15, 6144, 12, 128, 1}, + {15, 6656, 12, 32, 1}, + {15, 7168, 12, 64, 1}, + {15, 7680, 12, 64, 1}, + {15, 8192, 12, 32, 1}, + {15, 8704, 12, 64, 1}, + {15, 9216, 12, 64, 1}, + {15, 9728, 12, 32, 1}, + {15, 10240, 12, 64, 1}, + {15, 20480, 12, 64, 1}, + {15, 30720, 12, 64, 1}, + {15, 40960, 12, 64, 1}, + {15, 51200, 12, 64, 1}, + {15, 61440, 12, 64, 1}, + {15, 71680, 12, 64, 1}, + {15, 81920, 12, 64, 1}, + {15, 92160, 12, 64, 1}, + {15, 102400, 12, 64, 1}, + {15, 204800, 12, 2048, 1}, + {15, 307200, 12, 512, 1}, + {15, 409600, 12, 16384, 1}, + {15, 512000, 12, 4096, 1}, + {15, 614400, 12, 8192, 1}, + {15, 716800, 12, 4096, 1}, + {15, 819200, 12, 32768, 1}, + {15, 921600, 12, 4096, 1}, + {15, 1024000, 12, 8192, 1}, + {20, 512, 11, 64, 1}, + {20, 1024, 11, 64, 1}, + {20, 1536, 11, 64, 1}, + {20, 2048, 11, 64, 1}, + {20, 2560, 11, 32, 1}, + {20, 3072, 11, 32, 1}, + {20, 3584, 11, 64, 1}, + {20, 4096, 11, 32, 1}, + {20, 4608, 11, 32, 1}, + {20, 5120, 11, 32, 1}, + {20, 5632, 11, 128, 1}, + {20, 6144, 11, 64, 1}, + {20, 6656, 11, 32, 1}, + {20, 7168, 11, 32, 1}, + {20, 7680, 11, 32, 1}, + {20, 8192, 11, 8192, 1}, + {20, 8704, 11, 32, 1}, + {20, 9216, 11, 64, 1}, + {20, 9728, 11, 32, 1}, + {20, 10240, 11, 32, 1}, + {20, 20480, 11, 64, 1}, + {20, 30720, 11, 64, 1}, + {20, 40960, 11, 64, 1}, + {20, 51200, 11, 64, 1}, + {20, 61440, 11, 64, 1}, + {20, 71680, 11, 64, 1}, + {20, 81920, 11, 64, 1}, + {20, 92160, 11, 64, 1}, + {20, 102400, 11, 64, 1}, + {20, 204800, 11, 8192, 1}, + {20, 307200, 11, 256, 1}, + {20, 409600, 11, 16384, 1}, + {20, 512000, 11, 512, 1}, + {20, 614400, 11, 8192, 1}, + {20, 716800, 11, 4096, 1}, + {20, 819200, 11, 32768, 1}, + {20, 921600, 11, 921600, 1}, + {20, 1024000, 11, 1024000, 1}, + {21, 512, 16, 64, 1}, + {21, 1024, 16, 32, 1}, + {21, 1536, 16, 32, 1}, + {21, 2048, 16, 64, 1}, + {21, 2560, 16, 64, 1}, + {21, 3072, 16, 32, 1}, + {21, 3584, 16, 64, 1}, + {21, 4096, 16, 32, 1}, + {21, 4608, 16, 64, 1}, + {21, 5120, 16, 32, 1}, + {21, 5632, 16, 32, 1}, + {21, 6144, 16, 32, 1}, + {21, 6656, 16, 64, 1}, + {21, 7168, 16, 64, 1}, + {21, 7680, 16, 128, 1}, + {21, 8192, 16, 32, 1}, + {21, 8704, 16, 32, 1}, + {21, 9216, 16, 64, 1}, + {21, 9728, 16, 64, 1}, + {21, 10240, 16, 64, 1}, + {21, 20480, 16, 64, 1}, + {21, 30720, 16, 32, 1}, + {21, 40960, 16, 32, 1}, + {21, 51200, 16, 64, 1}, + {21, 61440, 16, 64, 1}, + {21, 71680, 16, 64, 1}, + {21, 81920, 16, 64, 1}, + {21, 92160, 16, 64, 1}, + {21, 102400, 16, 64, 1}, + {21, 204800, 16, 8192, 1}, + {21, 307200, 16, 4096, 1}, + {21, 409600, 16, 16384, 1}, + {21, 512000, 16, 4096, 1}, + {21, 614400, 16, 8192, 1}, + {21, 716800, 16, 716800, 1}, + {21, 819200, 16, 32768, 1}, + {21, 921600, 16, 921600, 1}, + {21, 1024000, 16, 1024000, 1}, + {28, 512, 25, 64, 1}, + {28, 1024, 25, 64, 1}, + {28, 1536, 25, 64, 1}, + {28, 2048, 25, 32, 1}, + {28, 2560, 25, 32, 1}, + {28, 3072, 25, 64, 1}, + {28, 3584, 25, 64, 1}, + {28, 4096, 25, 64, 1}, + {28, 4608, 25, 64, 1}, + {28, 5120, 25, 64, 1}, + {28, 5632, 25, 32, 1}, + {28, 6144, 25, 32, 1}, + {28, 6656, 25, 32, 1}, + {28, 7168, 25, 64, 1}, + {28, 7680, 25, 32, 1}, + {28, 8192, 25, 64, 1}, + {28, 8704, 25, 32, 1}, + {28, 9216, 25, 64, 1}, + {28, 9728, 25, 32, 1}, + {28, 10240, 25, 64, 1}, + {28, 20480, 25, 32, 1}, + {28, 30720, 25, 32, 1}, + {28, 40960, 25, 64, 1}, + {28, 51200, 25, 64, 1}, + {28, 61440, 25, 64, 1}, + {28, 71680, 25, 32, 1}, + {28, 81920, 25, 64, 1}, + {28, 92160, 25, 64, 1}, + {28, 102400, 25, 64, 1}, + {28, 204800, 25, 4096, 1}, + {28, 307200, 25, 4096, 1}, + {28, 409600, 25, 8192, 1}, + {28, 512000, 25, 512000, 1}, + {28, 614400, 25, 8192, 1}, + {28, 716800, 25, 716800, 1}, + {28, 819200, 25, 819200, 1}, + {28, 921600, 25, 921600, 1}, + {28, 1024000, 25, 1024000, 1}, + {35, 512, 24, 64, 1}, + {35, 1024, 24, 32, 1}, + {35, 1536, 24, 32, 1}, + {35, 2048, 24, 32, 1}, + {35, 2560, 24, 64, 1}, + {35, 3072, 24, 64, 1}, + {35, 3584, 24, 64, 1}, + {35, 4096, 24, 64, 1}, + {35, 4608, 24, 32, 1}, + {35, 5120, 24, 64, 1}, + {35, 5632, 24, 32, 1}, + {35, 6144, 24, 64, 1}, + {35, 6656, 24, 64, 1}, + {35, 7168, 24, 32, 1}, + {35, 7680, 24, 32, 1}, + {35, 8192, 24, 64, 1}, + {35, 8704, 24, 32, 1}, + {35, 9216, 24, 64, 1}, + {35, 9728, 24, 64, 1}, + {35, 10240, 24, 64, 1}, + {35, 20480, 24, 64, 1}, + {35, 30720, 24, 32, 1}, + {35, 40960, 24, 8192, 1}, + {35, 51200, 24, 32, 1}, + {35, 61440, 24, 64, 1}, + {35, 71680, 24, 128, 1}, + {35, 81920, 24, 64, 1}, + {35, 92160, 24, 64, 1}, + {35, 102400, 24, 4096, 1}, + {35, 204800, 24, 8192, 1}, + {35, 307200, 24, 4096, 1}, + {35, 409600, 24, 16384, 1}, + {35, 512000, 24, 4096, 1}, + {35, 614400, 24, 614400, 1}, + {35, 716800, 24, 716800, 1}, + {35, 819200, 24, 819200, 1}, + {35, 921600, 24, 921600, 1}, + {35, 1024000, 24, 8192, 1}, + {36, 512, 33, 32, 1}, + {36, 1024, 33, 64, 1}, + {36, 1536, 33, 64, 1}, + {36, 2048, 33, 64, 1}, + {36, 2560, 33, 64, 1}, + {36, 3072, 33, 32, 1}, + {36, 3584, 33, 64, 1}, + {36, 4096, 33, 64, 1}, + {36, 4608, 33, 64, 1}, + {36, 5120, 33, 64, 1}, + {36, 5632, 33, 64, 1}, + {36, 6144, 33, 64, 1}, + {36, 6656, 33, 64, 1}, + {36, 7168, 33, 64, 1}, + {36, 7680, 33, 64, 1}, + {36, 8192, 33, 64, 1}, + {36, 8704, 33, 64, 1}, + {36, 9216, 33, 64, 1}, + {36, 9728, 33, 64, 1}, + {36, 10240, 33, 64, 1}, + {36, 20480, 33, 64, 1}, + {36, 30720, 33, 64, 1}, + {36, 40960, 33, 64, 1}, + {36, 51200, 33, 64, 1}, + {36, 61440, 33, 64, 1}, + {36, 71680, 33, 2048, 1}, + {36, 81920, 33, 16384, 1}, + {36, 92160, 33, 2048, 1}, + {36, 102400, 33, 1024, 1}, + {36, 204800, 33, 4096, 1}, + {36, 307200, 33, 307200, 1}, + {36, 409600, 33, 409600, 1}, + {36, 512000, 33, 512000, 1}, + {36, 614400, 33, 614400, 1}, + {36, 716800, 33, 716800, 1}, + {36, 819200, 33, 819200, 1}, + {36, 921600, 33, 921600, 1}, + {36, 1024000, 33, 1024000, 1}, + {45, 512, 42, 64, 1}, + {45, 1024, 42, 64, 1}, + {45, 1536, 42, 64, 1}, + {45, 2048, 42, 32, 1}, + {45, 2560, 42, 64, 1}, + {45, 3072, 42, 64, 1}, + {45, 3584, 42, 64, 1}, + {45, 4096, 42, 64, 1}, + {45, 4608, 42, 64, 1}, + {45, 5120, 42, 64, 1}, + {45, 5632, 42, 64, 1}, + {45, 6144, 42, 64, 1}, + {45, 6656, 42, 64, 1}, + {45, 7168, 42, 32, 1}, + {45, 7680, 42, 64, 1}, + {45, 8192, 42, 64, 1}, + {45, 8704, 42, 64, 1}, + {45, 9216, 42, 64, 1}, + {45, 9728, 42, 64, 1}, + {45, 10240, 42, 64, 1}, + {45, 20480, 42, 4096, 1}, + {45, 30720, 42, 64, 1}, + {45, 40960, 42, 8192, 1}, + {45, 51200, 42, 256, 1}, + {45, 61440, 42, 4096, 1}, + {45, 71680, 42, 2048, 1}, + {45, 81920, 42, 16384, 1}, + {45, 92160, 42, 2048, 1}, + {45, 102400, 42, 4096, 1}, + {45, 204800, 42, 8192, 1}, + {45, 307200, 42, 4096, 1}, + {45, 409600, 42, 8192, 1}, + {45, 512000, 42, 4096, 1}, + {45, 614400, 42, 8192, 1}, + {45, 716800, 42, 4096, 1}, + {45, 819200, 42, 16384, 1}, + {45, 921600, 42, 2048, 1}, + {45, 1024000, 42, 4096, 1}, + {56, 512, 43, 64, 1}, + {56, 1024, 43, 32, 1}, + {56, 1536, 43, 64, 1}, + {56, 2048, 43, 32, 1}, + {56, 2560, 43, 64, 1}, + {56, 3072, 43, 64, 1}, + {56, 3584, 43, 32, 1}, + {56, 4096, 43, 64, 1}, + {56, 4608, 43, 64, 1}, + {56, 5120, 43, 64, 1}, + {56, 5632, 43, 64, 1}, + {56, 6144, 43, 64, 1}, + {56, 6656, 43, 64, 1}, + {56, 7168, 43, 64, 1}, + {56, 7680, 43, 64, 1}, + {56, 8192, 43, 32, 1}, + {56, 8704, 43, 64, 1}, + {56, 9216, 43, 32, 1}, + {56, 9728, 43, 64, 1}, + {56, 10240, 43, 64, 1}, + {56, 20480, 43, 64, 1}, + {56, 30720, 43, 64, 1}, + {56, 40960, 43, 8192, 1}, + {56, 51200, 43, 512, 1}, + {56, 61440, 43, 4096, 1}, + {56, 71680, 43, 2048, 1}, + {56, 81920, 43, 8192, 1}, + {56, 92160, 43, 2048, 1}, + {56, 102400, 43, 4096, 1}, + {56, 204800, 43, 4096, 1}, + {56, 307200, 43, 2048, 1}, + {56, 409600, 43, 8192, 1}, + {56, 512000, 43, 4096, 1}, + {56, 614400, 43, 4096, 1}, + {56, 716800, 43, 4096, 1}, + {56, 819200, 43, 16384, 1}, + {56, 921600, 43, 2048, 1}, + {56, 1024000, 43, 8192, 1}, + {84, 512, 126, 32, 1}, + {84, 1024, 126, 64, 1}, + {84, 1536, 126, 64, 1}, + {84, 2048, 126, 64, 1}, + {84, 2560, 126, 64, 1}, + {84, 3072, 126, 64, 1}, + {84, 3584, 126, 32, 1}, + {84, 4096, 126, 32, 1}, + {84, 4608, 126, 32, 1}, + {84, 5120, 126, 64, 1}, + {84, 5632, 126, 32, 1}, + {84, 6144, 126, 64, 1}, + {84, 6656, 126, 32, 1}, + {84, 7168, 126, 64, 1}, + {84, 7680, 126, 64, 1}, + {84, 8192, 126, 64, 1}, + {84, 8704, 126, 64, 1}, + {84, 9216, 126, 32, 1}, + {84, 9728, 126, 64, 1}, + {84, 10240, 126, 64, 1}, + {84, 20480, 126, 32, 1}, + {84, 30720, 126, 32, 1}, + {84, 40960, 126, 64, 1}, + {84, 51200, 126, 64, 1}, + {84, 61440, 126, 32, 1}, + {84, 71680, 126, 32, 1}, + {84, 81920, 126, 8192, 1}, + {84, 92160, 126, 2048, 1}, + {84, 102400, 126, 4096, 1}, + {84, 204800, 126, 8192, 1}, + {84, 307200, 126, 64, 0}, + {84, 409600, 126, 64, 0}, + {84, 512000, 126, 4096, 0}, + {84, 614400, 126, 614400, 0}, + {84, 716800, 126, 716800, 0}, + {84, 819200, 126, 819200, 0}, + {84, 921600, 126, 921600, 0}, + {84, 1024000, 126, 1024000, 0}, + {120, 512, 210, 64, 1}, + {120, 1024, 210, 32, 1}, + {120, 1536, 210, 32, 1}, + {120, 2048, 210, 32, 1}, + {120, 2560, 210, 32, 1}, + {120, 3072, 210, 1024, 1}, + {120, 3584, 210, 64, 1}, + {120, 4096, 210, 1024, 1}, + {120, 4608, 210, 32, 1}, + {120, 5120, 210, 1024, 1}, + {120, 5632, 210, 64, 1}, + {120, 6144, 210, 1024, 1}, + {120, 6656, 210, 64, 1}, + {120, 7168, 210, 64, 0}, + {120, 7680, 210, 64, 0}, + {120, 8192, 210, 2048, 0}, + {120, 8704, 210, 64, 0}, + {120, 9216, 210, 1024, 0}, + {120, 9728, 210, 64, 0}, + {120, 10240, 210, 2048, 0}, + {120, 20480, 210, 4096, 0}, + {120, 30720, 210, 64, 0}, + {120, 40960, 210, 64, 0}, + {120, 51200, 210, 64, 0}, + {120, 61440, 210, 2048, 0}, + {120, 71680, 210, 64, 0}, + {120, 81920, 210, 2048, 0}, + {120, 92160, 210, 2048, 0}, + {120, 102400, 210, 4096, 0}, + {120, 204800, 210, 8192, 0}, + {120, 307200, 210, 307200, 0}, + {120, 409600, 210, 409600, 0}, + {120, 512000, 210, 512000, 0}, + {120, 614400, 210, 614400, 0}, + {120, 716800, 210, 716800, 0}, + {120, 819200, 210, 819200, 0}, + {120, 921600, 210, 921600, 0}, + {120, 1024000, 210, 1024000, 0}, + {165, 512, 330, 32, 1}, + {165, 1024, 330, 1024, 1}, + {165, 1536, 330, 32, 1}, + {165, 2048, 330, 1024, 1}, + {165, 2560, 330, 64, 1}, + {165, 3072, 330, 64, 1}, + {165, 3584, 330, 512, 1}, + {165, 4096, 330, 2048, 0}, + {165, 4608, 330, 64, 0}, + {165, 5120, 330, 1024, 0}, + {165, 5632, 330, 512, 0}, + {165, 6144, 330, 2048, 0}, + {165, 6656, 330, 64, 0}, + {165, 7168, 330, 1024, 0}, + {165, 7680, 330, 512, 0}, + {165, 8192, 330, 2048, 0}, + {165, 8704, 330, 64, 0}, + {165, 9216, 330, 9216, 1}, + {165, 9728, 330, 9728, 1}, + {165, 10240, 330, 512, 0}, + {165, 20480, 330, 64, 0}, + {165, 30720, 330, 64, 0}, + {165, 40960, 330, 4096, 0}, + {165, 51200, 330, 2048, 0}, + {165, 61440, 330, 64, 0}, + {165, 71680, 330, 1024, 0}, + {165, 81920, 330, 4096, 0}, + {165, 92160, 330, 512, 0}, + {165, 102400, 330, 4096, 0}, + {165, 204800, 330, 8192, 0}, + {165, 307200, 330, 4096, 0}, + {165, 409600, 330, 409600, 0}, + {165, 512000, 330, 512000, 0}, + {165, 614400, 330, 614400, 0}, + {165, 716800, 330, 716800, 0}, + {165, 819200, 330, 819200, 0}, + {165, 921600, 330, 921600, 0}, + {165, 1024000, 330, 1024000, 0} }; //////////////////////////////////////////////////////////////////////////////// -std::vector< std::array > dgemm_tn_v100 = -{ - {1 , 512 , 3 , 32 , 0 }, - {1 , 1024 , 3 , 256 , 0 }, - {1 , 1536 , 3 , 64 , 0 }, - {1 , 2048 , 3 , 1024 , 0 }, - {1 , 2560 , 3 , 64 , 0 }, - {1 , 3072 , 3 , 64 , 0 }, - {1 , 3584 , 3 , 32 , 0 }, - {1 , 4096 , 3 , 64 , 0 }, - {1 , 4608 , 3 , 64 , 0 }, - {1 , 5120 , 3 , 64 , 0 }, - {1 , 5632 , 3 , 32 , 0 }, - {1 , 6144 , 3 , 64 , 0 }, - {1 , 6656 , 3 , 32 , 0 }, - {1 , 7168 , 3 , 1024 , 0 }, - {1 , 7680 , 3 , 32 , 1 }, - {1 , 8192 , 3 , 32 , 0 }, - {1 , 8704 , 3 , 32 , 0 }, - {1 , 9216 , 3 , 32 , 0 }, - {1 , 9728 , 3 , 32 , 0 }, - {1 , 10240 , 3 , 64 , 0 }, - {1 , 20480 , 3 , 32 , 0 }, - {1 , 30720 , 3 , 32 , 0 }, - {1 , 40960 , 3 , 32 , 0 }, - {1 , 51200 , 3 , 32 , 0 }, - {1 , 61440 , 3 , 64 , 1 }, - {1 , 71680 , 3 , 64 , 1 }, - {1 , 81920 , 3 , 2048 , 1 }, - {1 , 92160 , 3 , 64 , 1 }, - {1 , 102400 , 3 , 64 , 1 }, - {1 , 204800 , 3 , 2048 , 1 }, - {1 , 307200 , 3 , 2048 , 1 }, - {1 , 409600 , 3 , 2048 , 1 }, - {1 , 512000 , 3 , 2048 , 1 }, - {1 , 614400 , 3 , 4096 , 1 }, - {1 , 716800 , 3 , 4096 , 1 }, - {1 , 819200 , 3 , 32768 , 1 }, - {1 , 921600 , 3 , 4096 , 1 }, - {1 , 1024000, 3 , 8192 , 1 }, - {1 , 512 , 4 , 32 , 0 }, - {1 , 1024 , 4 , 64 , 0 }, - {1 , 1536 , 4 , 64 , 0 }, - {1 , 2048 , 4 , 32 , 0 }, - {1 , 2560 , 4 , 64 , 0 }, - {1 , 3072 , 4 , 64 , 0 }, - {1 , 3584 , 4 , 64 , 0 }, - {1 , 4096 , 4 , 32 , 0 }, - {1 , 4608 , 4 , 32 , 0 }, - {1 , 5120 , 4 , 32 , 0 }, - {1 , 5632 , 4 , 64 , 0 }, - {1 , 6144 , 4 , 64 , 0 }, - {1 , 6656 , 4 , 32 , 0 }, - {1 , 7168 , 4 , 512 , 0 }, - {1 , 7680 , 4 , 64 , 0 }, - {1 , 8192 , 4 , 32 , 0 }, - {1 , 8704 , 4 , 64 , 1 }, - {1 , 9216 , 4 , 64 , 0 }, - {1 , 9728 , 4 , 64 , 0 }, - {1 , 10240 , 4 , 64 , 0 }, - {1 , 20480 , 4 , 32 , 0 }, - {1 , 30720 , 4 , 64 , 0 }, - {1 , 40960 , 4 , 64 , 0 }, - {1 , 51200 , 4 , 32 , 0 }, - {1 , 61440 , 4 , 64 , 1 }, - {1 , 71680 , 4 , 32 , 0 }, - {1 , 81920 , 4 , 64 , 1 }, - {1 , 92160 , 4 , 512 , 1 }, - {1 , 102400 , 4 , 32 , 0 }, - {1 , 204800 , 4 , 64 , 0 }, - {1 , 307200 , 4 , 2048 , 1 }, - {1 , 409600 , 4 , 16384 , 1 }, - {1 , 512000 , 4 , 2048 , 1 }, - {1 , 614400 , 4 , 8192 , 1 }, - {1 , 716800 , 4 , 4096 , 1 }, - {1 , 819200 , 4 , 32768 , 1 }, - {1 , 921600 , 4 , 4096 , 1 }, - {1 , 1024000, 4 , 4096 , 1 }, - {3 , 512 , 6 , 256 , 1 }, - {3 , 1024 , 6 , 64 , 1 }, - {3 , 1536 , 6 , 32 , 1 }, - {3 , 2048 , 6 , 32 , 1 }, - {3 , 2560 , 6 , 32 , 1 }, - {3 , 3072 , 6 , 64 , 1 }, - {3 , 3584 , 6 , 128 , 1 }, - {3 , 4096 , 6 , 64 , 1 }, - {3 , 4608 , 6 , 32 , 1 }, - {3 , 5120 , 6 , 64 , 1 }, - {3 , 5632 , 6 , 32 , 1 }, - {3 , 6144 , 6 , 32 , 1 }, - {3 , 6656 , 6 , 256 , 1 }, - {3 , 7168 , 6 , 64 , 1 }, - {3 , 7680 , 6 , 256 , 1 }, - {3 , 8192 , 6 , 32 , 1 }, - {3 , 8704 , 6 , 256 , 1 }, - {3 , 9216 , 6 , 64 , 1 }, - {3 , 9728 , 6 , 32 , 1 }, - {3 , 10240 , 6 , 32 , 1 }, - {3 , 20480 , 6 , 64 , 1 }, - {3 , 30720 , 6 , 64 , 1 }, - {3 , 40960 , 6 , 64 , 1 }, - {3 , 51200 , 6 , 64 , 1 }, - {3 , 61440 , 6 , 64 , 1 }, - {3 , 71680 , 6 , 1024 , 1 }, - {3 , 81920 , 6 , 64 , 1 }, - {3 , 92160 , 6 , 64 , 1 }, - {3 , 102400 , 6 , 64 , 1 }, - {3 , 204800 , 6 , 128 , 1 }, - {3 , 307200 , 6 , 4096 , 1 }, - {3 , 409600 , 6 , 16384 , 1 }, - {3 , 512000 , 6 , 4096 , 1 }, - {3 , 614400 , 6 , 8192 , 1 }, - {3 , 716800 , 6 , 4096 , 1 }, - {3 , 819200 , 6 , 32768 , 1 }, - {3 , 921600 , 6 , 4096 , 1 }, - {3 , 1024000, 6 , 8192 , 1 }, - {4 , 512 , 10 , 32 , 1 }, - {4 , 1024 , 10 , 32 , 1 }, - {4 , 1536 , 10 , 64 , 1 }, - {4 , 2048 , 10 , 64 , 1 }, - {4 , 2560 , 10 , 64 , 1 }, - {4 , 3072 , 10 , 64 , 1 }, - {4 , 3584 , 10 , 32 , 1 }, - {4 , 4096 , 10 , 32 , 1 }, - {4 , 4608 , 10 , 32 , 1 }, - {4 , 5120 , 10 , 32 , 1 }, - {4 , 5632 , 10 , 64 , 1 }, - {4 , 6144 , 10 , 32 , 1 }, - {4 , 6656 , 10 , 32 , 1 }, - {4 , 7168 , 10 , 64 , 1 }, - {4 , 7680 , 10 , 64 , 1 }, - {4 , 8192 , 10 , 32 , 1 }, - {4 , 8704 , 10 , 32 , 1 }, - {4 , 9216 , 10 , 32 , 1 }, - {4 , 9728 , 10 , 32 , 1 }, - {4 , 10240 , 10 , 64 , 1 }, - {4 , 20480 , 10 , 64 , 1 }, - {4 , 30720 , 10 , 32 , 1 }, - {4 , 40960 , 10 , 256 , 1 }, - {4 , 51200 , 10 , 64 , 1 }, - {4 , 61440 , 10 , 64 , 1 }, - {4 , 71680 , 10 , 64 , 1 }, - {4 , 81920 , 10 , 64 , 1 }, - {4 , 92160 , 10 , 64 , 1 }, - {4 , 102400 , 10 , 64 , 1 }, - {4 , 204800 , 10 , 8192 , 1 }, - {4 , 307200 , 10 , 4096 , 1 }, - {4 , 409600 , 10 , 16384 , 1 }, - {4 , 512000 , 10 , 4096 , 1 }, - {4 , 614400 , 10 , 4096 , 1 }, - {4 , 716800 , 10 , 4096 , 1 }, - {4 , 819200 , 10 , 16384 , 1 }, - {4 , 921600 , 10 , 4096 , 1 }, - {4 , 1024000, 10 , 8192 , 1 }, - {6 , 512 , 10 , 32 , 1 }, - {6 , 1024 , 10 , 32 , 1 }, - {6 , 1536 , 10 , 64 , 1 }, - {6 , 2048 , 10 , 64 , 1 }, - {6 , 2560 , 10 , 32 , 1 }, - {6 , 3072 , 10 , 32 , 1 }, - {6 , 3584 , 10 , 32 , 1 }, - {6 , 4096 , 10 , 32 , 1 }, - {6 , 4608 , 10 , 256 , 1 }, - {6 , 5120 , 10 , 32 , 1 }, - {6 , 5632 , 10 , 32 , 1 }, - {6 , 6144 , 10 , 32 , 1 }, - {6 , 6656 , 10 , 32 , 1 }, - {6 , 7168 , 10 , 64 , 1 }, - {6 , 7680 , 10 , 64 , 1 }, - {6 , 8192 , 10 , 32 , 1 }, - {6 , 8704 , 10 , 32 , 1 }, - {6 , 9216 , 10 , 64 , 1 }, - {6 , 9728 , 10 , 64 , 1 }, - {6 , 10240 , 10 , 64 , 1 }, - {6 , 20480 , 10 , 32 , 1 }, - {6 , 30720 , 10 , 64 , 1 }, - {6 , 40960 , 10 , 64 , 1 }, - {6 , 51200 , 10 , 64 , 1 }, - {6 , 61440 , 10 , 64 , 1 }, - {6 , 71680 , 10 , 64 , 1 }, - {6 , 81920 , 10 , 64 , 1 }, - {6 , 92160 , 10 , 64 , 1 }, - {6 , 102400 , 10 , 64 , 1 }, - {6 , 204800 , 10 , 8192 , 1 }, - {6 , 307200 , 10 , 2048 , 1 }, - {6 , 409600 , 10 , 8192 , 1 }, - {6 , 512000 , 10 , 1024 , 1 }, - {6 , 614400 , 10 , 8192 , 1 }, - {6 , 716800 , 10 , 4096 , 1 }, - {6 , 819200 , 10 , 32768 , 1 }, - {6 , 921600 , 10 , 4096 , 1 }, - {6 , 1024000, 10 , 8192 , 1 }, - {12 , 512 , 15 , 32 , 1 }, - {12 , 1024 , 15 , 256 , 1 }, - {12 , 1536 , 15 , 256 , 1 }, - {12 , 2048 , 15 , 128 , 1 }, - {12 , 2560 , 15 , 64 , 1 }, - {12 , 3072 , 15 , 64 , 1 }, - {12 , 3584 , 15 , 32 , 1 }, - {12 , 4096 , 15 , 32 , 1 }, - {12 , 4608 , 15 , 128 , 1 }, - {12 , 5120 , 15 , 256 , 1 }, - {12 , 5632 , 15 , 64 , 1 }, - {12 , 6144 , 15 , 32 , 1 }, - {12 , 6656 , 15 , 32 , 1 }, - {12 , 7168 , 15 , 64 , 1 }, - {12 , 7680 , 15 , 64 , 1 }, - {12 , 8192 , 15 , 32 , 1 }, - {12 , 8704 , 15 , 64 , 1 }, - {12 , 9216 , 15 , 32 , 1 }, - {12 , 9728 , 15 , 32 , 1 }, - {12 , 10240 , 15 , 32 , 1 }, - {12 , 20480 , 15 , 64 , 1 }, - {12 , 30720 , 15 , 64 , 1 }, - {12 , 40960 , 15 , 64 , 1 }, - {12 , 51200 , 15 , 64 , 1 }, - {12 , 61440 , 15 , 64 , 1 }, - {12 , 71680 , 15 , 64 , 1 }, - {12 , 81920 , 15 , 8192 , 1 }, - {12 , 92160 , 15 , 64 , 1 }, - {12 , 102400 , 15 , 64 , 1 }, - {12 , 204800 , 15 , 4096 , 1 }, - {12 , 307200 , 15 , 4096 , 1 }, - {12 , 409600 , 15 , 16384 , 1 }, - {12 , 512000 , 15 , 4096 , 1 }, - {12 , 614400 , 15 , 8192 , 1 }, - {12 , 716800 , 15 , 4096 , 1 }, - {12 , 819200 , 15 , 32768 , 1 }, - {12 , 921600 , 15 , 4096 , 1 }, - {12 , 1024000, 15 , 8192 , 1 }, - {11 , 512 , 20 , 64 , 1 }, - {11 , 1024 , 20 , 64 , 1 }, - {11 , 1536 , 20 , 32 , 1 }, - {11 , 2048 , 20 , 32 , 1 }, - {11 , 2560 , 20 , 32 , 1 }, - {11 , 3072 , 20 , 64 , 1 }, - {11 , 3584 , 20 , 64 , 1 }, - {11 , 4096 , 20 , 64 , 1 }, - {11 , 4608 , 20 , 32 , 1 }, - {11 , 5120 , 20 , 128 , 1 }, - {11 , 5632 , 20 , 32 , 1 }, - {11 , 6144 , 20 , 32 , 1 }, - {11 , 6656 , 20 , 64 , 1 }, - {11 , 7168 , 20 , 64 , 1 }, - {11 , 7680 , 20 , 64 , 1 }, - {11 , 8192 , 20 , 64 , 1 }, - {11 , 8704 , 20 , 32 , 1 }, - {11 , 9216 , 20 , 32 , 1 }, - {11 , 9728 , 20 , 64 , 1 }, - {11 , 10240 , 20 , 32 , 1 }, - {11 , 20480 , 20 , 64 , 1 }, - {11 , 30720 , 20 , 64 , 1 }, - {11 , 40960 , 20 , 64 , 1 }, - {11 , 51200 , 20 , 64 , 1 }, - {11 , 61440 , 20 , 64 , 1 }, - {11 , 71680 , 20 , 64 , 1 }, - {11 , 81920 , 20 , 16384 , 1 }, - {11 , 92160 , 20 , 256 , 1 }, - {11 , 102400 , 20 , 64 , 1 }, - {11 , 204800 , 20 , 8192 , 1 }, - {11 , 307200 , 20 , 4096 , 1 }, - {11 , 409600 , 20 , 16384 , 1 }, - {11 , 512000 , 20 , 1024 , 1 }, - {11 , 614400 , 20 , 8192 , 1 }, - {11 , 716800 , 20 , 4096 , 1 }, - {11 , 819200 , 20 , 32768 , 1 }, - {11 , 921600 , 20 , 4096 , 1 }, - {11 , 1024000, 20 , 4096 , 1 }, - {16 , 512 , 21 , 64 , 1 }, - {16 , 1024 , 21 , 32 , 1 }, - {16 , 1536 , 21 , 64 , 1 }, - {16 , 2048 , 21 , 64 , 1 }, - {16 , 2560 , 21 , 32 , 1 }, - {16 , 3072 , 21 , 64 , 1 }, - {16 , 3584 , 21 , 64 , 1 }, - {16 , 4096 , 21 , 32 , 1 }, - {16 , 4608 , 21 , 64 , 1 }, - {16 , 5120 , 21 , 64 , 1 }, - {16 , 5632 , 21 , 64 , 1 }, - {16 , 6144 , 21 , 32 , 1 }, - {16 , 6656 , 21 , 32 , 1 }, - {16 , 7168 , 21 , 64 , 1 }, - {16 , 7680 , 21 , 32 , 1 }, - {16 , 8192 , 21 , 64 , 1 }, - {16 , 8704 , 21 , 32 , 1 }, - {16 , 9216 , 21 , 64 , 1 }, - {16 , 9728 , 21 , 64 , 1 }, - {16 , 10240 , 21 , 32 , 1 }, - {16 , 20480 , 21 , 32 , 1 }, - {16 , 30720 , 21 , 64 , 1 }, - {16 , 40960 , 21 , 32 , 1 }, - {16 , 51200 , 21 , 64 , 1 }, - {16 , 61440 , 21 , 64 , 1 }, - {16 , 71680 , 21 , 64 , 1 }, - {16 , 81920 , 21 , 16384 , 1 }, - {16 , 92160 , 21 , 64 , 1 }, - {16 , 102400 , 21 , 64 , 1 }, - {16 , 204800 , 21 , 4096 , 1 }, - {16 , 307200 , 21 , 256 , 1 }, - {16 , 409600 , 21 , 16384 , 1 }, - {16 , 512000 , 21 , 4096 , 1 }, - {16 , 614400 , 21 , 8192 , 1 }, - {16 , 716800 , 21 , 4096 , 1 }, - {16 , 819200 , 21 , 8192 , 1 }, - {16 , 921600 , 21 , 4096 , 1 }, - {16 , 1024000, 21 , 8192 , 1 }, - {25 , 512 , 28 , 32 , 1 }, - {25 , 1024 , 28 , 64 , 1 }, - {25 , 1536 , 28 , 128 , 1 }, - {25 , 2048 , 28 , 64 , 1 }, - {25 , 2560 , 28 , 32 , 1 }, - {25 , 3072 , 28 , 64 , 1 }, - {25 , 3584 , 28 , 64 , 1 }, - {25 , 4096 , 28 , 64 , 1 }, - {25 , 4608 , 28 , 32 , 1 }, - {25 , 5120 , 28 , 32 , 1 }, - {25 , 5632 , 28 , 64 , 1 }, - {25 , 6144 , 28 , 64 , 1 }, - {25 , 6656 , 28 , 32 , 1 }, - {25 , 7168 , 28 , 64 , 1 }, - {25 , 7680 , 28 , 32 , 1 }, - {25 , 8192 , 28 , 32 , 1 }, - {25 , 8704 , 28 , 64 , 1 }, - {25 , 9216 , 28 , 32 , 1 }, - {25 , 9728 , 28 , 32 , 1 }, - {25 , 10240 , 28 , 32 , 1 }, - {25 , 20480 , 28 , 32 , 1 }, - {25 , 30720 , 28 , 64 , 1 }, - {25 , 40960 , 28 , 64 , 1 }, - {25 , 51200 , 28 , 32 , 1 }, - {25 , 61440 , 28 , 64 , 1 }, - {25 , 71680 , 28 , 64 , 1 }, - {25 , 81920 , 28 , 256 , 1 }, - {25 , 92160 , 28 , 64 , 1 }, - {25 , 102400 , 28 , 4096 , 1 }, - {25 , 204800 , 28 , 8192 , 1 }, - {25 , 307200 , 28 , 4096 , 1 }, - {25 , 409600 , 28 , 16384 , 1 }, - {25 , 512000 , 28 , 4096 , 1 }, - {25 , 614400 , 28 , 8192 , 1 }, - {25 , 716800 , 28 , 4096 , 1 }, - {25 , 819200 , 28 , 16384 , 1 }, - {25 , 921600 , 28 , 921600 , 1 }, - {25 , 1024000, 28 , 8192 , 1 }, - {24 , 512 , 35 , 32 , 1 }, - {24 , 1024 , 35 , 64 , 1 }, - {24 , 1536 , 35 , 32 , 1 }, - {24 , 2048 , 35 , 64 , 1 }, - {24 , 2560 , 35 , 64 , 1 }, - {24 , 3072 , 35 , 32 , 1 }, - {24 , 3584 , 35 , 32 , 1 }, - {24 , 4096 , 35 , 32 , 1 }, - {24 , 4608 , 35 , 32 , 1 }, - {24 , 5120 , 35 , 32 , 1 }, - {24 , 5632 , 35 , 64 , 1 }, - {24 , 6144 , 35 , 32 , 1 }, - {24 , 6656 , 35 , 32 , 1 }, - {24 , 7168 , 35 , 32 , 1 }, - {24 , 7680 , 35 , 64 , 1 }, - {24 , 8192 , 35 , 64 , 1 }, - {24 , 8704 , 35 , 32 , 1 }, - {24 , 9216 , 35 , 64 , 1 }, - {24 , 9728 , 35 , 32 , 1 }, - {24 , 10240 , 35 , 64 , 1 }, - {24 , 20480 , 35 , 32 , 1 }, - {24 , 30720 , 35 , 64 , 1 }, - {24 , 40960 , 35 , 64 , 1 }, - {24 , 51200 , 35 , 64 , 1 }, - {24 , 61440 , 35 , 64 , 1 }, - {24 , 71680 , 35 , 32 , 1 }, - {24 , 81920 , 35 , 16384 , 1 }, - {24 , 92160 , 35 , 64 , 1 }, - {24 , 102400 , 35 , 64 , 1 }, - {24 , 204800 , 35 , 8192 , 1 }, - {24 , 307200 , 35 , 4096 , 1 }, - {24 , 409600 , 35 , 8192 , 1 }, - {24 , 512000 , 35 , 4096 , 1 }, - {24 , 614400 , 35 , 8192 , 1 }, - {24 , 716800 , 35 , 4096 , 1 }, - {24 , 819200 , 35 , 2048 , 1 }, - {24 , 921600 , 35 , 2048 , 1 }, - {24 , 1024000, 35 , 8192 , 1 }, - {33 , 512 , 36 , 32 , 1 }, - {33 , 1024 , 36 , 32 , 1 }, - {33 , 1536 , 36 , 64 , 1 }, - {33 , 2048 , 36 , 32 , 1 }, - {33 , 2560 , 36 , 32 , 1 }, - {33 , 3072 , 36 , 32 , 1 }, - {33 , 3584 , 36 , 64 , 1 }, - {33 , 4096 , 36 , 32 , 1 }, - {33 , 4608 , 36 , 64 , 1 }, - {33 , 5120 , 36 , 32 , 1 }, - {33 , 5632 , 36 , 32 , 1 }, - {33 , 6144 , 36 , 32 , 1 }, - {33 , 6656 , 36 , 32 , 1 }, - {33 , 7168 , 36 , 64 , 1 }, - {33 , 7680 , 36 , 64 , 1 }, - {33 , 8192 , 36 , 32 , 1 }, - {33 , 8704 , 36 , 32 , 1 }, - {33 , 9216 , 36 , 64 , 1 }, - {33 , 9728 , 36 , 32 , 1 }, - {33 , 10240 , 36 , 32 , 1 }, - {33 , 20480 , 36 , 32 , 1 }, - {33 , 30720 , 36 , 64 , 1 }, - {33 , 40960 , 36 , 32 , 1 }, - {33 , 51200 , 36 , 32 , 1 }, - {33 , 61440 , 36 , 64 , 1 }, - {33 , 71680 , 36 , 64 , 1 }, - {33 , 81920 , 36 , 64 , 1 }, - {33 , 92160 , 36 , 64 , 1 }, - {33 , 102400 , 36 , 64 , 1 }, - {33 , 204800 , 36 , 8192 , 1 }, - {33 , 307200 , 36 , 4096 , 1 }, - {33 , 409600 , 36 , 8192 , 1 }, - {33 , 512000 , 36 , 4096 , 1 }, - {33 , 614400 , 36 , 4096 , 1 }, - {33 , 716800 , 36 , 4096 , 1 }, - {33 , 819200 , 36 , 16384 , 1 }, - {33 , 921600 , 36 , 921600 , 1 }, - {33 , 1024000, 36 , 8192 , 1 }, - {42 , 512 , 45 , 32 , 1 }, - {42 , 1024 , 45 , 64 , 1 }, - {42 , 1536 , 45 , 32 , 1 }, - {42 , 2048 , 45 , 64 , 1 }, - {42 , 2560 , 45 , 32 , 1 }, - {42 , 3072 , 45 , 64 , 1 }, - {42 , 3584 , 45 , 32 , 1 }, - {42 , 4096 , 45 , 32 , 1 }, - {42 , 4608 , 45 , 32 , 1 }, - {42 , 5120 , 45 , 32 , 1 }, - {42 , 5632 , 45 , 64 , 1 }, - {42 , 6144 , 45 , 32 , 1 }, - {42 , 6656 , 45 , 32 , 1 }, - {42 , 7168 , 45 , 64 , 1 }, - {42 , 7680 , 45 , 32 , 1 }, - {42 , 8192 , 45 , 64 , 1 }, - {42 , 8704 , 45 , 64 , 1 }, - {42 , 9216 , 45 , 32 , 1 }, - {42 , 9728 , 45 , 64 , 1 }, - {42 , 10240 , 45 , 32 , 1 }, - {42 , 20480 , 45 , 2048 , 1 }, - {42 , 30720 , 45 , 32 , 1 }, - {42 , 40960 , 45 , 64 , 1 }, - {42 , 51200 , 45 , 64 , 1 }, - {42 , 61440 , 45 , 32 , 1 }, - {42 , 71680 , 45 , 32 , 1 }, - {42 , 81920 , 45 , 64 , 1 }, - {42 , 92160 , 45 , 64 , 1 }, - {42 , 102400 , 45 , 64 , 1 }, - {42 , 204800 , 45 , 4096 , 1 }, - {42 , 307200 , 45 , 4096 , 1 }, - {42 , 409600 , 45 , 16384 , 1 }, - {42 , 512000 , 45 , 4096 , 1 }, - {42 , 614400 , 45 , 4096 , 1 }, - {42 , 716800 , 45 , 4096 , 1 }, - {42 , 819200 , 45 , 8192 , 1 }, - {42 , 921600 , 45 , 2048 , 1 }, - {42 , 1024000, 45 , 8192 , 1 }, - {43 , 512 , 56 , 32 , 1 }, - {43 , 1024 , 56 , 32 , 1 }, - {43 , 1536 , 56 , 32 , 1 }, - {43 , 2048 , 56 , 32 , 1 }, - {43 , 2560 , 56 , 32 , 1 }, - {43 , 3072 , 56 , 32 , 1 }, - {43 , 3584 , 56 , 32 , 1 }, - {43 , 4096 , 56 , 64 , 1 }, - {43 , 4608 , 56 , 64 , 1 }, - {43 , 5120 , 56 , 32 , 1 }, - {43 , 5632 , 56 , 64 , 1 }, - {43 , 6144 , 56 , 64 , 1 }, - {43 , 6656 , 56 , 32 , 1 }, - {43 , 7168 , 56 , 32 , 1 }, - {43 , 7680 , 56 , 32 , 1 }, - {43 , 8192 , 56 , 32 , 1 }, - {43 , 8704 , 56 , 32 , 1 }, - {43 , 9216 , 56 , 64 , 1 }, - {43 , 9728 , 56 , 64 , 1 }, - {43 , 10240 , 56 , 32 , 1 }, - {43 , 20480 , 56 , 64 , 1 }, - {43 , 30720 , 56 , 64 , 1 }, - {43 , 40960 , 56 , 64 , 1 }, - {43 , 51200 , 56 , 64 , 1 }, - {43 , 61440 , 56 , 64 , 1 }, - {43 , 71680 , 56 , 64 , 1 }, - {43 , 81920 , 56 , 64 , 1 }, - {43 , 92160 , 56 , 64 , 1 }, - {43 , 102400 , 56 , 64 , 1 }, - {43 , 204800 , 56 , 8192 , 1 }, - {43 , 307200 , 56 , 2048 , 1 }, - {43 , 409600 , 56 , 8192 , 1 }, - {43 , 512000 , 56 , 2048 , 1 }, - {43 , 614400 , 56 , 8192 , 1 }, - {43 , 716800 , 56 , 4096 , 1 }, - {43 , 819200 , 56 , 4096 , 1 }, - {43 , 921600 , 56 , 4096 , 1 }, - {43 , 1024000, 56 , 4096 , 1 }, - {126 , 512 , 84 , 32 , 1 }, - {126 , 1024 , 84 , 512 , 1 }, - {126 , 1536 , 84 , 32 , 1 }, - {126 , 2048 , 84 , 64 , 1 }, - {126 , 2560 , 84 , 64 , 1 }, - {126 , 3072 , 84 , 64 , 1 }, - {126 , 3584 , 84 , 64 , 1 }, - {126 , 4096 , 84 , 32 , 1 }, - {126 , 4608 , 84 , 32 , 1 }, - {126 , 5120 , 84 , 32 , 1 }, - {126 , 5632 , 84 , 64 , 1 }, - {126 , 6144 , 84 , 32 , 1 }, - {126 , 6656 , 84 , 64 , 1 }, - {126 , 7168 , 84 , 64 , 1 }, - {126 , 7680 , 84 , 64 , 1 }, - {126 , 8192 , 84 , 64 , 1 }, - {126 , 8704 , 84 , 64 , 1 }, - {126 , 9216 , 84 , 32 , 1 }, - {126 , 9728 , 84 , 32 , 1 }, - {126 , 10240 , 84 , 32 , 1 }, - {126 , 20480 , 84 , 32 , 1 }, - {126 , 30720 , 84 , 2048 , 0 }, - {126 , 40960 , 84 , 2048 , 0 }, - {126 , 51200 , 84 , 2048 , 0 }, - {126 , 61440 , 84 , 4096 , 0 }, - {126 , 71680 , 84 , 64 , 0 }, - {126 , 81920 , 84 , 8192 , 0 }, - {126 , 92160 , 84 , 64 , 0 }, - {126 , 102400 , 84 , 64 , 0 }, - {126 , 204800 , 84 , 204800 , 0 }, - {126 , 307200 , 84 , 307200 , 0 }, - {126 , 409600 , 84 , 409600 , 0 }, - {126 , 512000 , 84 , 512000 , 0 }, - {126 , 614400 , 84 , 614400 , 0 }, - {126 , 716800 , 84 , 716800 , 0 }, - {126 , 819200 , 84 , 819200 , 0 }, - {126 , 921600 , 84 , 921600 , 0 }, - {126 , 1024000, 84 , 1024000, 0 }, - {210 , 512 , 120 , 64 , 1 }, - {210 , 1024 , 120 , 64 , 1 }, - {210 , 1536 , 120 , 32 , 1 }, - {210 , 2048 , 120 , 2048 , 1 }, - {210 , 2560 , 120 , 64 , 1 }, - {210 , 3072 , 120 , 1024 , 1 }, - {210 , 3584 , 120 , 64 , 1 }, - {210 , 4096 , 120 , 2048 , 1 }, - {210 , 4608 , 120 , 64 , 0 }, - {210 , 5120 , 120 , 1024 , 1 }, - {210 , 5632 , 120 , 64 , 1 }, - {210 , 6144 , 120 , 1024 , 1 }, - {210 , 6656 , 120 , 64 , 1 }, - {210 , 7168 , 120 , 1024 , 1 }, - {210 , 7680 , 120 , 64 , 1 }, - {210 , 8192 , 120 , 8192 , 1 }, - {210 , 8704 , 120 , 32 , 1 }, - {210 , 9216 , 120 , 1024 , 1 }, - {210 , 9728 , 120 , 64 , 1 }, - {210 , 10240 , 120 , 1024 , 0 }, - {210 , 20480 , 120 , 4096 , 0 }, - {210 , 30720 , 120 , 64 , 0 }, - {210 , 40960 , 120 , 40960 , 0 }, - {210 , 51200 , 120 , 51200 , 0 }, - {210 , 61440 , 120 , 61440 , 0 }, - {210 , 71680 , 120 , 71680 , 0 }, - {210 , 81920 , 120 , 81920 , 0 }, - {210 , 92160 , 120 , 92160 , 0 }, - {210 , 102400 , 120 , 102400 , 0 }, - {210 , 204800 , 120 , 2048 , 0 }, - {210 , 307200 , 120 , 307200 , 0 }, - {210 , 409600 , 120 , 409600 , 0 }, - {210 , 512000 , 120 , 512000 , 0 }, - {210 , 614400 , 120 , 614400 , 0 }, - {210 , 716800 , 120 , 716800 , 0 }, - {210 , 819200 , 120 , 819200 , 0 }, - {210 , 921600 , 120 , 921600 , 0 }, - {210 , 1024000, 120 , 1024000, 0 }, - {330 , 512 , 165 , 512 , 1 }, - {330 , 1024 , 165 , 1024 , 1 }, - {330 , 1536 , 165 , 512 , 1 }, - {330 , 2048 , 165 , 2048 , 1 }, - {330 , 2560 , 165 , 512 , 1 }, - {330 , 3072 , 165 , 512 , 0 }, - {330 , 3584 , 165 , 512 , 1 }, - {330 , 4096 , 165 , 4096 , 1 }, - {330 , 4608 , 165 , 512 , 1 }, - {330 , 5120 , 165 , 1024 , 1 }, - {330 , 5632 , 165 , 512 , 1 }, - {330 , 6144 , 165 , 2048 , 1 }, - {330 , 6656 , 165 , 512 , 1 }, - {330 , 7168 , 165 , 512 , 1 }, - {330 , 7680 , 165 , 512 , 1 }, - {330 , 8192 , 165 , 2048 , 1 }, - {330 , 8704 , 165 , 512 , 1 }, - {330 , 9216 , 165 , 1024 , 1 }, - {330 , 9728 , 165 , 512 , 1 }, - {330 , 10240 , 165 , 2048 , 1 }, - {330 , 20480 , 165 , 2048 , 1 }, - {330 , 30720 , 165 , 30720 , 0 }, - {330 , 40960 , 165 , 40960 , 0 }, - {330 , 51200 , 165 , 2048 , 0 }, - {330 , 61440 , 165 , 61440 , 0 }, - {330 , 71680 , 165 , 71680 , 0 }, - {330 , 81920 , 165 , 81920 , 0 }, - {330 , 92160 , 165 , 92160 , 0 }, - {330 , 102400 , 165 , 2048 , 0 }, - {330 , 204800 , 165 , 204800 , 0 }, - {330 , 307200 , 165 , 307200 , 0 }, - {330 , 409600 , 165 , 409600 , 0 }, - {330 , 512000 , 165 , 512000 , 0 }, - {330 , 614400 , 165 , 614400 , 0 }, - {330 , 716800 , 165 , 716800 , 0 }, - {330 , 819200 , 165 , 819200 , 0 }, - {330 , 921600 , 165 , 921600 , 0 }, - {330 , 1024000, 165 , 1024000, 0 } +std::vector > dgemm_tn_v100 = { + {1, 512, 3, 32, 0}, + {1, 1024, 3, 256, 0}, + {1, 1536, 3, 64, 0}, + {1, 2048, 3, 1024, 0}, + {1, 2560, 3, 64, 0}, + {1, 3072, 3, 64, 0}, + {1, 3584, 3, 32, 0}, + {1, 4096, 3, 64, 0}, + {1, 4608, 3, 64, 0}, + {1, 5120, 3, 64, 0}, + {1, 5632, 3, 32, 0}, + {1, 6144, 3, 64, 0}, + {1, 6656, 3, 32, 0}, + {1, 7168, 3, 1024, 0}, + {1, 7680, 3, 32, 1}, + {1, 8192, 3, 32, 0}, + {1, 8704, 3, 32, 0}, + {1, 9216, 3, 32, 0}, + {1, 9728, 3, 32, 0}, + {1, 10240, 3, 64, 0}, + {1, 20480, 3, 32, 0}, + {1, 30720, 3, 32, 0}, + {1, 40960, 3, 32, 0}, + {1, 51200, 3, 32, 0}, + {1, 61440, 3, 64, 1}, + {1, 71680, 3, 64, 1}, + {1, 81920, 3, 2048, 1}, + {1, 92160, 3, 64, 1}, + {1, 102400, 3, 64, 1}, + {1, 204800, 3, 2048, 1}, + {1, 307200, 3, 2048, 1}, + {1, 409600, 3, 2048, 1}, + {1, 512000, 3, 2048, 1}, + {1, 614400, 3, 4096, 1}, + {1, 716800, 3, 4096, 1}, + {1, 819200, 3, 32768, 1}, + {1, 921600, 3, 4096, 1}, + {1, 1024000, 3, 8192, 1}, + {1, 512, 4, 32, 0}, + {1, 1024, 4, 64, 0}, + {1, 1536, 4, 64, 0}, + {1, 2048, 4, 32, 0}, + {1, 2560, 4, 64, 0}, + {1, 3072, 4, 64, 0}, + {1, 3584, 4, 64, 0}, + {1, 4096, 4, 32, 0}, + {1, 4608, 4, 32, 0}, + {1, 5120, 4, 32, 0}, + {1, 5632, 4, 64, 0}, + {1, 6144, 4, 64, 0}, + {1, 6656, 4, 32, 0}, + {1, 7168, 4, 512, 0}, + {1, 7680, 4, 64, 0}, + {1, 8192, 4, 32, 0}, + {1, 8704, 4, 64, 1}, + {1, 9216, 4, 64, 0}, + {1, 9728, 4, 64, 0}, + {1, 10240, 4, 64, 0}, + {1, 20480, 4, 32, 0}, + {1, 30720, 4, 64, 0}, + {1, 40960, 4, 64, 0}, + {1, 51200, 4, 32, 0}, + {1, 61440, 4, 64, 1}, + {1, 71680, 4, 32, 0}, + {1, 81920, 4, 64, 1}, + {1, 92160, 4, 512, 1}, + {1, 102400, 4, 32, 0}, + {1, 204800, 4, 64, 0}, + {1, 307200, 4, 2048, 1}, + {1, 409600, 4, 16384, 1}, + {1, 512000, 4, 2048, 1}, + {1, 614400, 4, 8192, 1}, + {1, 716800, 4, 4096, 1}, + {1, 819200, 4, 32768, 1}, + {1, 921600, 4, 4096, 1}, + {1, 1024000, 4, 4096, 1}, + {3, 512, 6, 256, 1}, + {3, 1024, 6, 64, 1}, + {3, 1536, 6, 32, 1}, + {3, 2048, 6, 32, 1}, + {3, 2560, 6, 32, 1}, + {3, 3072, 6, 64, 1}, + {3, 3584, 6, 128, 1}, + {3, 4096, 6, 64, 1}, + {3, 4608, 6, 32, 1}, + {3, 5120, 6, 64, 1}, + {3, 5632, 6, 32, 1}, + {3, 6144, 6, 32, 1}, + {3, 6656, 6, 256, 1}, + {3, 7168, 6, 64, 1}, + {3, 7680, 6, 256, 1}, + {3, 8192, 6, 32, 1}, + {3, 8704, 6, 256, 1}, + {3, 9216, 6, 64, 1}, + {3, 9728, 6, 32, 1}, + {3, 10240, 6, 32, 1}, + {3, 20480, 6, 64, 1}, + {3, 30720, 6, 64, 1}, + {3, 40960, 6, 64, 1}, + {3, 51200, 6, 64, 1}, + {3, 61440, 6, 64, 1}, + {3, 71680, 6, 1024, 1}, + {3, 81920, 6, 64, 1}, + {3, 92160, 6, 64, 1}, + {3, 102400, 6, 64, 1}, + {3, 204800, 6, 128, 1}, + {3, 307200, 6, 4096, 1}, + {3, 409600, 6, 16384, 1}, + {3, 512000, 6, 4096, 1}, + {3, 614400, 6, 8192, 1}, + {3, 716800, 6, 4096, 1}, + {3, 819200, 6, 32768, 1}, + {3, 921600, 6, 4096, 1}, + {3, 1024000, 6, 8192, 1}, + {4, 512, 10, 32, 1}, + {4, 1024, 10, 32, 1}, + {4, 1536, 10, 64, 1}, + {4, 2048, 10, 64, 1}, + {4, 2560, 10, 64, 1}, + {4, 3072, 10, 64, 1}, + {4, 3584, 10, 32, 1}, + {4, 4096, 10, 32, 1}, + {4, 4608, 10, 32, 1}, + {4, 5120, 10, 32, 1}, + {4, 5632, 10, 64, 1}, + {4, 6144, 10, 32, 1}, + {4, 6656, 10, 32, 1}, + {4, 7168, 10, 64, 1}, + {4, 7680, 10, 64, 1}, + {4, 8192, 10, 32, 1}, + {4, 8704, 10, 32, 1}, + {4, 9216, 10, 32, 1}, + {4, 9728, 10, 32, 1}, + {4, 10240, 10, 64, 1}, + {4, 20480, 10, 64, 1}, + {4, 30720, 10, 32, 1}, + {4, 40960, 10, 256, 1}, + {4, 51200, 10, 64, 1}, + {4, 61440, 10, 64, 1}, + {4, 71680, 10, 64, 1}, + {4, 81920, 10, 64, 1}, + {4, 92160, 10, 64, 1}, + {4, 102400, 10, 64, 1}, + {4, 204800, 10, 8192, 1}, + {4, 307200, 10, 4096, 1}, + {4, 409600, 10, 16384, 1}, + {4, 512000, 10, 4096, 1}, + {4, 614400, 10, 4096, 1}, + {4, 716800, 10, 4096, 1}, + {4, 819200, 10, 16384, 1}, + {4, 921600, 10, 4096, 1}, + {4, 1024000, 10, 8192, 1}, + {6, 512, 10, 32, 1}, + {6, 1024, 10, 32, 1}, + {6, 1536, 10, 64, 1}, + {6, 2048, 10, 64, 1}, + {6, 2560, 10, 32, 1}, + {6, 3072, 10, 32, 1}, + {6, 3584, 10, 32, 1}, + {6, 4096, 10, 32, 1}, + {6, 4608, 10, 256, 1}, + {6, 5120, 10, 32, 1}, + {6, 5632, 10, 32, 1}, + {6, 6144, 10, 32, 1}, + {6, 6656, 10, 32, 1}, + {6, 7168, 10, 64, 1}, + {6, 7680, 10, 64, 1}, + {6, 8192, 10, 32, 1}, + {6, 8704, 10, 32, 1}, + {6, 9216, 10, 64, 1}, + {6, 9728, 10, 64, 1}, + {6, 10240, 10, 64, 1}, + {6, 20480, 10, 32, 1}, + {6, 30720, 10, 64, 1}, + {6, 40960, 10, 64, 1}, + {6, 51200, 10, 64, 1}, + {6, 61440, 10, 64, 1}, + {6, 71680, 10, 64, 1}, + {6, 81920, 10, 64, 1}, + {6, 92160, 10, 64, 1}, + {6, 102400, 10, 64, 1}, + {6, 204800, 10, 8192, 1}, + {6, 307200, 10, 2048, 1}, + {6, 409600, 10, 8192, 1}, + {6, 512000, 10, 1024, 1}, + {6, 614400, 10, 8192, 1}, + {6, 716800, 10, 4096, 1}, + {6, 819200, 10, 32768, 1}, + {6, 921600, 10, 4096, 1}, + {6, 1024000, 10, 8192, 1}, + {12, 512, 15, 32, 1}, + {12, 1024, 15, 256, 1}, + {12, 1536, 15, 256, 1}, + {12, 2048, 15, 128, 1}, + {12, 2560, 15, 64, 1}, + {12, 3072, 15, 64, 1}, + {12, 3584, 15, 32, 1}, + {12, 4096, 15, 32, 1}, + {12, 4608, 15, 128, 1}, + {12, 5120, 15, 256, 1}, + {12, 5632, 15, 64, 1}, + {12, 6144, 15, 32, 1}, + {12, 6656, 15, 32, 1}, + {12, 7168, 15, 64, 1}, + {12, 7680, 15, 64, 1}, + {12, 8192, 15, 32, 1}, + {12, 8704, 15, 64, 1}, + {12, 9216, 15, 32, 1}, + {12, 9728, 15, 32, 1}, + {12, 10240, 15, 32, 1}, + {12, 20480, 15, 64, 1}, + {12, 30720, 15, 64, 1}, + {12, 40960, 15, 64, 1}, + {12, 51200, 15, 64, 1}, + {12, 61440, 15, 64, 1}, + {12, 71680, 15, 64, 1}, + {12, 81920, 15, 8192, 1}, + {12, 92160, 15, 64, 1}, + {12, 102400, 15, 64, 1}, + {12, 204800, 15, 4096, 1}, + {12, 307200, 15, 4096, 1}, + {12, 409600, 15, 16384, 1}, + {12, 512000, 15, 4096, 1}, + {12, 614400, 15, 8192, 1}, + {12, 716800, 15, 4096, 1}, + {12, 819200, 15, 32768, 1}, + {12, 921600, 15, 4096, 1}, + {12, 1024000, 15, 8192, 1}, + {11, 512, 20, 64, 1}, + {11, 1024, 20, 64, 1}, + {11, 1536, 20, 32, 1}, + {11, 2048, 20, 32, 1}, + {11, 2560, 20, 32, 1}, + {11, 3072, 20, 64, 1}, + {11, 3584, 20, 64, 1}, + {11, 4096, 20, 64, 1}, + {11, 4608, 20, 32, 1}, + {11, 5120, 20, 128, 1}, + {11, 5632, 20, 32, 1}, + {11, 6144, 20, 32, 1}, + {11, 6656, 20, 64, 1}, + {11, 7168, 20, 64, 1}, + {11, 7680, 20, 64, 1}, + {11, 8192, 20, 64, 1}, + {11, 8704, 20, 32, 1}, + {11, 9216, 20, 32, 1}, + {11, 9728, 20, 64, 1}, + {11, 10240, 20, 32, 1}, + {11, 20480, 20, 64, 1}, + {11, 30720, 20, 64, 1}, + {11, 40960, 20, 64, 1}, + {11, 51200, 20, 64, 1}, + {11, 61440, 20, 64, 1}, + {11, 71680, 20, 64, 1}, + {11, 81920, 20, 16384, 1}, + {11, 92160, 20, 256, 1}, + {11, 102400, 20, 64, 1}, + {11, 204800, 20, 8192, 1}, + {11, 307200, 20, 4096, 1}, + {11, 409600, 20, 16384, 1}, + {11, 512000, 20, 1024, 1}, + {11, 614400, 20, 8192, 1}, + {11, 716800, 20, 4096, 1}, + {11, 819200, 20, 32768, 1}, + {11, 921600, 20, 4096, 1}, + {11, 1024000, 20, 4096, 1}, + {16, 512, 21, 64, 1}, + {16, 1024, 21, 32, 1}, + {16, 1536, 21, 64, 1}, + {16, 2048, 21, 64, 1}, + {16, 2560, 21, 32, 1}, + {16, 3072, 21, 64, 1}, + {16, 3584, 21, 64, 1}, + {16, 4096, 21, 32, 1}, + {16, 4608, 21, 64, 1}, + {16, 5120, 21, 64, 1}, + {16, 5632, 21, 64, 1}, + {16, 6144, 21, 32, 1}, + {16, 6656, 21, 32, 1}, + {16, 7168, 21, 64, 1}, + {16, 7680, 21, 32, 1}, + {16, 8192, 21, 64, 1}, + {16, 8704, 21, 32, 1}, + {16, 9216, 21, 64, 1}, + {16, 9728, 21, 64, 1}, + {16, 10240, 21, 32, 1}, + {16, 20480, 21, 32, 1}, + {16, 30720, 21, 64, 1}, + {16, 40960, 21, 32, 1}, + {16, 51200, 21, 64, 1}, + {16, 61440, 21, 64, 1}, + {16, 71680, 21, 64, 1}, + {16, 81920, 21, 16384, 1}, + {16, 92160, 21, 64, 1}, + {16, 102400, 21, 64, 1}, + {16, 204800, 21, 4096, 1}, + {16, 307200, 21, 256, 1}, + {16, 409600, 21, 16384, 1}, + {16, 512000, 21, 4096, 1}, + {16, 614400, 21, 8192, 1}, + {16, 716800, 21, 4096, 1}, + {16, 819200, 21, 8192, 1}, + {16, 921600, 21, 4096, 1}, + {16, 1024000, 21, 8192, 1}, + {25, 512, 28, 32, 1}, + {25, 1024, 28, 64, 1}, + {25, 1536, 28, 128, 1}, + {25, 2048, 28, 64, 1}, + {25, 2560, 28, 32, 1}, + {25, 3072, 28, 64, 1}, + {25, 3584, 28, 64, 1}, + {25, 4096, 28, 64, 1}, + {25, 4608, 28, 32, 1}, + {25, 5120, 28, 32, 1}, + {25, 5632, 28, 64, 1}, + {25, 6144, 28, 64, 1}, + {25, 6656, 28, 32, 1}, + {25, 7168, 28, 64, 1}, + {25, 7680, 28, 32, 1}, + {25, 8192, 28, 32, 1}, + {25, 8704, 28, 64, 1}, + {25, 9216, 28, 32, 1}, + {25, 9728, 28, 32, 1}, + {25, 10240, 28, 32, 1}, + {25, 20480, 28, 32, 1}, + {25, 30720, 28, 64, 1}, + {25, 40960, 28, 64, 1}, + {25, 51200, 28, 32, 1}, + {25, 61440, 28, 64, 1}, + {25, 71680, 28, 64, 1}, + {25, 81920, 28, 256, 1}, + {25, 92160, 28, 64, 1}, + {25, 102400, 28, 4096, 1}, + {25, 204800, 28, 8192, 1}, + {25, 307200, 28, 4096, 1}, + {25, 409600, 28, 16384, 1}, + {25, 512000, 28, 4096, 1}, + {25, 614400, 28, 8192, 1}, + {25, 716800, 28, 4096, 1}, + {25, 819200, 28, 16384, 1}, + {25, 921600, 28, 921600, 1}, + {25, 1024000, 28, 8192, 1}, + {24, 512, 35, 32, 1}, + {24, 1024, 35, 64, 1}, + {24, 1536, 35, 32, 1}, + {24, 2048, 35, 64, 1}, + {24, 2560, 35, 64, 1}, + {24, 3072, 35, 32, 1}, + {24, 3584, 35, 32, 1}, + {24, 4096, 35, 32, 1}, + {24, 4608, 35, 32, 1}, + {24, 5120, 35, 32, 1}, + {24, 5632, 35, 64, 1}, + {24, 6144, 35, 32, 1}, + {24, 6656, 35, 32, 1}, + {24, 7168, 35, 32, 1}, + {24, 7680, 35, 64, 1}, + {24, 8192, 35, 64, 1}, + {24, 8704, 35, 32, 1}, + {24, 9216, 35, 64, 1}, + {24, 9728, 35, 32, 1}, + {24, 10240, 35, 64, 1}, + {24, 20480, 35, 32, 1}, + {24, 30720, 35, 64, 1}, + {24, 40960, 35, 64, 1}, + {24, 51200, 35, 64, 1}, + {24, 61440, 35, 64, 1}, + {24, 71680, 35, 32, 1}, + {24, 81920, 35, 16384, 1}, + {24, 92160, 35, 64, 1}, + {24, 102400, 35, 64, 1}, + {24, 204800, 35, 8192, 1}, + {24, 307200, 35, 4096, 1}, + {24, 409600, 35, 8192, 1}, + {24, 512000, 35, 4096, 1}, + {24, 614400, 35, 8192, 1}, + {24, 716800, 35, 4096, 1}, + {24, 819200, 35, 2048, 1}, + {24, 921600, 35, 2048, 1}, + {24, 1024000, 35, 8192, 1}, + {33, 512, 36, 32, 1}, + {33, 1024, 36, 32, 1}, + {33, 1536, 36, 64, 1}, + {33, 2048, 36, 32, 1}, + {33, 2560, 36, 32, 1}, + {33, 3072, 36, 32, 1}, + {33, 3584, 36, 64, 1}, + {33, 4096, 36, 32, 1}, + {33, 4608, 36, 64, 1}, + {33, 5120, 36, 32, 1}, + {33, 5632, 36, 32, 1}, + {33, 6144, 36, 32, 1}, + {33, 6656, 36, 32, 1}, + {33, 7168, 36, 64, 1}, + {33, 7680, 36, 64, 1}, + {33, 8192, 36, 32, 1}, + {33, 8704, 36, 32, 1}, + {33, 9216, 36, 64, 1}, + {33, 9728, 36, 32, 1}, + {33, 10240, 36, 32, 1}, + {33, 20480, 36, 32, 1}, + {33, 30720, 36, 64, 1}, + {33, 40960, 36, 32, 1}, + {33, 51200, 36, 32, 1}, + {33, 61440, 36, 64, 1}, + {33, 71680, 36, 64, 1}, + {33, 81920, 36, 64, 1}, + {33, 92160, 36, 64, 1}, + {33, 102400, 36, 64, 1}, + {33, 204800, 36, 8192, 1}, + {33, 307200, 36, 4096, 1}, + {33, 409600, 36, 8192, 1}, + {33, 512000, 36, 4096, 1}, + {33, 614400, 36, 4096, 1}, + {33, 716800, 36, 4096, 1}, + {33, 819200, 36, 16384, 1}, + {33, 921600, 36, 921600, 1}, + {33, 1024000, 36, 8192, 1}, + {42, 512, 45, 32, 1}, + {42, 1024, 45, 64, 1}, + {42, 1536, 45, 32, 1}, + {42, 2048, 45, 64, 1}, + {42, 2560, 45, 32, 1}, + {42, 3072, 45, 64, 1}, + {42, 3584, 45, 32, 1}, + {42, 4096, 45, 32, 1}, + {42, 4608, 45, 32, 1}, + {42, 5120, 45, 32, 1}, + {42, 5632, 45, 64, 1}, + {42, 6144, 45, 32, 1}, + {42, 6656, 45, 32, 1}, + {42, 7168, 45, 64, 1}, + {42, 7680, 45, 32, 1}, + {42, 8192, 45, 64, 1}, + {42, 8704, 45, 64, 1}, + {42, 9216, 45, 32, 1}, + {42, 9728, 45, 64, 1}, + {42, 10240, 45, 32, 1}, + {42, 20480, 45, 2048, 1}, + {42, 30720, 45, 32, 1}, + {42, 40960, 45, 64, 1}, + {42, 51200, 45, 64, 1}, + {42, 61440, 45, 32, 1}, + {42, 71680, 45, 32, 1}, + {42, 81920, 45, 64, 1}, + {42, 92160, 45, 64, 1}, + {42, 102400, 45, 64, 1}, + {42, 204800, 45, 4096, 1}, + {42, 307200, 45, 4096, 1}, + {42, 409600, 45, 16384, 1}, + {42, 512000, 45, 4096, 1}, + {42, 614400, 45, 4096, 1}, + {42, 716800, 45, 4096, 1}, + {42, 819200, 45, 8192, 1}, + {42, 921600, 45, 2048, 1}, + {42, 1024000, 45, 8192, 1}, + {43, 512, 56, 32, 1}, + {43, 1024, 56, 32, 1}, + {43, 1536, 56, 32, 1}, + {43, 2048, 56, 32, 1}, + {43, 2560, 56, 32, 1}, + {43, 3072, 56, 32, 1}, + {43, 3584, 56, 32, 1}, + {43, 4096, 56, 64, 1}, + {43, 4608, 56, 64, 1}, + {43, 5120, 56, 32, 1}, + {43, 5632, 56, 64, 1}, + {43, 6144, 56, 64, 1}, + {43, 6656, 56, 32, 1}, + {43, 7168, 56, 32, 1}, + {43, 7680, 56, 32, 1}, + {43, 8192, 56, 32, 1}, + {43, 8704, 56, 32, 1}, + {43, 9216, 56, 64, 1}, + {43, 9728, 56, 64, 1}, + {43, 10240, 56, 32, 1}, + {43, 20480, 56, 64, 1}, + {43, 30720, 56, 64, 1}, + {43, 40960, 56, 64, 1}, + {43, 51200, 56, 64, 1}, + {43, 61440, 56, 64, 1}, + {43, 71680, 56, 64, 1}, + {43, 81920, 56, 64, 1}, + {43, 92160, 56, 64, 1}, + {43, 102400, 56, 64, 1}, + {43, 204800, 56, 8192, 1}, + {43, 307200, 56, 2048, 1}, + {43, 409600, 56, 8192, 1}, + {43, 512000, 56, 2048, 1}, + {43, 614400, 56, 8192, 1}, + {43, 716800, 56, 4096, 1}, + {43, 819200, 56, 4096, 1}, + {43, 921600, 56, 4096, 1}, + {43, 1024000, 56, 4096, 1}, + {126, 512, 84, 32, 1}, + {126, 1024, 84, 512, 1}, + {126, 1536, 84, 32, 1}, + {126, 2048, 84, 64, 1}, + {126, 2560, 84, 64, 1}, + {126, 3072, 84, 64, 1}, + {126, 3584, 84, 64, 1}, + {126, 4096, 84, 32, 1}, + {126, 4608, 84, 32, 1}, + {126, 5120, 84, 32, 1}, + {126, 5632, 84, 64, 1}, + {126, 6144, 84, 32, 1}, + {126, 6656, 84, 64, 1}, + {126, 7168, 84, 64, 1}, + {126, 7680, 84, 64, 1}, + {126, 8192, 84, 64, 1}, + {126, 8704, 84, 64, 1}, + {126, 9216, 84, 32, 1}, + {126, 9728, 84, 32, 1}, + {126, 10240, 84, 32, 1}, + {126, 20480, 84, 32, 1}, + {126, 30720, 84, 2048, 0}, + {126, 40960, 84, 2048, 0}, + {126, 51200, 84, 2048, 0}, + {126, 61440, 84, 4096, 0}, + {126, 71680, 84, 64, 0}, + {126, 81920, 84, 8192, 0}, + {126, 92160, 84, 64, 0}, + {126, 102400, 84, 64, 0}, + {126, 204800, 84, 204800, 0}, + {126, 307200, 84, 307200, 0}, + {126, 409600, 84, 409600, 0}, + {126, 512000, 84, 512000, 0}, + {126, 614400, 84, 614400, 0}, + {126, 716800, 84, 716800, 0}, + {126, 819200, 84, 819200, 0}, + {126, 921600, 84, 921600, 0}, + {126, 1024000, 84, 1024000, 0}, + {210, 512, 120, 64, 1}, + {210, 1024, 120, 64, 1}, + {210, 1536, 120, 32, 1}, + {210, 2048, 120, 2048, 1}, + {210, 2560, 120, 64, 1}, + {210, 3072, 120, 1024, 1}, + {210, 3584, 120, 64, 1}, + {210, 4096, 120, 2048, 1}, + {210, 4608, 120, 64, 0}, + {210, 5120, 120, 1024, 1}, + {210, 5632, 120, 64, 1}, + {210, 6144, 120, 1024, 1}, + {210, 6656, 120, 64, 1}, + {210, 7168, 120, 1024, 1}, + {210, 7680, 120, 64, 1}, + {210, 8192, 120, 8192, 1}, + {210, 8704, 120, 32, 1}, + {210, 9216, 120, 1024, 1}, + {210, 9728, 120, 64, 1}, + {210, 10240, 120, 1024, 0}, + {210, 20480, 120, 4096, 0}, + {210, 30720, 120, 64, 0}, + {210, 40960, 120, 40960, 0}, + {210, 51200, 120, 51200, 0}, + {210, 61440, 120, 61440, 0}, + {210, 71680, 120, 71680, 0}, + {210, 81920, 120, 81920, 0}, + {210, 92160, 120, 92160, 0}, + {210, 102400, 120, 102400, 0}, + {210, 204800, 120, 2048, 0}, + {210, 307200, 120, 307200, 0}, + {210, 409600, 120, 409600, 0}, + {210, 512000, 120, 512000, 0}, + {210, 614400, 120, 614400, 0}, + {210, 716800, 120, 716800, 0}, + {210, 819200, 120, 819200, 0}, + {210, 921600, 120, 921600, 0}, + {210, 1024000, 120, 1024000, 0}, + {330, 512, 165, 512, 1}, + {330, 1024, 165, 1024, 1}, + {330, 1536, 165, 512, 1}, + {330, 2048, 165, 2048, 1}, + {330, 2560, 165, 512, 1}, + {330, 3072, 165, 512, 0}, + {330, 3584, 165, 512, 1}, + {330, 4096, 165, 4096, 1}, + {330, 4608, 165, 512, 1}, + {330, 5120, 165, 1024, 1}, + {330, 5632, 165, 512, 1}, + {330, 6144, 165, 2048, 1}, + {330, 6656, 165, 512, 1}, + {330, 7168, 165, 512, 1}, + {330, 7680, 165, 512, 1}, + {330, 8192, 165, 2048, 1}, + {330, 8704, 165, 512, 1}, + {330, 9216, 165, 1024, 1}, + {330, 9728, 165, 512, 1}, + {330, 10240, 165, 2048, 1}, + {330, 20480, 165, 2048, 1}, + {330, 30720, 165, 30720, 0}, + {330, 40960, 165, 40960, 0}, + {330, 51200, 165, 2048, 0}, + {330, 61440, 165, 61440, 0}, + {330, 71680, 165, 71680, 0}, + {330, 81920, 165, 81920, 0}, + {330, 92160, 165, 92160, 0}, + {330, 102400, 165, 2048, 0}, + {330, 204800, 165, 204800, 0}, + {330, 307200, 165, 307200, 0}, + {330, 409600, 165, 409600, 0}, + {330, 512000, 165, 512000, 0}, + {330, 614400, 165, 614400, 0}, + {330, 716800, 165, 716800, 0}, + {330, 819200, 165, 819200, 0}, + {330, 921600, 165, 921600, 0}, + {330, 1024000, 165, 1024000, 0} }; - diff --git a/backends/magma/kernels/common/weight.h b/backends/magma/kernels/common/weight.h index 78aa65c699..4c12262c32 100644 --- a/backends/magma/kernels/common/weight.h +++ b/backends/magma/kernels/common/weight.h @@ -9,19 +9,17 @@ #define CEED_MAGMA_WEIGHT_H #include + #include "magma_v2.h" ////////////////////////////////////////////////////////////////////////////////////////// -static __global__ void -magma_weight_nontensor_kernel(const CeedInt nelem, const CeedInt Q, - const CeedScalar *__restrict__ qweight, - CeedScalar *__restrict__ d_V) { +static __global__ void magma_weight_nontensor_kernel(const CeedInt nelem, const CeedInt Q, const CeedScalar *__restrict__ qweight, + CeedScalar *__restrict__ d_V) { const int tid = threadIdx.x; - //TODO load qweight in shared memory if blockDim.z > 1? - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; - elem += gridDim.x*blockDim.z) { - d_V[elem*Q + tid] = qweight[tid]; + // TODO load qweight in shared memory if blockDim.z > 1? + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x * blockDim.z) { + d_V[elem * Q + tid] = qweight[tid]; } } -#endif // CEED_MAGMA_WEIGHT_H +#endif // CEED_MAGMA_WEIGHT_H diff --git a/backends/magma/kernels/hip/weight_generic.hip.cpp b/backends/magma/kernels/hip/weight_generic.hip.cpp index b963b3dbf0..24fd090239 100644 --- a/backends/magma/kernels/hip/weight_generic.hip.cpp +++ b/backends/magma/kernels/hip/weight_generic.hip.cpp @@ -5,15 +5,12 @@ // // This file is part of CEED: http://github.com/ceed -#include "hip/hip_runtime.h" #include "../common/weight.h" - +#include "hip/hip_runtime.h" ////////////////////////////////////////////////////////////////////////////////////////// // NonTensor weight function -extern "C" void -magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, - CeedScalar *dqweight, CeedScalar *dv, magma_queue_t queue) -{ - hipLaunchKernelGGL(magma_weight_nontensor_kernel, dim3(grid), dim3(threads), 0, magma_queue_get_hip_stream(queue), nelem, Q, dqweight, dv); +extern "C" void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv, + magma_queue_t queue) { + hipLaunchKernelGGL(magma_weight_nontensor_kernel, dim3(grid), dim3(threads), 0, magma_queue_get_hip_stream(queue), nelem, Q, dqweight, dv); } diff --git a/backends/magma/magma_dgemm_nontensor.c b/backends/magma/magma_dgemm_nontensor.c index 1bdda0274e..f4de746ded 100644 --- a/backends/magma/magma_dgemm_nontensor.c +++ b/backends/magma/magma_dgemm_nontensor.c @@ -8,113 +8,79 @@ #include "ceed-magma.h" #ifdef CEED_MAGMA_USE_HIP -#define devblasDgemmStridedBatched hipblasDgemmStridedBatched +#define devblasDgemmStridedBatched hipblasDgemmStridedBatched #define magma_queue_get_devblas_handle magma_queue_get_hipblas_handle -#define devblas_trans_const hipblas_trans_const +#define devblas_trans_const hipblas_trans_const #else -#define devblasDgemmStridedBatched cublasDgemmStridedBatched +#define devblasDgemmStridedBatched cublasDgemmStridedBatched #define magma_queue_get_devblas_handle magma_queue_get_cublas_handle -#define devblas_trans_const cublas_trans_const +#define devblas_trans_const cublas_trans_const #endif -int -magma_dgemm_nontensor( - magma_trans_t transA, magma_trans_t transB, - magma_int_t m, magma_int_t n, magma_int_t k, - double alpha, const double *dA, magma_int_t ldda, - const double *dB, magma_int_t lddb, - double beta, double *dC, magma_int_t lddc, - magma_queue_t queue ) { +int magma_dgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, const double *dA, + magma_int_t ldda, const double *dB, magma_int_t lddb, double beta, double *dC, magma_int_t lddc, magma_queue_t queue) { magma_int_t nbatch, use_magmablas; magma_int_t arch = magma_getdevice_arch(); // check for specific transpositions (NN and TN only) bool NN = transA == MagmaNoTrans && transB == MagmaNoTrans; - bool TN = transA == MagmaTrans && transB == MagmaNoTrans; - if ( !(NN || TN) ) { + bool TN = transA == MagmaTrans && transB == MagmaNoTrans; + if (!(NN || TN)) { // default case -- no specific tuning - magma_dgemm( - transA, transB, m, n, k, - alpha, dA, ldda, - dB, lddb, - beta, dC, lddc, queue); + magma_dgemm(transA, transB, m, n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc, queue); return 0; } // get tuning decision - char trans = (transA == MagmaNoTrans) ? 'n' : 't'; - char precision = 'd'; - gemm_selector( arch, precision, trans, m, n, k, &nbatch, &use_magmablas ); + char trans = (transA == MagmaNoTrans) ? 'n' : 't'; + char precision = 'd'; + gemm_selector(arch, precision, trans, m, n, k, &nbatch, &use_magmablas); - #if 0 +#if 0 printf("%c %c -- (%3d, %3d, %3d) -- nbatch = %3d, use_magma = %d\n", trans, precision, m, n, k, nbatch, use_magmablas); - #endif +#endif // perform the dgemm operation - if ( nbatch == n) { + if (nbatch == n) { // no batching - if( use_magmablas ) { - magmablas_dgemm( - transA, transB, m, n, k, - alpha, dA, ldda, - dB, lddb, - beta, dC, lddc, queue); + if (use_magmablas) { + magmablas_dgemm(transA, transB, m, n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc, queue); } else { - magma_dgemm( - transA, transB, m, n, k, - alpha, dA, ldda, - dB, lddb, - beta, dC, lddc, queue); + magma_dgemm(transA, transB, m, n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc, queue); } } else { // use batch kernels - magma_int_t batchCount = n/nbatch; - magma_int_t n2 = n - (batchCount * nbatch); - magma_int_t strideA = 0; - magma_int_t strideB = lddb*nbatch; - magma_int_t strideC = lddc*nbatch; + magma_int_t batchCount = n / nbatch; + magma_int_t n2 = n - (batchCount * nbatch); + magma_int_t strideA = 0; + magma_int_t strideB = lddb * nbatch; + magma_int_t strideC = lddc * nbatch; - if ( use_magmablas ) { - magmablas_dgemm_batched_strided( - transA, transB, m, nbatch, k, - alpha, dA, ldda, strideA, - dB, lddb, strideB, - beta, dC, lddc, strideC, - batchCount, queue); + if (use_magmablas) { + magmablas_dgemm_batched_strided(transA, transB, m, nbatch, k, alpha, dA, ldda, strideA, dB, lddb, strideB, beta, dC, lddc, strideC, batchCount, + queue); // cleanup if (n2 > 0) { - magma_dgemm( - transA, transB, m, n2, k, - alpha, dA, ldda, - dB + batchCount * strideB, lddb, - beta, dC + batchCount * strideC, lddc, queue); + magma_dgemm(transA, transB, m, n2, k, alpha, dA, ldda, dB + batchCount * strideB, lddb, beta, dC + batchCount * strideC, lddc, queue); } } else { - devblasDgemmStridedBatched( - magma_queue_get_devblas_handle( queue ), - devblas_trans_const(transA), devblas_trans_const(transB), - (int)m, (int)nbatch, (int)k, - &alpha, (const double *) dA, (int)ldda, strideA, - (const double *) dB, (int)lddb, strideB, - &beta, dC, (int)lddc, strideC, (int)batchCount ); + devblasDgemmStridedBatched(magma_queue_get_devblas_handle(queue), devblas_trans_const(transA), devblas_trans_const(transB), (int)m, (int)nbatch, + (int)k, &alpha, (const double *)dA, (int)ldda, strideA, (const double *)dB, (int)lddb, strideB, &beta, dC, (int)lddc, + strideC, (int)batchCount); // cleanup if (n2 > 0) { - devblasDgemmStridedBatched( - magma_queue_get_devblas_handle( queue ), - devblas_trans_const(transA), devblas_trans_const(transB), - (int)m, (int)n2, (int)k, - &alpha, (const double *) dA, (int)ldda, strideA, - (const double *) dB + batchCount * strideB, (int)lddb, strideB, - &beta, dC + batchCount * strideC, (int)lddc, strideC, 1 ); + devblasDgemmStridedBatched(magma_queue_get_devblas_handle(queue), devblas_trans_const(transA), devblas_trans_const(transB), (int)m, (int)n2, + (int)k, &alpha, (const double *)dA, (int)ldda, strideA, (const double *)dB + batchCount * strideB, (int)lddb, + strideB, &beta, dC + batchCount * strideC, (int)lddc, strideC, 1); } } } // wait for the operation to complete - ceed_magma_queue_sync( queue ); + ceed_magma_queue_sync(queue); return 0; } diff --git a/backends/magma/magma_sgemm_nontensor.c b/backends/magma/magma_sgemm_nontensor.c index b136d132f6..560625b291 100644 --- a/backends/magma/magma_sgemm_nontensor.c +++ b/backends/magma/magma_sgemm_nontensor.c @@ -8,113 +8,79 @@ #include "ceed-magma.h" #ifdef CEED_MAGMA_USE_HIP -#define devblasSgemmStridedBatched hipblasSgemmStridedBatched +#define devblasSgemmStridedBatched hipblasSgemmStridedBatched #define magma_queue_get_devblas_handle magma_queue_get_hipblas_handle -#define devblas_trans_const hipblas_trans_const +#define devblas_trans_const hipblas_trans_const #else -#define devblasSgemmStridedBatched cublasSgemmStridedBatched +#define devblasSgemmStridedBatched cublasSgemmStridedBatched #define magma_queue_get_devblas_handle magma_queue_get_cublas_handle -#define devblas_trans_const cublas_trans_const +#define devblas_trans_const cublas_trans_const #endif -int -magma_sgemm_nontensor( - magma_trans_t transA, magma_trans_t transB, - magma_int_t m, magma_int_t n, magma_int_t k, - float alpha, const float *dA, magma_int_t ldda, - const float *dB, magma_int_t lddb, - float beta, float *dC, magma_int_t lddc, - magma_queue_t queue ) { +int magma_sgemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, float alpha, const float *dA, + magma_int_t ldda, const float *dB, magma_int_t lddb, float beta, float *dC, magma_int_t lddc, magma_queue_t queue) { magma_int_t nbatch, use_magmablas; magma_int_t arch = magma_getdevice_arch(); // check for specific transpositions (NN and TN only) bool NN = transA == MagmaNoTrans && transB == MagmaNoTrans; - bool TN = transA == MagmaTrans && transB == MagmaNoTrans; - if ( !(NN || TN) ) { + bool TN = transA == MagmaTrans && transB == MagmaNoTrans; + if (!(NN || TN)) { // default case -- no specific tuning - magma_sgemm( - transA, transB, m, n, k, - alpha, dA, ldda, - dB, lddb, - beta, dC, lddc, queue); + magma_sgemm(transA, transB, m, n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc, queue); return 0; } // get tuning decision - char trans = (transA == MagmaNoTrans) ? 'n' : 't'; - char precision = 'd'; - gemm_selector( arch, precision, trans, m, n, k, &nbatch, &use_magmablas ); + char trans = (transA == MagmaNoTrans) ? 'n' : 't'; + char precision = 'd'; + gemm_selector(arch, precision, trans, m, n, k, &nbatch, &use_magmablas); - #if 0 +#if 0 printf("%c %c -- (%3d, %3d, %3d) -- nbatch = %3d, use_magma = %d\n", trans, precision, m, n, k, nbatch, use_magmablas); - #endif +#endif // perform the sgemm operation - if ( nbatch == n) { + if (nbatch == n) { // no batching - if( use_magmablas ) { - magmablas_sgemm( - transA, transB, m, n, k, - alpha, dA, ldda, - dB, lddb, - beta, dC, lddc, queue); + if (use_magmablas) { + magmablas_sgemm(transA, transB, m, n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc, queue); } else { - magma_sgemm( - transA, transB, m, n, k, - alpha, dA, ldda, - dB, lddb, - beta, dC, lddc, queue); + magma_sgemm(transA, transB, m, n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc, queue); } } else { // use batch kernels - magma_int_t batchCount = n/nbatch; - magma_int_t n2 = n - (batchCount * nbatch); - magma_int_t strideA = 0; - magma_int_t strideB = lddb*nbatch; - magma_int_t strideC = lddc*nbatch; + magma_int_t batchCount = n / nbatch; + magma_int_t n2 = n - (batchCount * nbatch); + magma_int_t strideA = 0; + magma_int_t strideB = lddb * nbatch; + magma_int_t strideC = lddc * nbatch; - if ( use_magmablas ) { - magmablas_sgemm_batched_strided( - transA, transB, m, nbatch, k, - alpha, dA, ldda, strideA, - dB, lddb, strideB, - beta, dC, lddc, strideC, - batchCount, queue); + if (use_magmablas) { + magmablas_sgemm_batched_strided(transA, transB, m, nbatch, k, alpha, dA, ldda, strideA, dB, lddb, strideB, beta, dC, lddc, strideC, batchCount, + queue); // cleanup if (n2 > 0) { - magma_sgemm( - transA, transB, m, n2, k, - alpha, dA, ldda, - dB + batchCount * strideB, lddb, - beta, dC + batchCount * strideC, lddc, queue); + magma_sgemm(transA, transB, m, n2, k, alpha, dA, ldda, dB + batchCount * strideB, lddb, beta, dC + batchCount * strideC, lddc, queue); } } else { - devblasSgemmStridedBatched( - magma_queue_get_devblas_handle( queue ), - devblas_trans_const(transA), devblas_trans_const(transB), - (int)m, (int)nbatch, (int)k, - &alpha, (const float *) dA, (int)ldda, strideA, - (const float *) dB, (int)lddb, strideB, - &beta, dC, (int)lddc, strideC, (int)batchCount ); + devblasSgemmStridedBatched(magma_queue_get_devblas_handle(queue), devblas_trans_const(transA), devblas_trans_const(transB), (int)m, (int)nbatch, + (int)k, &alpha, (const float *)dA, (int)ldda, strideA, (const float *)dB, (int)lddb, strideB, &beta, dC, (int)lddc, + strideC, (int)batchCount); // cleanup if (n2 > 0) { - devblasSgemmStridedBatched( - magma_queue_get_devblas_handle( queue ), - devblas_trans_const(transA), devblas_trans_const(transB), - (int)m, (int)n2, (int)k, - &alpha, (const float *) dA, (int)ldda, strideA, - (const float *) dB + batchCount * strideB, (int)lddb, strideB, - &beta, dC + batchCount * strideC, (int)lddc, strideC, 1 ); + devblasSgemmStridedBatched(magma_queue_get_devblas_handle(queue), devblas_trans_const(transA), devblas_trans_const(transB), (int)m, (int)n2, + (int)k, &alpha, (const float *)dA, (int)ldda, strideA, (const float *)dB + batchCount * strideB, (int)lddb, + strideB, &beta, dC + batchCount * strideC, (int)lddc, strideC, 1); } } } // wait for the operation to complete - ceed_magma_queue_sync( queue ); + ceed_magma_queue_sync(queue); return 0; } diff --git a/backends/memcheck/ceed-memcheck-blocked.c b/backends/memcheck/ceed-memcheck-blocked.c index f1b3efef80..653a769c9b 100644 --- a/backends/memcheck/ceed-memcheck-blocked.c +++ b/backends/memcheck/ceed-memcheck-blocked.c @@ -5,33 +5,30 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include + #include "ceed-memcheck.h" //------------------------------------------------------------------------------ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Memcheck(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self/memcheck/blocked")) + if (strcmp(resource, "/cpu/self/memcheck/blocked")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Valgrind Memcheck backend cannot use resource: %s", - resource); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Valgrind Memcheck backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceed_ref; - CeedInit("/cpu/self/ref/blocked", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/cpu/self/ref/blocked", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", - CeedQFunctionCreate_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", - CeedQFunctionContextCreate_Memcheck); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Memcheck)); return CEED_ERROR_SUCCESS; } @@ -39,7 +36,5 @@ static int CeedInit_Memcheck(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Memcheck_Blocked(void) { - return CeedRegister("/cpu/self/memcheck/blocked", CeedInit_Memcheck, 110); -} +CEED_INTERN int CeedRegister_Memcheck_Blocked(void) { return CeedRegister("/cpu/self/memcheck/blocked", CeedInit_Memcheck, 110); } //------------------------------------------------------------------------------ diff --git a/backends/memcheck/ceed-memcheck-qfunction.c b/backends/memcheck/ceed-memcheck-qfunction.c index 94330dcea1..485a8f4861 100644 --- a/backends/memcheck/ceed-memcheck-qfunction.c +++ b/backends/memcheck/ceed-memcheck-qfunction.c @@ -5,60 +5,56 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-memcheck.h" //------------------------------------------------------------------------------ // QFunction Apply //------------------------------------------------------------------------------ -static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, - CeedVector *U, CeedVector *V) { - int ierr; +static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { CeedQFunction_Memcheck *impl; - ierr = CeedQFunctionGetData(qf, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &impl)); void *ctx_data = NULL; - ierr = CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data)); CeedQFunctionUser f = NULL; - ierr = CeedQFunctionGetUserFunction(qf, &f); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetUserFunction(qf, &f)); CeedInt num_in, num_out; - ierr = CeedQFunctionGetNumArgs(qf, &num_in, &num_out); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_in, &num_out)); - for (CeedInt i = 0; iinputs[i]); - CeedChkBackend(ierr); + for (CeedInt i = 0; i < num_in; i++) { + CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i])); } int mem_block_ids[num_out]; - for (CeedInt i = 0; ioutputs[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(V[i], CEED_MEM_HOST, &impl->outputs[i])); - ierr = CeedVectorGetLength(V[i], &len); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(V[i], &len)); VALGRIND_MAKE_MEM_UNDEFINED(impl->outputs[i], len); snprintf(name, 30, "'QFunction output %" CeedInt_FMT "'", i); mem_block_ids[i] = VALGRIND_CREATE_BLOCK(impl->outputs[i], len, name); } - ierr = f(ctx_data, Q, impl->inputs, impl->outputs); CeedChkBackend(ierr); + CeedCallBackend(f(ctx_data, Q, impl->inputs, impl->outputs)); - for (CeedInt i = 0; iinputs[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < num_in; i++) { + CeedCallBackend(CeedVectorRestoreArrayRead(U[i], &impl->inputs[i])); } - for (CeedInt i = 0; ioutputs[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < num_out; i++) { + CeedCallBackend(CeedVectorRestoreArray(V[i], &impl->outputs[i])); VALGRIND_DISCARD(mem_block_ids[i]); } - ierr = CeedQFunctionRestoreContextData(qf, &ctx_data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionRestoreContextData(qf, &ctx_data)); return CEED_ERROR_SUCCESS; } @@ -67,13 +63,12 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, // QFunction Destroy //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Memcheck(CeedQFunction qf) { - int ierr; CeedQFunction_Memcheck *impl; - ierr = CeedQFunctionGetData(qf, (void *)&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, (void *)&impl)); - ierr = CeedFree(&impl->inputs); CeedChkBackend(ierr); - ierr = CeedFree(&impl->outputs); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->inputs)); + CeedCallBackend(CeedFree(&impl->outputs)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -82,20 +77,17 @@ static int CeedQFunctionDestroy_Memcheck(CeedQFunction qf) { // QFunction Create //------------------------------------------------------------------------------ int CeedQFunctionCreate_Memcheck(CeedQFunction qf) { - int ierr; Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedQFunction_Memcheck *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->inputs); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->outputs); CeedChkBackend(ierr); - ierr = CeedQFunctionSetData(qf, impl); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", - CeedQFunctionApply_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", - CeedQFunctionDestroy_Memcheck); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->inputs)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->outputs)); + CeedCallBackend(CeedQFunctionSetData(qf, impl)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Memcheck)); return CEED_ERROR_SUCCESS; } diff --git a/backends/memcheck/ceed-memcheck-qfunctioncontext.c b/backends/memcheck/ceed-memcheck-qfunctioncontext.c index 54ac17cb4e..1be64e3c5b 100644 --- a/backends/memcheck/ceed-memcheck-qfunctioncontext.c +++ b/backends/memcheck/ceed-memcheck-qfunctioncontext.c @@ -5,21 +5,19 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-memcheck.h" //------------------------------------------------------------------------------ // QFunctionContext has valid data //------------------------------------------------------------------------------ -static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, - bool *has_valid_data) { - int ierr; +static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, bool *has_valid_data) { CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); *has_valid_data = !!impl->data; @@ -29,26 +27,21 @@ static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // QFunctionContext has borrowed data //------------------------------------------------------------------------------ -static int CeedQFunctionContextHasBorrowedDataOfType_Memcheck( - CeedQFunctionContext ctx, CeedMemType mem_type, - bool *has_borrowed_data_of_type) { - int ierr; +static int CeedQFunctionContextHasBorrowedDataOfType_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); switch (mem_type) { - case CEED_MEM_HOST: - *has_borrowed_data_of_type = !!impl->data_borrowed; - break; - default: - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only set HOST memory for this backend"); - // LCOV_EXCL_STOP - break; + case CEED_MEM_HOST: + *has_borrowed_data_of_type = !!impl->data_borrowed; + break; + default: + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + // LCOV_EXCL_STOP + break; } return CEED_ERROR_SUCCESS; @@ -57,49 +50,44 @@ static int CeedQFunctionContextHasBorrowedDataOfType_Memcheck( //------------------------------------------------------------------------------ // QFunctionContext Set Data //------------------------------------------------------------------------------ -static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, - CeedMemType mem_type, CeedCopyMode copy_mode, void *data) { - int ierr; +static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, CeedCopyMode copy_mode, void *data) { CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); size_t ctx_size; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctx_size); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only set HOST memory for this backend"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + // LCOV_EXCL_STOP + } - ierr = CeedFree(&impl->data_allocated); CeedChkBackend(ierr); - ierr = CeedFree(&impl->data_owned); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->data_allocated)); + CeedCallBackend(CeedFree(&impl->data_owned)); switch (copy_mode) { - case CEED_COPY_VALUES: - ierr = CeedMallocArray(1, ctx_size, &impl->data_owned); CeedChkBackend(ierr); - impl->data_borrowed = NULL; - impl->data = impl->data_owned; - memcpy(impl->data, data, ctx_size); - break; - case CEED_OWN_POINTER: - impl->data_owned = data; - impl->data_borrowed = NULL; - impl->data = data; - break; - case CEED_USE_POINTER: - impl->data_borrowed = data; - impl->data = data; + case CEED_COPY_VALUES: + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_owned)); + impl->data_borrowed = NULL; + impl->data = impl->data_owned; + memcpy(impl->data, data, ctx_size); + break; + case CEED_OWN_POINTER: + impl->data_owned = data; + impl->data_borrowed = NULL; + impl->data = data; + break; + case CEED_USE_POINTER: + impl->data_borrowed = data; + impl->data = data; } // Copy data to check ctx_size bounds - ierr = CeedMallocArray(1, ctx_size, &impl->data_allocated); - CeedChkBackend(ierr); + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_allocated)); memcpy(impl->data_allocated, impl->data, ctx_size); impl->data = impl->data_allocated; VALGRIND_DISCARD(impl->mem_block_id); - impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->data, ctx_size, - "'QFunction backend context data copy'"); + impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->data, ctx_size, "'QFunction backend context data copy'"); return CEED_ERROR_SUCCESS; } @@ -107,26 +95,23 @@ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // QFunctionContext Take Data //------------------------------------------------------------------------------ -static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, - CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only provide HOST memory for this backend"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + // LCOV_EXCL_STOP + } - *(void **)data = impl->data_borrowed; + *(void **)data = impl->data_borrowed; impl->data_borrowed = NULL; - impl->data = NULL; + impl->data = NULL; VALGRIND_DISCARD(impl->mem_block_id); - ierr = CeedFree(&impl->data_allocated); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->data_allocated)); return CEED_ERROR_SUCCESS; } @@ -134,20 +119,17 @@ static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // QFunctionContext Get Data //------------------------------------------------------------------------------ -static int CeedQFunctionContextGetData_Memcheck(CeedQFunctionContext ctx, - CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextGetData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only provide HOST memory for this backend"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + // LCOV_EXCL_STOP + } *(void **)data = impl->data; @@ -157,23 +139,18 @@ static int CeedQFunctionContextGetData_Memcheck(CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // QFunctionContext Get Data Read-Only //------------------------------------------------------------------------------ -static int CeedQFunctionContextGetDataRead_Memcheck(CeedQFunctionContext ctx, - CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextGetDataRead_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); size_t ctx_size; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctx_size); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - ierr = CeedQFunctionContextGetData_Memcheck(ctx, mem_type, data); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetData_Memcheck(ctx, mem_type, data)); // Make copy to verify no write occured - ierr = CeedMallocArray(1, ctx_size, &impl->data_read_only_copy); - CeedChkBackend(ierr); + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_read_only_copy)); memcpy(impl->data_read_only_copy, *(void **)data, ctx_size); return CEED_ERROR_SUCCESS; @@ -183,12 +160,10 @@ static int CeedQFunctionContextGetDataRead_Memcheck(CeedQFunctionContext ctx, // QFunctionContext Restore Data //------------------------------------------------------------------------------ static int CeedQFunctionContextRestoreData_Memcheck(CeedQFunctionContext ctx) { - int ierr; size_t ctx_size; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctx_size); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); if (impl->data_borrowed) { memcpy(impl->data_borrowed, impl->data, ctx_size); @@ -203,24 +178,21 @@ static int CeedQFunctionContextRestoreData_Memcheck(CeedQFunctionContext ctx) { //------------------------------------------------------------------------------ // QFunctionContext Restore Data Read-Only //------------------------------------------------------------------------------ -static int CeedQFunctionContextRestoreDataRead_Memcheck( - CeedQFunctionContext ctx) { - int ierr; +static int CeedQFunctionContextRestoreDataRead_Memcheck(CeedQFunctionContext ctx) { size_t ctx_size; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctx_size); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - if (memcmp(impl->data, impl->data_read_only_copy, ctx_size)) + if (memcmp(impl->data, impl->data_read_only_copy, ctx_size)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Context data changed while accessed in read-only mode"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Context data changed while accessed in read-only mode"); + // LCOV_EXCL_STOP + } - ierr = CeedFree(&impl->data_read_only_copy); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->data_read_only_copy)); return CEED_ERROR_SUCCESS; } @@ -229,27 +201,24 @@ static int CeedQFunctionContextRestoreDataRead_Memcheck( // QFunctionContext destroy user data //------------------------------------------------------------------------------ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) { - int ierr; CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedQFunctionContextDataDestroyUser data_destroy_function; - CeedMemType data_destroy_mem_type; - ierr = CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, - &data_destroy_function); CeedChk(ierr); + CeedMemType data_destroy_mem_type; + CeedCallBackend(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - if (data_destroy_mem_type != CEED_MEM_HOST) + if (data_destroy_mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only destroy HOST memory for this backend"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only destroy HOST memory for this backend"); + // LCOV_EXCL_STOP + } if (data_destroy_function) { - ierr = data_destroy_function(impl->data_borrowed ? impl->data_borrowed : - impl->data_owned); CeedChk(ierr); + CeedCallBackend(data_destroy_function(impl->data_borrowed ? impl->data_borrowed : impl->data_owned)); } - ierr = CeedFree(&impl->data_allocated); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->data_allocated)); return CEED_ERROR_SUCCESS; } @@ -258,13 +227,12 @@ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) { // QFunctionContext Destroy //------------------------------------------------------------------------------ static int CeedQFunctionContextDestroy_Memcheck(CeedQFunctionContext ctx) { - int ierr; CeedQFunctionContext_Memcheck *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = CeedFree(&impl->data_allocated); CeedChkBackend(ierr); - ierr = CeedFree(&impl->data_owned); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->data_allocated)); + CeedCallBackend(CeedFree(&impl->data_owned)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -272,37 +240,23 @@ static int CeedQFunctionContextDestroy_Memcheck(CeedQFunctionContext ctx) { // QFunctionContext Create //------------------------------------------------------------------------------ int CeedQFunctionContextCreate_Memcheck(CeedQFunctionContext ctx) { - int ierr; CeedQFunctionContext_Memcheck *impl; - Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", - CeedQFunctionContextHasValidData_Memcheck); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, - "HasBorrowedDataOfType", - CeedQFunctionContextHasBorrowedDataOfType_Memcheck); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", - CeedQFunctionContextSetData_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "TakeData", - CeedQFunctionContextTakeData_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", - CeedQFunctionContextGetData_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", - CeedQFunctionContextGetDataRead_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreData", - CeedQFunctionContextRestoreData_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", - CeedQFunctionContextRestoreDataRead_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "DataDestroy", - CeedQFunctionContextDataDestroy_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", - CeedQFunctionContextDestroy_Memcheck); CeedChkBackend(ierr); - - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedQFunctionContextSetBackendData(ctx, impl); CeedChkBackend(ierr); + Ceed ceed; + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", CeedQFunctionContextHasValidData_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasBorrowedDataOfType", CeedQFunctionContextHasBorrowedDataOfType_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", CeedQFunctionContextSetData_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "TakeData", CeedQFunctionContextTakeData_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreData", CeedQFunctionContextRestoreData_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreDataRead_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "DataDestroy", CeedQFunctionContextDataDestroy_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Memcheck)); + + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/memcheck/ceed-memcheck-serial.c b/backends/memcheck/ceed-memcheck-serial.c index 07c9a464ef..6a3c4a5d3f 100644 --- a/backends/memcheck/ceed-memcheck-serial.c +++ b/backends/memcheck/ceed-memcheck-serial.c @@ -5,34 +5,30 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include + #include "ceed-memcheck.h" //------------------------------------------------------------------------------ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Memcheck(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self/memcheck") - && strcmp(resource, "/cpu/self/memcheck/serial")) + if (strcmp(resource, "/cpu/self/memcheck") && strcmp(resource, "/cpu/self/memcheck/serial")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Valgrind Memcheck backend cannot use resource: %s", - resource); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Valgrind Memcheck backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceed_ref; - CeedInit("/cpu/self/ref/serial", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", - CeedQFunctionCreate_Memcheck); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", - CeedQFunctionContextCreate_Memcheck); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Memcheck)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Memcheck)); return CEED_ERROR_SUCCESS; } @@ -40,7 +36,5 @@ static int CeedInit_Memcheck(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Memcheck_Serial(void) { - return CeedRegister("/cpu/self/memcheck/serial", CeedInit_Memcheck, 100); -} +CEED_INTERN int CeedRegister_Memcheck_Serial(void) { return CeedRegister("/cpu/self/memcheck/serial", CeedInit_Memcheck, 100); } //------------------------------------------------------------------------------ diff --git a/backends/memcheck/ceed-memcheck.h b/backends/memcheck/ceed-memcheck.h index 5dc710ca66..03ec36dfd6 100644 --- a/backends/memcheck/ceed-memcheck.h +++ b/backends/memcheck/ceed-memcheck.h @@ -8,17 +8,17 @@ #ifndef _ceed_memcheck_h #define _ceed_memcheck_h -#include #include +#include typedef struct { const CeedScalar **inputs; - CeedScalar **outputs; - bool setup_done; + CeedScalar **outputs; + bool setup_done; } CeedQFunction_Memcheck; typedef struct { - int mem_block_id; + int mem_block_id; void *data; void *data_allocated; void *data_owned; @@ -30,4 +30,4 @@ CEED_INTERN int CeedQFunctionCreate_Memcheck(CeedQFunction qf); CEED_INTERN int CeedQFunctionContextCreate_Memcheck(CeedQFunctionContext ctx); -#endif // _ceed_memcheck_h +#endif // _ceed_memcheck_h diff --git a/backends/occa/ceed-occa-basis.cpp b/backends/occa/ceed-occa-basis.cpp index 5590298cf1..64a6a25b6b 100644 --- a/backends/occa/ceed-occa-basis.cpp +++ b/backends/occa/ceed-occa-basis.cpp @@ -6,90 +6,75 @@ // This file is part of CEED: http://github.com/ceed #include "ceed-occa-basis.hpp" + #include "ceed-occa-tensor-basis.hpp" namespace ceed { - namespace occa { - Basis::Basis() : - ceedComponentCount(0), - dim(0), - P(0), - Q(0) {} - - Basis::~Basis() {} - - Basis* Basis::getBasis(CeedBasis basis, - const bool assertValid) { - if (!basis) { - return NULL; - } - - int ierr; - Basis *basis_ = NULL; - - ierr = CeedBasisGetData(basis, &basis_); - if (assertValid) { - CeedOccaFromChk(ierr); - } - - return basis_; - } - - Basis* Basis::from(CeedBasis basis) { - Basis *basis_ = getBasis(basis); - if (!basis_) { - return NULL; - } - - int ierr; - ierr = basis_->setCeedFields(basis); CeedOccaFromChk(ierr); - - return basis_; - } - - Basis* Basis::from(CeedOperatorField operatorField) { - int ierr; - CeedBasis basis; - ierr = CeedOperatorFieldGetBasis(operatorField, &basis); CeedOccaFromChk(ierr); - return from(basis); - } - - int Basis::setCeedFields(CeedBasis basis) { - int ierr; - - ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); - ierr = CeedBasisGetNumComponents(basis, &ceedComponentCount); CeedChk(ierr); - - return CEED_ERROR_SUCCESS; - } - - //---[ Ceed Callbacks ]----------- - int Basis::registerCeedFunction(Ceed ceed, CeedBasis basis, - const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "Basis", basis, fname, f); - } - - int Basis::ceedApply(CeedBasis basis, const CeedInt nelem, - CeedTransposeMode tmode, - CeedEvalMode emode, CeedVector u, CeedVector v) { - Basis *basis_ = Basis::from(basis); - Vector *U = Vector::from(u); - Vector *V = Vector::from(v); - - if (!basis_) { - return staticCeedError("Incorrect CeedBasis argument: op"); - } - - return basis_->apply( - nelem, - tmode, emode, - U, V - ); - } - - int Basis::ceedDestroy(CeedBasis basis) { - delete getBasis(basis, false); - return CEED_ERROR_SUCCESS; - } +namespace occa { +Basis::Basis() : ceedComponentCount(0), dim(0), P(0), Q(0) {} + +Basis::~Basis() {} + +Basis *Basis::getBasis(CeedBasis basis, const bool assertValid) { + if (!basis) { + return NULL; + } + + int ierr; + Basis *basis_ = NULL; + + ierr = CeedBasisGetData(basis, &basis_); + if (assertValid) { + CeedOccaFromChk(ierr); + } + + return basis_; +} + +Basis *Basis::from(CeedBasis basis) { + Basis *basis_ = getBasis(basis); + if (!basis_) { + return NULL; + } + + CeedCallOcca(basis_->setCeedFields(basis)); + + return basis_; +} + +Basis *Basis::from(CeedOperatorField operatorField) { + CeedBasis basis; + CeedCallOcca(CeedOperatorFieldGetBasis(operatorField, &basis)); + return from(basis); +} + +int Basis::setCeedFields(CeedBasis basis) { + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &ceedComponentCount)); + + return CEED_ERROR_SUCCESS; +} + +//---[ Ceed Callbacks ]----------- +int Basis::registerCeedFunction(Ceed ceed, CeedBasis basis, const char *fname, ceed::occa::ceedFunction f) { + return CeedSetBackendFunction(ceed, "Basis", basis, fname, f); +} + +int Basis::ceedApply(CeedBasis basis, const CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector u, CeedVector v) { + Basis *basis_ = Basis::from(basis); + Vector *U = Vector::from(u); + Vector *V = Vector::from(v); + + if (!basis_) { + return staticCeedError("Incorrect CeedBasis argument: op"); } + + return basis_->apply(nelem, tmode, emode, U, V); +} + +int Basis::ceedDestroy(CeedBasis basis) { + delete getBasis(basis, false); + return CEED_ERROR_SUCCESS; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-basis.hpp b/backends/occa/ceed-occa-basis.hpp index 53c8e78f6b..d820b38913 100644 --- a/backends/occa/ceed-occa-basis.hpp +++ b/backends/occa/ceed-occa-basis.hpp @@ -12,50 +12,42 @@ #include "ceed-occa-vector.hpp" namespace ceed { - namespace occa { - class Basis : public CeedObject { - public: - // Ceed object information - CeedInt ceedComponentCount; +namespace occa { +class Basis : public CeedObject { + public: + // Ceed object information + CeedInt ceedComponentCount; - // Owned information - CeedInt dim; - CeedInt P; - CeedInt Q; + // Owned information + CeedInt dim; + CeedInt P; + CeedInt Q; - Basis(); + Basis(); - virtual ~Basis(); + virtual ~Basis(); - static Basis* getBasis(CeedBasis basis, - const bool assertValid = true); + static Basis* getBasis(CeedBasis basis, const bool assertValid = true); - static Basis* from(CeedBasis basis); - static Basis* from(CeedOperatorField operatorField); + static Basis* from(CeedBasis basis); + static Basis* from(CeedOperatorField operatorField); - int setCeedFields(CeedBasis basis); + int setCeedFields(CeedBasis basis); - virtual bool isTensorBasis() const = 0; + virtual bool isTensorBasis() const = 0; - virtual const char* getFunctionSource() const = 0; + virtual const char* getFunctionSource() const = 0; - virtual int apply(const CeedInt elementCount, - CeedTransposeMode tmode, - CeedEvalMode emode, - Vector *u, - Vector *v) = 0; + virtual int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector* u, Vector* v) = 0; - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedBasis basis, - const char *fname, ceed::occa::ceedFunction f); + //---[ Ceed Callbacks ]----------- + static int registerCeedFunction(Ceed ceed, CeedBasis basis, const char* fname, ceed::occa::ceedFunction f); - static int ceedApply(CeedBasis basis, const CeedInt nelem, - CeedTransposeMode tmode, - CeedEvalMode emode, CeedVector u, CeedVector v); + static int ceedApply(CeedBasis basis, const CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector u, CeedVector v); - static int ceedDestroy(CeedBasis basis); - }; - } -} + static int ceedDestroy(CeedBasis basis); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-ceed-object.cpp b/backends/occa/ceed-occa-ceed-object.cpp index 05ea6814e7..6d1a1507ab 100644 --- a/backends/occa/ceed-occa-ceed-object.cpp +++ b/backends/occa/ceed-occa-ceed-object.cpp @@ -6,34 +6,26 @@ // This file is part of CEED: http://github.com/ceed #include "ceed-occa-ceed-object.hpp" + #include "ceed-occa-context.hpp" namespace ceed { - namespace occa { - CeedObject::CeedObject(Ceed ceed_) : - ceed(ceed_) {} - - ::occa::device CeedObject::getDevice() { - if (!_device.isInitialized()) { - _device = Context::from(ceed)->device; - } - return _device; - } - - bool CeedObject::usingCpuDevice() const { - return Context::from(ceed)->usingCpuDevice(); - } - - bool CeedObject::usingGpuDevice() const { - return Context::from(ceed)->usingGpuDevice(); - } - - int CeedObject::ceedError(const std::string &message) const { - return CeedError(ceed, CEED_ERROR_BACKEND, message.c_str()); - } - - int CeedObject::staticCeedError(const std::string &message) { - return CeedError(NULL, CEED_ERROR_BACKEND, message.c_str()); - } +namespace occa { +CeedObject::CeedObject(Ceed ceed_) : ceed(ceed_) {} + +::occa::device CeedObject::getDevice() { + if (!_device.isInitialized()) { + _device = Context::from(ceed)->device; } + return _device; } + +bool CeedObject::usingCpuDevice() const { return Context::from(ceed)->usingCpuDevice(); } + +bool CeedObject::usingGpuDevice() const { return Context::from(ceed)->usingGpuDevice(); } + +int CeedObject::ceedError(const std::string &message) const { return CeedError(ceed, CEED_ERROR_BACKEND, message.c_str()); } + +int CeedObject::staticCeedError(const std::string &message) { return CeedError(NULL, CEED_ERROR_BACKEND, message.c_str()); } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-ceed-object.hpp b/backends/occa/ceed-occa-ceed-object.hpp index 184097d83f..0f7caa384f 100644 --- a/backends/occa/ceed-occa-ceed-object.hpp +++ b/backends/occa/ceed-occa-ceed-object.hpp @@ -11,32 +11,32 @@ #include "ceed-occa-context.hpp" namespace ceed { - namespace occa { - class CeedObject { - private: - ::occa::device _device; +namespace occa { +class CeedObject { + private: + ::occa::device _device; - public: - Ceed ceed; + public: + Ceed ceed; - CeedObject(Ceed ceed_ = NULL); + CeedObject(Ceed ceed_ = NULL); - ::occa::device getDevice(); + ::occa::device getDevice(); - bool usingCpuDevice() const; - bool usingGpuDevice() const; + bool usingCpuDevice() const; + bool usingGpuDevice() const; - int ceedError(const std::string &message) const; - static int staticCeedError(const std::string &message); - }; + int ceedError(const std::string &message) const; + static int staticCeedError(const std::string &message); +}; - namespace SyncState { - static const int none = 0; - static const int host = (1 << 0); - static const int device = (1 << 1); - static const int all = host | device; - } - } -} +namespace SyncState { +static const int none = 0; +static const int host = (1 << 0); +static const int device = (1 << 1); +static const int all = host | device; +} // namespace SyncState +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-context.cpp b/backends/occa/ceed-occa-context.cpp index c60919c888..f8533563cc 100644 --- a/backends/occa/ceed-occa-context.cpp +++ b/backends/occa/ceed-occa-context.cpp @@ -8,30 +8,25 @@ #include "ceed-occa-context.hpp" namespace ceed { - namespace occa { - Context::Context(::occa::device device_) : - device(device_) { - const std::string mode = device.mode(); - _usingCpuDevice = (mode == "Serial" || mode == "OpenMP"); - _usingGpuDevice = (mode == "CUDA" || mode == "HIP" || mode == "OpenCL"); - } +namespace occa { +Context::Context(::occa::device device_) : device(device_) { + const std::string mode = device.mode(); + _usingCpuDevice = (mode == "Serial" || mode == "OpenMP"); + _usingGpuDevice = (mode == "CUDA" || mode == "HIP" || mode == "OpenCL"); +} - Context* Context::from(Ceed ceed) { - if (!ceed) { - return NULL; - } +Context* Context::from(Ceed ceed) { + if (!ceed) { + return NULL; + } - Context *context; - CeedGetData(ceed, (void**) &context); - return context; - } + Context* context; + CeedGetData(ceed, (void**)&context); + return context; +} - bool Context::usingCpuDevice() const { - return _usingCpuDevice; - } +bool Context::usingCpuDevice() const { return _usingCpuDevice; } - bool Context::usingGpuDevice() const { - return _usingGpuDevice; - } - } -} +bool Context::usingGpuDevice() const { return _usingGpuDevice; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-context.hpp b/backends/occa/ceed-occa-context.hpp index 501304bcaf..4e580cba80 100644 --- a/backends/occa/ceed-occa-context.hpp +++ b/backends/occa/ceed-occa-context.hpp @@ -11,23 +11,23 @@ #include "ceed-occa-types.hpp" namespace ceed { - namespace occa { - class Context { - private: - bool _usingCpuDevice; - bool _usingGpuDevice; +namespace occa { +class Context { + private: + bool _usingCpuDevice; + bool _usingGpuDevice; - public: - ::occa::device device; + public: + ::occa::device device; - Context(::occa::device device_); + Context(::occa::device device_); - static Context* from(Ceed ceed); + static Context* from(Ceed ceed); - bool usingCpuDevice() const; - bool usingGpuDevice() const; - }; - } -} + bool usingCpuDevice() const; + bool usingGpuDevice() const; +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-cpu-operator.cpp b/backends/occa/ceed-occa-cpu-operator.cpp index 92ae0a6e1d..46c872381f 100644 --- a/backends/occa/ceed-occa-cpu-operator.cpp +++ b/backends/occa/ceed-occa-cpu-operator.cpp @@ -6,6 +6,7 @@ // This file is part of CEED: http://github.com/ceed #include "ceed-occa-cpu-operator.hpp" + #include "ceed-occa-elem-restriction.hpp" #include "ceed-occa-qfunction.hpp" #include "ceed-occa-qfunctioncontext.hpp" @@ -15,824 +16,736 @@ #define CEED_OCCA_PRINT_KERNEL_HASHES 0 namespace ceed { - namespace occa { - CpuOperator::CpuOperator() {} +namespace occa { +CpuOperator::CpuOperator() {} - CpuOperator::~CpuOperator() {} +CpuOperator::~CpuOperator() {} - void CpuOperator::setupVectors() { - setupVectors(args.inputCount(), args.opInputs, args.qfInputs, dofInputs); - setupVectors(args.outputCount(), args.opOutputs, args.qfOutputs, dofOutputs); - } +void CpuOperator::setupVectors() { + setupVectors(args.inputCount(), args.opInputs, args.qfInputs, dofInputs); + setupVectors(args.outputCount(), args.opOutputs, args.qfOutputs, dofOutputs); +} - void CpuOperator::setupVectors(const int fieldCount, - OperatorFieldVector &opFields, - QFunctionFieldVector &qfFields, - VectorVector &vectors) { - for (int i = 0; i < fieldCount; ++i) { - const QFunctionField &qfField = qfFields[i]; - const OperatorField &opField = opFields[i]; - - if (qfField.evalMode == CEED_EVAL_WEIGHT) { - // Weight kernel doesn't use the input - vectors.push_back(NULL); - continue; - } - - int entries; - if (qfField.evalMode == CEED_EVAL_NONE) { - // The output vector stores values at quadrature points - entries = ( - ceedElementCount - * ceedQ - * qfField.size - ); - } else { - // The output vector stores the element dof values - entries = ( - ceedElementCount - * opField.getElementSize() - * opField.getComponentCount() - ); - } - - Vector *dofVector = new Vector(); - dofVector->ceed = ceed; - dofVector->resize(entries); - - vectors.push_back(dofVector); - } +void CpuOperator::setupVectors(const int fieldCount, OperatorFieldVector &opFields, QFunctionFieldVector &qfFields, VectorVector &vectors) { + for (int i = 0; i < fieldCount; ++i) { + const QFunctionField &qfField = qfFields[i]; + const OperatorField &opField = opFields[i]; + + if (qfField.evalMode == CEED_EVAL_WEIGHT) { + // Weight kernel doesn't use the input + vectors.push_back(NULL); + continue; } - void CpuOperator::freeVectors() { - for (int i = 0; i < args.inputCount(); ++i) { - delete dofInputs[i]; - } - for (int i = 0; i < args.outputCount(); ++i) { - delete dofOutputs[i]; - } - dofInputs.clear(); - dofOutputs.clear(); + int entries; + if (qfField.evalMode == CEED_EVAL_NONE) { + // The output vector stores values at quadrature points + entries = (ceedElementCount * ceedQ * qfField.size); + } else { + // The output vector stores the element dof values + entries = (ceedElementCount * opField.getElementSize() * opField.getComponentCount()); } - void CpuOperator::setupInputs(Vector *in) { - for (int i = 0; i < args.inputCount(); ++i) { - // Weight kernel doesn't use the input vector - if (args.getInputEvalMode(i) == CEED_EVAL_WEIGHT) { - continue; - } + Vector *dofVector = new Vector(); + dofVector->ceed = ceed; + dofVector->resize(entries); - const OperatorField &opField = args.getOpInput(i); + vectors.push_back(dofVector); + } +} - Vector *input = opField.usesActiveVector() ? in : opField.vec; - Vector *output = dofInputs[i]; +void CpuOperator::freeVectors() { + for (int i = 0; i < args.inputCount(); ++i) { + delete dofInputs[i]; + } + for (int i = 0; i < args.outputCount(); ++i) { + delete dofOutputs[i]; + } + dofInputs.clear(); + dofOutputs.clear(); +} - opField.elemRestriction->apply(CEED_NOTRANSPOSE, *input, *output); - } +void CpuOperator::setupInputs(Vector *in) { + for (int i = 0; i < args.inputCount(); ++i) { + // Weight kernel doesn't use the input vector + if (args.getInputEvalMode(i) == CEED_EVAL_WEIGHT) { + continue; } - void CpuOperator::setupOutputs(Vector *out) { - for (int i = 0; i < args.outputCount(); ++i) { - // Weight is not supported for output vectors - if (args.getOutputEvalMode(i) == CEED_EVAL_WEIGHT) { - continue; - } + const OperatorField &opField = args.getOpInput(i); - const OperatorField &opField = args.getOpOutput(i); + Vector *input = opField.usesActiveVector() ? in : opField.vec; + Vector *output = dofInputs[i]; - Vector *input = dofOutputs[i]; - Vector *output = opField.usesActiveVector() ? out : opField.vec; + opField.elemRestriction->apply(CEED_NOTRANSPOSE, *input, *output); + } +} - opField.elemRestriction->apply(CEED_TRANSPOSE, *input, *output); - } +void CpuOperator::setupOutputs(Vector *out) { + for (int i = 0; i < args.outputCount(); ++i) { + // Weight is not supported for output vectors + if (args.getOutputEvalMode(i) == CEED_EVAL_WEIGHT) { + continue; } - void CpuOperator::applyQFunction() { - if (qfunction->qFunctionContext) { - QFunctionContext *ctx = QFunctionContext::from(qfunction->qFunctionContext); - applyAddKernel.pushArg(ctx->getKernelArg()); - } else { - applyAddKernel.pushArg(::occa::null); - } - applyAddKernel.pushArg(ceedElementCount); + const OperatorField &opField = args.getOpOutput(i); - for (int i = 0; i < args.inputCount(); ++i) { - const bool isInput = true; - pushKernelArgs(dofInputs[i], isInput, i); - } + Vector *input = dofOutputs[i]; + Vector *output = opField.usesActiveVector() ? out : opField.vec; - for (int i = 0; i < args.outputCount(); ++i) { - const bool isInput = false; - pushKernelArgs(dofOutputs[i], isInput, i); - } + opField.elemRestriction->apply(CEED_TRANSPOSE, *input, *output); + } +} - applyAddKernel.run(); - } +void CpuOperator::applyQFunction() { + if (qfunction->qFunctionContext) { + QFunctionContext *ctx = QFunctionContext::from(qfunction->qFunctionContext); + applyAddKernel.pushArg(ctx->getKernelArg()); + } else { + applyAddKernel.pushArg(::occa::null); + } + applyAddKernel.pushArg(ceedElementCount); - void CpuOperator::pushKernelArgs(Vector *vec, - const bool isInput, - const int index) { - const OperatorField &opField = args.getOpField(isInput, index); - const QFunctionField &qfField = args.getQfField(isInput, index); - - if (opField.hasBasis()) { - if (opField.usingTensorBasis()) { - pushTensorBasisKernelArgs(qfField, - *((TensorBasis*) opField.basis)); - } else { - pushSimplexBasisKernelArgs(qfField, - *((SimplexBasis*) opField.basis)); - } - } + for (int i = 0; i < args.inputCount(); ++i) { + const bool isInput = true; + pushKernelArgs(dofInputs[i], isInput, i); + } - if (vec) { - if (isInput) { - applyAddKernel.pushArg(vec->getConstKernelArg()); - } else { - applyAddKernel.pushArg(vec->getKernelArg()); - } - } else { - applyAddKernel.pushArg(::occa::null); - } + for (int i = 0; i < args.outputCount(); ++i) { + const bool isInput = false; + pushKernelArgs(dofOutputs[i], isInput, i); + } + + applyAddKernel.run(); +} + +void CpuOperator::pushKernelArgs(Vector *vec, const bool isInput, const int index) { + const OperatorField &opField = args.getOpField(isInput, index); + const QFunctionField &qfField = args.getQfField(isInput, index); + + if (opField.hasBasis()) { + if (opField.usingTensorBasis()) { + pushTensorBasisKernelArgs(qfField, *((TensorBasis *)opField.basis)); + } else { + pushSimplexBasisKernelArgs(qfField, *((SimplexBasis *)opField.basis)); } + } - void CpuOperator::pushTensorBasisKernelArgs(const QFunctionField &qfField, - TensorBasis &basis) { - switch (qfField.evalMode) { - case CEED_EVAL_INTERP: { - applyAddKernel.pushArg(basis.interp1D); - break; - } - case CEED_EVAL_GRAD: { - applyAddKernel.pushArg(basis.interp1D); - applyAddKernel.pushArg(basis.grad1D); - break; - } - case CEED_EVAL_WEIGHT: { - applyAddKernel.pushArg(basis.qWeight1D); - break; - } - default: {} - } + if (vec) { + if (isInput) { + applyAddKernel.pushArg(vec->getConstKernelArg()); + } else { + applyAddKernel.pushArg(vec->getKernelArg()); } + } else { + applyAddKernel.pushArg(::occa::null); + } +} - void CpuOperator::pushSimplexBasisKernelArgs(const QFunctionField &qfField, - SimplexBasis &basis) { - switch (qfField.evalMode) { - case CEED_EVAL_INTERP: { - applyAddKernel.pushArg(basis.interp); - break; - } - case CEED_EVAL_GRAD: { - applyAddKernel.pushArg(basis.grad); - break; - } - case CEED_EVAL_WEIGHT: { - applyAddKernel.pushArg(basis.qWeight); - break; - } - default: {} - } +void CpuOperator::pushTensorBasisKernelArgs(const QFunctionField &qfField, TensorBasis &basis) { + switch (qfField.evalMode) { + case CEED_EVAL_INTERP: { + applyAddKernel.pushArg(basis.interp1D); + break; + } + case CEED_EVAL_GRAD: { + applyAddKernel.pushArg(basis.interp1D); + applyAddKernel.pushArg(basis.grad1D); + break; + } + case CEED_EVAL_WEIGHT: { + applyAddKernel.pushArg(basis.qWeight1D); + break; } + default: { + } + } +} - ::occa::properties CpuOperator::getKernelProps() { - ::occa::properties props = qfunction->getKernelProps(ceedQ); +void CpuOperator::pushSimplexBasisKernelArgs(const QFunctionField &qfField, SimplexBasis &basis) { + switch (qfField.evalMode) { + case CEED_EVAL_INTERP: { + applyAddKernel.pushArg(basis.interp); + break; + } + case CEED_EVAL_GRAD: { + applyAddKernel.pushArg(basis.grad); + break; + } + case CEED_EVAL_WEIGHT: { + applyAddKernel.pushArg(basis.qWeight); + break; + } + default: { + } + } +} - props["defines/OCCA_Q"] = ceedQ; +::occa::properties CpuOperator::getKernelProps() { + ::occa::properties props = qfunction->getKernelProps(ceedQ); - return props; - } + props["defines/OCCA_Q"] = ceedQ; - void CpuOperator::applyAdd(Vector *in, Vector *out) { - // Setup helper vectors - setupVectors(); + return props; +} - // Dof nodes -> local dofs - setupInputs(in); +void CpuOperator::applyAdd(Vector *in, Vector *out) { + // Setup helper vectors + setupVectors(); - // Apply qFunction - applyQFunction(); + // Dof nodes -> local dofs + setupInputs(in); - // Local dofs -> dof nodes - setupOutputs(out); + // Apply qFunction + applyQFunction(); - // Cleanup helper vectors - freeVectors(); - } + // Local dofs -> dof nodes + setupOutputs(out); - ::occa::kernel CpuOperator::buildApplyAddKernel() { - std::stringstream ss; + // Cleanup helper vectors + freeVectors(); +} - addBasisFunctionSource(ss); +::occa::kernel CpuOperator::buildApplyAddKernel() { + std::stringstream ss; - addKernelSource(ss); + addBasisFunctionSource(ss); - const std::string kernelSource = ss.str(); + addKernelSource(ss); - CeedDebug(ceed, kernelSource.c_str()); + const std::string kernelSource = ss.str(); - // TODO: Store a kernel per Q - return getDevice().buildKernelFromString(kernelSource, - "applyAdd", - getKernelProps()); - } + CeedDebug(ceed, kernelSource.c_str()); - //---[ Kernel Generation ]-------------------- - void CpuOperator::addBasisFunctionSource(std::stringstream &ss) { - BasisVector sourceBasis; - for (int i = 0; i < args.inputCount(); ++i) { - addBasisIfMissingSource(sourceBasis, args.getOpInput(i).basis); - } - for (int i = 0; i < args.outputCount(); ++i) { - addBasisIfMissingSource(sourceBasis, args.getOpOutput(i).basis); - } + // TODO: Store a kernel per Q + return getDevice().buildKernelFromString(kernelSource, "applyAdd", getKernelProps()); +} - // Make sure there's a break between past code - ss << std::endl; - - // Add source code for each unique basis function - const int basisCount = (int) sourceBasis.size(); - for (int i = 0; i < basisCount; ++i) { - Basis &basis = *(sourceBasis[i]); - - ss << "// Code generation for basis " << i + 1 << std::endl - << "//---[ START ]-------------------------------" << std::endl; - - // Undefine and redefine required variables - if (basis.isTensorBasis()) { - TensorBasis &basisTensor = (TensorBasis&) basis; - ss << "#undef TENSOR_FUNCTION" << std::endl - << "#undef P1D" << std::endl - << "#undef Q1D" << std::endl - << "#define P1D " << basisTensor.P1D << std::endl - << "#define Q1D " << basisTensor.Q1D << std::endl; - } else { - SimplexBasis &basisSimplex = (SimplexBasis&) basis; - ss << "#undef SIMPLEX_FUNCTION" << std::endl - << "#undef DIM" << std::endl - << "#undef P" << std::endl - << "#undef Q" << std::endl - << "#define DIM " << basisSimplex.dim << std::endl - << "#define P " << basisSimplex.P << std::endl - << "#define Q " << basisSimplex.Q << std::endl; - } - - ss << std::endl - << basis.getFunctionSource() - << std::endl - << "//---[ END ]---------------------------------" << std::endl; - } - } +//---[ Kernel Generation ]-------------------- +void CpuOperator::addBasisFunctionSource(std::stringstream &ss) { + BasisVector sourceBasis; + for (int i = 0; i < args.inputCount(); ++i) { + addBasisIfMissingSource(sourceBasis, args.getOpInput(i).basis); + } + for (int i = 0; i < args.outputCount(); ++i) { + addBasisIfMissingSource(sourceBasis, args.getOpOutput(i).basis); + } - void CpuOperator::addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis) { - // Avoid adding duplicate sources which will result in colliding symbol names + // Make sure there's a break between past code + ss << std::endl; + + // Add source code for each unique basis function + const int basisCount = (int)sourceBasis.size(); + for (int i = 0; i < basisCount; ++i) { + Basis &basis = *(sourceBasis[i]); + + ss << "// Code generation for basis " << i + 1 << std::endl << "//---[ START ]-------------------------------" << std::endl; + + // Undefine and redefine required variables + if (basis.isTensorBasis()) { + TensorBasis &basisTensor = (TensorBasis &)basis; + ss << "#undef TENSOR_FUNCTION" << std::endl + << "#undef P1D" << std::endl + << "#undef Q1D" << std::endl + << "#define P1D " << basisTensor.P1D << std::endl + << "#define Q1D " << basisTensor.Q1D << std::endl; + } else { + SimplexBasis &basisSimplex = (SimplexBasis &)basis; + ss << "#undef SIMPLEX_FUNCTION" << std::endl + << "#undef DIM" << std::endl + << "#undef P" << std::endl + << "#undef Q" << std::endl + << "#define DIM " << basisSimplex.dim << std::endl + << "#define P " << basisSimplex.P << std::endl + << "#define Q " << basisSimplex.Q << std::endl; + } + + ss << std::endl << basis.getFunctionSource() << std::endl << "//---[ END ]---------------------------------" << std::endl; + } +} - // No basis - if (!basis) { - return; - } +void CpuOperator::addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis) { + // Avoid adding duplicate sources which will result in colliding symbol names - // Fast enough since we expect a small number of inputs/outputs - const int existingBasisCount = (int) sourceBasis.size(); - for (int i = 0; i < existingBasisCount; ++i) { - Basis *other = sourceBasis[i]; - // They are different basis types so other != basis - if (basis->isTensorBasis() != other->isTensorBasis()) { - continue; - } - - if (basis->dim == other->dim && - basis->P == other->P && - basis->Q == other->Q) { - // `other` wil generate the same code - return; - } - } + // No basis + if (!basis) { + return; + } + + // Fast enough since we expect a small number of inputs/outputs + const int existingBasisCount = (int)sourceBasis.size(); + for (int i = 0; i < existingBasisCount; ++i) { + Basis *other = sourceBasis[i]; + // They are different basis types so other != basis + if (basis->isTensorBasis() != other->isTensorBasis()) { + continue; + } - // Basis didn't match any other existing basis - sourceBasis.push_back(basis); + if (basis->dim == other->dim && basis->P == other->P && basis->Q == other->Q) { + // `other` wil generate the same code + return; } + } + + // Basis didn't match any other existing basis + sourceBasis.push_back(basis); +} - void CpuOperator::addKernelSource(std::stringstream &ss) { - // Make sure there's a break between past code - ss << std::endl; +void CpuOperator::addKernelSource(std::stringstream &ss) { + // Make sure there's a break between past code + ss << std::endl; - ss << "@kernel void applyAdd(" << std::endl; + ss << "@kernel void applyAdd(" << std::endl; - addKernelArgsSource(ss); + addKernelArgsSource(ss); - ss << std::endl - << ") {" << std::endl - << " @tile(128, @outer, @inner)" << std::endl - << " for (int element = 0; element < elementCount; ++element) {" << std::endl; + ss << std::endl + << ") {" << std::endl + << " @tile(128, @outer, @inner)" << std::endl + << " for (int element = 0; element < elementCount; ++element) {" << std::endl; #if CEED_OCCA_PRINT_KERNEL_HASHES - // Print to see which kernel is being run - ss << " if (element == 0) {" << std::endl - << " printf(\"\\n\\nOperator Kernel: \" OKL_KERNEL_HASH \"\\n\\n\");" << std::endl - << " }" << std::endl; + // Print to see which kernel is being run + ss << " if (element == 0) {" << std::endl + << " printf(\"\\n\\nOperator Kernel: \" OKL_KERNEL_HASH \"\\n\\n\");" << std::endl + << " }" << std::endl; #endif - addQuadArraySource(ss); + addQuadArraySource(ss); - ss << std::endl - << " // [Start] Transforming inputs to quadrature points" << std::endl; - addInputSetupSource(ss); - ss << " // [End] Transforming inputs to quadrature points" << std::endl - << std::endl; + ss << std::endl << " // [Start] Transforming inputs to quadrature points" << std::endl; + addInputSetupSource(ss); + ss << " // [End] Transforming inputs to quadrature points" << std::endl << std::endl; - addQFunctionApplicationSource(ss); + addQFunctionApplicationSource(ss); - ss << std::endl - << " // [Start] Transforming outputs to quadrature points" << std::endl; - addOutputSetupSource(ss); - ss << " // [End] Transforming outputs to quadrature points" << std::endl; + ss << std::endl << " // [Start] Transforming outputs to quadrature points" << std::endl; + addOutputSetupSource(ss); + ss << " // [End] Transforming outputs to quadrature points" << std::endl; - ss << " }" << std::endl - << "}" << std::endl; - } + ss << " }" << std::endl << "}" << std::endl; +} - void CpuOperator::addKernelArgsSource(std::stringstream &ss) { - ss << " void *ctx," << std::endl - << " const CeedInt elementCount"; +void CpuOperator::addKernelArgsSource(std::stringstream &ss) { + ss << " void *ctx," << std::endl << " const CeedInt elementCount"; - for (int i = 0; i < args.inputCount(); ++i) { - const bool isInput = true; - addKernelArgSource(ss, isInput, i); - } - for (int i = 0; i < args.outputCount(); ++i) { - const bool isInput = false; - addKernelArgSource(ss, isInput, i); - } + for (int i = 0; i < args.inputCount(); ++i) { + const bool isInput = true; + addKernelArgSource(ss, isInput, i); + } + for (int i = 0; i < args.outputCount(); ++i) { + const bool isInput = false; + addKernelArgSource(ss, isInput, i); + } +} + +void CpuOperator::addKernelArgSource(std::stringstream &ss, const bool isInput, const int index) { + const OperatorField &opField = args.getOpField(isInput, index); + const QFunctionField &qfField = args.getQfField(isInput, index); + + std::stringstream dimAttribute; + if (opField.hasBasis()) { + ss << ',' << std::endl; + if (opField.usingTensorBasis()) { + addTensorKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute); + } else { + addSimplexKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute); } + } - void CpuOperator::addKernelArgSource(std::stringstream &ss, - const bool isInput, - const int index) { - const OperatorField &opField = args.getOpField(isInput, index); - const QFunctionField &qfField = args.getQfField(isInput, index); - - std::stringstream dimAttribute; - if (opField.hasBasis()) { - ss << ',' << std::endl; - if (opField.usingTensorBasis()) { - addTensorKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute); - } else { - addSimplexKernelArgSource(ss, isInput, index, opField, qfField, dimAttribute); - } - } + ss << ',' << std::endl; + if (isInput) { + ss << " const CeedScalar *" << dofInputVar(index) << dimAttribute.str(); + } else { + ss << " CeedScalar *" << dofOutputVar(index) << dimAttribute.str(); + } +} - ss << ',' << std::endl; - if (isInput) { - ss << " const CeedScalar *" << dofInputVar(index) << dimAttribute.str(); - } else { - ss << " CeedScalar *" << dofOutputVar(index) << dimAttribute.str(); - } +void CpuOperator::addTensorKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField, + const QFunctionField &qfField, std::stringstream &dimAttribute) { + TensorBasis &basis = *((TensorBasis *)opField.basis); + + dimAttribute << " @dim("; + + if (qfField.evalMode == CEED_EVAL_INTERP) { + ss << " const CeedScalar *" << interpVar(isInput, index); + + // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) + for (int i = 0; i < basis.dim; ++i) { + dimAttribute << basis.P1D << ", "; } + dimAttribute << basis.ceedComponentCount << ", elementCount"; + } else if (qfField.evalMode == CEED_EVAL_GRAD) { + ss << " const CeedScalar *" << interpVar(isInput, index) << ',' << std::endl << " const CeedScalar *" << gradVar(isInput, index); - void CpuOperator::addTensorKernelArgSource(std::stringstream &ss, - const bool isInput, - const int index, - const OperatorField &opField, - const QFunctionField &qfField, - std::stringstream &dimAttribute) { - TensorBasis &basis = *((TensorBasis*) opField.basis); - - dimAttribute << " @dim("; - - if (qfField.evalMode == CEED_EVAL_INTERP) { - ss << " const CeedScalar *" << interpVar(isInput, index); - - // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) - for (int i = 0; i < basis.dim; ++i) { - dimAttribute << basis.P1D << ", "; - } - dimAttribute << basis.ceedComponentCount - << ", elementCount"; - } - else if (qfField.evalMode == CEED_EVAL_GRAD) { - ss << " const CeedScalar *" << interpVar(isInput, index) << ',' << std::endl - << " const CeedScalar *" << gradVar(isInput, index); - - // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) - for (int i = 0; i < basis.dim; ++i) { - dimAttribute << basis.P1D << ", "; - } - dimAttribute << basis.ceedComponentCount - << ", elementCount"; - } - else if (qfField.evalMode == CEED_EVAL_WEIGHT) { - ss << " const CeedScalar *" << qWeightVar(isInput, index); - - // @dim(Q1D, Q1D, elementCount) - for (int i = 0; i < basis.dim; ++i) { - dimAttribute << basis.Q1D << ", "; - } - dimAttribute << "elementCount"; - } - else { - // Clear @dim - dimAttribute.str(""); - return; - } + // @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) + for (int i = 0; i < basis.dim; ++i) { + dimAttribute << basis.P1D << ", "; + } + dimAttribute << basis.ceedComponentCount << ", elementCount"; + } else if (qfField.evalMode == CEED_EVAL_WEIGHT) { + ss << " const CeedScalar *" << qWeightVar(isInput, index); - dimAttribute << ")"; + // @dim(Q1D, Q1D, elementCount) + for (int i = 0; i < basis.dim; ++i) { + dimAttribute << basis.Q1D << ", "; } + dimAttribute << "elementCount"; + } else { + // Clear @dim + dimAttribute.str(""); + return; + } - void CpuOperator::addSimplexKernelArgSource(std::stringstream &ss, - const bool isInput, - const int index, - const OperatorField &opField, - const QFunctionField &qfField, - std::stringstream &dimAttribute) { - SimplexBasis &basis = *((SimplexBasis*) opField.basis); + dimAttribute << ")"; +} - dimAttribute << " @dim("; +void CpuOperator::addSimplexKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField, + const QFunctionField &qfField, std::stringstream &dimAttribute) { + SimplexBasis &basis = *((SimplexBasis *)opField.basis); - if (qfField.evalMode == CEED_EVAL_INTERP) { - ss << " const CeedScalar *" << interpVar(isInput, index); + dimAttribute << " @dim("; - // @dim(P, BASIS_COMPONENT_COUNT, elementCount) - dimAttribute << basis.P - << ", " << basis.ceedComponentCount - << ", elementCount"; - } - else if (qfField.evalMode == CEED_EVAL_GRAD) { - ss << " const CeedScalar *" << gradVar(isInput, index); + if (qfField.evalMode == CEED_EVAL_INTERP) { + ss << " const CeedScalar *" << interpVar(isInput, index); - // @dim(P, BASIS_COMPONENT_COUNT, elementCount) - dimAttribute << basis.P - << ", " << basis.ceedComponentCount - << ", elementCount"; - } - else if (qfField.evalMode == CEED_EVAL_WEIGHT) { - ss << " const CeedScalar *" << qWeightVar(isInput, index); + // @dim(P, BASIS_COMPONENT_COUNT, elementCount) + dimAttribute << basis.P << ", " << basis.ceedComponentCount << ", elementCount"; + } else if (qfField.evalMode == CEED_EVAL_GRAD) { + ss << " const CeedScalar *" << gradVar(isInput, index); - // @dim(Q, elementCount) - dimAttribute << basis.Q << ", " << "elementCount"; - } - else { - // Clear @dim - dimAttribute.str(""); - return; - } + // @dim(P, BASIS_COMPONENT_COUNT, elementCount) + dimAttribute << basis.P << ", " << basis.ceedComponentCount << ", elementCount"; + } else if (qfField.evalMode == CEED_EVAL_WEIGHT) { + ss << " const CeedScalar *" << qWeightVar(isInput, index); - dimAttribute << ")"; - } + // @dim(Q, elementCount) + dimAttribute << basis.Q << ", " + << "elementCount"; + } else { + // Clear @dim + dimAttribute.str(""); + return; + } - void CpuOperator::addQuadArraySource(std::stringstream &ss) { - const int inputs = args.inputCount(); - const int outputs = args.outputCount(); + dimAttribute << ")"; +} - const std::string quadInput = "quadInput"; - const std::string quadOutput = "quadOutput"; +void CpuOperator::addQuadArraySource(std::stringstream &ss) { + const int inputs = args.inputCount(); + const int outputs = args.outputCount(); - ss << " // Store the transformed input quad values" << std::endl; - for (int i = 0; i < inputs; ++i) { - const bool isInput = true; - addSingleQfunctionQuadArraySource(ss, isInput, i, quadInput); - } + const std::string quadInput = "quadInput"; + const std::string quadOutput = "quadOutput"; - ss << std::endl - << " // Store the transformed output quad values" << std::endl; - for (int i = 0; i < outputs; ++i) { - const bool isInput = false; - addSingleQfunctionQuadArraySource(ss, isInput, i, quadOutput); - } - ss << std::endl; + ss << " // Store the transformed input quad values" << std::endl; + for (int i = 0; i < inputs; ++i) { + const bool isInput = true; + addSingleQfunctionQuadArraySource(ss, isInput, i, quadInput); + } - ss << std::endl - << " // Store all input pointers in a single array" << std::endl; - addQfunctionQuadArraySource(ss, true, inputs, quadInput); + ss << std::endl << " // Store the transformed output quad values" << std::endl; + for (int i = 0; i < outputs; ++i) { + const bool isInput = false; + addSingleQfunctionQuadArraySource(ss, isInput, i, quadOutput); + } + ss << std::endl; - ss << std::endl - << " // Store all output pointers in a single array" << std::endl; - addQfunctionQuadArraySource(ss, false, outputs, quadOutput); + ss << std::endl << " // Store all input pointers in a single array" << std::endl; + addQfunctionQuadArraySource(ss, true, inputs, quadInput); - ss << std::endl; - } + ss << std::endl << " // Store all output pointers in a single array" << std::endl; + addQfunctionQuadArraySource(ss, false, outputs, quadOutput); - void CpuOperator::addSingleQfunctionQuadArraySource(std::stringstream &ss, - const bool isInput, - const int index, - const std::string &name) { - // Output: - // CeedScalar quadInput0[DIM][COMPONENTS][OCCA_Q]; - // CeedScalar quadInput0[OCCA_Q * SIZE]; - - const OperatorField &opField = args.getOpField(isInput, index); - CeedEvalMode evalMode = args.getEvalMode(isInput, index); - - if (evalMode == CEED_EVAL_GRAD) { - ss << " CeedScalar " << indexedVar(name, index) - << "[" << opField.getDim() << "]" - << "[" << opField.getComponentCount() << "]" - << "[OCCA_Q];" << std::endl; - } else if (evalMode == CEED_EVAL_INTERP) { - ss << " CeedScalar " << indexedVar(name, index) - << "[" << opField.getComponentCount() << "]" - << "[OCCA_Q];" << std::endl; - } else { - const QFunctionField &qfField = args.getQfField(isInput, index); + ss << std::endl; +} - ss << " CeedScalar " << indexedVar(name, index) - << "[OCCA_Q * " << qfField.size << "];" << std::endl; - } +void CpuOperator::addSingleQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int index, const std::string &name) { + // Output: + // CeedScalar quadInput0[DIM][COMPONENTS][OCCA_Q]; + // CeedScalar quadInput0[OCCA_Q * SIZE]; + + const OperatorField &opField = args.getOpField(isInput, index); + CeedEvalMode evalMode = args.getEvalMode(isInput, index); + + if (evalMode == CEED_EVAL_GRAD) { + ss << " CeedScalar " << indexedVar(name, index) << "[" << opField.getDim() << "]" + << "[" << opField.getComponentCount() << "]" + << "[OCCA_Q];" << std::endl; + } else if (evalMode == CEED_EVAL_INTERP) { + ss << " CeedScalar " << indexedVar(name, index) << "[" << opField.getComponentCount() << "]" + << "[OCCA_Q];" << std::endl; + } else { + const QFunctionField &qfField = args.getQfField(isInput, index); + + ss << " CeedScalar " << indexedVar(name, index) << "[OCCA_Q * " << qfField.size << "];" << std::endl; + } +} + +void CpuOperator::addQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int count, const std::string &name) { + // Output: + // CeedScalar *quadInputs[2] = { + // (CeedScalar*) quadInput0, + // (CeedScalar*) quadInput1 + // }; + + // Add an 's': quadInput -> quadInputs + const std::string arrayName = name + "s"; + + ss << " CeedScalar *" << arrayName << "[" << count << "] = {" << std::endl; + for (int i = 0; i < count; ++i) { + if (i) { + ss << ',' << std::endl; } + ss << " (CeedScalar*) " << indexedVar(name, i); + } + ss << std::endl << " };" << std::endl; +} + +void CpuOperator::addInputSetupSource(std::stringstream &ss) { + const bool isInput = true; + addBasisApplySource(ss, isInput, args.inputCount()); +} - void CpuOperator::addQfunctionQuadArraySource(std::stringstream &ss, - const bool isInput, - const int count, - const std::string &name) { - // Output: - // CeedScalar *quadInputs[2] = { - // (CeedScalar*) quadInput0, - // (CeedScalar*) quadInput1 - // }; - - // Add an 's': quadInput -> quadInputs - const std::string arrayName = name + "s"; - - ss << " CeedScalar *" << arrayName << "[" << count << "] = {" << std::endl; - for (int i = 0; i < count; ++i) { - if (i) { - ss << ',' << std::endl; - } - ss << " (CeedScalar*) " << indexedVar(name, i); +void CpuOperator::addOutputSetupSource(std::stringstream &ss) { + const bool isInput = false; + addBasisApplySource(ss, isInput, args.outputCount()); +} + +void CpuOperator::addBasisApplySource(std::stringstream &ss, const bool isInput, const int count) { + for (int i = 0; i < count; ++i) { + CeedEvalMode evalMode = args.getEvalMode(isInput, i); + + if (evalMode == CEED_EVAL_INTERP) { + addInterpSource(ss, isInput, i); + } else if (evalMode == CEED_EVAL_GRAD) { + const bool hasTensorBasis = args.getOpField(isInput, i).usingTensorBasis(); + if (hasTensorBasis) { + addGradTensorSource(ss, isInput, i); + } else { + addGradSimplexSource(ss, isInput, i); } - ss << std::endl - << " };" << std::endl; + } else if (evalMode == CEED_EVAL_WEIGHT) { + addWeightSource(ss, isInput, i); + } else if (evalMode == CEED_EVAL_NONE) { + addCopySource(ss, isInput, i); } + } +} - void CpuOperator::addInputSetupSource(std::stringstream &ss) { - const bool isInput = true; - addBasisApplySource(ss, isInput, args.inputCount()); - } +void CpuOperator::addInterpSource(std::stringstream &ss, const bool isInput, const int index) { + const OperatorField &opField = args.getOpField(isInput, index); + const bool usingTensorBasis = opField.usingTensorBasis(); + const int components = opField.getComponentCount(); + const int dim = opField.getDim(); - void CpuOperator::addOutputSetupSource(std::stringstream &ss) { - const bool isInput = false; - addBasisApplySource(ss, isInput, args.outputCount()); - } + const std::string weights = interpVar(isInput, index); - void CpuOperator::addBasisApplySource(std::stringstream &ss, - const bool isInput, - const int count) { - for (int i = 0; i < count; ++i) { - CeedEvalMode evalMode = args.getEvalMode(isInput, i); - - if (evalMode == CEED_EVAL_INTERP) { - addInterpSource(ss, isInput, i); - } - else if (evalMode == CEED_EVAL_GRAD) { - const bool hasTensorBasis = args.getOpField(isInput, i).usingTensorBasis(); - if (hasTensorBasis) { - addGradTensorSource(ss, isInput, i); - } else { - addGradSimplexSource(ss, isInput, i); - } - } - else if (evalMode == CEED_EVAL_WEIGHT) { - addWeightSource(ss, isInput, i); - } - else if (evalMode == CEED_EVAL_NONE) { - addCopySource(ss, isInput, i); - } + std::string dimArgs; + if (usingTensorBasis) { + for (int i = 0; i < dim; ++i) { + if (i) { + dimArgs += ", "; } + dimArgs += '0'; } + } else { + dimArgs = "0"; + } - void CpuOperator::addInterpSource(std::stringstream &ss, - const bool isInput, - const int index) { - const OperatorField &opField = args.getOpField(isInput, index); - const bool usingTensorBasis = opField.usingTensorBasis(); - const int components = opField.getComponentCount(); - const int dim = opField.getDim(); - - const std::string weights = interpVar(isInput, index); - - std::string dimArgs; - if (usingTensorBasis) { - for (int i = 0; i < dim; ++i) { - if (i) { - dimArgs += ", "; - } - dimArgs += '0'; - } - } else { - dimArgs = "0"; - } + std::string input, output; + if (isInput) { + input = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)"; + output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]"; + } else { + input = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]"; + output = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)"; + } - std::string input, output; - if (isInput) { - input = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)"; - output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]"; - } else { - input = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]"; - output = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)"; - } + ss << " // Applying interp (" << xputName(isInput) << ": " << index << ")" << std::endl + << " for (int component = 0; component < " << components << "; ++component) {" << std::endl + << " " << elementFunction(isInput, index) << "(" << std::endl + << " " << weights << ',' << std::endl + << " " << input << ',' << std::endl + << " " << output << std::endl + << " );" << std::endl + << " }" << std::endl + << std::endl; +} + +void CpuOperator::addGradTensorSource(std::stringstream &ss, const bool isInput, const int index) { + const OperatorField &opField = args.getOpField(isInput, index); + const int components = opField.getComponentCount(); + const int dim = opField.getDim(); + + const std::string B = interpVar(isInput, index); + const std::string Bx = gradVar(isInput, index); - ss << " // Applying interp (" << xputName(isInput) << ": " << index << ")" << std::endl - << " for (int component = 0; component < " << components << "; ++component) {" << std::endl - << " " << elementFunction(isInput, index) << "(" << std::endl - << " " << weights << ',' << std::endl - << " " << input << ',' << std::endl - << " " << output << std::endl - << " );" << std::endl - << " }" << std::endl - << std::endl; + std::string dimArgs; + for (int i = 0; i < dim; ++i) { + if (i) { + dimArgs += ", "; } + dimArgs += '0'; + } - void CpuOperator::addGradTensorSource(std::stringstream &ss, - const bool isInput, - const int index) { - const OperatorField &opField = args.getOpField(isInput, index); - const int components = opField.getComponentCount(); - const int dim = opField.getDim(); - - const std::string B = interpVar(isInput, index); - const std::string Bx = gradVar(isInput, index); - - std::string dimArgs; - for (int i = 0; i < dim; ++i) { - if (i) { - dimArgs += ", "; - } - dimArgs += '0'; - } + std::string inputs, outputs; + if (isInput) { + inputs = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)"; - std::string inputs, outputs; - if (isInput) { - inputs = "&" + dofInputVar(index) + "(" + dimArgs + ", component, element)"; - - for (int i = 0; i < dim; ++i) { - if (i) { - outputs += ",\n "; - } - const std::string iStr = std::to_string(i); - outputs += "(CeedScalar*) " + indexedVar("quadInput", index) + "[" + iStr + "][component]"; - } - } else { - for (int i = 0; i < dim; ++i) { - if (i) { - inputs += ",\n "; - } - const std::string iStr = std::to_string(i); - inputs += "(CeedScalar*) " + indexedVar("quadOutput", index) + "[" + iStr + "][component]"; - } - - outputs = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)"; + for (int i = 0; i < dim; ++i) { + if (i) { + outputs += ",\n "; } - - ss << " // Applying grad-tensor (" << xputName(isInput) << ": " << index << ")" << std::endl - << " for (int component = 0; component < " << components << "; ++component) {" << std::endl - << " " << elementFunction(isInput, index) << "(" << std::endl - << " " << B << ',' << std::endl - << " " << Bx << ',' << std::endl - << " " << inputs << ',' << std::endl - << " " << outputs << std::endl - << " );" << std::endl - << " }" << std::endl - << std::endl; + const std::string iStr = std::to_string(i); + outputs += "(CeedScalar*) " + indexedVar("quadInput", index) + "[" + iStr + "][component]"; } - - void CpuOperator::addGradSimplexSource(std::stringstream &ss, - const bool isInput, - const int index) { - const int components = ( - args - .getOpField(isInput, index) - .getComponentCount() - ); - - const std::string weights = gradVar(isInput, index); - - std::string input, output; - if (isInput) { - input = "&" + dofInputVar(index) + "(0, component, element)"; - output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]"; - } else { - input = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]"; - output = "&" + dofOutputVar(index) + "(0, component, element)"; + } else { + for (int i = 0; i < dim; ++i) { + if (i) { + inputs += ",\n "; } - - ss << " // Applying grad-simplex (" << xputName(isInput) << ": " << index << ")" << std::endl - << " for (int component = 0; component < " << components << "; ++component) {" << std::endl - << " " << elementFunction(isInput, index) << "(" << std::endl - << " " << weights << ',' << std::endl - << " " << input << ',' << std::endl - << " " << output << std::endl - << " );" << std::endl - << " }" << std::endl - << std::endl; + const std::string iStr = std::to_string(i); + inputs += "(CeedScalar*) " + indexedVar("quadOutput", index) + "[" + iStr + "][component]"; } - void CpuOperator::addWeightSource(std::stringstream &ss, - const bool isInput, - const int index) { + outputs = "&" + dofOutputVar(index) + "(" + dimArgs + ", component, element)"; + } - const std::string weights = qWeightVar(isInput, index); + ss << " // Applying grad-tensor (" << xputName(isInput) << ": " << index << ")" << std::endl + << " for (int component = 0; component < " << components << "; ++component) {" << std::endl + << " " << elementFunction(isInput, index) << "(" << std::endl + << " " << B << ',' << std::endl + << " " << Bx << ',' << std::endl + << " " << inputs << ',' << std::endl + << " " << outputs << std::endl + << " );" << std::endl + << " }" << std::endl + << std::endl; +} - std::string output; - if (isInput) { - // TODO: Can the weight operator handle multiple components? - output = "(CeedScalar*) " + indexedVar("quadInput", index); - } else { - output = "&" + dofOutputVar(index) + "(0, element)"; - } +void CpuOperator::addGradSimplexSource(std::stringstream &ss, const bool isInput, const int index) { + const int components = (args.getOpField(isInput, index).getComponentCount()); - ss << " // Applying weight (" << xputName(isInput) << ": " << index << ")" << std::endl - << " " << elementFunction(isInput, index) << "(" << std::endl - << " " << weights << ',' << std::endl - << " " << output << std::endl - << " );" << std::endl - << std::endl; - } + const std::string weights = gradVar(isInput, index); - void CpuOperator::addCopySource(std::stringstream &ss, - const bool isInput, - const int index) { - const QFunctionField &qfField = args.getQfField(isInput, index); - const std::string size = std::to_string(qfField.size); + std::string input, output; + if (isInput) { + input = "&" + dofInputVar(index) + "(0, component, element)"; + output = "(CeedScalar*) " + indexedVar("quadInput", index) + "[component]"; + } else { + input = "(CeedScalar*) " + indexedVar("quadOutput", index) + "[component]"; + output = "&" + dofOutputVar(index) + "(0, component, element)"; + } - std::string input, output; - if (isInput) { - input += dofInputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]"; - output += indexedVar("quadInput", index) + "[q + field * OCCA_Q]"; - } else { - input = indexedVar("quadOutput", index) + "[q + field * OCCA_Q]"; - output = dofOutputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]"; - } + ss << " // Applying grad-simplex (" << xputName(isInput) << ": " << index << ")" << std::endl + << " for (int component = 0; component < " << components << "; ++component) {" << std::endl + << " " << elementFunction(isInput, index) << "(" << std::endl + << " " << weights << ',' << std::endl + << " " << input << ',' << std::endl + << " " << output << std::endl + << " );" << std::endl + << " }" << std::endl + << std::endl; +} - ss << " // Copying source directly (" << xputName(isInput) << ": " << index << ")" << std::endl - << " for (int field = 0; field < " << size << "; ++field) {" << std::endl - << " for (int q = 0; q < OCCA_Q; ++q) {" << std::endl - << " " << output << " = " << input << ";" << std::endl - << " }" << std::endl - << " }" << std::endl - << std::endl; - } +void CpuOperator::addWeightSource(std::stringstream &ss, const bool isInput, const int index) { + const std::string weights = qWeightVar(isInput, index); - void CpuOperator::addQFunctionApplicationSource(std::stringstream &ss) { - ss << " // Apply qFunction" << std::endl - << " " << qfunction->qFunctionName << "(ctx, OCCA_Q, quadInputs, quadOutputs);" << std::endl - << std::endl; - } + std::string output; + if (isInput) { + // TODO: Can the weight operator handle multiple components? + output = "(CeedScalar*) " + indexedVar("quadInput", index); + } else { + output = "&" + dofOutputVar(index) + "(0, element)"; + } - // ---[ Variables ]----------------- - std::string CpuOperator::elementFunction(const bool isInput, - const int index) { - return fullFieldFunctionName(isInput, - args.getOpField(isInput, index), - args.getQfField(isInput, index)); - } + ss << " // Applying weight (" << xputName(isInput) << ": " << index << ")" << std::endl + << " " << elementFunction(isInput, index) << "(" << std::endl + << " " << weights << ',' << std::endl + << " " << output << std::endl + << " );" << std::endl + << std::endl; +} - std::string CpuOperator::fieldFunctionName(const QFunctionField &qfField) { - switch (qfField.evalMode) { - case CEED_EVAL_INTERP: - return "interp"; - case CEED_EVAL_GRAD: - return "grad"; - case CEED_EVAL_WEIGHT: - return "weight"; - default: - return "none"; - } - } +void CpuOperator::addCopySource(std::stringstream &ss, const bool isInput, const int index) { + const QFunctionField &qfField = args.getQfField(isInput, index); + const std::string size = std::to_string(qfField.size); + + std::string input, output; + if (isInput) { + input += dofInputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]"; + output += indexedVar("quadInput", index) + "[q + field * OCCA_Q]"; + } else { + input = indexedVar("quadOutput", index) + "[q + field * OCCA_Q]"; + output = dofOutputVar(index) + "[q + (OCCA_Q * (field + element * " + size + "))]"; + } - std::string CpuOperator::fullFieldFunctionName(const bool isInput, - const OperatorField &opField, - const QFunctionField &qfField) { - // Output: - // - tensor_1d_interpElement_Q2_P2 - // - simplex_1d_interpElementTranspose_Q2_P2 - - const bool usingTensorBasis = opField.usingTensorBasis(); - std::stringstream ss; - int dim, Q, P; - - if (usingTensorBasis) { - TensorBasis &basis = *((TensorBasis*) opField.basis); - dim = basis.dim; - Q = basis.Q1D; - P = basis.P1D; - ss << "tensor_"; - } else { - SimplexBasis &basis = *((SimplexBasis*) opField.basis); - dim = basis.dim; - Q = basis.Q; - P = basis.P; - ss << "simplex_"; - } + ss << " // Copying source directly (" << xputName(isInput) << ": " << index << ")" << std::endl + << " for (int field = 0; field < " << size << "; ++field) {" << std::endl + << " for (int q = 0; q < OCCA_Q; ++q) {" << std::endl + << " " << output << " = " << input << ";" << std::endl + << " }" << std::endl + << " }" << std::endl + << std::endl; +} - ss << dim << "d_" << fieldFunctionName(qfField) << "Element"; +void CpuOperator::addQFunctionApplicationSource(std::stringstream &ss) { + ss << " // Apply qFunction" << std::endl + << " " << qfunction->qFunctionName << "(ctx, OCCA_Q, quadInputs, quadOutputs);" << std::endl + << std::endl; +} - if (!isInput) { - ss << "Transpose"; - } +// ---[ Variables ]----------------- +std::string CpuOperator::elementFunction(const bool isInput, const int index) { + return fullFieldFunctionName(isInput, args.getOpField(isInput, index), args.getQfField(isInput, index)); +} - ss << "_Q" << Q << "_P" << P; +std::string CpuOperator::fieldFunctionName(const QFunctionField &qfField) { + switch (qfField.evalMode) { + case CEED_EVAL_INTERP: + return "interp"; + case CEED_EVAL_GRAD: + return "grad"; + case CEED_EVAL_WEIGHT: + return "weight"; + default: + return "none"; + } +} - return ss.str(); - } +std::string CpuOperator::fullFieldFunctionName(const bool isInput, const OperatorField &opField, const QFunctionField &qfField) { + // Output: + // - tensor_1d_interpElement_Q2_P2 + // - simplex_1d_interpElementTranspose_Q2_P2 + + const bool usingTensorBasis = opField.usingTensorBasis(); + std::stringstream ss; + int dim, Q, P; + + if (usingTensorBasis) { + TensorBasis &basis = *((TensorBasis *)opField.basis); + dim = basis.dim; + Q = basis.Q1D; + P = basis.P1D; + ss << "tensor_"; + } else { + SimplexBasis &basis = *((SimplexBasis *)opField.basis); + dim = basis.dim; + Q = basis.Q; + P = basis.P; + ss << "simplex_"; } + + ss << dim << "d_" << fieldFunctionName(qfField) << "Element"; + + if (!isInput) { + ss << "Transpose"; + } + + ss << "_Q" << Q << "_P" << P; + + return ss.str(); } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-cpu-operator.hpp b/backends/occa/ceed-occa-cpu-operator.hpp index 1da34be0a5..3924cccfcd 100644 --- a/backends/occa/ceed-occa-cpu-operator.hpp +++ b/backends/occa/ceed-occa-cpu-operator.hpp @@ -15,176 +15,118 @@ #include "ceed-occa-vector.hpp" namespace ceed { - namespace occa { - class Basis; - class SimplexBasis; - class TensorBasis; +namespace occa { +class Basis; +class SimplexBasis; +class TensorBasis; - class CpuOperator : public Operator { - private: - typedef std::vector VectorVector; - typedef std::vector BasisVector; +class CpuOperator : public Operator { + private: + typedef std::vector VectorVector; + typedef std::vector BasisVector; - VectorVector dofInputs, dofOutputs; + VectorVector dofInputs, dofOutputs; - public: - CpuOperator(); + public: + CpuOperator(); - ~CpuOperator(); + ~CpuOperator(); - // Setup helper vectors - void setupVectors(); + // Setup helper vectors + void setupVectors(); - void setupVectors(const int fieldCount, - OperatorFieldVector &opFields, - QFunctionFieldVector &qfFields, - VectorVector &vectors); + void setupVectors(const int fieldCount, OperatorFieldVector &opFields, QFunctionFieldVector &qfFields, VectorVector &vectors); - void freeVectors(); + void freeVectors(); - // Restriction operators - void setupInputs(Vector *in); + // Restriction operators + void setupInputs(Vector *in); - void setupOutputs(Vector *out); + void setupOutputs(Vector *out); - void applyQFunction(); + void applyQFunction(); - // Push arguments for a given field - void pushKernelArgs(Vector *vec, - const bool isInput, - const int index); + // Push arguments for a given field + void pushKernelArgs(Vector *vec, const bool isInput, const int index); - void pushTensorBasisKernelArgs(const QFunctionField &qfField, - TensorBasis &basis); + void pushTensorBasisKernelArgs(const QFunctionField &qfField, TensorBasis &basis); - void pushSimplexBasisKernelArgs(const QFunctionField &qfField, - SimplexBasis &basis); + void pushSimplexBasisKernelArgs(const QFunctionField &qfField, SimplexBasis &basis); - // Set props for a given field - ::occa::properties getKernelProps(); + // Set props for a given field + ::occa::properties getKernelProps(); - void applyAdd(Vector *in, Vector *out); + void applyAdd(Vector *in, Vector *out); - ::occa::kernel buildApplyAddKernel(); + ::occa::kernel buildApplyAddKernel(); - //---[ Kernel Generation ]------------------ - void addBasisFunctionSource(std::stringstream &ss); + //---[ Kernel Generation ]------------------ + void addBasisFunctionSource(std::stringstream &ss); - void addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis); + void addBasisIfMissingSource(BasisVector &sourceBasis, Basis *basis); - void addKernelSource(std::stringstream &ss); + void addKernelSource(std::stringstream &ss); - void addKernelArgsSource(std::stringstream &ss); + void addKernelArgsSource(std::stringstream &ss); - void addKernelArgSource(std::stringstream &ss, - const bool isInput, - const int index); + void addKernelArgSource(std::stringstream &ss, const bool isInput, const int index); - void addTensorKernelArgSource(std::stringstream &ss, - const bool isInput, - const int index, - const OperatorField &opField, - const QFunctionField &qfField, - std::stringstream &dimAttribute); + void addTensorKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField, + const QFunctionField &qfField, std::stringstream &dimAttribute); - void addSimplexKernelArgSource(std::stringstream &ss, - const bool isInput, - const int index, - const OperatorField &opField, - const QFunctionField &qfField, - std::stringstream &dimAttribute); + void addSimplexKernelArgSource(std::stringstream &ss, const bool isInput, const int index, const OperatorField &opField, + const QFunctionField &qfField, std::stringstream &dimAttribute); - void addQuadArraySource(std::stringstream &ss); + void addQuadArraySource(std::stringstream &ss); - void addSingleQfunctionQuadArraySource(std::stringstream &ss, - const bool isInput, - const int index, - const std::string &name); + void addSingleQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int index, const std::string &name); - void addQfunctionQuadArraySource(std::stringstream &ss, - const bool isInput, - const int count, - const std::string &name); + void addQfunctionQuadArraySource(std::stringstream &ss, const bool isInput, const int count, const std::string &name); - void addInputSetupSource(std::stringstream &ss); + void addInputSetupSource(std::stringstream &ss); - void addOutputSetupSource(std::stringstream &ss); + void addOutputSetupSource(std::stringstream &ss); - void addBasisApplySource(std::stringstream &ss, - const bool isInput, - const int count); + void addBasisApplySource(std::stringstream &ss, const bool isInput, const int count); - void addInterpSource(std::stringstream &ss, - const bool isInput, - const int index); + void addInterpSource(std::stringstream &ss, const bool isInput, const int index); - void addGradTensorSource(std::stringstream &ss, - const bool isInput, - const int index); + void addGradTensorSource(std::stringstream &ss, const bool isInput, const int index); - void addGradSimplexSource(std::stringstream &ss, - const bool isInput, - const int index); + void addGradSimplexSource(std::stringstream &ss, const bool isInput, const int index); - void addWeightSource(std::stringstream &ss, - const bool isInput, - const int index); + void addWeightSource(std::stringstream &ss, const bool isInput, const int index); - void addCopySource(std::stringstream &ss, - const bool isInput, - const int index); + void addCopySource(std::stringstream &ss, const bool isInput, const int index); - void addQFunctionApplicationSource(std::stringstream &ss); + void addQFunctionApplicationSource(std::stringstream &ss); - // ---[ Variables ]--------------- - inline std::string xputName(const bool isInput) { - return isInput ? "input" : "output"; - } + // ---[ Variables ]--------------- + inline std::string xputName(const bool isInput) { return isInput ? "input" : "output"; } - inline std::string indexedVar(const std::string &name, - const int index) { - return name + std::to_string(index); - } + inline std::string indexedVar(const std::string &name, const int index) { return name + std::to_string(index); } - inline std::string indexedVar(const std::string &name, - const bool isInput, - const int index) { - return (isInput ? "input" : "output") + std::to_string(index) + "_" + name; - } + inline std::string indexedVar(const std::string &name, const bool isInput, const int index) { + return (isInput ? "input" : "output") + std::to_string(index) + "_" + name; + } - inline std::string dofInputVar(const int index) { - return indexedVar("dofInput", index); - } + inline std::string dofInputVar(const int index) { return indexedVar("dofInput", index); } - inline std::string dofOutputVar(const int index) { - return indexedVar("dofOutput", index); - } + inline std::string dofOutputVar(const int index) { return indexedVar("dofOutput", index); } - inline std::string interpVar(const bool isInput, - const int index) { - return indexedVar("B", isInput, index); - } + inline std::string interpVar(const bool isInput, const int index) { return indexedVar("B", isInput, index); } - inline std::string gradVar(const bool isInput, - const int index) { - return indexedVar("Bx", isInput, index); - } + inline std::string gradVar(const bool isInput, const int index) { return indexedVar("Bx", isInput, index); } - inline std::string qWeightVar(const bool isInput, - const int index) { - return indexedVar("qWeights", isInput, index); - } - - std::string elementFunction(const bool isInput, - const int index); - - std::string fieldFunctionName(const QFunctionField &qfField); - - std::string fullFieldFunctionName(const bool isInput, - const OperatorField &opField, - const QFunctionField &qfField); - }; - } -} + inline std::string qWeightVar(const bool isInput, const int index) { return indexedVar("qWeights", isInput, index); } + + std::string elementFunction(const bool isInput, const int index); + + std::string fieldFunctionName(const QFunctionField &qfField); + + std::string fullFieldFunctionName(const bool isInput, const OperatorField &opField, const QFunctionField &qfField); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-elem-restriction.cpp b/backends/occa/ceed-occa-elem-restriction.cpp index a9d844091c..11983a8a72 100644 --- a/backends/occa/ceed-occa-elem-restriction.cpp +++ b/backends/occa/ceed-occa-elem-restriction.cpp @@ -5,426 +5,363 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "./ceed-occa-elem-restriction.hpp" + #include +#include -#include "./ceed-occa-elem-restriction.hpp" #include "./ceed-occa-kernels.hpp" #include "./ceed-occa-vector.hpp" namespace ceed { - namespace occa { - ElemRestriction::ElemRestriction() : - ceedElementCount(0), - ceedElementSize(0), - ceedComponentCount(0), - ceedLVectorSize(0), - ceedNodeStride(0), - ceedComponentStride(0), - ceedElementStride(0), - ceedUnstridedComponentStride(0), - freeHostIndices(true), - hostIndices(NULL), - freeIndices(true) {} - - ElemRestriction::~ElemRestriction() { - if (freeHostIndices) { - CeedFree(&hostIndices); - } - if (freeIndices) { - indices.free(); - } - } +namespace occa { +ElemRestriction::ElemRestriction() + : ceedElementCount(0), + ceedElementSize(0), + ceedComponentCount(0), + ceedLVectorSize(0), + ceedNodeStride(0), + ceedComponentStride(0), + ceedElementStride(0), + ceedUnstridedComponentStride(0), + freeHostIndices(true), + hostIndices(NULL), + freeIndices(true) {} + +ElemRestriction::~ElemRestriction() { + if (freeHostIndices) { + CeedFree(&hostIndices); + } + if (freeIndices) { + indices.free(); + } +} - void ElemRestriction::setup(CeedMemType memType, - CeedCopyMode copyMode, - const CeedInt *indicesInput) { - if (memType == CEED_MEM_HOST) { - setupFromHostMemory(copyMode, indicesInput); - } else { - setupFromDeviceMemory(copyMode, indicesInput); - } +void ElemRestriction::setup(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput) { + if (memType == CEED_MEM_HOST) { + setupFromHostMemory(copyMode, indicesInput); + } else { + setupFromDeviceMemory(copyMode, indicesInput); + } - setupTransposeIndices(); - } + setupTransposeIndices(); +} - void ElemRestriction::setupFromHostMemory(CeedCopyMode copyMode, - const CeedInt *indices_h) { - const CeedInt entries = ceedElementCount * ceedElementSize; +void ElemRestriction::setupFromHostMemory(CeedCopyMode copyMode, const CeedInt *indices_h) { + const CeedInt entries = ceedElementCount * ceedElementSize; - freeHostIndices = (copyMode == CEED_OWN_POINTER || copyMode == CEED_COPY_VALUES); + freeHostIndices = (copyMode == CEED_OWN_POINTER || copyMode == CEED_COPY_VALUES); - if (copyMode != CEED_COPY_VALUES) { - hostIndices = const_cast(indices_h); - } else { - const size_t bytes = entries * sizeof(CeedInt); - hostIndices = (CeedInt*) ::malloc(bytes); - std::memcpy(hostIndices, indices_h, bytes); - } + if (copyMode != CEED_COPY_VALUES) { + hostIndices = const_cast(indices_h); + } else { + const size_t bytes = entries * sizeof(CeedInt); + hostIndices = (CeedInt *)::malloc(bytes); + std::memcpy(hostIndices, indices_h, bytes); + } - if (hostIndices) { - indices = getDevice().malloc(entries, hostIndices); - } - } + if (hostIndices) { + indices = getDevice().malloc(entries, hostIndices); + } +} - void ElemRestriction::setupFromDeviceMemory(CeedCopyMode copyMode, - const CeedInt *indices_d) { - ::occa::memory deviceIndices = arrayToMemory(indices_d); +void ElemRestriction::setupFromDeviceMemory(CeedCopyMode copyMode, const CeedInt *indices_d) { + ::occa::memory deviceIndices = arrayToMemory(indices_d); - freeIndices = (copyMode == CEED_OWN_POINTER); + freeIndices = (copyMode == CEED_OWN_POINTER); - if (copyMode == CEED_COPY_VALUES) { - indices = deviceIndices.clone(); - } else { - indices = deviceIndices; - } - } + if (copyMode == CEED_COPY_VALUES) { + indices = deviceIndices.clone(); + } else { + indices = deviceIndices; + } +} - bool ElemRestriction::usesIndices() { - return indices.isInitialized(); - } +bool ElemRestriction::usesIndices() { return indices.isInitialized(); } - void ElemRestriction::setupTransposeIndices() { - if (!usesIndices() || transposeQuadIndices.isInitialized()) { - return; - } - - const CeedInt elementEntryCount = ceedElementCount * ceedElementSize; - - bool *indexIsUsed = new bool[ceedLVectorSize]; - std::memset(indexIsUsed, 0, ceedLVectorSize * sizeof(bool)); - - for (CeedInt i = 0; i < elementEntryCount; ++i) { - indexIsUsed[hostIndices[i]] = true; - } - - CeedInt nodeCount = 0; - for (CeedInt i = 0; i < ceedLVectorSize; ++i) { - nodeCount += indexIsUsed[i]; - } - - const CeedInt dofOffsetCount = nodeCount + 1; - CeedInt *quadIndexToDofOffset = new CeedInt[ceedLVectorSize]; - CeedInt *transposeQuadIndices_h = new CeedInt[nodeCount]; - CeedInt *transposeDofOffsets_h = new CeedInt[dofOffsetCount]; - CeedInt *transposeDofIndices_h = new CeedInt[elementEntryCount]; - - std::memset(transposeDofOffsets_h, 0, dofOffsetCount * sizeof(CeedInt)); - - // Compute ids - CeedInt offsetId = 0; - for (CeedInt i = 0; i < ceedLVectorSize; ++i) { - if (indexIsUsed[i]) { - transposeQuadIndices_h[offsetId] = i; - quadIndexToDofOffset[i] = offsetId++; - } - } - - // Count how many times a specific quad node is used - for (CeedInt i = 0; i < elementEntryCount; ++i) { - ++transposeDofOffsets_h[ - quadIndexToDofOffset[hostIndices[i]] + 1 - ]; - } - - // Aggregate to find true offsets - for (CeedInt i = 1; i < dofOffsetCount; ++i) { - transposeDofOffsets_h[i] += transposeDofOffsets_h[i - 1]; - } - - // Compute dof indices - for (CeedInt i = 0; i < elementEntryCount; ++i) { - const CeedInt quadIndex = hostIndices[i]; - const CeedInt dofIndex = transposeDofOffsets_h[ - quadIndexToDofOffset[quadIndex] - ]++; - transposeDofIndices_h[dofIndex] = i; - } - - // Reset offsets - for (int i = dofOffsetCount - 1; i > 0; --i) { - transposeDofOffsets_h[i] = transposeDofOffsets_h[i - 1]; - } - transposeDofOffsets_h[0] = 0; - - // Copy to device - ::occa::device device = getDevice(); - - transposeQuadIndices = device.malloc(nodeCount, - transposeQuadIndices_h); - transposeDofOffsets = device.malloc(dofOffsetCount, - transposeDofOffsets_h); - transposeDofIndices = device.malloc(elementEntryCount, - transposeDofIndices_h); - - // Clean up temporary arrays - delete [] indexIsUsed; - delete [] quadIndexToDofOffset; - delete [] transposeQuadIndices_h; - delete [] transposeDofOffsets_h; - delete [] transposeDofIndices_h; - } +void ElemRestriction::setupTransposeIndices() { + if (!usesIndices() || transposeQuadIndices.isInitialized()) { + return; + } - void ElemRestriction::setKernelProperties() { - kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); - kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); - kernelProperties["defines/COMPONENT_COUNT"] = ceedComponentCount; - kernelProperties["defines/ELEMENT_SIZE"] = ceedElementSize; - kernelProperties["defines/TILE_SIZE"] = 64; - kernelProperties["defines/USES_INDICES"] = usesIndices(); - kernelProperties["defines/USER_STRIDES"] = StrideType::USER_STRIDES; - kernelProperties["defines/NOT_STRIDED"] = StrideType::NOT_STRIDED; - kernelProperties["defines/BACKEND_STRIDES"] = StrideType::BACKEND_STRIDES; - kernelProperties["defines/STRIDE_TYPE"] = ceedStrideType; - kernelProperties["defines/NODE_COUNT"] = transposeQuadIndices.length(); - kernelProperties["defines/NODE_STRIDE"] = ceedNodeStride; - kernelProperties["defines/COMPONENT_STRIDE"] = ceedComponentStride; - kernelProperties["defines/ELEMENT_STRIDE"] = ceedElementStride; - kernelProperties["defines/UNSTRIDED_COMPONENT_STRIDE"] = ceedUnstridedComponentStride; - } + const CeedInt elementEntryCount = ceedElementCount * ceedElementSize; - ElemRestriction* ElemRestriction::getElemRestriction(CeedElemRestriction r, - const bool assertValid) { - if (!r || r == CEED_ELEMRESTRICTION_NONE) { - return NULL; - } + bool *indexIsUsed = new bool[ceedLVectorSize]; + std::memset(indexIsUsed, 0, ceedLVectorSize * sizeof(bool)); - int ierr; - ElemRestriction *elemRestriction = NULL; + for (CeedInt i = 0; i < elementEntryCount; ++i) { + indexIsUsed[hostIndices[i]] = true; + } + + CeedInt nodeCount = 0; + for (CeedInt i = 0; i < ceedLVectorSize; ++i) { + nodeCount += indexIsUsed[i]; + } - ierr = CeedElemRestrictionGetData(r, (void**) &elemRestriction); - if (assertValid) { - CeedOccaFromChk(ierr); - } + const CeedInt dofOffsetCount = nodeCount + 1; + CeedInt *quadIndexToDofOffset = new CeedInt[ceedLVectorSize]; + CeedInt *transposeQuadIndices_h = new CeedInt[nodeCount]; + CeedInt *transposeDofOffsets_h = new CeedInt[dofOffsetCount]; + CeedInt *transposeDofIndices_h = new CeedInt[elementEntryCount]; - return elemRestriction; + std::memset(transposeDofOffsets_h, 0, dofOffsetCount * sizeof(CeedInt)); + + // Compute ids + CeedInt offsetId = 0; + for (CeedInt i = 0; i < ceedLVectorSize; ++i) { + if (indexIsUsed[i]) { + transposeQuadIndices_h[offsetId] = i; + quadIndexToDofOffset[i] = offsetId++; } + } - ElemRestriction* ElemRestriction::from(CeedElemRestriction r) { - ElemRestriction *elemRestriction = getElemRestriction(r); - if (!elemRestriction) { - return NULL; - } + // Count how many times a specific quad node is used + for (CeedInt i = 0; i < elementEntryCount; ++i) { + ++transposeDofOffsets_h[quadIndexToDofOffset[hostIndices[i]] + 1]; + } - int ierr; - ierr = CeedElemRestrictionGetCeed(r, &elemRestriction->ceed); - CeedOccaFromChk(ierr); + // Aggregate to find true offsets + for (CeedInt i = 1; i < dofOffsetCount; ++i) { + transposeDofOffsets_h[i] += transposeDofOffsets_h[i - 1]; + } - return elemRestriction->setupFrom(r); - } + // Compute dof indices + for (CeedInt i = 0; i < elementEntryCount; ++i) { + const CeedInt quadIndex = hostIndices[i]; + const CeedInt dofIndex = transposeDofOffsets_h[quadIndexToDofOffset[quadIndex]]++; + transposeDofIndices_h[dofIndex] = i; + } - ElemRestriction* ElemRestriction::from(CeedOperatorField operatorField) { - int ierr; - CeedElemRestriction ceedElemRestriction; + // Reset offsets + for (int i = dofOffsetCount - 1; i > 0; --i) { + transposeDofOffsets_h[i] = transposeDofOffsets_h[i - 1]; + } + transposeDofOffsets_h[0] = 0; - ierr = CeedOperatorFieldGetElemRestriction(operatorField, &ceedElemRestriction); - CeedOccaFromChk(ierr); + // Copy to device + ::occa::device device = getDevice(); - return from(ceedElemRestriction); - } + transposeQuadIndices = device.malloc(nodeCount, transposeQuadIndices_h); + transposeDofOffsets = device.malloc(dofOffsetCount, transposeDofOffsets_h); + transposeDofIndices = device.malloc(elementEntryCount, transposeDofIndices_h); - ElemRestriction* ElemRestriction::setupFrom(CeedElemRestriction r) { - int ierr; + // Clean up temporary arrays + delete[] indexIsUsed; + delete[] quadIndexToDofOffset; + delete[] transposeQuadIndices_h; + delete[] transposeDofOffsets_h; + delete[] transposeDofIndices_h; +} - ierr = CeedElemRestrictionGetNumElements(r, &ceedElementCount); - CeedOccaFromChk(ierr); +void ElemRestriction::setKernelProperties() { + kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); + kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); + kernelProperties["defines/COMPONENT_COUNT"] = ceedComponentCount; + kernelProperties["defines/ELEMENT_SIZE"] = ceedElementSize; + kernelProperties["defines/TILE_SIZE"] = 64; + kernelProperties["defines/USES_INDICES"] = usesIndices(); + kernelProperties["defines/USER_STRIDES"] = StrideType::USER_STRIDES; + kernelProperties["defines/NOT_STRIDED"] = StrideType::NOT_STRIDED; + kernelProperties["defines/BACKEND_STRIDES"] = StrideType::BACKEND_STRIDES; + kernelProperties["defines/STRIDE_TYPE"] = ceedStrideType; + kernelProperties["defines/NODE_COUNT"] = transposeQuadIndices.length(); + kernelProperties["defines/NODE_STRIDE"] = ceedNodeStride; + kernelProperties["defines/COMPONENT_STRIDE"] = ceedComponentStride; + kernelProperties["defines/ELEMENT_STRIDE"] = ceedElementStride; + kernelProperties["defines/UNSTRIDED_COMPONENT_STRIDE"] = ceedUnstridedComponentStride; +} - ierr = CeedElemRestrictionGetElementSize(r, &ceedElementSize); - CeedOccaFromChk(ierr); +ElemRestriction *ElemRestriction::getElemRestriction(CeedElemRestriction r, const bool assertValid) { + if (!r || r == CEED_ELEMRESTRICTION_NONE) { + return NULL; + } - ierr = CeedElemRestrictionGetNumComponents(r, &ceedComponentCount); - CeedOccaFromChk(ierr); + int ierr; + ElemRestriction *elemRestriction = NULL; - ierr = CeedElemRestrictionGetLVectorSize(r, &ceedLVectorSize); - CeedOccaFromChk(ierr); + ierr = CeedElemRestrictionGetData(r, (void **)&elemRestriction); + if (assertValid) { + CeedOccaFromChk(ierr); + } - // Find what type of striding the restriction uses - bool isStrided = false; - bool hasBackendStrides = false; + return elemRestriction; +} - ierr = CeedElemRestrictionIsStrided(r, &isStrided); - CeedOccaFromChk(ierr); +ElemRestriction *ElemRestriction::from(CeedElemRestriction r) { + ElemRestriction *elemRestriction = getElemRestriction(r); + if (!elemRestriction) { + return NULL; + } - if (isStrided) { - ierr = CeedElemRestrictionHasBackendStrides(r, &hasBackendStrides); - CeedOccaFromChk(ierr); - } + CeedCallOcca(CeedElemRestrictionGetCeed(r, &elemRestriction->ceed)); - if (isStrided) { - if (hasBackendStrides) { - ceedStrideType = BACKEND_STRIDES; - } else { - ceedStrideType = USER_STRIDES; - } - } else { - ceedStrideType = NOT_STRIDED; - } + return elemRestriction->setupFrom(r); +} - // Default strides - ceedNodeStride = 1; - ceedComponentStride = ceedElementSize; - ceedElementStride = ceedElementSize * ceedComponentCount; - ceedUnstridedComponentStride = 1; +ElemRestriction *ElemRestriction::from(CeedOperatorField operatorField) { + CeedElemRestriction ceedElemRestriction; - if (ceedStrideType == USER_STRIDES) { - CeedInt strides[3]; + CeedCallOcca(CeedOperatorFieldGetElemRestriction(operatorField, &ceedElemRestriction)); - ierr = CeedElemRestrictionGetStrides(r, &strides); - CeedOccaFromChk(ierr); + return from(ceedElemRestriction); +} - ceedNodeStride = strides[0]; - ceedComponentStride = strides[1]; - ceedElementStride = strides[2]; +ElemRestriction *ElemRestriction::setupFrom(CeedElemRestriction r) { + CeedCallOcca(CeedElemRestrictionGetNumElements(r, &ceedElementCount)); - } else if (ceedStrideType == NOT_STRIDED) { - ierr = CeedElemRestrictionGetCompStride(r, &ceedUnstridedComponentStride); - CeedOccaFromChk(ierr); - } + CeedCallOcca(CeedElemRestrictionGetElementSize(r, &ceedElementSize)); - return this; - } + CeedCallOcca(CeedElemRestrictionGetNumComponents(r, &ceedComponentCount)); - int ElemRestriction::apply(CeedTransposeMode rTransposeMode, - Vector &u, - Vector &v) { - const bool rIsTransposed = (rTransposeMode != CEED_NOTRANSPOSE); - - // Todo: refactor - if (rIsTransposed) { - if(!restrictionTransposeKernel.isInitialized()) { - setKernelProperties(); - restrictionTransposeKernel = getDevice().buildKernelFromString( - occa_elem_restriction_source, - "applyRestrictionTranspose", - kernelProperties); - } - restrictionTransposeKernel(ceedElementCount, - transposeQuadIndices, - transposeDofOffsets, - transposeDofIndices, - u.getConstKernelArg(), - v.getKernelArg()); - } else { - if(!restrictionKernel.isInitialized()) { - setKernelProperties(); - restrictionKernel = getDevice().buildKernelFromString( - occa_elem_restriction_source, - "applyRestriction", - kernelProperties); - } - restrictionKernel(ceedElementCount, - indices, - u.getConstKernelArg(), - v.getKernelArg()); - } - return CEED_ERROR_SUCCESS; - } + CeedCallOcca(CeedElemRestrictionGetLVectorSize(r, &ceedLVectorSize)); - int ElemRestriction::getOffsets(CeedMemType memType, - const CeedInt **offsets) { - switch (memType) { - case CEED_MEM_HOST: { - *offsets = hostIndices; - return CEED_ERROR_SUCCESS; - } - case CEED_MEM_DEVICE: { - *offsets = memoryToArray(indices); - return CEED_ERROR_SUCCESS; - } - } - return ceedError("Unsupported CeedMemType passed to ElemRestriction::getOffsets"); - } + // Find what type of striding the restriction uses + bool isStrided = false; + bool hasBackendStrides = false; - //---[ Ceed Callbacks ]----------- - int ElemRestriction::registerCeedFunction(Ceed ceed, CeedElemRestriction r, - const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "ElemRestriction", r, fname, f); - } + CeedCallOcca(CeedElemRestrictionIsStrided(r, &isStrided)); - int ElemRestriction::ceedCreate(CeedMemType memType, - CeedCopyMode copyMode, - const CeedInt *indicesInput, - CeedElemRestriction r) { - int ierr; - Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChk(ierr); - - if ((memType != CEED_MEM_DEVICE) && (memType != CEED_MEM_HOST)) { - return staticCeedError("Only HOST and DEVICE CeedMemType supported"); - } - - ElemRestriction *elemRestriction = new ElemRestriction(); - ierr = CeedElemRestrictionSetData(r, elemRestriction); CeedChk(ierr); - - // Setup Ceed objects before setting up memory - elemRestriction = ElemRestriction::from(r); - elemRestriction->setup(memType, copyMode, indicesInput); - - CeedInt defaultLayout[3] = { - 1, - elemRestriction->ceedElementSize, - elemRestriction->ceedElementSize * elemRestriction->ceedComponentCount - }; - ierr = CeedElemRestrictionSetELayout(r, defaultLayout); CeedChk(ierr); - - CeedOccaRegisterFunction(r, "Apply", ElemRestriction::ceedApply); - CeedOccaRegisterFunction(r, "ApplyBlock", ElemRestriction::ceedApplyBlock); - CeedOccaRegisterFunction(r, "GetOffsets", ElemRestriction::ceedGetOffsets); - CeedOccaRegisterFunction(r, "Destroy", ElemRestriction::ceedDestroy); + if (isStrided) { + CeedCallOcca(CeedElemRestrictionHasBackendStrides(r, &hasBackendStrides)); + } - return CEED_ERROR_SUCCESS; + if (isStrided) { + if (hasBackendStrides) { + ceedStrideType = BACKEND_STRIDES; + } else { + ceedStrideType = USER_STRIDES; } + } else { + ceedStrideType = NOT_STRIDED; + } - int ElemRestriction::ceedCreateBlocked(CeedMemType memType, - CeedCopyMode copyMode, - const CeedInt *indicesInput, - CeedElemRestriction r) { - return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionCreateBlocked"); - } + // Default strides + ceedNodeStride = 1; + ceedComponentStride = ceedElementSize; + ceedElementStride = ceedElementSize * ceedComponentCount; + ceedUnstridedComponentStride = 1; - int ElemRestriction::ceedApply(CeedElemRestriction r, - CeedTransposeMode tmode, - CeedVector u, CeedVector v, - CeedRequest *request) { - ElemRestriction *elemRestriction = ElemRestriction::from(r); - Vector *uVector = Vector::from(u); - Vector *vVector = Vector::from(v); - - if (!elemRestriction) { - return staticCeedError("Incorrect CeedElemRestriction argument: r"); - } - if (!uVector) { - return elemRestriction->ceedError("Incorrect CeedVector argument: u"); - } - if (!vVector) { - return elemRestriction->ceedError("Incorrect CeedVector argument: v"); - } - - return elemRestriction->apply(tmode, *uVector, *vVector); - } + if (ceedStrideType == USER_STRIDES) { + CeedInt strides[3]; - int ElemRestriction::ceedApplyBlock(CeedElemRestriction r, - CeedInt block, CeedTransposeMode tmode, - CeedVector u, CeedVector v, - CeedRequest *request) { - return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionApplyBlock"); - } + CeedCallOcca(CeedElemRestrictionGetStrides(r, &strides)); + + ceedNodeStride = strides[0]; + ceedComponentStride = strides[1]; + ceedElementStride = strides[2]; + + } else if (ceedStrideType == NOT_STRIDED) { + CeedCallOcca(CeedElemRestrictionGetCompStride(r, &ceedUnstridedComponentStride)); + } - int ElemRestriction::ceedGetOffsets(CeedElemRestriction r, - CeedMemType memType, - const CeedInt **offsets) { - ElemRestriction *elemRestriction = ElemRestriction::from(r); + return this; +} - if (!elemRestriction) { - return staticCeedError("Incorrect CeedElemRestriction argument: r"); - } +int ElemRestriction::apply(CeedTransposeMode rTransposeMode, Vector &u, Vector &v) { + const bool rIsTransposed = (rTransposeMode != CEED_NOTRANSPOSE); - return elemRestriction->getOffsets(memType, offsets); + // Todo: refactor + if (rIsTransposed) { + if (!restrictionTransposeKernel.isInitialized()) { + setKernelProperties(); + restrictionTransposeKernel = getDevice().buildKernelFromString(occa_elem_restriction_source, "applyRestrictionTranspose", kernelProperties); + } + restrictionTransposeKernel(ceedElementCount, transposeQuadIndices, transposeDofOffsets, transposeDofIndices, u.getConstKernelArg(), + v.getKernelArg()); + } else { + if (!restrictionKernel.isInitialized()) { + setKernelProperties(); + restrictionKernel = getDevice().buildKernelFromString(occa_elem_restriction_source, "applyRestriction", kernelProperties); } + restrictionKernel(ceedElementCount, indices, u.getConstKernelArg(), v.getKernelArg()); + } + return CEED_ERROR_SUCCESS; +} - int ElemRestriction::ceedDestroy(CeedElemRestriction r) { - delete getElemRestriction(r, false); +int ElemRestriction::getOffsets(CeedMemType memType, const CeedInt **offsets) { + switch (memType) { + case CEED_MEM_HOST: { + *offsets = hostIndices; + return CEED_ERROR_SUCCESS; + } + case CEED_MEM_DEVICE: { + *offsets = memoryToArray(indices); return CEED_ERROR_SUCCESS; } } + return ceedError("Unsupported CeedMemType passed to ElemRestriction::getOffsets"); +} + +//---[ Ceed Callbacks ]----------- +int ElemRestriction::registerCeedFunction(Ceed ceed, CeedElemRestriction r, const char *fname, ceed::occa::ceedFunction f) { + return CeedSetBackendFunction(ceed, "ElemRestriction", r, fname, f); +} + +int ElemRestriction::ceedCreate(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, CeedElemRestriction r) { + Ceed ceed; + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + + if ((memType != CEED_MEM_DEVICE) && (memType != CEED_MEM_HOST)) { + return staticCeedError("Only HOST and DEVICE CeedMemType supported"); + } + + ElemRestriction *elemRestriction = new ElemRestriction(); + CeedCallBackend(CeedElemRestrictionSetData(r, elemRestriction)); + + // Setup Ceed objects before setting up memory + elemRestriction = ElemRestriction::from(r); + elemRestriction->setup(memType, copyMode, indicesInput); + + CeedInt defaultLayout[3] = {1, elemRestriction->ceedElementSize, elemRestriction->ceedElementSize * elemRestriction->ceedComponentCount}; + CeedChkBackend(CeedElemRestrictionSetELayout(r, defaultLayout)); + + CeedOccaRegisterFunction(r, "Apply", ElemRestriction::ceedApply); + CeedOccaRegisterFunction(r, "ApplyBlock", ElemRestriction::ceedApplyBlock); + CeedOccaRegisterFunction(r, "GetOffsets", ElemRestriction::ceedGetOffsets); + CeedOccaRegisterFunction(r, "Destroy", ElemRestriction::ceedDestroy); + + return CEED_ERROR_SUCCESS; +} + +int ElemRestriction::ceedCreateBlocked(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, CeedElemRestriction r) { + return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionCreateBlocked"); +} + +int ElemRestriction::ceedApply(CeedElemRestriction r, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) { + ElemRestriction *elemRestriction = ElemRestriction::from(r); + Vector *uVector = Vector::from(u); + Vector *vVector = Vector::from(v); + + if (!elemRestriction) { + return staticCeedError("Incorrect CeedElemRestriction argument: r"); + } + if (!uVector) { + return elemRestriction->ceedError("Incorrect CeedVector argument: u"); + } + if (!vVector) { + return elemRestriction->ceedError("Incorrect CeedVector argument: v"); + } + + return elemRestriction->apply(tmode, *uVector, *vVector); +} + +int ElemRestriction::ceedApplyBlock(CeedElemRestriction r, CeedInt block, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) { + return staticCeedError("(OCCA) Backend does not implement CeedElemRestrictionApplyBlock"); +} + +int ElemRestriction::ceedGetOffsets(CeedElemRestriction r, CeedMemType memType, const CeedInt **offsets) { + ElemRestriction *elemRestriction = ElemRestriction::from(r); + + if (!elemRestriction) { + return staticCeedError("Incorrect CeedElemRestriction argument: r"); + } + + return elemRestriction->getOffsets(memType, offsets); +} + +int ElemRestriction::ceedDestroy(CeedElemRestriction r) { + delete getElemRestriction(r, false); + return CEED_ERROR_SUCCESS; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-elem-restriction.hpp b/backends/occa/ceed-occa-elem-restriction.hpp index f107cd3fa9..958f554d7e 100644 --- a/backends/occa/ceed-occa-elem-restriction.hpp +++ b/backends/occa/ceed-occa-elem-restriction.hpp @@ -12,106 +12,84 @@ #include "ceed-occa-vector.hpp" namespace ceed { - namespace occa { - enum StrideType { - BACKEND_STRIDES = 0, - USER_STRIDES = 1, - NOT_STRIDED = 2, - }; - - class ElemRestriction : public CeedObject { - public: - // Ceed object information - CeedInt ceedElementCount; - CeedInt ceedElementSize; - CeedInt ceedComponentCount; - CeedSize ceedLVectorSize; - StrideType ceedStrideType; - CeedInt ceedNodeStride; - CeedInt ceedComponentStride; - CeedInt ceedElementStride; - CeedInt ceedUnstridedComponentStride; - - // Passed resources - bool freeHostIndices; - CeedInt *hostIndices; - - // Owned resources - bool freeIndices; - ::occa::memory indices; - - ::occa::memory transposeQuadIndices; - ::occa::memory transposeDofOffsets; - ::occa::memory transposeDofIndices; - - ::occa::json kernelProperties; - ::occa::kernel restrictionKernel; - ::occa::kernel restrictionTransposeKernel; - - ElemRestriction(); - - ~ElemRestriction(); - - void setup(CeedMemType memType, - CeedCopyMode copyMode, - const CeedInt *indicesInput); - - void setupFromHostMemory(CeedCopyMode copyMode, - const CeedInt *indices_h); - - void setupFromDeviceMemory(CeedCopyMode copyMode, - const CeedInt *indices_d); - - bool usesIndices(); - - void setupTransposeIndices(); - - void setKernelProperties(); - - static ElemRestriction* getElemRestriction(CeedElemRestriction r, - const bool assertValid = true); - - static ElemRestriction* from(CeedElemRestriction r); - static ElemRestriction* from(CeedOperatorField operatorField); - ElemRestriction* setupFrom(CeedElemRestriction r); - - int apply(CeedTransposeMode rTransposeMode, - Vector &u, - Vector &v); - - int getOffsets(CeedMemType memType, - const CeedInt **offsets); - - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedElemRestriction r, - const char *fname, ceed::occa::ceedFunction f); - - static int ceedCreate(CeedMemType memType, - CeedCopyMode copyMode, - const CeedInt *indicesInput, - CeedElemRestriction r); - - static int ceedCreateBlocked(CeedMemType memType, - CeedCopyMode copyMode, - const CeedInt *indicesInput, - CeedElemRestriction r); - - static int ceedApply(CeedElemRestriction r, - CeedTransposeMode tmode, - CeedVector u, CeedVector v, CeedRequest *request); - - static int ceedGetOffsets(CeedElemRestriction r, - CeedMemType memType, - const CeedInt **offsets); - - static int ceedApplyBlock(CeedElemRestriction r, - CeedInt block, CeedTransposeMode tmode, - CeedVector u, CeedVector v, - CeedRequest *request); - - static int ceedDestroy(CeedElemRestriction r); - }; - } -} +namespace occa { +enum StrideType { + BACKEND_STRIDES = 0, + USER_STRIDES = 1, + NOT_STRIDED = 2, +}; + +class ElemRestriction : public CeedObject { + public: + // Ceed object information + CeedInt ceedElementCount; + CeedInt ceedElementSize; + CeedInt ceedComponentCount; + CeedSize ceedLVectorSize; + StrideType ceedStrideType; + CeedInt ceedNodeStride; + CeedInt ceedComponentStride; + CeedInt ceedElementStride; + CeedInt ceedUnstridedComponentStride; + + // Passed resources + bool freeHostIndices; + CeedInt *hostIndices; + + // Owned resources + bool freeIndices; + ::occa::memory indices; + + ::occa::memory transposeQuadIndices; + ::occa::memory transposeDofOffsets; + ::occa::memory transposeDofIndices; + + ::occa::json kernelProperties; + ::occa::kernel restrictionKernel; + ::occa::kernel restrictionTransposeKernel; + + ElemRestriction(); + + ~ElemRestriction(); + + void setup(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput); + + void setupFromHostMemory(CeedCopyMode copyMode, const CeedInt *indices_h); + + void setupFromDeviceMemory(CeedCopyMode copyMode, const CeedInt *indices_d); + + bool usesIndices(); + + void setupTransposeIndices(); + + void setKernelProperties(); + + static ElemRestriction *getElemRestriction(CeedElemRestriction r, const bool assertValid = true); + + static ElemRestriction *from(CeedElemRestriction r); + static ElemRestriction *from(CeedOperatorField operatorField); + ElemRestriction *setupFrom(CeedElemRestriction r); + + int apply(CeedTransposeMode rTransposeMode, Vector &u, Vector &v); + + int getOffsets(CeedMemType memType, const CeedInt **offsets); + + //---[ Ceed Callbacks ]----------- + static int registerCeedFunction(Ceed ceed, CeedElemRestriction r, const char *fname, ceed::occa::ceedFunction f); + + static int ceedCreate(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, CeedElemRestriction r); + + static int ceedCreateBlocked(CeedMemType memType, CeedCopyMode copyMode, const CeedInt *indicesInput, CeedElemRestriction r); + + static int ceedApply(CeedElemRestriction r, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request); + + static int ceedGetOffsets(CeedElemRestriction r, CeedMemType memType, const CeedInt **offsets); + + static int ceedApplyBlock(CeedElemRestriction r, CeedInt block, CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request); + + static int ceedDestroy(CeedElemRestriction r); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-gpu-operator.cpp b/backends/occa/ceed-occa-gpu-operator.cpp index 4a2c96b282..f502ed0415 100644 --- a/backends/occa/ceed-occa-gpu-operator.cpp +++ b/backends/occa/ceed-occa-gpu-operator.cpp @@ -6,20 +6,19 @@ // This file is part of CEED: http://github.com/ceed #include "ceed-occa-gpu-operator.hpp" + #include "ceed-occa-qfunction.hpp" namespace ceed { - namespace occa { - GpuOperator::GpuOperator() {} +namespace occa { +GpuOperator::GpuOperator() {} - GpuOperator::~GpuOperator() {} +GpuOperator::~GpuOperator() {} - ::occa::kernel GpuOperator::buildApplyAddKernel() { - return ::occa::kernel(); - } +::occa::kernel GpuOperator::buildApplyAddKernel() { return ::occa::kernel(); } - void GpuOperator::applyAdd(Vector *in, Vector *out) { - // TODO: Implement - } - } +void GpuOperator::applyAdd(Vector *in, Vector *out) { + // TODO: Implement } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-gpu-operator.hpp b/backends/occa/ceed-occa-gpu-operator.hpp index bcb95f231e..a6f3952b3d 100644 --- a/backends/occa/ceed-occa-gpu-operator.hpp +++ b/backends/occa/ceed-occa-gpu-operator.hpp @@ -13,18 +13,18 @@ #include "ceed-occa-operator.hpp" namespace ceed { - namespace occa { - class GpuOperator : public Operator { - public: - GpuOperator(); +namespace occa { +class GpuOperator : public Operator { + public: + GpuOperator(); - ~GpuOperator(); + ~GpuOperator(); - ::occa::kernel buildApplyAddKernel(); + ::occa::kernel buildApplyAddKernel(); - void applyAdd(Vector *in, Vector *out); - }; - } -} + void applyAdd(Vector *in, Vector *out); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-operator-args.cpp b/backends/occa/ceed-occa-operator-args.cpp index 4a01853209..b36e4cf770 100644 --- a/backends/occa/ceed-occa-operator-args.cpp +++ b/backends/occa/ceed-occa-operator-args.cpp @@ -8,56 +8,41 @@ #include "ceed-occa-operator-args.hpp" namespace ceed { - namespace occa { - OperatorArgs::OperatorArgs() : - QFunctionArgs() {} - - OperatorArgs::OperatorArgs(CeedOperator op) : - QFunctionArgs() { - - setupArgs(op); - } - - void OperatorArgs::setupArgs(CeedOperator op) { - CeedQFunction qf; - CeedOperatorField *ceedInputFields, *ceedOutputFields; - int ierr = 0; - - ierr = CeedOperatorGetQFunction(op, &qf); - CeedOccaValidChk(_isValid, ierr); - setupQFunctionArgs(qf); - - if (!_isValid) { - return; - } - - ierr = CeedOperatorGetFields(op, NULL, &ceedInputFields, NULL, &ceedOutputFields); - CeedOccaValidChk(_isValid, ierr); - - for (int i = 0; i < _inputCount; ++i) { - OperatorField field = OperatorField(ceedInputFields[i]); - opInputs.push_back(field); - _isValid &= field.isValid(); - } - - for (int i = 0; i < _outputCount; ++i) { - OperatorField field = OperatorField(ceedOutputFields[i]); - opOutputs.push_back(field); - _isValid &= field.isValid(); - } - } - - const OperatorField& OperatorArgs::getOpField(const bool isInput, - const int index) const { - return isInput ? opInputs[index] : opOutputs[index]; - } - - const OperatorField& OperatorArgs::getOpInput(const int index) const { - return opInputs[index]; - } - - const OperatorField& OperatorArgs::getOpOutput(const int index) const { - return opOutputs[index]; - } +namespace occa { +OperatorArgs::OperatorArgs() : QFunctionArgs() {} + +OperatorArgs::OperatorArgs(CeedOperator op) : QFunctionArgs() { setupArgs(op); } + +void OperatorArgs::setupArgs(CeedOperator op) { + CeedQFunction qf; + CeedOperatorField *ceedInputFields, *ceedOutputFields; + + CeedCallOccaValid(_isValid, CeedOperatorGetQFunction(op, &qf)); + setupQFunctionArgs(qf); + + if (!_isValid) { + return; + } + + CeedCallOccaValid(_isValid, CeedOperatorGetFields(op, NULL, &ceedInputFields, NULL, &ceedOutputFields)); + + for (int i = 0; i < _inputCount; ++i) { + OperatorField field = OperatorField(ceedInputFields[i]); + opInputs.push_back(field); + _isValid &= field.isValid(); + } + + for (int i = 0; i < _outputCount; ++i) { + OperatorField field = OperatorField(ceedOutputFields[i]); + opOutputs.push_back(field); + _isValid &= field.isValid(); } } + +const OperatorField& OperatorArgs::getOpField(const bool isInput, const int index) const { return isInput ? opInputs[index] : opOutputs[index]; } + +const OperatorField& OperatorArgs::getOpInput(const int index) const { return opInputs[index]; } + +const OperatorField& OperatorArgs::getOpOutput(const int index) const { return opOutputs[index]; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-operator-args.hpp b/backends/occa/ceed-occa-operator-args.hpp index 0190a7266f..78a33bbc04 100644 --- a/backends/occa/ceed-occa-operator-args.hpp +++ b/backends/occa/ceed-occa-operator-args.hpp @@ -11,31 +11,30 @@ #include #include "ceed-occa-ceed-object.hpp" -#include "ceed-occa-qfunction-args.hpp" #include "ceed-occa-operator-field.hpp" +#include "ceed-occa-qfunction-args.hpp" namespace ceed { - namespace occa { - typedef std::vector OperatorFieldVector; +namespace occa { +typedef std::vector OperatorFieldVector; - class OperatorArgs : public QFunctionArgs { - public: - OperatorFieldVector opInputs; - OperatorFieldVector opOutputs; +class OperatorArgs : public QFunctionArgs { + public: + OperatorFieldVector opInputs; + OperatorFieldVector opOutputs; - OperatorArgs(); - OperatorArgs(CeedOperator op); + OperatorArgs(); + OperatorArgs(CeedOperator op); - void setupArgs(CeedOperator op); + void setupArgs(CeedOperator op); - const OperatorField& getOpField(const bool isInput, - const int index) const; + const OperatorField& getOpField(const bool isInput, const int index) const; - const OperatorField& getOpInput(const int index) const; + const OperatorField& getOpInput(const int index) const; - const OperatorField& getOpOutput(const int index) const; - }; - } -} + const OperatorField& getOpOutput(const int index) const; +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-operator-field.cpp b/backends/occa/ceed-occa-operator-field.cpp index 95a12edc20..f6ee104bda 100644 --- a/backends/occa/ceed-occa-operator-field.cpp +++ b/backends/occa/ceed-occa-operator-field.cpp @@ -6,109 +6,56 @@ // This file is part of CEED: http://github.com/ceed #include "ceed-occa-operator-field.hpp" + #include "ceed-occa-basis.hpp" #include "ceed-occa-elem-restriction.hpp" #include "ceed-occa-vector.hpp" namespace ceed { - namespace occa { - OperatorField::OperatorField(CeedOperatorField opField) : - _isValid(false), - _usesActiveVector(false), - vec(NULL), - basis(NULL), - elemRestriction(NULL) { - - CeedBasis ceedBasis; - CeedVector ceedVector; - CeedElemRestriction ceedElemRestriction; - int ierr = 0; - - ierr = CeedOperatorFieldGetBasis(opField, &ceedBasis); - CeedOccaValidChk(_isValid, ierr); - - ierr = CeedOperatorFieldGetVector(opField, &ceedVector); - CeedOccaValidChk(_isValid, ierr); - - ierr = CeedOperatorFieldGetElemRestriction(opField, &ceedElemRestriction); - CeedOccaValidChk(_isValid, ierr); - - _isValid = true; - _usesActiveVector = ceedVector == CEED_VECTOR_ACTIVE; - - vec = Vector::from(ceedVector); - basis = Basis::from(ceedBasis); - elemRestriction = ElemRestriction::from(ceedElemRestriction); - } - - bool OperatorField::isValid() const { - return _isValid; - } - - //---[ Vector Info ]---------------- - bool OperatorField::usesActiveVector() const { - return _usesActiveVector; - } - //================================== - - //---[ Basis Info ]----------------- - bool OperatorField::hasBasis() const { - return basis; - } - - int OperatorField::usingTensorBasis() const { - return basis->isTensorBasis(); - } - - int OperatorField::getComponentCount() const { - return ( - basis - ? basis->ceedComponentCount - : 1 - ); - } - - int OperatorField::getP() const { - return ( - basis - ? basis->P - : 0 - ); - } - - int OperatorField::getQ() const { - return ( - basis - ? basis->Q - : 0 - ); - } - - int OperatorField::getDim() const { - return ( - basis - ? basis->dim - : 1 - ); - } - //================================== - - //---[ ElemRestriction Info ]------- - int OperatorField::getElementCount() const { - return ( - elemRestriction - ? elemRestriction->ceedElementCount - : 1 - ); - } - - int OperatorField::getElementSize() const { - return ( - elemRestriction - ? elemRestriction->ceedElementSize - : 1 - ); - } - //================================== - } +namespace occa { +OperatorField::OperatorField(CeedOperatorField opField) : _isValid(false), _usesActiveVector(false), vec(NULL), basis(NULL), elemRestriction(NULL) { + CeedBasis ceedBasis; + CeedVector ceedVector; + CeedElemRestriction ceedElemRestriction; + + CeedCallOccaValid(_isValid, CeedOperatorFieldGetBasis(opField, &ceedBasis)); + + CeedCallOccaValid(_isValid, CeedOperatorFieldGetVector(opField, &ceedVector)); + + CeedCallOccaValid(_isValid, CeedOperatorFieldGetElemRestriction(opField, &ceedElemRestriction)); + + _isValid = true; + _usesActiveVector = ceedVector == CEED_VECTOR_ACTIVE; + + vec = Vector::from(ceedVector); + basis = Basis::from(ceedBasis); + elemRestriction = ElemRestriction::from(ceedElemRestriction); } + +bool OperatorField::isValid() const { return _isValid; } + +//---[ Vector Info ]---------------- +bool OperatorField::usesActiveVector() const { return _usesActiveVector; } +//================================== + +//---[ Basis Info ]----------------- +bool OperatorField::hasBasis() const { return basis; } + +int OperatorField::usingTensorBasis() const { return basis->isTensorBasis(); } + +int OperatorField::getComponentCount() const { return (basis ? basis->ceedComponentCount : 1); } + +int OperatorField::getP() const { return (basis ? basis->P : 0); } + +int OperatorField::getQ() const { return (basis ? basis->Q : 0); } + +int OperatorField::getDim() const { return (basis ? basis->dim : 1); } +//================================== + +//---[ ElemRestriction Info ]------- +int OperatorField::getElementCount() const { return (elemRestriction ? elemRestriction->ceedElementCount : 1); } + +int OperatorField::getElementSize() const { return (elemRestriction ? elemRestriction->ceedElementSize : 1); } +//================================== +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-operator-field.hpp b/backends/occa/ceed-occa-operator-field.hpp index 478e59f020..22b9029a5a 100644 --- a/backends/occa/ceed-occa-operator-field.hpp +++ b/backends/occa/ceed-occa-operator-field.hpp @@ -11,45 +11,45 @@ #include "ceed-occa-context.hpp" namespace ceed { - namespace occa { - class Basis; - class ElemRestriction; - class Vector; - - class OperatorField { - private: - bool _isValid; - bool _usesActiveVector; - - public: - Vector *vec; - Basis *basis; - ElemRestriction *elemRestriction; - - OperatorField(CeedOperatorField opField); - - bool isValid() const; - - //---[ Vector Info ]-------------- - bool usesActiveVector() const; - //================================ - - //---[ Basis Info ]--------------- - bool hasBasis() const; - int usingTensorBasis() const; - - int getComponentCount() const; - int getP() const; - int getQ() const; - int getDim() const; - //================================ - - //---[ ElemRestriction Info ]----- - int getElementCount() const; - int getElementSize() const; - //================================ - }; - } -} +namespace occa { +class Basis; +class ElemRestriction; +class Vector; + +class OperatorField { + private: + bool _isValid; + bool _usesActiveVector; + + public: + Vector *vec; + Basis *basis; + ElemRestriction *elemRestriction; + + OperatorField(CeedOperatorField opField); + + bool isValid() const; + + //---[ Vector Info ]-------------- + bool usesActiveVector() const; + //================================ + + //---[ Basis Info ]--------------- + bool hasBasis() const; + int usingTensorBasis() const; + + int getComponentCount() const; + int getP() const; + int getQ() const; + int getDim() const; + //================================ + + //---[ ElemRestriction Info ]----- + int getElementCount() const; + int getElementSize() const; + //================================ +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-operator.cpp b/backends/occa/ceed-occa-operator.cpp index c14b2a1198..c157debb34 100644 --- a/backends/occa/ceed-occa-operator.cpp +++ b/backends/occa/ceed-occa-operator.cpp @@ -5,169 +5,147 @@ // // This file is part of CEED: http://github.com/ceed -#include "ceed-occa-basis.hpp" -#include "ceed-occa-elem-restriction.hpp" #include "ceed-occa-operator.hpp" + +#include "ceed-occa-basis.hpp" #include "ceed-occa-cpu-operator.hpp" +#include "ceed-occa-elem-restriction.hpp" #include "ceed-occa-gpu-operator.hpp" #include "ceed-occa-qfunction.hpp" namespace ceed { - namespace occa { - Operator::Operator() : - ceedQ(0), - ceedElementCount(0), - qfunction(NULL), - needsInitialSetup(true) {} +namespace occa { +Operator::Operator() : ceedQ(0), ceedElementCount(0), qfunction(NULL), needsInitialSetup(true) {} - Operator::~Operator() {} +Operator::~Operator() {} - Operator* Operator::getOperator(CeedOperator op, - const bool assertValid) { - if (!op) { - return NULL; - } +Operator *Operator::getOperator(CeedOperator op, const bool assertValid) { + if (!op) { + return NULL; + } - int ierr; - Operator *operator_ = NULL; + int ierr; + Operator *operator_ = NULL; - ierr = CeedOperatorGetData(op, (void**) &operator_); - if (assertValid) { - CeedOccaFromChk(ierr); - } + ierr = CeedOperatorGetData(op, (void **)&operator_); + if (assertValid) { + CeedOccaFromChk(ierr); + } - return operator_; - } + return operator_; +} - Operator* Operator::from(CeedOperator op) { - Operator *operator_ = getOperator(op); - if (!operator_) { - return NULL; - } +Operator *Operator::from(CeedOperator op) { + Operator *operator_ = getOperator(op); + if (!operator_) { + return NULL; + } - int ierr; - ierr = CeedOperatorGetCeed(op, &operator_->ceed); CeedOccaFromChk(ierr); + CeedCallOcca(CeedOperatorGetCeed(op, &operator_->ceed)); - operator_->qfunction = QFunction::from(op); - if (!operator_->qfunction) { - return NULL; - } + operator_->qfunction = QFunction::from(op); + if (!operator_->qfunction) { + return NULL; + } - ierr = CeedOperatorGetNumQuadraturePoints(op, &operator_->ceedQ); CeedOccaFromChk(ierr); - ierr = CeedOperatorGetNumElements(op, &operator_->ceedElementCount); CeedOccaFromChk(ierr); + CeedCallOcca(CeedOperatorGetNumQuadraturePoints(op, &operator_->ceedQ)); + CeedCallOcca(CeedOperatorGetNumElements(op, &operator_->ceedElementCount)); - operator_->args.setupArgs(op); - if (!operator_->args.isValid()) { - return NULL; - } + operator_->args.setupArgs(op); + if (!operator_->args.isValid()) { + return NULL; + } - return operator_; - } + return operator_; +} - bool Operator::isApplyingIdentityFunction() { - return qfunction->ceedIsIdentity; - } +bool Operator::isApplyingIdentityFunction() { return qfunction->ceedIsIdentity; } - int Operator::applyAdd(Vector *in, Vector *out, CeedRequest *request) { - // TODO: Cache kernel objects rather than relying on OCCA kernel caching - applyAddKernel = buildApplyAddKernel(); +int Operator::applyAdd(Vector *in, Vector *out, CeedRequest *request) { + // TODO: Cache kernel objects rather than relying on OCCA kernel caching + applyAddKernel = buildApplyAddKernel(); - if (needsInitialSetup) { - initialSetup(); - needsInitialSetup = false; - } + if (needsInitialSetup) { + initialSetup(); + needsInitialSetup = false; + } - applyAdd(in, out); + applyAdd(in, out); - return CEED_ERROR_SUCCESS; - } + return CEED_ERROR_SUCCESS; +} - //---[ Virtual Methods ]------------ - void Operator::initialSetup() {} +//---[ Virtual Methods ]------------ +void Operator::initialSetup() {} - //---[ Ceed Callbacks ]------------- - int Operator::registerCeedFunction(Ceed ceed, CeedOperator op, - const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "Operator", op, fname, f); - } +//---[ Ceed Callbacks ]------------- +int Operator::registerCeedFunction(Ceed ceed, CeedOperator op, const char *fname, ceed::occa::ceedFunction f) { + return CeedSetBackendFunction(ceed, "Operator", op, fname, f); +} - int Operator::ceedCreate(CeedOperator op) { - int ierr; - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); +int Operator::ceedCreate(CeedOperator op) { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); #if 1 - Operator *operator_ = new CpuOperator(); + Operator *operator_ = new CpuOperator(); #else - // TODO: Add GPU specific operator - Operator *operator_ = ( - Context::from(ceed)->usingCpuDevice() - ? ((Operator*) new CpuOperator()) - : ((Operator*) new GpuOperator()) - ); + // TODO: Add GPU specific operator + Operator *operator_ = (Context::from(ceed)->usingCpuDevice() ? ((Operator *)new CpuOperator()) : ((Operator *)new GpuOperator())); #endif - ierr = CeedOperatorSetData(op, operator_); CeedChk(ierr); + CeedCallBackend(CeedOperatorSetData(op, operator_)); - CeedOccaRegisterFunction(op, "LinearAssembleQFunction", Operator::ceedLinearAssembleQFunction); - CeedOccaRegisterFunction(op, "LinearAssembleQFunctionUpdate", Operator::ceedLinearAssembleQFunction); - CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal); - CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal); - CeedOccaRegisterFunction(op, "CreateFDMElementInverse", Operator::ceedCreateFDMElementInverse); - CeedOccaRegisterFunction(op, "ApplyAdd", Operator::ceedApplyAdd); - CeedOccaRegisterFunction(op, "Destroy", Operator::ceedDestroy); + CeedOccaRegisterFunction(op, "LinearAssembleQFunction", Operator::ceedLinearAssembleQFunction); + CeedOccaRegisterFunction(op, "LinearAssembleQFunctionUpdate", Operator::ceedLinearAssembleQFunction); + CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal); + CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal); + CeedOccaRegisterFunction(op, "CreateFDMElementInverse", Operator::ceedCreateFDMElementInverse); + CeedOccaRegisterFunction(op, "ApplyAdd", Operator::ceedApplyAdd); + CeedOccaRegisterFunction(op, "Destroy", Operator::ceedDestroy); - return CEED_ERROR_SUCCESS; - } - - int Operator::ceedCreateComposite(CeedOperator op) { - int ierr; - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); + return CEED_ERROR_SUCCESS; +} - CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal); - CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal); +int Operator::ceedCreateComposite(CeedOperator op) { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - return CEED_ERROR_SUCCESS; - } + CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal); + CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal); - int Operator::ceedLinearAssembleQFunction(CeedOperator op) { - return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunction"); - } + return CEED_ERROR_SUCCESS; +} - int Operator::ceedLinearAssembleQFunctionUpdate(CeedOperator op) { - return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunctionUpdate"); - } +int Operator::ceedLinearAssembleQFunction(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunction"); } - int Operator::ceedLinearAssembleAddDiagonal(CeedOperator op) { - return staticCeedError("(OCCA) Backend does not implement LinearAssembleDiagonal"); - } +int Operator::ceedLinearAssembleQFunctionUpdate(CeedOperator op) { + return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunctionUpdate"); +} +int Operator::ceedLinearAssembleAddDiagonal(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement LinearAssembleDiagonal"); } - int Operator::ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op) { - return staticCeedError("(OCCA) Backend does not implement LinearAssemblePointBlockDiagonal"); - } +int Operator::ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op) { + return staticCeedError("(OCCA) Backend does not implement LinearAssemblePointBlockDiagonal"); +} - int Operator::ceedCreateFDMElementInverse(CeedOperator op) { - return staticCeedError("(OCCA) Backend does not implement CreateFDMElementInverse"); - } +int Operator::ceedCreateFDMElementInverse(CeedOperator op) { return staticCeedError("(OCCA) Backend does not implement CreateFDMElementInverse"); } - int Operator::ceedApplyAdd(CeedOperator op, - CeedVector invec, CeedVector outvec, CeedRequest *request) { - Operator *operator_ = Operator::from(op); - Vector *in = Vector::from(invec); - Vector *out = Vector::from(outvec); +int Operator::ceedApplyAdd(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) { + Operator *operator_ = Operator::from(op); + Vector *in = Vector::from(invec); + Vector *out = Vector::from(outvec); - if (!operator_) { - return staticCeedError("Incorrect CeedOperator argument: op"); - } + if (!operator_) { + return staticCeedError("Incorrect CeedOperator argument: op"); + } - return operator_->applyAdd(in, out, request); - } + return operator_->applyAdd(in, out, request); +} - int Operator::ceedDestroy(CeedOperator op) { - delete getOperator(op, false); - return CEED_ERROR_SUCCESS; - } - } +int Operator::ceedDestroy(CeedOperator op) { + delete getOperator(op, false); + return CEED_ERROR_SUCCESS; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-operator.hpp b/backends/occa/ceed-occa-operator.hpp index 9f3ee3d132..2db09a1bd4 100644 --- a/backends/occa/ceed-occa-operator.hpp +++ b/backends/occa/ceed-occa-operator.hpp @@ -14,64 +14,61 @@ #include "ceed-occa-operator-args.hpp" namespace ceed { - namespace occa { - typedef std::vector VectorVector_t; +namespace occa { +typedef std::vector VectorVector_t; - class QFunction; +class QFunction; - class Operator : public CeedObject { - public: - // Ceed object information - CeedInt ceedQ; - CeedInt ceedElementCount; +class Operator : public CeedObject { + public: + // Ceed object information + CeedInt ceedQ; + CeedInt ceedElementCount; - // Owned resources - QFunction *qfunction; - OperatorArgs args; - ::occa::kernel applyAddKernel; - bool needsInitialSetup; + // Owned resources + QFunction *qfunction; + OperatorArgs args; + ::occa::kernel applyAddKernel; + bool needsInitialSetup; - // Reference to other memory - ::occa::memory qFunctionContextData; + // Reference to other memory + ::occa::memory qFunctionContextData; - Operator(); - virtual ~Operator(); + Operator(); + virtual ~Operator(); - static Operator* getOperator(CeedOperator op, - const bool assertValid = true); + static Operator *getOperator(CeedOperator op, const bool assertValid = true); - static Operator* from(CeedOperator op); + static Operator *from(CeedOperator op); - bool isApplyingIdentityFunction(); + bool isApplyingIdentityFunction(); - int applyAdd(Vector *in, Vector *out, CeedRequest *request); + int applyAdd(Vector *in, Vector *out, CeedRequest *request); - //---[ Virtual Methods ]---------- - virtual ::occa::kernel buildApplyAddKernel() = 0; + //---[ Virtual Methods ]---------- + virtual ::occa::kernel buildApplyAddKernel() = 0; - virtual void initialSetup(); + virtual void initialSetup(); - virtual void applyAdd(Vector *in, Vector *out) = 0; + virtual void applyAdd(Vector *in, Vector *out) = 0; - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedOperator op, - const char *fname, ceed::occa::ceedFunction f); + //---[ Ceed Callbacks ]----------- + static int registerCeedFunction(Ceed ceed, CeedOperator op, const char *fname, ceed::occa::ceedFunction f); - static int ceedCreate(CeedOperator op); - static int ceedCreateComposite(CeedOperator op); + static int ceedCreate(CeedOperator op); + static int ceedCreateComposite(CeedOperator op); - static int ceedLinearAssembleQFunction(CeedOperator op); - static int ceedLinearAssembleQFunctionUpdate(CeedOperator op); - static int ceedLinearAssembleAddDiagonal(CeedOperator op); - static int ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op); - static int ceedCreateFDMElementInverse(CeedOperator op); + static int ceedLinearAssembleQFunction(CeedOperator op); + static int ceedLinearAssembleQFunctionUpdate(CeedOperator op); + static int ceedLinearAssembleAddDiagonal(CeedOperator op); + static int ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op); + static int ceedCreateFDMElementInverse(CeedOperator op); - static int ceedApplyAdd(CeedOperator op, - CeedVector invec, CeedVector outvec, CeedRequest *request); + static int ceedApplyAdd(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request); - static int ceedDestroy(CeedOperator op); - }; - } -} + static int ceedDestroy(CeedOperator op); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-qfunction-args.cpp b/backends/occa/ceed-occa-qfunction-args.cpp index cd403a98ec..5f4acbbc07 100644 --- a/backends/occa/ceed-occa-qfunction-args.cpp +++ b/backends/occa/ceed-occa-qfunction-args.cpp @@ -8,85 +8,53 @@ #include "ceed-occa-qfunction-args.hpp" namespace ceed { - namespace occa { - QFunctionArgs::QFunctionArgs() : - _isValid(false), - _inputCount(0), - _outputCount(0) {} - - QFunctionArgs::QFunctionArgs(CeedQFunction qf) : - _isValid(false), - _inputCount(0), - _outputCount(0) { - - setupQFunctionArgs(qf); - } - - void QFunctionArgs::setupQFunctionArgs(CeedQFunction qf) { - int ierr = 0; - CeedQFunctionField *ceedInputFields, *ceedOutputFields; - - ierr = CeedQFunctionGetCeed(qf, &ceed); - CeedOccaValidChk(_isValid, ierr); - - ierr = CeedQFunctionGetNumArgs(qf, &_inputCount, &_outputCount); - CeedOccaValidChk(_isValid, ierr); - - ierr = CeedQFunctionGetFields(qf, NULL, &ceedInputFields, NULL, &ceedOutputFields); - CeedOccaValidChk(_isValid, ierr); - - _isValid = true; - - for (int i = 0; i < _inputCount; ++i) { - QFunctionField field = QFunctionField(ceedInputFields[i]); - qfInputs.push_back(field); - _isValid &= field.isValid(); - } - - for (int i = 0; i < _outputCount; ++i) { - QFunctionField field = QFunctionField(ceedOutputFields[i]); - qfOutputs.push_back(field); - _isValid &= field.isValid(); - } - } - - - bool QFunctionArgs::isValid() const { - return _isValid; - } - - int QFunctionArgs::inputCount() const { - return _inputCount; - } - - int QFunctionArgs::outputCount() const { - return _outputCount; - } - - const QFunctionField& QFunctionArgs::getQfField(const bool isInput, - const int index) const { - return isInput ? qfInputs[index] : qfOutputs[index]; - } - - const QFunctionField& QFunctionArgs::getQfInput(const int index) const { - return qfInputs[index]; - } - - const QFunctionField& QFunctionArgs::getQfOutput(const int index) const { - return qfOutputs[index]; - } - - CeedEvalMode QFunctionArgs::getEvalMode(const bool isInput, - const int index) const { - return isInput ? qfInputs[index].evalMode : qfOutputs[index].evalMode; - } - - CeedEvalMode QFunctionArgs::getInputEvalMode(const int index) const { - return qfInputs[index].evalMode; - } - - CeedEvalMode QFunctionArgs::getOutputEvalMode(const int index) const { - return qfOutputs[index].evalMode; - } +namespace occa { +QFunctionArgs::QFunctionArgs() : _isValid(false), _inputCount(0), _outputCount(0) {} + +QFunctionArgs::QFunctionArgs(CeedQFunction qf) : _isValid(false), _inputCount(0), _outputCount(0) { setupQFunctionArgs(qf); } + +void QFunctionArgs::setupQFunctionArgs(CeedQFunction qf) { + CeedQFunctionField *ceedInputFields, *ceedOutputFields; + + CeedCallOccaValid(_isValid, CeedQFunctionGetCeed(qf, &ceed)); + + CeedCallOccaValid(_isValid, CeedQFunctionGetNumArgs(qf, &_inputCount, &_outputCount)); + + CeedCallOccaValid(_isValid, CeedQFunctionGetFields(qf, NULL, &ceedInputFields, NULL, &ceedOutputFields)); + + _isValid = true; + + for (int i = 0; i < _inputCount; ++i) { + QFunctionField field = QFunctionField(ceedInputFields[i]); + qfInputs.push_back(field); + _isValid &= field.isValid(); + } + + for (int i = 0; i < _outputCount; ++i) { + QFunctionField field = QFunctionField(ceedOutputFields[i]); + qfOutputs.push_back(field); + _isValid &= field.isValid(); } } + +bool QFunctionArgs::isValid() const { return _isValid; } + +int QFunctionArgs::inputCount() const { return _inputCount; } + +int QFunctionArgs::outputCount() const { return _outputCount; } + +const QFunctionField& QFunctionArgs::getQfField(const bool isInput, const int index) const { return isInput ? qfInputs[index] : qfOutputs[index]; } + +const QFunctionField& QFunctionArgs::getQfInput(const int index) const { return qfInputs[index]; } + +const QFunctionField& QFunctionArgs::getQfOutput(const int index) const { return qfOutputs[index]; } + +CeedEvalMode QFunctionArgs::getEvalMode(const bool isInput, const int index) const { + return isInput ? qfInputs[index].evalMode : qfOutputs[index].evalMode; +} + +CeedEvalMode QFunctionArgs::getInputEvalMode(const int index) const { return qfInputs[index].evalMode; } + +CeedEvalMode QFunctionArgs::getOutputEvalMode(const int index) const { return qfOutputs[index].evalMode; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-qfunction-args.hpp b/backends/occa/ceed-occa-qfunction-args.hpp index 0a4a02b018..95131c98bd 100644 --- a/backends/occa/ceed-occa-qfunction-args.hpp +++ b/backends/occa/ceed-occa-qfunction-args.hpp @@ -14,44 +14,42 @@ #include "ceed-occa-qfunction-field.hpp" namespace ceed { - namespace occa { - typedef std::vector QFunctionFieldVector; +namespace occa { +typedef std::vector QFunctionFieldVector; - class QFunctionArgs : public CeedObject { - protected: - bool _isValid; - CeedInt _inputCount; - CeedInt _outputCount; +class QFunctionArgs : public CeedObject { + protected: + bool _isValid; + CeedInt _inputCount; + CeedInt _outputCount; - public: - QFunctionFieldVector qfInputs; - QFunctionFieldVector qfOutputs; + public: + QFunctionFieldVector qfInputs; + QFunctionFieldVector qfOutputs; - QFunctionArgs(); - QFunctionArgs(CeedQFunction qf); + QFunctionArgs(); + QFunctionArgs(CeedQFunction qf); - void setupQFunctionArgs(CeedQFunction qf); + void setupQFunctionArgs(CeedQFunction qf); - bool isValid() const; + bool isValid() const; - int inputCount() const; - int outputCount() const; + int inputCount() const; + int outputCount() const; - const QFunctionField& getQfField(const bool isInput, - const int index) const; + const QFunctionField& getQfField(const bool isInput, const int index) const; - const QFunctionField& getQfInput(const int index) const; + const QFunctionField& getQfInput(const int index) const; - const QFunctionField& getQfOutput(const int index) const; + const QFunctionField& getQfOutput(const int index) const; - CeedEvalMode getEvalMode(const bool isInput, - const int index) const; + CeedEvalMode getEvalMode(const bool isInput, const int index) const; - CeedEvalMode getInputEvalMode(const int index) const; + CeedEvalMode getInputEvalMode(const int index) const; - CeedEvalMode getOutputEvalMode(const int index) const; - }; - } -} + CeedEvalMode getOutputEvalMode(const int index) const; +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-qfunction-field.cpp b/backends/occa/ceed-occa-qfunction-field.cpp index 8a4c520349..f208cd03d5 100644 --- a/backends/occa/ceed-occa-qfunction-field.cpp +++ b/backends/occa/ceed-occa-qfunction-field.cpp @@ -8,24 +8,15 @@ #include "ceed-occa-qfunction-field.hpp" namespace ceed { - namespace occa { - QFunctionField::QFunctionField(CeedQFunctionField qfField) : - _isValid(false), - size(0) { +namespace occa { +QFunctionField::QFunctionField(CeedQFunctionField qfField) : _isValid(false), size(0) { + CeedCallOccaValid(_isValid, CeedQFunctionFieldGetEvalMode(qfField, &evalMode)); - int ierr = 0; + CeedCallOccaValid(_isValid, CeedQFunctionFieldGetSize(qfField, &size)); - ierr = CeedQFunctionFieldGetEvalMode(qfField, &evalMode); - CeedOccaValidChk(_isValid, ierr); - - ierr = CeedQFunctionFieldGetSize(qfField, &size); - CeedOccaValidChk(_isValid, ierr); - - _isValid = true; - } - - bool QFunctionField::isValid() const { - return _isValid; - } - } + _isValid = true; } + +bool QFunctionField::isValid() const { return _isValid; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-qfunction-field.hpp b/backends/occa/ceed-occa-qfunction-field.hpp index 99a777b53a..6fb13178f3 100644 --- a/backends/occa/ceed-occa-qfunction-field.hpp +++ b/backends/occa/ceed-occa-qfunction-field.hpp @@ -11,20 +11,20 @@ #include "ceed-occa-context.hpp" namespace ceed { - namespace occa { - class QFunctionField { - protected: - bool _isValid; +namespace occa { +class QFunctionField { + protected: + bool _isValid; - public: - CeedEvalMode evalMode; - CeedInt size; + public: + CeedEvalMode evalMode; + CeedInt size; - QFunctionField(CeedQFunctionField qfField); + QFunctionField(CeedQFunctionField qfField); - bool isValid() const; - }; - } -} + bool isValid() const; +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-qfunction.cpp b/backends/occa/ceed-occa-qfunction.cpp index e6210d187d..6eaeb7f30b 100644 --- a/backends/occa/ceed-occa-qfunction.cpp +++ b/backends/occa/ceed-occa-qfunction.cpp @@ -5,259 +5,238 @@ // // This file is part of CEED: http://github.com/ceed +#include "ceed-occa-qfunction.hpp" + #include #include -#include "ceed-occa-qfunction.hpp" #include "ceed-occa-qfunctioncontext.hpp" #include "ceed-occa-vector.hpp" namespace ceed { - namespace occa { - QFunction::QFunction(const std::string &source, - const std::string& function_name) : - ceedIsIdentity(false) { - filename = source; - qFunctionName = function_name; - } +namespace occa { +QFunction::QFunction(const std::string &source, const std::string &function_name) : ceedIsIdentity(false) { + filename = source; + qFunctionName = function_name; +} - QFunction* QFunction::getQFunction(CeedQFunction qf, - const bool assertValid) { - if (!qf) { - return NULL; - } +QFunction *QFunction::getQFunction(CeedQFunction qf, const bool assertValid) { + if (!qf) { + return NULL; + } - int ierr; - QFunction *qFunction = NULL; + QFunction *qFunction = NULL; - ierr = CeedQFunctionGetData(qf, &qFunction); - CeedOccaFromChk(ierr); + CeedCallOcca(CeedQFunctionGetData(qf, &qFunction)); - return qFunction; - } + return qFunction; +} - QFunction* QFunction::from(CeedQFunction qf) { - QFunction *qFunction = getQFunction(qf); - if (!qFunction) { - return NULL; - } +QFunction *QFunction::from(CeedQFunction qf) { + QFunction *qFunction = getQFunction(qf); + if (!qFunction) { + return NULL; + } - int ierr; - ierr = CeedQFunctionGetCeed(qf, &qFunction->ceed); - CeedOccaFromChk(ierr); + CeedCallOcca(CeedQFunctionGetCeed(qf, &qFunction->ceed)); - ierr = CeedQFunctionGetInnerContext(qf, &qFunction->qFunctionContext); - CeedOccaFromChk(ierr); + CeedCallOcca(CeedQFunctionGetInnerContext(qf, &qFunction->qFunctionContext)); - ierr = CeedQFunctionIsIdentity(qf, &qFunction->ceedIsIdentity); - CeedOccaFromChk(ierr); + CeedCallOcca(CeedQFunctionIsIdentity(qf, &qFunction->ceedIsIdentity)); - qFunction->args.setupQFunctionArgs(qf); - if (!qFunction->args.isValid()) { - return NULL; - } + qFunction->args.setupQFunctionArgs(qf); + if (!qFunction->args.isValid()) { + return NULL; + } - return qFunction; - } + return qFunction; +} - QFunction* QFunction::from(CeedOperator op) { - if (!op) { - return NULL; - } +QFunction *QFunction::from(CeedOperator op) { + if (!op) { + return NULL; + } - CeedQFunction qf; - int ierr = 0; + CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); - CeedOccaFromChk(ierr); + CeedCallOcca(CeedOperatorGetQFunction(op, &qf)); - return QFunction::from(qf); - } + return QFunction::from(qf); +} - ::occa::properties QFunction::getKernelProps(const CeedInt Q) { - ::occa::properties props; +::occa::properties QFunction::getKernelProps(const CeedInt Q) { + ::occa::properties props; - // Types - props["defines/CeedInt"] = ::occa::dtype::get().name(); - props["defines/CeedScalar"] = ::occa::dtype::get().name(); + // Types + props["defines/CeedInt"] = ::occa::dtype::get().name(); + props["defines/CeedScalar"] = ::occa::dtype::get().name(); - // CEED defines - props["defines/CeedPragmaSIMD"] = ""; - props["defines/CEED_Q_VLA"] = "OCCA_Q"; - props["defines/CEED_ERROR_SUCCESS"] = 0; + // CEED defines + props["defines/CeedPragmaSIMD"] = ""; + props["defines/CEED_Q_VLA"] = "OCCA_Q"; + props["defines/CEED_ERROR_SUCCESS"] = 0; - std::stringstream ss; - ss << "#define CEED_QFUNCTION(FUNC_NAME) \\" << std::endl - << " inline int FUNC_NAME" << std::endl - << "#define CEED_QFUNCTION_HELPER \\" << std::endl - << " inline" << std::endl - << std::endl - << "#include \"" << filename << "\"" << std::endl; + std::stringstream ss; + ss << "#define CEED_QFUNCTION(FUNC_NAME) \\" << std::endl + << " inline int FUNC_NAME" << std::endl + << "#define CEED_QFUNCTION_HELPER \\" << std::endl + << " inline" << std::endl + << std::endl + << "#include \"" << filename << "\"" << std::endl; - props["headers"].asArray() += ss.str(); + props["headers"].asArray() += ss.str(); - return props; - } + return props; +} - int QFunction::buildKernel(const CeedInt Q) { - // TODO: Store a kernel per Q - if (!qFunctionKernel.isInitialized()) { - ::occa::properties props = getKernelProps(Q); +int QFunction::buildKernel(const CeedInt Q) { + // TODO: Store a kernel per Q + if (!qFunctionKernel.isInitialized()) { + ::occa::properties props = getKernelProps(Q); - // Properties only used in the QFunction kernel source - props["defines/OCCA_Q"] = Q; + // Properties only used in the QFunction kernel source + props["defines/OCCA_Q"] = Q; - const std::string kernelName = "qf_" + qFunctionName; + const std::string kernelName = "qf_" + qFunctionName; - qFunctionKernel = ( - getDevice().buildKernelFromString(getKernelSource(kernelName, Q), - kernelName, - props) - ); - } + qFunctionKernel = (getDevice().buildKernelFromString(getKernelSource(kernelName, Q), kernelName, props)); + } - return CEED_ERROR_SUCCESS; - } + return CEED_ERROR_SUCCESS; +} - std::string QFunction::getKernelSource(const std::string &kernelName, - const CeedInt Q) { - std::stringstream ss; - - ss << "@kernel" << std::endl - << "void " << kernelName << "(" << std::endl; - - // qfunction arguments - for (int i = 0; i < args.inputCount(); ++i) { - ss << " const CeedScalar *in" << i << ',' << std::endl; - } - for (int i = 0; i < args.outputCount(); ++i) { - ss << " CeedScalar *out" << i << ',' << std::endl; - } - ss << " void *ctx" << std::endl; - ss << ") {" << std::endl; - - // Iterate over Q and call qfunction - ss << " @tile(128, @outer, @inner)" << std::endl - << " for (int q = 0; q < OCCA_Q; ++q) {" << std::endl - << " const CeedScalar* in[" << std::max(1, args.inputCount()) << "];" << std::endl - << " CeedScalar* out[" << std::max(1, args.outputCount()) << "];" << std::endl; - - // Set and define in for the q point - for (int i = 0; i < args.inputCount(); ++i) { - const CeedInt fieldSize = args.getQfInput(i).size; - const std::string qIn_i = "qIn" + std::to_string(i); - const std::string in_i = "in" + std::to_string(i); - - ss << " CeedScalar " << qIn_i << "[" << fieldSize << "];" << std::endl - << " in[" << i << "] = " << qIn_i << ";" << std::endl - // Copy q data - << " for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl - << " " << qIn_i << "[qi] = " << in_i << "[q + (OCCA_Q * qi)];" << std::endl - << " }" << std::endl; - } - - // Set out for the q point - for (int i = 0; i < args.outputCount(); ++i) { - const CeedInt fieldSize = args.getQfOutput(i).size; - const std::string qOut_i = "qOut" + std::to_string(i); - - ss << " CeedScalar " << qOut_i << "[" << fieldSize << "];" << std::endl - << " out[" << i << "] = " << qOut_i << ";" << std::endl; - } - - ss << " " << qFunctionName << "(ctx, 1, in, out);" << std::endl; - - // Copy out for the q point - for (int i = 0; i < args.outputCount(); ++i) { - const CeedInt fieldSize = args.getQfOutput(i).size; - const std::string qOut_i = "qOut" + std::to_string(i); - const std::string out_i = "out" + std::to_string(i); - - ss << " for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl - << " " << out_i << "[q + (OCCA_Q * qi)] = " << qOut_i << "[qi];" << std::endl - << " }" << std::endl; - } - - ss << " }" << std::endl - << "}"; - - return ss.str(); - } +std::string QFunction::getKernelSource(const std::string &kernelName, const CeedInt Q) { + std::stringstream ss; - int QFunction::apply(CeedInt Q, CeedVector *U, CeedVector *V) { - int ierr; - ierr = buildKernel(Q); CeedChk(ierr); - - std::vector outputArgs; - - qFunctionKernel.clearArgs(); - - for (CeedInt i = 0; i < args.inputCount(); i++) { - Vector *u = Vector::from(U[i]); - if (!u) { - return ceedError("Incorrect qFunction input field: U[" + std::to_string(i) + "]"); - } - qFunctionKernel.pushArg(u->getConstKernelArg()); - } - - for (CeedInt i = 0; i < args.outputCount(); i++) { - Vector *v = Vector::from(V[i]); - if (!v) { - return ceedError("Incorrect qFunction output field: V[" + std::to_string(i) + "]"); - } - qFunctionKernel.pushArg(v->getKernelArg()); - } - if (qFunctionContext) { - QFunctionContext *ctx = QFunctionContext::from(qFunctionContext); - qFunctionKernel.pushArg(ctx->getKernelArg()); - } else { - qFunctionKernel.pushArg(::occa::null); - } - - qFunctionKernel.run(); - - return CEED_ERROR_SUCCESS; - } + ss << "@kernel" << std::endl << "void " << kernelName << "(" << std::endl; - //---[ Ceed Callbacks ]----------- - int QFunction::registerCeedFunction(Ceed ceed, CeedQFunction qf, - const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "QFunction", qf, fname, f); - } + // qfunction arguments + for (int i = 0; i < args.inputCount(); ++i) { + ss << " const CeedScalar *in" << i << ',' << std::endl; + } + for (int i = 0; i < args.outputCount(); ++i) { + ss << " CeedScalar *out" << i << ',' << std::endl; + } + ss << " void *ctx" << std::endl; + ss << ") {" << std::endl; + + // Iterate over Q and call qfunction + ss << " @tile(128, @outer, @inner)" << std::endl + << " for (int q = 0; q < OCCA_Q; ++q) {" << std::endl + << " const CeedScalar* in[" << std::max(1, args.inputCount()) << "];" << std::endl + << " CeedScalar* out[" << std::max(1, args.outputCount()) << "];" << std::endl; + + // Set and define in for the q point + for (int i = 0; i < args.inputCount(); ++i) { + const CeedInt fieldSize = args.getQfInput(i).size; + const std::string qIn_i = "qIn" + std::to_string(i); + const std::string in_i = "in" + std::to_string(i); + + ss << " CeedScalar " << qIn_i << "[" << fieldSize << "];" << std::endl + << " in[" << i << "] = " << qIn_i << ";" + << std::endl + // Copy q data + << " for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl + << " " << qIn_i << "[qi] = " << in_i << "[q + (OCCA_Q * qi)];" << std::endl + << " }" << std::endl; + } - int QFunction::ceedCreate(CeedQFunction qf) { - int ierr; - Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChk(ierr); - Context *context; - ierr = CeedGetData(ceed, &context); CeedChk(ierr); - char *source; - ierr = CeedQFunctionGetSourcePath(qf, &source); CeedChk(ierr); - char *function_name; - ierr = CeedQFunctionGetKernelName(qf,&function_name); CeedChk(ierr); + // Set out for the q point + for (int i = 0; i < args.outputCount(); ++i) { + const CeedInt fieldSize = args.getQfOutput(i).size; + const std::string qOut_i = "qOut" + std::to_string(i); - QFunction *qFunction = new QFunction(source,function_name); - ierr = CeedQFunctionSetData(qf, qFunction); CeedChk(ierr); + ss << " CeedScalar " << qOut_i << "[" << fieldSize << "];" << std::endl << " out[" << i << "] = " << qOut_i << ";" << std::endl; + } - CeedOccaRegisterFunction(qf, "Apply", QFunction::ceedApply); - CeedOccaRegisterFunction(qf, "Destroy", QFunction::ceedDestroy); + ss << " " << qFunctionName << "(ctx, 1, in, out);" << std::endl; - return CEED_ERROR_SUCCESS; - } + // Copy out for the q point + for (int i = 0; i < args.outputCount(); ++i) { + const CeedInt fieldSize = args.getQfOutput(i).size; + const std::string qOut_i = "qOut" + std::to_string(i); + const std::string out_i = "out" + std::to_string(i); + + ss << " for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl + << " " << out_i << "[q + (OCCA_Q * qi)] = " << qOut_i << "[qi];" << std::endl + << " }" << std::endl; + } + + ss << " }" << std::endl << "}"; + + return ss.str(); +} - int QFunction::ceedApply(CeedQFunction qf, CeedInt Q, - CeedVector *U, CeedVector *V) { - QFunction *qFunction = QFunction::from(qf); - if (qFunction) { - return qFunction->apply(Q, U, V); - } +int QFunction::apply(CeedInt Q, CeedVector *U, CeedVector *V) { + CeedCallBackend(buildKernel(Q)); - return 1; + std::vector outputArgs; + + qFunctionKernel.clearArgs(); + + for (CeedInt i = 0; i < args.inputCount(); i++) { + Vector *u = Vector::from(U[i]); + if (!u) { + return ceedError("Incorrect qFunction input field: U[" + std::to_string(i) + "]"); } + qFunctionKernel.pushArg(u->getConstKernelArg()); + } - int QFunction::ceedDestroy(CeedQFunction qf) { - delete getQFunction(qf, false); - return CEED_ERROR_SUCCESS; + for (CeedInt i = 0; i < args.outputCount(); i++) { + Vector *v = Vector::from(V[i]); + if (!v) { + return ceedError("Incorrect qFunction output field: V[" + std::to_string(i) + "]"); } + qFunctionKernel.pushArg(v->getKernelArg()); + } + if (qFunctionContext) { + QFunctionContext *ctx = QFunctionContext::from(qFunctionContext); + qFunctionKernel.pushArg(ctx->getKernelArg()); + } else { + qFunctionKernel.pushArg(::occa::null); + } + + qFunctionKernel.run(); + + return CEED_ERROR_SUCCESS; +} + +//---[ Ceed Callbacks ]----------- +int QFunction::registerCeedFunction(Ceed ceed, CeedQFunction qf, const char *fname, ceed::occa::ceedFunction f) { + return CeedSetBackendFunction(ceed, "QFunction", qf, fname, f); +} + +int QFunction::ceedCreate(CeedQFunction qf) { + Ceed ceed; + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + Context *context; + CeedCallBackend(CeedGetData(ceed, &context)); + char *source; + CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source)); + char *function_name; + CeedCallBackend(CeedQFunctionGetKernelName(qf, &function_name)); + + QFunction *qFunction = new QFunction(source, function_name); + CeedCallBackend(CeedQFunctionSetData(qf, qFunction)); + + CeedOccaRegisterFunction(qf, "Apply", QFunction::ceedApply); + CeedOccaRegisterFunction(qf, "Destroy", QFunction::ceedDestroy); + + return CEED_ERROR_SUCCESS; +} + +int QFunction::ceedApply(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { + QFunction *qFunction = QFunction::from(qf); + if (qFunction) { + return qFunction->apply(Q, U, V); } + + return 1; +} + +int QFunction::ceedDestroy(CeedQFunction qf) { + delete getQFunction(qf, false); + return CEED_ERROR_SUCCESS; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-qfunction.hpp b/backends/occa/ceed-occa-qfunction.hpp index 0a45a7bd9d..f0de1e6228 100644 --- a/backends/occa/ceed-occa-qfunction.hpp +++ b/backends/occa/ceed-occa-qfunction.hpp @@ -12,48 +12,43 @@ #include "ceed-occa-qfunction-args.hpp" namespace ceed { - namespace occa { - class QFunction : public CeedObject { - public: - // Ceed object information - bool ceedIsIdentity; +namespace occa { +class QFunction : public CeedObject { + public: + // Ceed object information + bool ceedIsIdentity; - // Owned resources - std::string filename; - std::string qFunctionName; - ::occa::kernel qFunctionKernel; - CeedQFunctionContext qFunctionContext; - QFunctionArgs args; + // Owned resources + std::string filename; + std::string qFunctionName; + ::occa::kernel qFunctionKernel; + CeedQFunctionContext qFunctionContext; + QFunctionArgs args; - QFunction(const std::string &source, - const std::string &function_name); + QFunction(const std::string &source, const std::string &function_name); - static QFunction* getQFunction(CeedQFunction qf, - const bool assertValid = true); + static QFunction *getQFunction(CeedQFunction qf, const bool assertValid = true); - static QFunction* from(CeedQFunction qf); - static QFunction* from(CeedOperator op); + static QFunction *from(CeedQFunction qf); + static QFunction *from(CeedOperator op); - ::occa::properties getKernelProps(const CeedInt Q); + ::occa::properties getKernelProps(const CeedInt Q); - int buildKernel(const CeedInt Q); - std::string getKernelSource(const std::string &kernelName, - const CeedInt Q); + int buildKernel(const CeedInt Q); + std::string getKernelSource(const std::string &kernelName, const CeedInt Q); - int apply(CeedInt Q, CeedVector *U, CeedVector *V); + int apply(CeedInt Q, CeedVector *U, CeedVector *V); - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedQFunction qf, - const char *fname, ceed::occa::ceedFunction f); + //---[ Ceed Callbacks ]----------- + static int registerCeedFunction(Ceed ceed, CeedQFunction qf, const char *fname, ceed::occa::ceedFunction f); - static int ceedCreate(CeedQFunction qf); + static int ceedCreate(CeedQFunction qf); - static int ceedApply(CeedQFunction qf, - CeedInt Q, CeedVector *U, CeedVector *V); + static int ceedApply(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V); - static int ceedDestroy(CeedQFunction qf); - }; - } -} + static int ceedDestroy(CeedQFunction qf); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-qfunctioncontext.cpp b/backends/occa/ceed-occa-qfunctioncontext.cpp index be72d10ec4..24c74b09f0 100644 --- a/backends/occa/ceed-occa-qfunctioncontext.cpp +++ b/backends/occa/ceed-occa-qfunctioncontext.cpp @@ -5,349 +5,314 @@ // // This file is part of CEED: http://github.com/ceed -#include - #include "ceed-occa-qfunctioncontext.hpp" +#include + namespace ceed { - namespace occa { - QFunctionContext::QFunctionContext() : - ctxSize(0), - hostBuffer(NULL), - currentHostBuffer(NULL), - syncState(SyncState::none) {} - - QFunctionContext::~QFunctionContext() { - memory.free(); - freeHostCtxBuffer(); - } +namespace occa { +QFunctionContext::QFunctionContext() : ctxSize(0), hostBuffer(NULL), currentHostBuffer(NULL), syncState(SyncState::none) {} - QFunctionContext* QFunctionContext::getQFunctionContext(CeedQFunctionContext ctx, - const bool assertValid) { - if (!ctx) { - return NULL; - } +QFunctionContext::~QFunctionContext() { + memory.free(); + freeHostCtxBuffer(); +} - int ierr; - QFunctionContext *ctx_ = NULL; +QFunctionContext *QFunctionContext::getQFunctionContext(CeedQFunctionContext ctx, const bool assertValid) { + if (!ctx) { + return NULL; + } - ierr = CeedQFunctionContextGetBackendData(ctx, &ctx_); - if (assertValid) { - CeedOccaFromChk(ierr); - } + int ierr; + QFunctionContext *ctx_ = NULL; - return ctx_; - } + ierr = CeedQFunctionContextGetBackendData(ctx, &ctx_); + if (assertValid) { + CeedOccaFromChk(ierr); + } - QFunctionContext* QFunctionContext::from(CeedQFunctionContext ctx) { - QFunctionContext *ctx_ = getQFunctionContext(ctx); - if (!ctx_) { - return NULL; - } + return ctx_; +} - int ierr; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctx_->ctxSize); - CeedOccaFromChk(ierr); +QFunctionContext *QFunctionContext::from(CeedQFunctionContext ctx) { + QFunctionContext *ctx_ = getQFunctionContext(ctx); + if (!ctx_) { + return NULL; + } - if (ctx_ != NULL) { - ierr = CeedQFunctionContextGetCeed(ctx, &ctx_->ceed); CeedOccaFromChk(ierr); - } + CeedCallOcca(CeedQFunctionContextGetContextSize(ctx, &ctx_->ctxSize)); - return ctx_; - } + if (ctx_ != NULL) { + CeedCallOcca(CeedQFunctionContextGetCeed(ctx, &ctx_->ceed)); + } - void QFunctionContext::resizeCtx(const size_t ctxSize_) { - ctxSize = ctxSize_; - } + return ctx_; +} - void QFunctionContext::resizeCtxMemory(const size_t ctxSize_) { - resizeCtxMemory(getDevice(), ctxSize_); - } +void QFunctionContext::resizeCtx(const size_t ctxSize_) { ctxSize = ctxSize_; } - void QFunctionContext::resizeCtxMemory(::occa::device device, const size_t ctxSize_) { - if (ctxSize_ != memory.size()) { - memory.free(); - memory = device.malloc(ctxSize_); - } - } +void QFunctionContext::resizeCtxMemory(const size_t ctxSize_) { resizeCtxMemory(getDevice(), ctxSize_); } - void QFunctionContext::resizeHostCtxBuffer(const size_t ctxSize_) { - CeedFree(&hostBuffer); - CeedMallocArray(1, ctxSize, &hostBuffer); - } +void QFunctionContext::resizeCtxMemory(::occa::device device, const size_t ctxSize_) { + if (ctxSize_ != memory.size()) { + memory.free(); + memory = device.malloc(ctxSize_); + } +} - void QFunctionContext::setCurrentCtxMemoryIfNeeded() { - if (!currentMemory.isInitialized()) { - resizeCtxMemory(ctxSize); - currentMemory = memory; - } - } +void QFunctionContext::resizeHostCtxBuffer(const size_t ctxSize_) { + CeedFree(&hostBuffer); + CeedMallocArray(1, ctxSize, &hostBuffer); +} - void QFunctionContext::setCurrentHostCtxBufferIfNeeded() { - if (!currentHostBuffer) { - resizeHostCtxBuffer(ctxSize); - currentHostBuffer = hostBuffer; - } - } +void QFunctionContext::setCurrentCtxMemoryIfNeeded() { + if (!currentMemory.isInitialized()) { + resizeCtxMemory(ctxSize); + currentMemory = memory; + } +} - void QFunctionContext::freeHostCtxBuffer() { - if (hostBuffer) { - CeedFree(&hostBuffer); - } - } - - - int QFunctionContext::hasValidData(bool* has_valid_data) const { - (*has_valid_data) = (!!hostBuffer) - || (!!currentHostBuffer ) - || (memory.isInitialized()) - || (currentMemory.isInitialized()); +void QFunctionContext::setCurrentHostCtxBufferIfNeeded() { + if (!currentHostBuffer) { + resizeHostCtxBuffer(ctxSize); + currentHostBuffer = hostBuffer; + } +} + +void QFunctionContext::freeHostCtxBuffer() { + if (hostBuffer) { + CeedFree(&hostBuffer); + } +} + +int QFunctionContext::hasValidData(bool *has_valid_data) const { + (*has_valid_data) = (!!hostBuffer) || (!!currentHostBuffer) || (memory.isInitialized()) || (currentMemory.isInitialized()); + return CEED_ERROR_SUCCESS; +} + +int QFunctionContext::hasBorrowedDataOfType(CeedMemType mem_type, bool *has_borrowed_data_of_type) const { + switch (mem_type) { + case CEED_MEM_HOST: + (*has_borrowed_data_of_type) = !!currentHostBuffer; + break; + case CEED_MEM_DEVICE: + (*has_borrowed_data_of_type) = currentMemory.isInitialized(); + break; + } + return CEED_ERROR_SUCCESS; +} + +int QFunctionContext::setData(CeedMemType mtype, CeedCopyMode cmode, void *data) { + switch (cmode) { + case CEED_COPY_VALUES: + return copyDataValues(mtype, data); + case CEED_OWN_POINTER: + return ownDataPointer(mtype, data); + case CEED_USE_POINTER: + return useDataPointer(mtype, data); + } + return ceedError("Invalid CeedCopyMode passed"); +} + +int QFunctionContext::copyDataValues(CeedMemType mtype, void *data) { + switch (mtype) { + case CEED_MEM_HOST: + setCurrentHostCtxBufferIfNeeded(); + std::memcpy(currentHostBuffer, data, ctxSize); + syncState = SyncState::host; return CEED_ERROR_SUCCESS; - } - - int QFunctionContext::hasBorrowedDataOfType(CeedMemType mem_type, - bool *has_borrowed_data_of_type) const { - switch (mem_type) { - case CEED_MEM_HOST: - (*has_borrowed_data_of_type) = !!currentHostBuffer; - break; - case CEED_MEM_DEVICE: - (*has_borrowed_data_of_type) = currentMemory.isInitialized(); - break; - } + case CEED_MEM_DEVICE: + setCurrentCtxMemoryIfNeeded(); + currentMemory.copyFrom(dataToMemory(data)); + syncState = SyncState::device; return CEED_ERROR_SUCCESS; - } - - int QFunctionContext::setData(CeedMemType mtype, - CeedCopyMode cmode, void *data) { - switch (cmode) { - case CEED_COPY_VALUES: - return copyDataValues(mtype, data); - case CEED_OWN_POINTER: - return ownDataPointer(mtype, data); - case CEED_USE_POINTER: - return useDataPointer(mtype, data); - } - return ceedError("Invalid CeedCopyMode passed"); - } - - int QFunctionContext::copyDataValues(CeedMemType mtype, void *data) { - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostCtxBufferIfNeeded(); - std::memcpy(currentHostBuffer, data, ctxSize); - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentCtxMemoryIfNeeded(); - currentMemory.copyFrom(dataToMemory(data)); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); - } - - int QFunctionContext::ownDataPointer(CeedMemType mtype, void *data) { - switch (mtype) { - case CEED_MEM_HOST: - freeHostCtxBuffer(); - hostBuffer = currentHostBuffer = data; - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - memory.free(); - memory = currentMemory = dataToMemory(data); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); - } - - int QFunctionContext::useDataPointer(CeedMemType mtype, void *data) { - switch (mtype) { - case CEED_MEM_HOST: - freeHostCtxBuffer(); - currentHostBuffer = data; - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - memory.free(); - currentMemory = dataToMemory(data); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); - } - - int QFunctionContext::takeData(CeedMemType mtype, - void *data) { - if (currentHostBuffer == NULL && currentMemory == ::occa::null) - return ceedError("No context data set"); - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostCtxBufferIfNeeded(); - if (syncState == SyncState::device) { - setCurrentCtxMemoryIfNeeded(); - currentMemory.copyTo(currentHostBuffer); - } - syncState = SyncState::host; - *(void **)data = currentHostBuffer; - hostBuffer = NULL; - currentHostBuffer = NULL; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentCtxMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostCtxBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - syncState = SyncState::device; - *(void **)data = memoryToData(currentMemory); - memory = ::occa::null; - currentMemory = ::occa::null; - return CEED_ERROR_SUCCESS; + } + return ceedError("Invalid CeedMemType passed"); +} + +int QFunctionContext::ownDataPointer(CeedMemType mtype, void *data) { + switch (mtype) { + case CEED_MEM_HOST: + freeHostCtxBuffer(); + hostBuffer = currentHostBuffer = data; + syncState = SyncState::host; + return CEED_ERROR_SUCCESS; + case CEED_MEM_DEVICE: + memory.free(); + memory = currentMemory = dataToMemory(data); + syncState = SyncState::device; + return CEED_ERROR_SUCCESS; + } + return ceedError("Invalid CeedMemType passed"); +} + +int QFunctionContext::useDataPointer(CeedMemType mtype, void *data) { + switch (mtype) { + case CEED_MEM_HOST: + freeHostCtxBuffer(); + currentHostBuffer = data; + syncState = SyncState::host; + return CEED_ERROR_SUCCESS; + case CEED_MEM_DEVICE: + memory.free(); + currentMemory = dataToMemory(data); + syncState = SyncState::device; + return CEED_ERROR_SUCCESS; + } + return ceedError("Invalid CeedMemType passed"); +} + +int QFunctionContext::takeData(CeedMemType mtype, void *data) { + if (currentHostBuffer == NULL && currentMemory == ::occa::null) return ceedError("No context data set"); + switch (mtype) { + case CEED_MEM_HOST: + setCurrentHostCtxBufferIfNeeded(); + if (syncState == SyncState::device) { + setCurrentCtxMemoryIfNeeded(); + currentMemory.copyTo(currentHostBuffer); } - return ceedError("Invalid CeedMemType passed"); - } - - int QFunctionContext::getData(CeedMemType mtype, - void *data) { - // The passed `data` might be modified before restoring - if (currentHostBuffer == NULL && currentMemory == ::occa::null) - return ceedError("No context data set"); - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostCtxBufferIfNeeded(); - if (syncState == SyncState::device) { - setCurrentCtxMemoryIfNeeded(); - currentMemory.copyTo(currentHostBuffer); - } - syncState = SyncState::host; - *(void **)data = currentHostBuffer; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentCtxMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostCtxBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - syncState = SyncState::device; - *(void **)data = memoryToData(currentMemory); - return CEED_ERROR_SUCCESS; + syncState = SyncState::host; + *(void **)data = currentHostBuffer; + hostBuffer = NULL; + currentHostBuffer = NULL; + return CEED_ERROR_SUCCESS; + case CEED_MEM_DEVICE: + setCurrentCtxMemoryIfNeeded(); + if (syncState == SyncState::host) { + setCurrentHostCtxBufferIfNeeded(); + currentMemory.copyFrom(currentHostBuffer); } - return ceedError("Invalid CeedMemType passed"); - } - - int QFunctionContext::restoreData() { + syncState = SyncState::device; + *(void **)data = memoryToData(currentMemory); + memory = ::occa::null; + currentMemory = ::occa::null; return CEED_ERROR_SUCCESS; - } + } + return ceedError("Invalid CeedMemType passed"); +} - ::occa::memory QFunctionContext::getKernelArg() { +int QFunctionContext::getData(CeedMemType mtype, void *data) { + // The passed `data` might be modified before restoring + if (currentHostBuffer == NULL && currentMemory == ::occa::null) return ceedError("No context data set"); + switch (mtype) { + case CEED_MEM_HOST: + setCurrentHostCtxBufferIfNeeded(); + if (syncState == SyncState::device) { + setCurrentCtxMemoryIfNeeded(); + currentMemory.copyTo(currentHostBuffer); + } + syncState = SyncState::host; + *(void **)data = currentHostBuffer; + return CEED_ERROR_SUCCESS; + case CEED_MEM_DEVICE: setCurrentCtxMemoryIfNeeded(); if (syncState == SyncState::host) { setCurrentHostCtxBufferIfNeeded(); currentMemory.copyFrom(currentHostBuffer); } - syncState = SyncState::device; - return currentMemory; - } + syncState = SyncState::device; + *(void **)data = memoryToData(currentMemory); + return CEED_ERROR_SUCCESS; + } + return ceedError("Invalid CeedMemType passed"); +} - //---[ Ceed Callbacks ]----------- - int QFunctionContext::registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, - const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "QFunctionContext", ctx, fname, f); - } +int QFunctionContext::restoreData() { return CEED_ERROR_SUCCESS; } - int QFunctionContext::ceedCreate(CeedQFunctionContext ctx) { - int ierr; +::occa::memory QFunctionContext::getKernelArg() { + setCurrentCtxMemoryIfNeeded(); + if (syncState == SyncState::host) { + setCurrentHostCtxBufferIfNeeded(); + currentMemory.copyFrom(currentHostBuffer); + } + syncState = SyncState::device; + return currentMemory; +} - Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChk(ierr); +//---[ Ceed Callbacks ]----------- +int QFunctionContext::registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, const char *fname, ceed::occa::ceedFunction f) { + return CeedSetBackendFunction(ceed, "QFunctionContext", ctx, fname, f); +} - CeedOccaRegisterFunction(ctx, "HasValidData", QFunctionContext::ceedHasValidData); - CeedOccaRegisterFunction(ctx, "HasBorrowedDataOfType", QFunctionContext::ceedHasBorrowedDataOfType); - CeedOccaRegisterFunction(ctx, "SetData", QFunctionContext::ceedSetData); - CeedOccaRegisterFunction(ctx, "TakeData", QFunctionContext::ceedTakeData); - CeedOccaRegisterFunction(ctx, "GetData", QFunctionContext::ceedGetData); - CeedOccaRegisterFunction(ctx, "GetDataRead", QFunctionContext::ceedGetDataRead); - CeedOccaRegisterFunction(ctx, "RestoreData", QFunctionContext::ceedRestoreData); - CeedOccaRegisterFunction(ctx, "Destroy", QFunctionContext::ceedDestroy); +int QFunctionContext::ceedCreate(CeedQFunctionContext ctx) { + Ceed ceed; + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - QFunctionContext *ctx_ = new QFunctionContext(); - ierr = CeedQFunctionContextSetBackendData(ctx, ctx_); CeedChk(ierr); + CeedOccaRegisterFunction(ctx, "HasValidData", QFunctionContext::ceedHasValidData); + CeedOccaRegisterFunction(ctx, "HasBorrowedDataOfType", QFunctionContext::ceedHasBorrowedDataOfType); + CeedOccaRegisterFunction(ctx, "SetData", QFunctionContext::ceedSetData); + CeedOccaRegisterFunction(ctx, "TakeData", QFunctionContext::ceedTakeData); + CeedOccaRegisterFunction(ctx, "GetData", QFunctionContext::ceedGetData); + CeedOccaRegisterFunction(ctx, "GetDataRead", QFunctionContext::ceedGetDataRead); + CeedOccaRegisterFunction(ctx, "RestoreData", QFunctionContext::ceedRestoreData); + CeedOccaRegisterFunction(ctx, "Destroy", QFunctionContext::ceedDestroy); - return CEED_ERROR_SUCCESS; - } + QFunctionContext *ctx_ = new QFunctionContext(); + CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, ctx_)); - int QFunctionContext::ceedHasValidData(const CeedQFunctionContext ctx, - bool *has_valid_data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->hasValidData(has_valid_data); - } - - int QFunctionContext::ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, - CeedMemType mem_type, - bool *has_borrowed_data_of_type) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->hasBorrowedDataOfType(mem_type, - has_borrowed_data_of_type); - } - - int QFunctionContext::ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, - CeedCopyMode cmode, void *data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->setData(mtype, cmode, data); - } - - int QFunctionContext::ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, - void *data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->takeData(mtype, data); - } - - int QFunctionContext::ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, - void *data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->getData(mtype, data); - } - - int QFunctionContext::ceedGetDataRead(CeedQFunctionContext ctx, - CeedMemType mtype, - void *data) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - // Todo: Determine if calling getData is sufficient - return ctx_->getData(mtype, data); - } - - int QFunctionContext::ceedRestoreData(CeedQFunctionContext ctx) { - QFunctionContext *ctx_ = QFunctionContext::from(ctx); - if (!ctx_) { - return staticCeedError("Invalid CeedQFunctionContext passed"); - } - return ctx_->restoreData(); - } + return CEED_ERROR_SUCCESS; +} - int QFunctionContext::ceedDestroy(CeedQFunctionContext ctx) { - delete getQFunctionContext(ctx, false); - return CEED_ERROR_SUCCESS; - } +int QFunctionContext::ceedHasValidData(const CeedQFunctionContext ctx, bool *has_valid_data) { + QFunctionContext *ctx_ = QFunctionContext::from(ctx); + if (!ctx_) { + return staticCeedError("Invalid CeedQFunctionContext passed"); } + return ctx_->hasValidData(has_valid_data); +} + +int QFunctionContext::ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { + QFunctionContext *ctx_ = QFunctionContext::from(ctx); + if (!ctx_) { + return staticCeedError("Invalid CeedQFunctionContext passed"); + } + return ctx_->hasBorrowedDataOfType(mem_type, has_borrowed_data_of_type); +} + +int QFunctionContext::ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, CeedCopyMode cmode, void *data) { + QFunctionContext *ctx_ = QFunctionContext::from(ctx); + if (!ctx_) { + return staticCeedError("Invalid CeedQFunctionContext passed"); + } + return ctx_->setData(mtype, cmode, data); +} + +int QFunctionContext::ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, void *data) { + QFunctionContext *ctx_ = QFunctionContext::from(ctx); + if (!ctx_) { + return staticCeedError("Invalid CeedQFunctionContext passed"); + } + return ctx_->takeData(mtype, data); +} + +int QFunctionContext::ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, void *data) { + QFunctionContext *ctx_ = QFunctionContext::from(ctx); + if (!ctx_) { + return staticCeedError("Invalid CeedQFunctionContext passed"); + } + return ctx_->getData(mtype, data); +} + +int QFunctionContext::ceedGetDataRead(CeedQFunctionContext ctx, CeedMemType mtype, void *data) { + QFunctionContext *ctx_ = QFunctionContext::from(ctx); + if (!ctx_) { + return staticCeedError("Invalid CeedQFunctionContext passed"); + } + // Todo: Determine if calling getData is sufficient + return ctx_->getData(mtype, data); +} + +int QFunctionContext::ceedRestoreData(CeedQFunctionContext ctx) { + QFunctionContext *ctx_ = QFunctionContext::from(ctx); + if (!ctx_) { + return staticCeedError("Invalid CeedQFunctionContext passed"); + } + return ctx_->restoreData(); +} + +int QFunctionContext::ceedDestroy(CeedQFunctionContext ctx) { + delete getQFunctionContext(ctx, false); + return CEED_ERROR_SUCCESS; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-qfunctioncontext.hpp b/backends/occa/ceed-occa-qfunctioncontext.hpp index dbaeb5e0f4..679f8a3ac3 100644 --- a/backends/occa/ceed-occa-qfunctioncontext.hpp +++ b/backends/occa/ceed-occa-qfunctioncontext.hpp @@ -11,104 +11,92 @@ #include "ceed-occa-ceed-object.hpp" namespace ceed { - namespace occa { - class QFunctionContext : public CeedObject { - public: - // Owned resources - size_t ctxSize; - ::occa::memory memory; - void *hostBuffer; +namespace occa { +class QFunctionContext : public CeedObject { + public: + // Owned resources + size_t ctxSize; + ::occa::memory memory; + void *hostBuffer; - // Current resources - ::occa::memory currentMemory; - void *currentHostBuffer; + // Current resources + ::occa::memory currentMemory; + void *currentHostBuffer; - // State information - int syncState; + // State information + int syncState; - QFunctionContext(); + QFunctionContext(); - ~QFunctionContext(); + ~QFunctionContext(); - static QFunctionContext* getQFunctionContext(CeedQFunctionContext ctx, - const bool assertValid = true); + static QFunctionContext *getQFunctionContext(CeedQFunctionContext ctx, const bool assertValid = true); - static QFunctionContext* from(CeedQFunctionContext ctx); + static QFunctionContext *from(CeedQFunctionContext ctx); - ::occa::memory dataToMemory(const void *data) { - ::occa::memory mem((::occa::modeMemory_t*) data); - return mem; - } + ::occa::memory dataToMemory(const void *data) { + ::occa::memory mem((::occa::modeMemory_t *)data); + return mem; + } - void* memoryToData(::occa::memory &memory) { - return memory.getModeMemory(); - } + void *memoryToData(::occa::memory &memory) { return memory.getModeMemory(); } - void resizeCtx(const size_t ctxSize_); + void resizeCtx(const size_t ctxSize_); - void resizeCtxMemory(const size_t ctxSize_); + void resizeCtxMemory(const size_t ctxSize_); - void resizeCtxMemory(::occa::device device, const size_t ctxSize_); + void resizeCtxMemory(::occa::device device, const size_t ctxSize_); - void resizeHostCtxBuffer(const size_t ctxSize_); + void resizeHostCtxBuffer(const size_t ctxSize_); - void setCurrentCtxMemoryIfNeeded(); + void setCurrentCtxMemoryIfNeeded(); - void setCurrentHostCtxBufferIfNeeded(); + void setCurrentHostCtxBufferIfNeeded(); - void freeHostCtxBuffer(); + void freeHostCtxBuffer(); - int hasValidData(bool* has_valid_data) const; + int hasValidData(bool *has_valid_data) const; - int hasBorrowedDataOfType(CeedMemType mem_type, - bool *has_borrowed_data_of_type) const; + int hasBorrowedDataOfType(CeedMemType mem_type, bool *has_borrowed_data_of_type) const; - int setData(CeedMemType mtype, CeedCopyMode cmode, void *data); + int setData(CeedMemType mtype, CeedCopyMode cmode, void *data); - int copyDataValues(CeedMemType mtype, void *data); + int copyDataValues(CeedMemType mtype, void *data); - int ownDataPointer(CeedMemType mtype, void *data); + int ownDataPointer(CeedMemType mtype, void *data); - int useDataPointer(CeedMemType mtype, void *data); + int useDataPointer(CeedMemType mtype, void *data); - int takeData(CeedMemType mtype, void *data); + int takeData(CeedMemType mtype, void *data); - int getData(CeedMemType mtype, void *data); + int getData(CeedMemType mtype, void *data); - int restoreData(); + int restoreData(); - ::occa::memory getKernelArg(); + ::occa::memory getKernelArg(); - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, - const char *fname, ceed::occa::ceedFunction f); + //---[ Ceed Callbacks ]----------- + static int registerCeedFunction(Ceed ceed, CeedQFunctionContext ctx, const char *fname, ceed::occa::ceedFunction f); - static int ceedCreate(CeedQFunctionContext ctx); + static int ceedCreate(CeedQFunctionContext ctx); - static int ceedHasValidData(const CeedQFunctionContext ctx, - bool *has_valid_data); + static int ceedHasValidData(const CeedQFunctionContext ctx, bool *has_valid_data); - static int ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, - CeedMemType mem_type, - bool *has_borrowed_data_of_type); + static int ceedHasBorrowedDataOfType(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type); - static int ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, - CeedCopyMode cmode, void *data); + static int ceedSetData(CeedQFunctionContext ctx, CeedMemType mtype, CeedCopyMode cmode, void *data); - static int ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, - void *data); + static int ceedTakeData(CeedQFunctionContext ctx, CeedMemType mtype, void *data); - static int ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, - void *data); + static int ceedGetData(CeedQFunctionContext ctx, CeedMemType mtype, void *data); - static int ceedGetDataRead(CeedQFunctionContext ctx, CeedMemType mtype, - void *data); + static int ceedGetDataRead(CeedQFunctionContext ctx, CeedMemType mtype, void *data); - static int ceedRestoreData(CeedQFunctionContext ctx); + static int ceedRestoreData(CeedQFunctionContext ctx); - static int ceedDestroy(CeedQFunctionContext ctx); - }; - } -} + static int ceedDestroy(CeedQFunctionContext ctx); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-simplex-basis.cpp b/backends/occa/ceed-occa-simplex-basis.cpp index 40b1d5cd31..de815026f6 100644 --- a/backends/occa/ceed-occa-simplex-basis.cpp +++ b/backends/occa/ceed-occa-simplex-basis.cpp @@ -5,203 +5,161 @@ // // This file is part of CEED: http://github.com/ceed -#include "ceed-occa-kernels.hpp" #include "ceed-occa-simplex-basis.hpp" +#include "ceed-occa-kernels.hpp" + namespace ceed { - namespace occa { - SimplexBasis::SimplexBasis(CeedBasis basis, - CeedInt dim_, - CeedInt P_, - CeedInt Q_, - const CeedScalar *interp_, - const CeedScalar *grad_, - const CeedScalar *qWeight_) { - setCeedFields(basis); - - dim = dim_; - P = P_; - Q = Q_; - - ::occa::device device = getDevice(); - - interp = device.malloc(P * Q, interp_); - grad = device.malloc(P * Q * dim, grad_); - qWeight = device.malloc(Q, qWeight_); - - setKernelProperties(); - } +namespace occa { +SimplexBasis::SimplexBasis(CeedBasis basis, CeedInt dim_, CeedInt P_, CeedInt Q_, const CeedScalar *interp_, const CeedScalar *grad_, + const CeedScalar *qWeight_) { + setCeedFields(basis); - SimplexBasis::~SimplexBasis() {} + dim = dim_; + P = P_; + Q = Q_; - bool SimplexBasis::isTensorBasis() const { - return false; - } + ::occa::device device = getDevice(); - const char* SimplexBasis::getFunctionSource() const { - // TODO: Add gpu function sources when split - return occa_simplex_basis_cpu_function_source; - } + interp = device.malloc(P * Q, interp_); + grad = device.malloc(P * Q * dim, grad_); + qWeight = device.malloc(Q, qWeight_); + + setKernelProperties(); +} + +SimplexBasis::~SimplexBasis() {} + +bool SimplexBasis::isTensorBasis() const { return false; } - void SimplexBasis::setKernelProperties() { - kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); - kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); - kernelProperties["defines/DIM"] = dim; - kernelProperties["defines/Q"] = Q; - kernelProperties["defines/P"] = P; - kernelProperties["defines/MAX_PQ"] = P > Q ? P : Q; - kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount; - if(usingGpuDevice()) { - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = (Q <= 1024) ? (1024 / Q) : 1; - } +const char *SimplexBasis::getFunctionSource() const { + // TODO: Add gpu function sources when split + return occa_simplex_basis_cpu_function_source; +} + +void SimplexBasis::setKernelProperties() { + kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); + kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); + kernelProperties["defines/DIM"] = dim; + kernelProperties["defines/Q"] = Q; + kernelProperties["defines/P"] = P; + kernelProperties["defines/MAX_PQ"] = P > Q ? P : Q; + kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount; + if (usingGpuDevice()) { + kernelProperties["defines/ELEMENTS_PER_BLOCK"] = (Q <= 1024) ? (1024 / Q) : 1; + } +} + +::occa::kernel SimplexBasis::buildKernel(const std::string &kernelName) { + std::string kernelSource; + if (usingGpuDevice()) { + kernelSource = occa_simplex_basis_gpu_source; + } else { + kernelSource = occa_simplex_basis_cpu_function_source; + kernelSource += '\n'; + kernelSource += occa_simplex_basis_cpu_kernel_source; + } + + return getDevice().buildKernelFromString(kernelSource, kernelName, kernelProperties); +} + +int SimplexBasis::applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) { + if (transpose) { + if (!interpTKernel.isInitialized()) { + kernelProperties["defines/TRANSPOSE"] = transpose; + interpTKernel = buildKernel("interp"); } - ::occa::kernel SimplexBasis::buildKernel(const std::string& kernelName) { - std::string kernelSource; - if (usingGpuDevice()) { - kernelSource = occa_simplex_basis_gpu_source; - } else { - kernelSource = occa_simplex_basis_cpu_function_source; - kernelSource += '\n'; - kernelSource += occa_simplex_basis_cpu_kernel_source; - } - - return getDevice().buildKernelFromString(kernelSource, - kernelName, - kernelProperties); + interpTKernel(elementCount, interp, U.getConstKernelArg(), V.getKernelArg()); + } else { + if (!interpKernel.isInitialized()) { + kernelProperties["defines/TRANSPOSE"] = transpose; + interpKernel = buildKernel("interp"); } - int SimplexBasis::applyInterp(const CeedInt elementCount, - const bool transpose, - Vector &U, - Vector &V) { - if(transpose) { - if(!interpTKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - interpTKernel = buildKernel("interp"); - } - - interpTKernel(elementCount, - interp, - U.getConstKernelArg(), - V.getKernelArg()); - } else { - if(!interpKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - interpKernel = buildKernel("interp"); - } - - interpKernel(elementCount, - interp, - U.getConstKernelArg(), - V.getKernelArg()); - } - return CEED_ERROR_SUCCESS; + interpKernel(elementCount, interp, U.getConstKernelArg(), V.getKernelArg()); + } + return CEED_ERROR_SUCCESS; +} + +int SimplexBasis::applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) { + if (transpose) { + if (!gradTKernel.isInitialized()) { + kernelProperties["defines/TRANSPOSE"] = transpose; + gradTKernel = buildKernel("grad"); } - int SimplexBasis::applyGrad(const CeedInt elementCount, - const bool transpose, - Vector &U, - Vector &V) { - if(transpose) { - if(!gradTKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - gradTKernel = buildKernel("grad"); - } - - gradTKernel(elementCount, - grad, - U.getConstKernelArg(), - V.getKernelArg()); - } else { - if(!gradKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - gradKernel = buildKernel("grad"); - } - - gradKernel(elementCount, - grad, - U.getConstKernelArg(), - V.getKernelArg()); - } - return CEED_ERROR_SUCCESS; + gradTKernel(elementCount, grad, U.getConstKernelArg(), V.getKernelArg()); + } else { + if (!gradKernel.isInitialized()) { + kernelProperties["defines/TRANSPOSE"] = transpose; + gradKernel = buildKernel("grad"); } - int SimplexBasis::applyWeight(const CeedInt elementCount, - Vector &W) { - if(!weightKernel.isInitialized()) { - weightKernel = buildKernel("weight"); - } - weightKernel(elementCount, qWeight, W.getKernelArg()); + gradKernel(elementCount, grad, U.getConstKernelArg(), V.getKernelArg()); + } + return CEED_ERROR_SUCCESS; +} - return CEED_ERROR_SUCCESS; - } +int SimplexBasis::applyWeight(const CeedInt elementCount, Vector &W) { + if (!weightKernel.isInitialized()) { + weightKernel = buildKernel("weight"); + } + weightKernel(elementCount, qWeight, W.getKernelArg()); + + return CEED_ERROR_SUCCESS; +} - int SimplexBasis::apply(const CeedInt elementCount, - CeedTransposeMode tmode, - CeedEvalMode emode, - Vector *U, - Vector *V) { - const bool transpose = tmode == CEED_TRANSPOSE; - - if ((dim < 1) || (3 < dim)) { - return ceedError("Backend only supports dimensions: 1, 2, and 3"); - } - - // Check arguments - if (emode != CEED_EVAL_WEIGHT) { - if (!U) { - return ceedError("Incorrect CeedVector input: U"); - } - } - if (!V) { - return ceedError("Incorrect CeedVector input: V"); - } - - try { - // Apply kernel - switch (emode) { - case CEED_EVAL_INTERP: - return applyInterp(elementCount, transpose, *U, *V); - case CEED_EVAL_GRAD: - return applyGrad(elementCount, transpose, *U, *V); - case CEED_EVAL_WEIGHT: - return applyWeight(elementCount, *V); - default: - return ceedError("Backend does not support given simplex eval mode"); - } - } catch (::occa::exception &exc) { - // Handle kernel build errors the CEED way - CeedHandleOccaException(exc); - } - - return CEED_ERROR_SUCCESS; +int SimplexBasis::apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V) { + const bool transpose = tmode == CEED_TRANSPOSE; + + if ((dim < 1) || (3 < dim)) { + return ceedError("Backend only supports dimensions: 1, 2, and 3"); + } + + // Check arguments + if (emode != CEED_EVAL_WEIGHT) { + if (!U) { + return ceedError("Incorrect CeedVector input: U"); } + } + if (!V) { + return ceedError("Incorrect CeedVector input: V"); + } - //---[ Ceed Callbacks ]------------- - int SimplexBasis::ceedCreate(CeedElemTopology topology, - CeedInt dim, - CeedInt ndof, - CeedInt nquad, - const CeedScalar *interp, - const CeedScalar *grad, - const CeedScalar *qref, - const CeedScalar *qWeight, - CeedBasis basis) { - int ierr; - Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); - - SimplexBasis *basis_ = new SimplexBasis(basis, - dim, - ndof, nquad, - interp, grad, qWeight); - ierr = CeedBasisSetData(basis, basis_); CeedChk(ierr); - - CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply); - CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy); - - return CEED_ERROR_SUCCESS; + try { + // Apply kernel + switch (emode) { + case CEED_EVAL_INTERP: + return applyInterp(elementCount, transpose, *U, *V); + case CEED_EVAL_GRAD: + return applyGrad(elementCount, transpose, *U, *V); + case CEED_EVAL_WEIGHT: + return applyWeight(elementCount, *V); + default: + return ceedError("Backend does not support given simplex eval mode"); } + } catch (::occa::exception &exc) { + // Handle kernel build errors the CEED way + CeedHandleOccaException(exc); } + + return CEED_ERROR_SUCCESS; +} + +//---[ Ceed Callbacks ]------------- +int SimplexBasis::ceedCreate(CeedElemTopology topology, CeedInt dim, CeedInt ndof, CeedInt nquad, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar *qref, const CeedScalar *qWeight, CeedBasis basis) { + Ceed ceed; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + + SimplexBasis *basis_ = new SimplexBasis(basis, dim, ndof, nquad, interp, grad, qWeight); + CeedCallBackend(CeedBasisSetData(basis, basis_)); + + CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply); + CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy); + + return CEED_ERROR_SUCCESS; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-simplex-basis.hpp b/backends/occa/ceed-occa-simplex-basis.hpp index 1f8d141bde..bfd96d5929 100644 --- a/backends/occa/ceed-occa-simplex-basis.hpp +++ b/backends/occa/ceed-occa-simplex-basis.hpp @@ -11,71 +11,47 @@ #include "ceed-occa-basis.hpp" namespace ceed { - namespace occa { - class SimplexBasis : public Basis { - public: - ::occa::memory interp; - ::occa::memory grad; - ::occa::memory qWeight; - - ::occa::json kernelProperties; - ::occa::kernel interpKernel; - ::occa::kernel interpTKernel; - ::occa::kernel gradKernel; - ::occa::kernel gradTKernel; - ::occa::kernel weightKernel; - - SimplexBasis(CeedBasis basis, - CeedInt dim, - CeedInt P_, - CeedInt Q_, - const CeedScalar *interp_, - const CeedScalar *grad_, - const CeedScalar *qWeight_); - - ~SimplexBasis(); - - bool isTensorBasis() const; - - const char* getFunctionSource() const; - - void setKernelProperties(); - - std::string getKernelSource() const; - - ::occa::kernel buildKernel(const std::string& kernelName); - - int applyInterp(const CeedInt elementCount, - const bool transpose, - Vector &U, - Vector &V); - - int applyGrad(const CeedInt elementCount, - const bool transpose, - Vector &U, - Vector &V); - - int applyWeight(const CeedInt elementCount, - Vector &W); - - int apply(const CeedInt elementCount, - CeedTransposeMode tmode, - CeedEvalMode emode, - Vector *u, - Vector *v); - - //---[ Ceed Callbacks ]----------- - static int ceedCreate(CeedElemTopology topology, - CeedInt dim, - CeedInt ndof, - CeedInt nquad, - const CeedScalar *interp, - const CeedScalar *grad, - const CeedScalar *qref, - const CeedScalar *qWeight, - CeedBasis basis); - }; - } -} +namespace occa { +class SimplexBasis : public Basis { + public: + ::occa::memory interp; + ::occa::memory grad; + ::occa::memory qWeight; + + ::occa::json kernelProperties; + ::occa::kernel interpKernel; + ::occa::kernel interpTKernel; + ::occa::kernel gradKernel; + ::occa::kernel gradTKernel; + ::occa::kernel weightKernel; + + SimplexBasis(CeedBasis basis, CeedInt dim, CeedInt P_, CeedInt Q_, const CeedScalar *interp_, const CeedScalar *grad_, const CeedScalar *qWeight_); + + ~SimplexBasis(); + + bool isTensorBasis() const; + + const char *getFunctionSource() const; + + void setKernelProperties(); + + std::string getKernelSource() const; + + ::occa::kernel buildKernel(const std::string &kernelName); + + int applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V); + + int applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V); + + int applyWeight(const CeedInt elementCount, Vector &W); + + int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *u, Vector *v); + + //---[ Ceed Callbacks ]----------- + static int ceedCreate(CeedElemTopology topology, CeedInt dim, CeedInt ndof, CeedInt nquad, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar *qref, const CeedScalar *qWeight, CeedBasis basis); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-tensor-basis.cpp b/backends/occa/ceed-occa-tensor-basis.cpp index 22963bf812..4f4ce01502 100644 --- a/backends/occa/ceed-occa-tensor-basis.cpp +++ b/backends/occa/ceed-occa-tensor-basis.cpp @@ -5,289 +5,232 @@ // // This file is part of CEED: http://github.com/ceed -#include "ceed-occa-kernels.hpp" #include "ceed-occa-tensor-basis.hpp" +#include "ceed-occa-kernels.hpp" + namespace ceed { - namespace occa { - TensorBasis::TensorBasis(CeedBasis basis, - CeedInt dim_, - CeedInt P1D_, - CeedInt Q1D_, - const CeedScalar *interp1D_, - const CeedScalar *grad1D_, - const CeedScalar *qWeight1D_) : - P1D(P1D_), - Q1D(Q1D_) { - setCeedFields(basis); - - dim = dim_; - - P = P1D; - Q = Q1D; - for (int i = 1; i < dim; ++i) { - P *= P1D; - Q *= Q1D; - } - - ::occa::device device = getDevice(); - - interp1D = device.malloc(P1D * Q1D, interp1D_); - grad1D = device.malloc(P1D * Q1D, grad1D_); - qWeight1D = device.malloc(Q1D, qWeight1D_); - - setKernelProperties(); - } +namespace occa { +TensorBasis::TensorBasis(CeedBasis basis, CeedInt dim_, CeedInt P1D_, CeedInt Q1D_, const CeedScalar *interp1D_, const CeedScalar *grad1D_, + const CeedScalar *qWeight1D_) + : P1D(P1D_), Q1D(Q1D_) { + setCeedFields(basis); + + dim = dim_; + + P = P1D; + Q = Q1D; + for (int i = 1; i < dim; ++i) { + P *= P1D; + Q *= Q1D; + } - TensorBasis::~TensorBasis() {} + ::occa::device device = getDevice(); - bool TensorBasis::isTensorBasis() const { - return true; - } + interp1D = device.malloc(P1D * Q1D, interp1D_); + grad1D = device.malloc(P1D * Q1D, grad1D_); + qWeight1D = device.malloc(Q1D, qWeight1D_); - void TensorBasis::setKernelProperties() { - kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); - kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); - kernelProperties["defines/Q1D"] = Q1D; - kernelProperties["defines/P1D"] = P1D; - kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount; - if(usingGpuDevice()) { - kernelProperties["defines/MAX_PQ"] = (Q1D > P1D) ? Q1D : P1D; - } - } + setKernelProperties(); +} - const char* TensorBasis::getFunctionSource() const { - // TODO: Add gpu function sources when split - const char *cpuFunctionSources[3] = { - occa_tensor_basis_1d_cpu_function_source, - occa_tensor_basis_2d_cpu_function_source, - occa_tensor_basis_3d_cpu_function_source - }; - return cpuFunctionSources[dim - 1]; - } +TensorBasis::~TensorBasis() {} - std::string TensorBasis::getKernelSource() const { - const char *cpuFunctionSources[3] = { - occa_tensor_basis_1d_cpu_function_source, - occa_tensor_basis_2d_cpu_function_source, - occa_tensor_basis_3d_cpu_function_source - }; - const char *cpuKernelSources[3] = { - occa_tensor_basis_1d_cpu_kernel_source, - occa_tensor_basis_2d_cpu_kernel_source, - occa_tensor_basis_3d_cpu_kernel_source - }; - const char *gpuKernelSources[3] = { - occa_tensor_basis_1d_gpu_source, - occa_tensor_basis_2d_gpu_source, - occa_tensor_basis_3d_gpu_source - }; - - std::string kernelSource; - if (usingGpuDevice()) { - kernelSource = gpuKernelSources[dim - 1]; - } else { - kernelSource = cpuFunctionSources[dim - 1]; - kernelSource += '\n'; - kernelSource += cpuKernelSources[dim - 1]; - } - return kernelSource; - } +bool TensorBasis::isTensorBasis() const { return true; } - ::occa::kernel TensorBasis::buildKernel(const std::string& kernelName) { - std::string kernelSource = getKernelSource(); - return getDevice().buildKernelFromString(kernelSource, - kernelName, - kernelProperties); - } +void TensorBasis::setKernelProperties() { + kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); + kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); + kernelProperties["defines/Q1D"] = Q1D; + kernelProperties["defines/P1D"] = P1D; + kernelProperties["defines/BASIS_COMPONENT_COUNT"] = ceedComponentCount; + if (usingGpuDevice()) { + kernelProperties["defines/MAX_PQ"] = (Q1D > P1D) ? Q1D : P1D; + } +} + +const char *TensorBasis::getFunctionSource() const { + // TODO: Add gpu function sources when split + const char *cpuFunctionSources[3] = {occa_tensor_basis_1d_cpu_function_source, occa_tensor_basis_2d_cpu_function_source, + occa_tensor_basis_3d_cpu_function_source}; + return cpuFunctionSources[dim - 1]; +} + +std::string TensorBasis::getKernelSource() const { + const char *cpuFunctionSources[3] = {occa_tensor_basis_1d_cpu_function_source, occa_tensor_basis_2d_cpu_function_source, + occa_tensor_basis_3d_cpu_function_source}; + const char *cpuKernelSources[3] = {occa_tensor_basis_1d_cpu_kernel_source, occa_tensor_basis_2d_cpu_kernel_source, + occa_tensor_basis_3d_cpu_kernel_source}; + const char *gpuKernelSources[3] = {occa_tensor_basis_1d_gpu_source, occa_tensor_basis_2d_gpu_source, occa_tensor_basis_3d_gpu_source}; + + std::string kernelSource; + if (usingGpuDevice()) { + kernelSource = gpuKernelSources[dim - 1]; + } else { + kernelSource = cpuFunctionSources[dim - 1]; + kernelSource += '\n'; + kernelSource += cpuKernelSources[dim - 1]; + } + return kernelSource; +} - int TensorBasis::applyInterp(const CeedInt elementCount, - const bool transpose, - Vector &U, - Vector &V) { - if(transpose) { - if(!interpTKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp(); - interpTKernel = buildKernel("interp"); - } - interpTKernel(elementCount, - interp1D, - U.getConstKernelArg(), - V.getKernelArg()); - } else { - if(!interpKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp(); - interpKernel = buildKernel("interp"); - } - interpKernel(elementCount, - interp1D, - U.getConstKernelArg(), - V.getKernelArg()); - } - return CEED_ERROR_SUCCESS; +::occa::kernel TensorBasis::buildKernel(const std::string &kernelName) { + std::string kernelSource = getKernelSource(); + return getDevice().buildKernelFromString(kernelSource, kernelName, kernelProperties); +} + +int TensorBasis::applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) { + if (transpose) { + if (!interpTKernel.isInitialized()) { + kernelProperties["defines/TRANSPOSE"] = transpose; + kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp(); + interpTKernel = buildKernel("interp"); + } + interpTKernel(elementCount, interp1D, U.getConstKernelArg(), V.getKernelArg()); + } else { + if (!interpKernel.isInitialized()) { + kernelProperties["defines/TRANSPOSE"] = transpose; + kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockInterp(); + interpKernel = buildKernel("interp"); } + interpKernel(elementCount, interp1D, U.getConstKernelArg(), V.getKernelArg()); + } + return CEED_ERROR_SUCCESS; +} - int TensorBasis::elementsPerBlockInterp() const { - int elementsPerBlock; - if (dim == 1) { - elementsPerBlock = 32; - } else if (dim == 2) { - const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8}; - if (Q1D < 7) { - elementsPerBlock = blocksByQ[Q1D]; - } else { - elementsPerBlock = 1; - } - } else { - elementsPerBlock = 1; - } - return elementsPerBlock; +int TensorBasis::elementsPerBlockInterp() const { + int elementsPerBlock; + if (dim == 1) { + elementsPerBlock = 32; + } else if (dim == 2) { + const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8}; + if (Q1D < 7) { + elementsPerBlock = blocksByQ[Q1D]; + } else { + elementsPerBlock = 1; } + } else { + elementsPerBlock = 1; + } + return elementsPerBlock; +} - int TensorBasis::applyGrad(const CeedInt elementCount, - const bool transpose, - Vector &U, - Vector &V) { - - if(transpose) { - if(!gradTKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad(); - gradTKernel = buildKernel("grad"); - } - gradTKernel(elementCount, - interp1D, - grad1D, - U.getConstKernelArg(), - V.getKernelArg()); - } else { - if(!gradKernel.isInitialized()) { - kernelProperties["defines/TRANSPOSE"] = transpose; - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad(); - gradKernel = buildKernel("grad"); - } - gradKernel(elementCount, - interp1D, - grad1D, - U.getConstKernelArg(), - V.getKernelArg()); - } - return CEED_ERROR_SUCCESS; +int TensorBasis::applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V) { + if (transpose) { + if (!gradTKernel.isInitialized()) { + kernelProperties["defines/TRANSPOSE"] = transpose; + kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad(); + gradTKernel = buildKernel("grad"); + } + gradTKernel(elementCount, interp1D, grad1D, U.getConstKernelArg(), V.getKernelArg()); + } else { + if (!gradKernel.isInitialized()) { + kernelProperties["defines/TRANSPOSE"] = transpose; + kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockGrad(); + gradKernel = buildKernel("grad"); } + gradKernel(elementCount, interp1D, grad1D, U.getConstKernelArg(), V.getKernelArg()); + } + return CEED_ERROR_SUCCESS; +} - int TensorBasis::elementsPerBlockGrad() const { - int elementsPerBlock; - if (dim == 1) { - elementsPerBlock = 32; - } else if (dim == 2) { - const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8}; - if (Q1D < 7) { - elementsPerBlock = blocksByQ[Q1D]; - } else { - elementsPerBlock = 1; - } - } else { - elementsPerBlock = 1; - } - return elementsPerBlock; +int TensorBasis::elementsPerBlockGrad() const { + int elementsPerBlock; + if (dim == 1) { + elementsPerBlock = 32; + } else if (dim == 2) { + const CeedInt blocksByQ[7] = {0, 32, 8, 6, 4, 2, 8}; + if (Q1D < 7) { + elementsPerBlock = blocksByQ[Q1D]; + } else { + elementsPerBlock = 1; } + } else { + elementsPerBlock = 1; + } + return elementsPerBlock; +} - int TensorBasis::applyWeight(const CeedInt elementCount, - Vector &W) { - if(!weightKernel.isInitialized()) { - kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockWeight(); - weightKernel = buildKernel("weight"); - } - weightKernel(elementCount, qWeight1D, W.getKernelArg()); +int TensorBasis::applyWeight(const CeedInt elementCount, Vector &W) { + if (!weightKernel.isInitialized()) { + kernelProperties["defines/ELEMENTS_PER_BLOCK"] = elementsPerBlockWeight(); + weightKernel = buildKernel("weight"); + } + weightKernel(elementCount, qWeight1D, W.getKernelArg()); - return CEED_ERROR_SUCCESS; - } + return CEED_ERROR_SUCCESS; +} - int TensorBasis::elementsPerBlockWeight() const { - int elementsPerBlock; - if (dim == 1) { - elementsPerBlock = 32 / Q1D; - } else if (dim == 2) { - if ((Q1D * Q1D) > 32) { - elementsPerBlock = 1; - } else { - elementsPerBlock = 32 / (Q1D * Q1D); - } - } else { - elementsPerBlock = Q1D; - } - return elementsPerBlock; +int TensorBasis::elementsPerBlockWeight() const { + int elementsPerBlock; + if (dim == 1) { + elementsPerBlock = 32 / Q1D; + } else if (dim == 2) { + if ((Q1D * Q1D) > 32) { + elementsPerBlock = 1; + } else { + elementsPerBlock = 32 / (Q1D * Q1D); } + } else { + elementsPerBlock = Q1D; + } + return elementsPerBlock; +} + +int TensorBasis::apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V) { + const bool transpose = tmode == CEED_TRANSPOSE; - int TensorBasis::apply(const CeedInt elementCount, - CeedTransposeMode tmode, - CeedEvalMode emode, - Vector *U, - Vector *V) { - const bool transpose = tmode == CEED_TRANSPOSE; - - if ((dim < 1) || (3 < dim)) { - return ceedError("Backend only supports dimensions: 1, 2, and 3"); - } - - // Check arguments - if (emode != CEED_EVAL_WEIGHT) { - if (!U) { - return ceedError("Incorrect CeedVector input: U"); - } - } - if (!V) { - return ceedError("Incorrect CeedVector input: V"); - } - - try { - // Apply kernel - switch (emode) { - case CEED_EVAL_INTERP: - return applyInterp(elementCount, transpose, *U, *V); - case CEED_EVAL_GRAD: - return applyGrad(elementCount, transpose, *U, *V); - case CEED_EVAL_WEIGHT: - return applyWeight(elementCount, *V); - default: - return ceedError("Backend does not support given tensor eval mode"); - } - } catch (::occa::exception &exc) { - // Handle kernel build errors the CEED way - CeedHandleOccaException(exc); - } - - return CEED_ERROR_SUCCESS; + if ((dim < 1) || (3 < dim)) { + return ceedError("Backend only supports dimensions: 1, 2, and 3"); + } + + // Check arguments + if (emode != CEED_EVAL_WEIGHT) { + if (!U) { + return ceedError("Incorrect CeedVector input: U"); } + } + if (!V) { + return ceedError("Incorrect CeedVector input: V"); + } - //---[ Ceed Callbacks ]------------- - int TensorBasis::ceedCreate(CeedInt dim, - CeedInt P1D, CeedInt Q1D, - const CeedScalar *interp1D, - const CeedScalar *grad1D, - const CeedScalar *qref1D, - const CeedScalar *qWeight1D, - CeedBasis basis) { - int ierr; - Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); - - if (Q1D < P1D && Context::from(ceed)->usingGpuDevice()) { - return staticCeedError("(OCCA) Backend does not implement underintegrated basis"); - } - - TensorBasis *basis_ = new TensorBasis(basis, - dim, - P1D, Q1D, - interp1D, grad1D, qWeight1D); - ierr = CeedBasisSetData(basis, basis_); CeedChk(ierr); - - CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply); - CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy); - - return CEED_ERROR_SUCCESS; + try { + // Apply kernel + switch (emode) { + case CEED_EVAL_INTERP: + return applyInterp(elementCount, transpose, *U, *V); + case CEED_EVAL_GRAD: + return applyGrad(elementCount, transpose, *U, *V); + case CEED_EVAL_WEIGHT: + return applyWeight(elementCount, *V); + default: + return ceedError("Backend does not support given tensor eval mode"); } + } catch (::occa::exception &exc) { + // Handle kernel build errors the CEED way + CeedHandleOccaException(exc); + } + + return CEED_ERROR_SUCCESS; +} + +//---[ Ceed Callbacks ]------------- +int TensorBasis::ceedCreate(CeedInt dim, CeedInt P1D, CeedInt Q1D, const CeedScalar *interp1D, const CeedScalar *grad1D, const CeedScalar *qref1D, + const CeedScalar *qWeight1D, CeedBasis basis) { + Ceed ceed; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + + if (Q1D < P1D && Context::from(ceed)->usingGpuDevice()) { + return staticCeedError("(OCCA) Backend does not implement underintegrated basis"); } + + TensorBasis *basis_ = new TensorBasis(basis, dim, P1D, Q1D, interp1D, grad1D, qWeight1D); + CeedCallBackend(CeedBasisSetData(basis, basis_)); + + CeedOccaRegisterFunction(basis, "Apply", Basis::ceedApply); + CeedOccaRegisterFunction(basis, "Destroy", Basis::ceedDestroy); + + return CEED_ERROR_SUCCESS; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-tensor-basis.hpp b/backends/occa/ceed-occa-tensor-basis.hpp index 8b024deb9f..c1fee6529f 100644 --- a/backends/occa/ceed-occa-tensor-basis.hpp +++ b/backends/occa/ceed-occa-tensor-basis.hpp @@ -11,76 +11,54 @@ #include "ceed-occa-basis.hpp" namespace ceed { - namespace occa { - class TensorBasis : public Basis { - public: - CeedInt P1D; - CeedInt Q1D; - ::occa::memory interp1D; - ::occa::memory grad1D; - ::occa::memory qWeight1D; - - ::occa::json kernelProperties; - ::occa::kernel interpKernel; - ::occa::kernel interpTKernel; - ::occa::kernel gradKernel; - ::occa::kernel gradTKernel; - ::occa::kernel weightKernel; - - TensorBasis(CeedBasis basis, - CeedInt dim_, - CeedInt P1D_, - CeedInt Q1D_, - const CeedScalar *interp1D_, - const CeedScalar *grad1D_, - const CeedScalar *qWeight1D_); - - ~TensorBasis(); - - bool isTensorBasis() const; - - const char* getFunctionSource() const; - - std::string getKernelSource() const; - - void setKernelProperties(); - - int elementsPerBlockInterp() const; - int elementsPerBlockGrad() const; - int elementsPerBlockWeight() const; - - ::occa::kernel buildKernel(const std::string& kernelName); - - int applyInterp(const CeedInt elementCount, - const bool transpose, - Vector &U, - Vector &V); - - int applyGrad(const CeedInt elementCount, - const bool transpose, - Vector &U, - Vector &V); - - int applyWeight(const CeedInt elementCount, - Vector &W); - - int apply(const CeedInt elementCount, - CeedTransposeMode tmode, - CeedEvalMode emode, - Vector *U, - Vector *V); - - //---[ Ceed Callbacks ]----------- - static int ceedCreate(CeedInt dim, - CeedInt P1D, - CeedInt Q1D, - const CeedScalar *interp1D, - const CeedScalar *grad1D, - const CeedScalar *qref1D, - const CeedScalar *qWeight1D, - CeedBasis basis); - }; - } -} +namespace occa { +class TensorBasis : public Basis { + public: + CeedInt P1D; + CeedInt Q1D; + ::occa::memory interp1D; + ::occa::memory grad1D; + ::occa::memory qWeight1D; + + ::occa::json kernelProperties; + ::occa::kernel interpKernel; + ::occa::kernel interpTKernel; + ::occa::kernel gradKernel; + ::occa::kernel gradTKernel; + ::occa::kernel weightKernel; + + TensorBasis(CeedBasis basis, CeedInt dim_, CeedInt P1D_, CeedInt Q1D_, const CeedScalar *interp1D_, const CeedScalar *grad1D_, + const CeedScalar *qWeight1D_); + + ~TensorBasis(); + + bool isTensorBasis() const; + + const char *getFunctionSource() const; + + std::string getKernelSource() const; + + void setKernelProperties(); + + int elementsPerBlockInterp() const; + int elementsPerBlockGrad() const; + int elementsPerBlockWeight() const; + + ::occa::kernel buildKernel(const std::string &kernelName); + + int applyInterp(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V); + + int applyGrad(const CeedInt elementCount, const bool transpose, Vector &U, Vector &V); + + int applyWeight(const CeedInt elementCount, Vector &W); + + int apply(const CeedInt elementCount, CeedTransposeMode tmode, CeedEvalMode emode, Vector *U, Vector *V); + + //---[ Ceed Callbacks ]----------- + static int ceedCreate(CeedInt dim, CeedInt P1D, CeedInt Q1D, const CeedScalar *interp1D, const CeedScalar *grad1D, const CeedScalar *qref1D, + const CeedScalar *qWeight1D, CeedBasis basis); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-types.hpp b/backends/occa/ceed-occa-types.hpp index f2661e1909..121a8b122f 100644 --- a/backends/occa/ceed-occa-types.hpp +++ b/backends/occa/ceed-occa-types.hpp @@ -9,48 +9,52 @@ #define CEED_OCCA_TYPES_HEADER #include + #include -#define CeedOccaFromChk(ierr) \ - do { \ - if (ierr) { \ - return NULL; \ - } \ +#define CeedOccaFromChk(ierr) \ + do { \ + if (ierr) { \ + return NULL; \ + } \ } while (0) -#define CeedOccaValidChk(isValidVar, ierr) \ - do { \ - if (ierr) { \ - isValidVar = false; \ - return; \ - } \ +#define CeedCallOcca(...) \ + do { \ + int ierr_q_ = __VA_ARGS__; \ + CeedOccaFromChk(ierr_q_); \ + } while (0); + +#define CeedOccaValidChk(isValidVar, ierr) \ + do { \ + if (ierr) { \ + isValidVar = false; \ + return; \ + } \ } while (0) -#define CeedHandleOccaException(exc) \ - do { \ - std::string error = exc.toString(); \ - return CeedError(ceed, CEED_ERROR_BACKEND, error.c_str()); \ +#define CeedCallOccaValid(isValidVar, ...) \ + do { \ + int ierr_q_ = __VA_ARGS__; \ + CeedOccaValidChk(isValidVar, ierr_q_); \ + } while (0); + +#define CeedHandleOccaException(exc) \ + do { \ + std::string error = exc.toString(); \ + return CeedError(ceed, CEED_ERROR_BACKEND, error.c_str()); \ } while (0) -#define CeedOccaCastRegisterFunction(func) \ - (ceed::occa::ceedFunction) (void*) func +#define CeedOccaCastRegisterFunction(func) (ceed::occa::ceedFunction)(void*) func -#define CeedOccaRegisterBaseFunction(name, func) \ - ierr = registerCeedFunction( \ - ceed, name, \ - CeedOccaCastRegisterFunction(func) \ - ); CeedChk(ierr) +#define CeedOccaRegisterBaseFunction(name, func) CeedCallBackend(registerCeedFunction(ceed, name, CeedOccaCastRegisterFunction(func))); -#define CeedOccaRegisterFunction(object, name, func) \ - ierr = registerCeedFunction( \ - ceed, object, name, \ - CeedOccaCastRegisterFunction(func) \ - ); CeedChk(ierr) +#define CeedOccaRegisterFunction(object, name, func) CeedCallBackend(registerCeedFunction(ceed, object, name, CeedOccaCastRegisterFunction(func))); namespace ceed { - namespace occa { - typedef int (*ceedFunction)(); - } +namespace occa { +typedef int (*ceedFunction)(); } +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa-vector.cpp b/backends/occa/ceed-occa-vector.cpp index 6f2982edaf..44f3a23b54 100644 --- a/backends/occa/ceed-occa-vector.cpp +++ b/backends/occa/ceed-occa-vector.cpp @@ -5,496 +5,456 @@ // // This file is part of CEED: http://github.com/ceed +#include "ceed-occa-vector.hpp" + #include #include "ceed-occa-kernels.hpp" -#include "ceed-occa-vector.hpp" namespace ceed { - namespace occa { - Vector::Vector() : - length(0), - hostBufferLength(0), - hostBuffer(NULL), - currentHostBuffer(NULL), - syncState(SyncState::none) {} - - Vector::~Vector() { - memory.free(); - freeHostBuffer(); - } +namespace occa { +Vector::Vector() : length(0), hostBufferLength(0), hostBuffer(NULL), currentHostBuffer(NULL), syncState(SyncState::none) {} - int Vector::hasValidArray(bool* has_valid_array) { - (*has_valid_array) = (!!hostBuffer) - || (!!currentHostBuffer) - || (memory.isInitialized()) - || (currentMemory.isInitialized()); - return CEED_ERROR_SUCCESS; - } +Vector::~Vector() { + memory.free(); + freeHostBuffer(); +} - int Vector::hasBorrowedArrayOfType(CeedMemType mem_type, - bool *has_borrowed_array_of_type) { - switch (mem_type) { - case CEED_MEM_HOST: - (*has_borrowed_array_of_type) = !!currentHostBuffer; - break; - case CEED_MEM_DEVICE: - (*has_borrowed_array_of_type) = currentMemory.isInitialized(); - break; - } - return CEED_ERROR_SUCCESS; - } +int Vector::hasValidArray(bool *has_valid_array) { + (*has_valid_array) = (!!hostBuffer) || (!!currentHostBuffer) || (memory.isInitialized()) || (currentMemory.isInitialized()); + return CEED_ERROR_SUCCESS; +} - Vector* Vector::getVector(CeedVector vec, - const bool assertValid) { - if (!vec || vec == CEED_VECTOR_NONE) { - return NULL; - } +int Vector::hasBorrowedArrayOfType(CeedMemType mem_type, bool *has_borrowed_array_of_type) { + switch (mem_type) { + case CEED_MEM_HOST: + (*has_borrowed_array_of_type) = !!currentHostBuffer; + break; + case CEED_MEM_DEVICE: + (*has_borrowed_array_of_type) = currentMemory.isInitialized(); + break; + } + return CEED_ERROR_SUCCESS; +} - int ierr; - Vector *vector = NULL; +Vector *Vector::getVector(CeedVector vec, const bool assertValid) { + if (!vec || vec == CEED_VECTOR_NONE) { + return NULL; + } - ierr = CeedVectorGetData(vec, &vector); - if (assertValid) { - CeedOccaFromChk(ierr); - } + int ierr; + Vector *vector = NULL; - return vector; - } + ierr = CeedVectorGetData(vec, &vector); + if (assertValid) { + CeedOccaFromChk(ierr); + } - Vector* Vector::from(CeedVector vec) { - Vector *vector = getVector(vec); - if (!vector) { - return NULL; - } + return vector; +} - int ierr; - ierr = CeedVectorGetCeed(vec, &vector->ceed); CeedOccaFromChk(ierr); - ierr = CeedVectorGetLength(vec, &vector->length); CeedOccaFromChk(ierr); +Vector *Vector::from(CeedVector vec) { + Vector *vector = getVector(vec); + if (!vector) { + return NULL; + } - return vector; - } + CeedCallOcca(CeedVectorGetCeed(vec, &vector->ceed)); + CeedCallOcca(CeedVectorGetLength(vec, &vector->length)); - void Vector::resize(const CeedSize length_) { - length = length_; - } + return vector; +} - void Vector::resizeMemory(const CeedSize length_) { - resizeMemory(getDevice(), length_); - } +void Vector::resize(const CeedSize length_) { length = length_; } - void Vector::resizeMemory(::occa::device device, const CeedSize length_) { - if (length_ != (CeedSize) memory.length()) { - memory.free(); - memory = device.malloc(length_); - } - } +void Vector::resizeMemory(const CeedSize length_) { resizeMemory(getDevice(), length_); } - void Vector::resizeHostBuffer(const CeedSize length_) { - if (length_ != hostBufferLength) { - delete hostBuffer; - hostBuffer = new CeedScalar[length_]; - } - } +void Vector::resizeMemory(::occa::device device, const CeedSize length_) { + if (length_ != (CeedSize)memory.length()) { + memory.free(); + memory = device.malloc(length_); + } +} - void Vector::setCurrentMemoryIfNeeded() { - if (!currentMemory.isInitialized()) { - resizeMemory(length); - currentMemory = memory; - } - } +void Vector::resizeHostBuffer(const CeedSize length_) { + if (length_ != hostBufferLength) { + delete hostBuffer; + hostBuffer = new CeedScalar[length_]; + } +} - void Vector::setCurrentHostBufferIfNeeded() { - if (!currentHostBuffer) { - resizeHostBuffer(length); - currentHostBuffer = hostBuffer; - } - } +void Vector::setCurrentMemoryIfNeeded() { + if (!currentMemory.isInitialized()) { + resizeMemory(length); + currentMemory = memory; + } +} - void Vector::freeHostBuffer() { - if (hostBuffer) { - delete [] hostBuffer; - hostBuffer = NULL; - } - } +void Vector::setCurrentHostBufferIfNeeded() { + if (!currentHostBuffer) { + resizeHostBuffer(length); + currentHostBuffer = hostBuffer; + } +} - int Vector::setValue(CeedScalar value) { - // Prioritize keeping data in the device - if (syncState & SyncState::device) { - setCurrentMemoryIfNeeded(); - if(!setValueKernel.isInitialized()) { - ::occa::json kernelProperties; - CeedInt constexpr block_size{256}; - kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); - kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); - kernelProperties["defines/BLOCK_SIZE"] = block_size; - - std::string kernelSource = occa_set_value_source; - setValueKernel = getDevice().buildKernelFromString( - kernelSource,"setValue",kernelProperties - ); - setValueKernel(currentMemory,value,length); - } - syncState = SyncState::device; - } else { - setCurrentHostBufferIfNeeded(); - for (CeedInt i = 0; i < length; ++i) { - currentHostBuffer[i] = value; - } - syncState = SyncState::host; - } - return CEED_ERROR_SUCCESS; - } +void Vector::freeHostBuffer() { + if (hostBuffer) { + delete[] hostBuffer; + hostBuffer = NULL; + } +} - int Vector::setArray(CeedMemType mtype, - CeedCopyMode cmode, CeedScalar *array) { - switch (cmode) { - case CEED_COPY_VALUES: - return copyArrayValues(mtype, array); - case CEED_OWN_POINTER: - return ownArrayPointer(mtype, array); - case CEED_USE_POINTER: - return useArrayPointer(mtype, array); - } - return ceedError("Invalid CeedCopyMode passed"); - } +int Vector::setValue(CeedScalar value) { + // Prioritize keeping data in the device + if (syncState & SyncState::device) { + setCurrentMemoryIfNeeded(); + if (!setValueKernel.isInitialized()) { + ::occa::json kernelProperties; + CeedInt constexpr block_size{256}; + kernelProperties["defines/CeedInt"] = ::occa::dtype::get().name(); + kernelProperties["defines/CeedScalar"] = ::occa::dtype::get().name(); + kernelProperties["defines/BLOCK_SIZE"] = block_size; + + std::string kernelSource = occa_set_value_source; + setValueKernel = getDevice().buildKernelFromString(kernelSource, "setValue", kernelProperties); + setValueKernel(currentMemory, value, length); + } + syncState = SyncState::device; + } else { + setCurrentHostBufferIfNeeded(); + for (CeedInt i = 0; i < length; ++i) { + currentHostBuffer[i] = value; + } + syncState = SyncState::host; + } + return CEED_ERROR_SUCCESS; +} - int Vector::takeArray(CeedMemType mtype, CeedScalar **array) { - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostBufferIfNeeded(); - if (syncState == SyncState::device) { - setCurrentMemoryIfNeeded(); - currentMemory.copyTo(currentHostBuffer); - } - *array = currentHostBuffer; - hostBuffer = NULL; - currentHostBuffer = NULL; - - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - *array = memoryToArray(currentMemory); - memory = ::occa::null; - currentMemory = ::occa::null; - - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); - } +int Vector::setArray(CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array) { + switch (cmode) { + case CEED_COPY_VALUES: + return copyArrayValues(mtype, array); + case CEED_OWN_POINTER: + return ownArrayPointer(mtype, array); + case CEED_USE_POINTER: + return useArrayPointer(mtype, array); + } + return ceedError("Invalid CeedCopyMode passed"); +} - int Vector::copyArrayValues(CeedMemType mtype, CeedScalar *array) { - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostBufferIfNeeded(); - if (array) { - std::memcpy(currentHostBuffer, array, length * sizeof(CeedScalar)); - } - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentMemoryIfNeeded(); - if (array) { - currentMemory.copyFrom(arrayToMemory(array)); - } - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; +int Vector::takeArray(CeedMemType mtype, CeedScalar **array) { + switch (mtype) { + case CEED_MEM_HOST: + setCurrentHostBufferIfNeeded(); + if (syncState == SyncState::device) { + setCurrentMemoryIfNeeded(); + currentMemory.copyTo(currentHostBuffer); } - return ceedError("Invalid CeedMemType passed"); - } + *array = currentHostBuffer; + hostBuffer = NULL; + currentHostBuffer = NULL; - int Vector::ownArrayPointer(CeedMemType mtype, CeedScalar *array) { - switch (mtype) { - case CEED_MEM_HOST: - freeHostBuffer(); - hostBuffer = currentHostBuffer = array; - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - memory.free(); - memory = currentMemory = arrayToMemory(array); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; + syncState = SyncState::host; + return CEED_ERROR_SUCCESS; + case CEED_MEM_DEVICE: + setCurrentMemoryIfNeeded(); + if (syncState == SyncState::host) { + setCurrentHostBufferIfNeeded(); + currentMemory.copyFrom(currentHostBuffer); } - return ceedError("Invalid CeedMemType passed"); - } + *array = memoryToArray(currentMemory); + memory = ::occa::null; + currentMemory = ::occa::null; - int Vector::useArrayPointer(CeedMemType mtype, CeedScalar *array) { - switch (mtype) { - case CEED_MEM_HOST: - freeHostBuffer(); - currentHostBuffer = array; - syncState = SyncState::host; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - memory.free(); - currentMemory = arrayToMemory(array); - syncState = SyncState::device; - return CEED_ERROR_SUCCESS; - } - return ceedError("Invalid CeedMemType passed"); - } + syncState = SyncState::device; + return CEED_ERROR_SUCCESS; + } + return ceedError("Invalid CeedMemType passed"); +} - int Vector::getArray(CeedMemType mtype, - CeedScalar **array) { - // The passed `array` might be modified before restoring - // so we can't set sync state to SyncState::all - switch (mtype) { - case CEED_MEM_HOST: - setCurrentHostBufferIfNeeded(); - if (syncState == SyncState::device) { - setCurrentMemoryIfNeeded(); - currentMemory.copyTo(currentHostBuffer); - } - syncState = SyncState::host; - *array = currentHostBuffer; - return CEED_ERROR_SUCCESS; - case CEED_MEM_DEVICE: - setCurrentMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - } - syncState = SyncState::device; - *array = memoryToArray(currentMemory); - return CEED_ERROR_SUCCESS; +int Vector::copyArrayValues(CeedMemType mtype, CeedScalar *array) { + switch (mtype) { + case CEED_MEM_HOST: + setCurrentHostBufferIfNeeded(); + if (array) { + std::memcpy(currentHostBuffer, array, length * sizeof(CeedScalar)); } - return ceedError("Invalid CeedMemType passed"); - } - - int Vector::getReadOnlyArray(CeedMemType mtype, - CeedScalar **array) { - const bool willBeFullySynced = ( - (syncState == SyncState::device && mtype == CEED_MEM_HOST) || - (syncState == SyncState::host && mtype == CEED_MEM_DEVICE) - ); - - const int error = getArray(mtype, const_cast(array)); - // Take advantage the vector will be fully synced - if (!error && willBeFullySynced) { - syncState = SyncState::all; + syncState = SyncState::host; + return CEED_ERROR_SUCCESS; + case CEED_MEM_DEVICE: + setCurrentMemoryIfNeeded(); + if (array) { + currentMemory.copyFrom(arrayToMemory(array)); } + syncState = SyncState::device; + return CEED_ERROR_SUCCESS; + } + return ceedError("Invalid CeedMemType passed"); +} - return error; - } - - int Vector::getWriteOnlyArray(CeedMemType mtype, - CeedScalar **array) { - // const bool willBeFullySynced = ( - // (syncState == SyncState::device && mtype == CEED_MEM_HOST) || - // (syncState == SyncState::host && mtype == CEED_MEM_DEVICE) - // ); - - const int error = getArray(mtype, const_cast(array)); - // // Take advantage the vector will be fully synced - // if (!error && willBeFullySynced) { - // syncState = SyncState::all; - // } - - return error; - } - - int Vector::restoreArray(CeedScalar **array) { +int Vector::ownArrayPointer(CeedMemType mtype, CeedScalar *array) { + switch (mtype) { + case CEED_MEM_HOST: + freeHostBuffer(); + hostBuffer = currentHostBuffer = array; + syncState = SyncState::host; return CEED_ERROR_SUCCESS; - } + case CEED_MEM_DEVICE: + memory.free(); + memory = currentMemory = arrayToMemory(array); + syncState = SyncState::device; + return CEED_ERROR_SUCCESS; + } + return ceedError("Invalid CeedMemType passed"); +} - int Vector::restoreReadOnlyArray(CeedScalar **array) { +int Vector::useArrayPointer(CeedMemType mtype, CeedScalar *array) { + switch (mtype) { + case CEED_MEM_HOST: + freeHostBuffer(); + currentHostBuffer = array; + syncState = SyncState::host; return CEED_ERROR_SUCCESS; - } + case CEED_MEM_DEVICE: + memory.free(); + currentMemory = arrayToMemory(array); + syncState = SyncState::device; + return CEED_ERROR_SUCCESS; + } + return ceedError("Invalid CeedMemType passed"); +} - ::occa::memory Vector::getKernelArg() { +int Vector::getArray(CeedMemType mtype, CeedScalar **array) { + // The passed `array` might be modified before restoring + // so we can't set sync state to SyncState::all + switch (mtype) { + case CEED_MEM_HOST: + setCurrentHostBufferIfNeeded(); + if (syncState == SyncState::device) { + setCurrentMemoryIfNeeded(); + currentMemory.copyTo(currentHostBuffer); + } + syncState = SyncState::host; + *array = currentHostBuffer; + return CEED_ERROR_SUCCESS; + case CEED_MEM_DEVICE: setCurrentMemoryIfNeeded(); if (syncState == SyncState::host) { setCurrentHostBufferIfNeeded(); currentMemory.copyFrom(currentHostBuffer); } syncState = SyncState::device; - return currentMemory; - } + *array = memoryToArray(currentMemory); + return CEED_ERROR_SUCCESS; + } + return ceedError("Invalid CeedMemType passed"); +} - ::occa::memory Vector::getConstKernelArg() { - setCurrentMemoryIfNeeded(); - if (syncState == SyncState::host) { - setCurrentHostBufferIfNeeded(); - currentMemory.copyFrom(currentHostBuffer); - syncState = SyncState::all; - } - return currentMemory; - } +int Vector::getReadOnlyArray(CeedMemType mtype, CeedScalar **array) { + const bool willBeFullySynced = + ((syncState == SyncState::device && mtype == CEED_MEM_HOST) || (syncState == SyncState::host && mtype == CEED_MEM_DEVICE)); + + const int error = getArray(mtype, const_cast(array)); + // Take advantage the vector will be fully synced + if (!error && willBeFullySynced) { + syncState = SyncState::all; + } - void Vector::printValues(const std::string &name) { - CeedScalar *values; - getReadOnlyArray(CEED_MEM_HOST, &values); + return error; +} - std::cout << std::setprecision(8) - << "Vector: " << name << std::endl - << " - Values: " << std::endl; +int Vector::getWriteOnlyArray(CeedMemType mtype, CeedScalar **array) { + // const bool willBeFullySynced = ( + // (syncState == SyncState::device && mtype == CEED_MEM_HOST) || + // (syncState == SyncState::host && mtype == CEED_MEM_DEVICE) + // ); - for (int i = 0; i < length; ++i) { - printf(" %12.8f\n", values[i]); - } - } + const int error = getArray(mtype, const_cast(array)); + // // Take advantage the vector will be fully synced + // if (!error && willBeFullySynced) { + // syncState = SyncState::all; + // } - void Vector::printNonZeroValues(const std::string &name) { - CeedScalar *values; - getReadOnlyArray(CEED_MEM_HOST, &values); + return error; +} - std::cout << std::setprecision(8) - << "Vector: " << name << std::endl - << " - Non-zero values: " << std::endl; +int Vector::restoreArray(CeedScalar **array) { return CEED_ERROR_SUCCESS; } - for (int i = 0; i < length; ++i) { - if (fabs(values[i]) > 1e-8) { - printf(" %d: %12.8f\n", i, values[i]); - } - } - } +int Vector::restoreReadOnlyArray(CeedScalar **array) { return CEED_ERROR_SUCCESS; } - void Vector::printSummary(const std::string &name) { - CeedScalar *values; - getReadOnlyArray(CEED_MEM_HOST, &values); +::occa::memory Vector::getKernelArg() { + setCurrentMemoryIfNeeded(); + if (syncState == SyncState::host) { + setCurrentHostBufferIfNeeded(); + currentMemory.copyFrom(currentHostBuffer); + } + syncState = SyncState::device; + return currentMemory; +} - CeedScalar minValue = values[0]; - CeedScalar maxValue = values[0]; +::occa::memory Vector::getConstKernelArg() { + setCurrentMemoryIfNeeded(); + if (syncState == SyncState::host) { + setCurrentHostBufferIfNeeded(); + currentMemory.copyFrom(currentHostBuffer); + syncState = SyncState::all; + } + return currentMemory; +} - for (int i = 0; i < length; ++i) { - const CeedScalar value = values[i]; - minValue = minValue < value ? minValue : value; - maxValue = maxValue > value ? maxValue : value; - } +void Vector::printValues(const std::string &name) { + CeedScalar *values; + getReadOnlyArray(CEED_MEM_HOST, &values); - std::cout << std::setprecision(8) - << "Vector: " << name << std::endl - << " - Length: " << length << std::endl - << " - Min : " << minValue << std::endl - << " - Max : " << maxValue << std::endl; - } + std::cout << std::setprecision(8) << "Vector: " << name << std::endl << " - Values: " << std::endl; + + for (int i = 0; i < length; ++i) { + printf(" %12.8f\n", values[i]); + } +} + +void Vector::printNonZeroValues(const std::string &name) { + CeedScalar *values; + getReadOnlyArray(CEED_MEM_HOST, &values); + + std::cout << std::setprecision(8) << "Vector: " << name << std::endl << " - Non-zero values: " << std::endl; - //---[ Ceed Callbacks ]----------- - int Vector::registerCeedFunction(Ceed ceed, CeedVector vec, - const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "Vector", vec, fname, f); + for (int i = 0; i < length; ++i) { + if (fabs(values[i]) > 1e-8) { + printf(" %d: %12.8f\n", i, values[i]); } + } +} - int Vector::ceedCreate(CeedSize length, CeedVector vec) { - int ierr; +void Vector::printSummary(const std::string &name) { + CeedScalar *values; + getReadOnlyArray(CEED_MEM_HOST, &values); - Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChk(ierr); + CeedScalar minValue = values[0]; + CeedScalar maxValue = values[0]; - CeedOccaRegisterFunction(vec, "HasValidArray", Vector::ceedHasValidArray); - CeedOccaRegisterFunction(vec, "HasBorrowedArrayOfType",Vector::ceedHasBorrowedArrayOfType); - CeedOccaRegisterFunction(vec, "SetValue", Vector::ceedSetValue); - CeedOccaRegisterFunction(vec, "SetArray", Vector::ceedSetArray); - CeedOccaRegisterFunction(vec, "TakeArray", Vector::ceedTakeArray); - CeedOccaRegisterFunction(vec, "GetArray", Vector::ceedGetArray); - CeedOccaRegisterFunction(vec, "GetArrayRead", Vector::ceedGetArrayRead); - CeedOccaRegisterFunction(vec, "GetArrayWrite", Vector::ceedGetArrayWrite); - CeedOccaRegisterFunction(vec, "RestoreArray", Vector::ceedRestoreArray); - CeedOccaRegisterFunction(vec, "RestoreArrayRead", Vector::ceedRestoreArrayRead); - CeedOccaRegisterFunction(vec, "Destroy", Vector::ceedDestroy); + for (int i = 0; i < length; ++i) { + const CeedScalar value = values[i]; + minValue = minValue < value ? minValue : value; + maxValue = maxValue > value ? maxValue : value; + } - Vector *vector = new Vector(); - ierr = CeedVectorSetData(vec, vector); CeedChk(ierr); + std::cout << std::setprecision(8) << "Vector: " << name << std::endl + << " - Length: " << length << std::endl + << " - Min : " << minValue << std::endl + << " - Max : " << maxValue << std::endl; +} - return CEED_ERROR_SUCCESS; - } +//---[ Ceed Callbacks ]----------- +int Vector::registerCeedFunction(Ceed ceed, CeedVector vec, const char *fname, ceed::occa::ceedFunction f) { + return CeedSetBackendFunction(ceed, "Vector", vec, fname, f); +} - int Vector::ceedHasValidArray(CeedVector vec, bool* has_valid_array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->hasValidArray(has_valid_array); - } +int Vector::ceedCreate(CeedSize length, CeedVector vec) { + Ceed ceed; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + + CeedOccaRegisterFunction(vec, "HasValidArray", Vector::ceedHasValidArray); + CeedOccaRegisterFunction(vec, "HasBorrowedArrayOfType", Vector::ceedHasBorrowedArrayOfType); + CeedOccaRegisterFunction(vec, "SetValue", Vector::ceedSetValue); + CeedOccaRegisterFunction(vec, "SetArray", Vector::ceedSetArray); + CeedOccaRegisterFunction(vec, "TakeArray", Vector::ceedTakeArray); + CeedOccaRegisterFunction(vec, "GetArray", Vector::ceedGetArray); + CeedOccaRegisterFunction(vec, "GetArrayRead", Vector::ceedGetArrayRead); + CeedOccaRegisterFunction(vec, "GetArrayWrite", Vector::ceedGetArrayWrite); + CeedOccaRegisterFunction(vec, "RestoreArray", Vector::ceedRestoreArray); + CeedOccaRegisterFunction(vec, "RestoreArrayRead", Vector::ceedRestoreArrayRead); + CeedOccaRegisterFunction(vec, "Destroy", Vector::ceedDestroy); + + Vector *vector = new Vector(); + CeedCallBackend(CeedVectorSetData(vec, vector)); + + return CEED_ERROR_SUCCESS; +} - int Vector::ceedHasBorrowedArrayOfType(CeedVector vec, - CeedMemType mem_type, - bool *has_borrowed_array_of_type) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->hasBorrowedArrayOfType(mem_type,has_borrowed_array_of_type); - } +int Vector::ceedHasValidArray(CeedVector vec, bool *has_valid_array) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); + } + return vector->hasValidArray(has_valid_array); +} - int Vector::ceedSetValue(CeedVector vec, CeedScalar value) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->setValue(value); - } +int Vector::ceedHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); + } + return vector->hasBorrowedArrayOfType(mem_type, has_borrowed_array_of_type); +} - int Vector::ceedSetArray(CeedVector vec, CeedMemType mtype, - CeedCopyMode cmode, CeedScalar *array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->setArray(mtype, cmode, array); - } +int Vector::ceedSetValue(CeedVector vec, CeedScalar value) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); + } + return vector->setValue(value); +} - int Vector::ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->takeArray(mtype, array); - } +int Vector::ceedSetArray(CeedVector vec, CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); + } + return vector->setArray(mtype, cmode, array); +} - int Vector::ceedGetArray(CeedVector vec, CeedMemType mtype, - CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->getArray(mtype, array); - } +int Vector::ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); + } + return vector->takeArray(mtype, array); +} - int Vector::ceedGetArrayRead(CeedVector vec, CeedMemType mtype, - CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->getReadOnlyArray(mtype, array); - } +int Vector::ceedGetArray(CeedVector vec, CeedMemType mtype, CeedScalar **array) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); + } + return vector->getArray(mtype, array); +} - int Vector::ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, - CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->getWriteOnlyArray(mtype, array); - } +int Vector::ceedGetArrayRead(CeedVector vec, CeedMemType mtype, CeedScalar **array) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); + } + return vector->getReadOnlyArray(mtype, array); +} - int Vector::ceedRestoreArray(CeedVector vec, CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->restoreArray(array); - } +int Vector::ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, CeedScalar **array) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); + } + return vector->getWriteOnlyArray(mtype, array); +} - int Vector::ceedRestoreArrayRead(CeedVector vec, CeedScalar **array) { - Vector *vector = Vector::from(vec); - if (!vector) { - return staticCeedError("Invalid CeedVector passed"); - } - return vector->restoreReadOnlyArray(array); - } +int Vector::ceedRestoreArray(CeedVector vec, CeedScalar **array) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); + } + return vector->restoreArray(array); +} - int Vector::ceedDestroy(CeedVector vec) { - delete getVector(vec, false); - return CEED_ERROR_SUCCESS; - } +int Vector::ceedRestoreArrayRead(CeedVector vec, CeedScalar **array) { + Vector *vector = Vector::from(vec); + if (!vector) { + return staticCeedError("Invalid CeedVector passed"); } + return vector->restoreReadOnlyArray(array); +} + +int Vector::ceedDestroy(CeedVector vec) { + delete getVector(vec, false); + return CEED_ERROR_SUCCESS; } +} // namespace occa +} // namespace ceed diff --git a/backends/occa/ceed-occa-vector.hpp b/backends/occa/ceed-occa-vector.hpp index eb90c7d34f..2269c5a9d4 100644 --- a/backends/occa/ceed-occa-vector.hpp +++ b/backends/occa/ceed-occa-vector.hpp @@ -11,135 +11,123 @@ #include "ceed-occa-ceed-object.hpp" namespace ceed { - namespace occa { - template - ::occa::memory arrayToMemory(const TM *array) { - if (array) { - ::occa::memory mem((::occa::modeMemory_t*) array); - mem.setDtype(::occa::dtype::get()); - return mem; - } - return ::occa::null; - } +namespace occa { +template +::occa::memory arrayToMemory(const TM *array) { + if (array) { + ::occa::memory mem((::occa::modeMemory_t *)array); + mem.setDtype(::occa::dtype::get()); + return mem; + } + return ::occa::null; +} + +template +TM *memoryToArray(::occa::memory &memory) { + return (TM *)memory.getModeMemory(); +} - template - TM* memoryToArray(::occa::memory &memory) { - return (TM*) memory.getModeMemory(); - } +class Vector : public CeedObject { + public: + // Owned resources + CeedSize length; + ::occa::memory memory; + CeedSize hostBufferLength; + CeedScalar *hostBuffer; - class Vector : public CeedObject { - public: - // Owned resources - CeedSize length; - ::occa::memory memory; - CeedSize hostBufferLength; - CeedScalar *hostBuffer; + ::occa::kernel setValueKernel; - ::occa::kernel setValueKernel; + // Current resources + ::occa::memory currentMemory; + CeedScalar *currentHostBuffer; - // Current resources - ::occa::memory currentMemory; - CeedScalar *currentHostBuffer; + // State information + int syncState; - // State information - int syncState; + Vector(); - Vector(); + ~Vector(); - ~Vector(); + int hasValidArray(bool *has_valid_array); - int hasValidArray(bool* has_valid_array); + int hasBorrowedArrayOfType(CeedMemType mem_type, bool *has_borrowed_array_of_type); - int hasBorrowedArrayOfType(CeedMemType mem_type,bool *has_borrowed_array_of_type); + static Vector *getVector(CeedVector vec, const bool assertValid = true); - static Vector* getVector(CeedVector vec, - const bool assertValid = true); + static Vector *from(CeedVector vec); - static Vector* from(CeedVector vec); + void resize(const CeedSize length_); - void resize(const CeedSize length_); + void resizeMemory(const CeedSize length_); - void resizeMemory(const CeedSize length_); + void resizeMemory(::occa::device device, const CeedSize length_); - void resizeMemory(::occa::device device, const CeedSize length_); + void resizeHostBuffer(const CeedSize length_); - void resizeHostBuffer(const CeedSize length_); + void setCurrentMemoryIfNeeded(); - void setCurrentMemoryIfNeeded(); + void setCurrentHostBufferIfNeeded(); - void setCurrentHostBufferIfNeeded(); + void freeHostBuffer(); - void freeHostBuffer(); + int setValue(CeedScalar value); - int setValue(CeedScalar value); + int setArray(CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array); - int setArray(CeedMemType mtype, - CeedCopyMode cmode, CeedScalar *array); + int takeArray(CeedMemType mtype, CeedScalar **array); - int takeArray(CeedMemType mtype, CeedScalar **array); + int copyArrayValues(CeedMemType mtype, CeedScalar *array); - int copyArrayValues(CeedMemType mtype, CeedScalar *array); + int ownArrayPointer(CeedMemType mtype, CeedScalar *array); - int ownArrayPointer(CeedMemType mtype, CeedScalar *array); + int useArrayPointer(CeedMemType mtype, CeedScalar *array); - int useArrayPointer(CeedMemType mtype, CeedScalar *array); + int getArray(CeedMemType mtype, CeedScalar **array); - int getArray(CeedMemType mtype, - CeedScalar **array); + int getReadOnlyArray(CeedMemType mtype, CeedScalar **array); - int getReadOnlyArray(CeedMemType mtype, - CeedScalar **array); - - int getWriteOnlyArray(CeedMemType mtype, - CeedScalar **array); + int getWriteOnlyArray(CeedMemType mtype, CeedScalar **array); - int restoreArray(CeedScalar **array); + int restoreArray(CeedScalar **array); - int restoreReadOnlyArray(CeedScalar **array); + int restoreReadOnlyArray(CeedScalar **array); - ::occa::memory getKernelArg(); + ::occa::memory getKernelArg(); - ::occa::memory getConstKernelArg(); + ::occa::memory getConstKernelArg(); - void printValues(const std::string &name); - void printNonZeroValues(const std::string &name); - void printSummary(const std::string &name); + void printValues(const std::string &name); + void printNonZeroValues(const std::string &name); + void printSummary(const std::string &name); - //---[ Ceed Callbacks ]----------- - static int registerCeedFunction(Ceed ceed, CeedVector vec, - const char *fname, ceed::occa::ceedFunction f); + //---[ Ceed Callbacks ]----------- + static int registerCeedFunction(Ceed ceed, CeedVector vec, const char *fname, ceed::occa::ceedFunction f); - static int ceedHasValidArray(CeedVector vec, bool* has_valid_array); + static int ceedHasValidArray(CeedVector vec, bool *has_valid_array); - static int ceedHasBorrowedArrayOfType(CeedVector vec, - CeedMemType mem_type, - bool *has_borrowed_array_of_type); + static int ceedHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type); - static int ceedCreate(CeedSize length, CeedVector vec); + static int ceedCreate(CeedSize length, CeedVector vec); - static int ceedSetValue(CeedVector vec, CeedScalar value); + static int ceedSetValue(CeedVector vec, CeedScalar value); - static int ceedSetArray(CeedVector vec, CeedMemType mtype, - CeedCopyMode cmode, CeedScalar *array); + static int ceedSetArray(CeedVector vec, CeedMemType mtype, CeedCopyMode cmode, CeedScalar *array); - static int ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array); + static int ceedTakeArray(CeedVector vec, CeedMemType mtype, CeedScalar **array); - static int ceedGetArray(CeedVector vec, CeedMemType mtype, - CeedScalar **array); + static int ceedGetArray(CeedVector vec, CeedMemType mtype, CeedScalar **array); - static int ceedGetArrayRead(CeedVector vec, CeedMemType mtype, - CeedScalar **array); + static int ceedGetArrayRead(CeedVector vec, CeedMemType mtype, CeedScalar **array); - static int ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, - CeedScalar **array); + static int ceedGetArrayWrite(CeedVector vec, CeedMemType mtype, CeedScalar **array); - static int ceedRestoreArray(CeedVector vec, CeedScalar **array); + static int ceedRestoreArray(CeedVector vec, CeedScalar **array); - static int ceedRestoreArrayRead(CeedVector vec, CeedScalar **array); + static int ceedRestoreArrayRead(CeedVector vec, CeedScalar **array); - static int ceedDestroy(CeedVector vec); - }; - } -} + static int ceedDestroy(CeedVector vec); +}; +} // namespace occa +} // namespace ceed #endif diff --git a/backends/occa/ceed-occa.cpp b/backends/occa/ceed-occa.cpp index c9161c679b..b46cab9d6b 100644 --- a/backends/occa/ceed-occa.cpp +++ b/backends/occa/ceed-occa.cpp @@ -5,11 +5,11 @@ // // This file is part of CEED: http://github.com/ceed - #warning "libCEED OCCA backend is experimental; for best performance, use device native backends" +#warning "libCEED OCCA backend is experimental; for best performance, use device native backends" #include -#include #include +#include #include "ceed-occa-context.hpp" #include "ceed-occa-elem-restriction.hpp" @@ -22,331 +22,309 @@ #include "ceed-occa-vector.hpp" namespace ceed { - namespace occa { - typedef std::map StringMap; - typedef std::vector StringVector; - - enum ResourceParserStep { - RESOURCE, - QUERY_KEY, - QUERY_VALUE - }; - - static const char RESOURCE_DELIMITER = '/'; - static const char QUERY_DELIMITER = ':'; - static const char QUERY_KEY_VALUE_DELIMITER = '='; - static const char QUERY_ARG_DELIMITER = ','; - - static std::string getDefaultDeviceMode(const bool cpuMode, - const bool gpuMode) { - // In case both cpuMode and gpuMode are set, prioritize the GPU if available - // For example, if the resource is "/*/occa" - if (gpuMode) { - if (::occa::modeIsEnabled("CUDA")) { - return "CUDA"; - } - if (::occa::modeIsEnabled("HIP")) { - return "HIP"; - } - if (::occa::modeIsEnabled("dpcpp")) { - return "dpcpp"; - } - if (::occa::modeIsEnabled("OpenCL")) { - return "OpenCL"; - } - // Metal doesn't support doubles - } - - if (cpuMode) { - if (::occa::modeIsEnabled("OpenMP")) { - return "OpenMP"; - } - return "Serial"; - } +namespace occa { +typedef std::map StringMap; +typedef std::vector StringVector; + +enum ResourceParserStep { RESOURCE, QUERY_KEY, QUERY_VALUE }; + +static const char RESOURCE_DELIMITER = '/'; +static const char QUERY_DELIMITER = ':'; +static const char QUERY_KEY_VALUE_DELIMITER = '='; +static const char QUERY_ARG_DELIMITER = ','; + +static std::string getDefaultDeviceMode(const bool cpuMode, const bool gpuMode) { + // In case both cpuMode and gpuMode are set, prioritize the GPU if available + // For example, if the resource is "/*/occa" + if (gpuMode) { + if (::occa::modeIsEnabled("CUDA")) { + return "CUDA"; + } + if (::occa::modeIsEnabled("HIP")) { + return "HIP"; + } + if (::occa::modeIsEnabled("dpcpp")) { + return "dpcpp"; + } + if (::occa::modeIsEnabled("OpenCL")) { + return "OpenCL"; + } + // Metal doesn't support doubles + } - return ""; + if (cpuMode) { + if (::occa::modeIsEnabled("OpenMP")) { + return "OpenMP"; } + return "Serial"; + } - static int getDeviceMode(const std::string &match, - std::string &mode) { - if (match == "cuda") { - mode = "CUDA"; - return CEED_ERROR_SUCCESS; - } - if (match == "hip") { - mode = "HIP"; - return CEED_ERROR_SUCCESS; - } - if (match == "dpcpp") { - mode = "dpcpp"; - return CEED_ERROR_SUCCESS; - } - if (match == "opencl") { - mode = "OpenCL"; - return CEED_ERROR_SUCCESS; - } - if (match == "openmp") { - mode = "OpenMP"; - return CEED_ERROR_SUCCESS; - } - if (match == "serial") { - mode = "Serial"; - return CEED_ERROR_SUCCESS; - } + return ""; +} - const bool autoMode = match == "*"; - const bool cpuMode = match == "cpu"; - const bool gpuMode = match == "gpu"; +static int getDeviceMode(const std::string &match, std::string &mode) { + if (match == "cuda") { + mode = "CUDA"; + return CEED_ERROR_SUCCESS; + } + if (match == "hip") { + mode = "HIP"; + return CEED_ERROR_SUCCESS; + } + if (match == "dpcpp") { + mode = "dpcpp"; + return CEED_ERROR_SUCCESS; + } + if (match == "opencl") { + mode = "OpenCL"; + return CEED_ERROR_SUCCESS; + } + if (match == "openmp") { + mode = "OpenMP"; + return CEED_ERROR_SUCCESS; + } + if (match == "serial") { + mode = "Serial"; + return CEED_ERROR_SUCCESS; + } - mode = getDefaultDeviceMode(cpuMode || autoMode, - gpuMode || autoMode); - return !mode.size(); - } + const bool autoMode = match == "*"; + const bool cpuMode = match == "cpu"; + const bool gpuMode = match == "gpu"; - static int splitCeedResource(const std::string &resource, - std::string &match, - StringMap &query) { - /* - * resource: - * - * "/gpu/occa?mode='CUDA':device_id=0" - * - * resourceVector: - * - * ["gpu", "occa"] - * - * match: - * - * "gpu" - * - * query: - * - * { - * "mode": "'CUDA'", - * "device_id": "0", - * } - */ - const int charCount = (int) resource.size(); - const char *c_resource = resource.c_str(); - - StringVector resourceVector; - - ResourceParserStep parsingStep = RESOURCE; - int wordStart = 1; - std::string queryKey; - - // Check for /gpu/cuda/occa, /gpu/hip/occa, /cpu/self/occa, /cpu/openmp/occa - // Note: added for matching style with other backends - if (resource == "/gpu/cuda/occa"){ - match = "cuda"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/gpu/hip/occa"){ - match = "hip"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/gpu/dpcpp/occa"){ - match = "dpcpp"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/gpu/opencl/occa"){ - match = "opencl"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/cpu/openmp/occa"){ - match = "openmp"; - return CEED_ERROR_SUCCESS; - } - if (resource == "/cpu/self/occa"){ - match = "serial"; - return CEED_ERROR_SUCCESS; - } + mode = getDefaultDeviceMode(cpuMode || autoMode, gpuMode || autoMode); + return !mode.size(); +} - // Skip initial slash - for (int i = 1; i <= charCount; ++i) { - const char c = c_resource[i]; - - if (parsingStep == RESOURCE) { - if (c == RESOURCE_DELIMITER || c == QUERY_DELIMITER || c == '\0') { - resourceVector.push_back( - resource.substr(wordStart, i - wordStart) - ); - wordStart = i + 1; - - // Check if we are done parsing the resource - if (c == QUERY_DELIMITER) { - parsingStep = QUERY_KEY; - } - } - } - else if (parsingStep == QUERY_KEY) { - if (c == QUERY_KEY_VALUE_DELIMITER) { - queryKey = resource.substr(wordStart, i - wordStart); - wordStart = i + 1; - - // Looking to parse the query value now - parsingStep = QUERY_VALUE; - } - } else if (parsingStep == QUERY_VALUE) { - if (c == QUERY_ARG_DELIMITER || c == '\0') { - query[queryKey] = resource.substr(wordStart, i - wordStart); - wordStart = i + 1; - - // Back to parsing the next query argument - parsingStep = QUERY_KEY; - queryKey = ""; - } +static int splitCeedResource(const std::string &resource, std::string &match, StringMap &query) { + /* + * resource: + * + * "/gpu/occa?mode='CUDA':device_id=0" + * + * resourceVector: + * + * ["gpu", "occa"] + * + * match: + * + * "gpu" + * + * query: + * + * { + * "mode": "'CUDA'", + * "device_id": "0", + * } + */ + const int charCount = (int)resource.size(); + const char *c_resource = resource.c_str(); + + StringVector resourceVector; + + ResourceParserStep parsingStep = RESOURCE; + int wordStart = 1; + std::string queryKey; + + // Check for /gpu/cuda/occa, /gpu/hip/occa, /cpu/self/occa, /cpu/openmp/occa + // Note: added for matching style with other backends + if (resource == "/gpu/cuda/occa") { + match = "cuda"; + return CEED_ERROR_SUCCESS; + } + if (resource == "/gpu/hip/occa") { + match = "hip"; + return CEED_ERROR_SUCCESS; + } + if (resource == "/gpu/dpcpp/occa") { + match = "dpcpp"; + return CEED_ERROR_SUCCESS; + } + if (resource == "/gpu/opencl/occa") { + match = "opencl"; + return CEED_ERROR_SUCCESS; + } + if (resource == "/cpu/openmp/occa") { + match = "openmp"; + return CEED_ERROR_SUCCESS; + } + if (resource == "/cpu/self/occa") { + match = "serial"; + return CEED_ERROR_SUCCESS; + } + + // Skip initial slash + for (int i = 1; i <= charCount; ++i) { + const char c = c_resource[i]; + + if (parsingStep == RESOURCE) { + if (c == RESOURCE_DELIMITER || c == QUERY_DELIMITER || c == '\0') { + resourceVector.push_back(resource.substr(wordStart, i - wordStart)); + wordStart = i + 1; + + // Check if we are done parsing the resource + if (c == QUERY_DELIMITER) { + parsingStep = QUERY_KEY; } } + } else if (parsingStep == QUERY_KEY) { + if (c == QUERY_KEY_VALUE_DELIMITER) { + queryKey = resource.substr(wordStart, i - wordStart); + wordStart = i + 1; - // Looking for [match, "occa"] - if (resourceVector.size() != 2 || resourceVector[1] != "occa") { - return 1; + // Looking to parse the query value now + parsingStep = QUERY_VALUE; + } + } else if (parsingStep == QUERY_VALUE) { + if (c == QUERY_ARG_DELIMITER || c == '\0') { + query[queryKey] = resource.substr(wordStart, i - wordStart); + wordStart = i + 1; + + // Back to parsing the next query argument + parsingStep = QUERY_KEY; + queryKey = ""; } - - match = resourceVector[0]; - return CEED_ERROR_SUCCESS; } + } - void setDefaultProps(::occa::properties &deviceProps, - const std::string &defaultMode) { - std::string mode; - if (deviceProps.has("mode")) { - // Don't override mode if passed - mode = (std::string) deviceProps["mode"]; - } else { - mode = defaultMode; - deviceProps.set("mode",mode); - } + // Looking for [match, "occa"] + if (resourceVector.size() != 2 || resourceVector[1] != "occa") { + return 1; + } - // Set default device id - if ((mode == "CUDA") - || (mode == "HIP") - || (mode == "dpcpp") - || (mode == "OpenCL")) { - if (!deviceProps.has("device_id")) { - deviceProps["device_id"] = 0; - } - } + match = resourceVector[0]; + return CEED_ERROR_SUCCESS; +} - // Set default platform id - if ((mode=="dpcpp") || (mode == "OpenCL")){ - if (!deviceProps.has("platform_id")) { - deviceProps["platform_id"] = 0; - } - } +void setDefaultProps(::occa::properties &deviceProps, const std::string &defaultMode) { + std::string mode; + if (deviceProps.has("mode")) { + // Don't override mode if passed + mode = (std::string)deviceProps["mode"]; + } else { + mode = defaultMode; + deviceProps.set("mode", mode); + } + + // Set default device id + if ((mode == "CUDA") || (mode == "HIP") || (mode == "dpcpp") || (mode == "OpenCL")) { + if (!deviceProps.has("device_id")) { + deviceProps["device_id"] = 0; } + } - static int initCeed(const char *c_resource, Ceed ceed) { - int ierr; - std::string match; - StringMap query; + // Set default platform id + if ((mode == "dpcpp") || (mode == "OpenCL")) { + if (!deviceProps.has("platform_id")) { + deviceProps["platform_id"] = 0; + } + } +} - ierr = splitCeedResource(c_resource, match, query); - if (ierr) { - return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource); - } +static int initCeed(const char *c_resource, Ceed ceed) { + int ierr; + std::string match; + StringMap query; - std::string mode; - ierr = getDeviceMode(match, mode); - if (ierr) { - return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource); - } + ierr = splitCeedResource(c_resource, match, query); + if (ierr) { + return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource); + } - std::string devicePropsStr = "{\n"; - StringMap::const_iterator it; - for (it = query.begin(); it != query.end(); ++it) { - devicePropsStr += " \""; - devicePropsStr += it->first; - devicePropsStr += "\": "; - devicePropsStr += it->second; - devicePropsStr += ",\n"; - } - devicePropsStr += '}'; + std::string mode; + ierr = getDeviceMode(match, mode); + if (ierr) { + return CeedError(ceed, CEED_ERROR_BACKEND, "(OCCA) Backend cannot use resource: %s", c_resource); + } - ::occa::properties deviceProps(devicePropsStr); - setDefaultProps(deviceProps, mode); + std::string devicePropsStr = "{\n"; + StringMap::const_iterator it; + for (it = query.begin(); it != query.end(); ++it) { + devicePropsStr += " \""; + devicePropsStr += it->first; + devicePropsStr += "\": "; + devicePropsStr += it->second; + devicePropsStr += ",\n"; + } + devicePropsStr += '}'; - ceed::occa::Context *context = new Context(::occa::device(deviceProps)); - ierr = CeedSetData(ceed, context); CeedChkBackend(ierr); + ::occa::properties deviceProps(devicePropsStr); + setDefaultProps(deviceProps, mode); - return CEED_ERROR_SUCCESS; - } + ceed::occa::Context *context = new Context(::occa::device(deviceProps)); + CeedCallBackend(CeedSetData(ceed, context)); - static int destroyCeed(Ceed ceed) { - delete Context::from(ceed); - return CEED_ERROR_SUCCESS; - } + return CEED_ERROR_SUCCESS; +} - static int registerCeedFunction(Ceed ceed, const char *fname, ceed::occa::ceedFunction f) { - return CeedSetBackendFunction(ceed, "Ceed", ceed, fname, f); - } +static int destroyCeed(Ceed ceed) { + delete Context::from(ceed); + return CEED_ERROR_SUCCESS; +} - static int preferHostMemType(CeedMemType *type) { - *type = CEED_MEM_HOST; - return CEED_ERROR_SUCCESS; - } +static int registerCeedFunction(Ceed ceed, const char *fname, ceed::occa::ceedFunction f) { + return CeedSetBackendFunction(ceed, "Ceed", ceed, fname, f); +} - static int preferDeviceMemType(CeedMemType *type) { - *type = CEED_MEM_DEVICE; - return CEED_ERROR_SUCCESS; - } +static int preferHostMemType(CeedMemType *type) { + *type = CEED_MEM_HOST; + return CEED_ERROR_SUCCESS; +} - static ceed::occa::ceedFunction getPreferredMemType(Ceed ceed) { - if (Context::from(ceed)->device.hasSeparateMemorySpace()) { - return (ceed::occa::ceedFunction) (void*) preferDeviceMemType; - } - return (ceed::occa::ceedFunction) (void*) preferHostMemType; - } +static int preferDeviceMemType(CeedMemType *type) { + *type = CEED_MEM_DEVICE; + return CEED_ERROR_SUCCESS; +} - static int registerMethods(Ceed ceed) { - int ierr; - - CeedOccaRegisterBaseFunction("Destroy", ceed::occa::destroyCeed); - CeedOccaRegisterBaseFunction("GetPreferredMemType", getPreferredMemType(ceed)); - CeedOccaRegisterBaseFunction("VectorCreate", ceed::occa::Vector::ceedCreate); - CeedOccaRegisterBaseFunction("BasisCreateTensorH1", ceed::occa::TensorBasis::ceedCreate); - CeedOccaRegisterBaseFunction("BasisCreateH1", ceed::occa::SimplexBasis::ceedCreate); - CeedOccaRegisterBaseFunction("ElemRestrictionCreate", ceed::occa::ElemRestriction::ceedCreate); - CeedOccaRegisterBaseFunction("ElemRestrictionCreateBlocked", ceed::occa::ElemRestriction::ceedCreateBlocked); - CeedOccaRegisterBaseFunction("QFunctionCreate", ceed::occa::QFunction::ceedCreate); - CeedOccaRegisterBaseFunction("QFunctionContextCreate", ceed::occa::QFunctionContext::ceedCreate); - CeedOccaRegisterBaseFunction("OperatorCreate", ceed::occa::Operator::ceedCreate); - CeedOccaRegisterBaseFunction("CompositeOperatorCreate", ceed::occa::Operator::ceedCreateComposite); - - return CEED_ERROR_SUCCESS; - } +static ceed::occa::ceedFunction getPreferredMemType(Ceed ceed) { + if (Context::from(ceed)->device.hasSeparateMemorySpace()) { + return (ceed::occa::ceedFunction)(void *)preferDeviceMemType; + } + return (ceed::occa::ceedFunction)(void *)preferHostMemType; +} - static int registerBackend(const char *resource, Ceed ceed) { - int ierr; +static int registerMethods(Ceed ceed) { + CeedOccaRegisterBaseFunction("Destroy", ceed::occa::destroyCeed); + CeedOccaRegisterBaseFunction("GetPreferredMemType", getPreferredMemType(ceed)); + CeedOccaRegisterBaseFunction("VectorCreate", ceed::occa::Vector::ceedCreate); + CeedOccaRegisterBaseFunction("BasisCreateTensorH1", ceed::occa::TensorBasis::ceedCreate); + CeedOccaRegisterBaseFunction("BasisCreateH1", ceed::occa::SimplexBasis::ceedCreate); + CeedOccaRegisterBaseFunction("ElemRestrictionCreate", ceed::occa::ElemRestriction::ceedCreate); + CeedOccaRegisterBaseFunction("ElemRestrictionCreateBlocked", ceed::occa::ElemRestriction::ceedCreateBlocked); + CeedOccaRegisterBaseFunction("QFunctionCreate", ceed::occa::QFunction::ceedCreate); + CeedOccaRegisterBaseFunction("QFunctionContextCreate", ceed::occa::QFunctionContext::ceedCreate); + CeedOccaRegisterBaseFunction("OperatorCreate", ceed::occa::Operator::ceedCreate); + CeedOccaRegisterBaseFunction("CompositeOperatorCreate", ceed::occa::Operator::ceedCreateComposite); - try { - ierr = ceed::occa::initCeed(resource, ceed); CeedChkBackend(ierr); - } catch (const ::occa::exception &e) { - CeedHandleOccaException(e); - } - try { - ierr = ceed::occa::registerMethods(ceed); CeedChkBackend(ierr); - } - catch (const ::occa::exception &e) { - CeedHandleOccaException(e); - } - return CEED_ERROR_SUCCESS; - } + return CEED_ERROR_SUCCESS; +} + +static int registerBackend(const char *resource, Ceed ceed) { + try { + CeedCallBackend(ceed::occa::initCeed(resource, ceed)); + } catch (const ::occa::exception &e) { + CeedHandleOccaException(e); + } + try { + CeedCallBackend(ceed::occa::registerMethods(ceed)); + } catch (const ::occa::exception &e) { + CeedHandleOccaException(e); } + return CEED_ERROR_SUCCESS; } +} // namespace occa +} // namespace ceed CEED_INTERN int CeedRegister_Occa(void) { - int ierr; // General mode - ierr = CeedRegister("/*/occa", ceed::occa::registerBackend, 270); CeedChkBackend(ierr); + CeedCallBackend(CeedRegister("/*/occa", ceed::occa::registerBackend, 270)); // CPU Modes - ierr = CeedRegister("/cpu/self/occa",ceed::occa::registerBackend, 260); CeedChkBackend(ierr); - ierr = CeedRegister("/cpu/openmp/occa",ceed::occa::registerBackend, 250); CeedChkBackend(ierr); + CeedCallBackend(CeedRegister("/cpu/self/occa", ceed::occa::registerBackend, 260)); + CeedCallBackend(CeedRegister("/cpu/openmp/occa", ceed::occa::registerBackend, 250)); // GPU Modes - ierr = CeedRegister("/gpu/dpcpp/occa",ceed::occa::registerBackend, 240); CeedChkBackend(ierr); - ierr = CeedRegister("/gpu/opencl/occa",ceed::occa::registerBackend, 230); CeedChkBackend(ierr); - ierr = CeedRegister("/gpu/hip/occa",ceed::occa::registerBackend, 220); CeedChkBackend(ierr); - ierr = CeedRegister("/gpu/cuda/occa",ceed::occa::registerBackend, 210); CeedChkBackend(ierr); + CeedCallBackend(CeedRegister("/gpu/dpcpp/occa", ceed::occa::registerBackend, 240)); + CeedCallBackend(CeedRegister("/gpu/opencl/occa", ceed::occa::registerBackend, 230)); + CeedCallBackend(CeedRegister("/gpu/hip/occa", ceed::occa::registerBackend, 220)); + CeedCallBackend(CeedRegister("/gpu/cuda/occa", ceed::occa::registerBackend, 210)); return CEED_ERROR_SUCCESS; } diff --git a/backends/occa/ceed-occa.h b/backends/occa/ceed-occa.h index 574357de05..0e8b1b4747 100644 --- a/backends/occa/ceed-occa.h +++ b/backends/occa/ceed-occa.h @@ -5,9 +5,9 @@ // // This file is part of CEED: http://github.com/ceed -#include -#include #include +#include +#include #include #include #include @@ -36,7 +36,7 @@ typedef struct { CeedScalar *h_array; CeedScalar *h_array_allocated; - occaMemory d_array; + occaMemory d_array; } CeedVector_Occa; // ***************************************************************************** @@ -44,7 +44,7 @@ typedef struct { // ***************************************************************************** #define CEED_OCCA_NUM_RESTRICTION_KERNELS 8 typedef struct { - bool strided; + bool strided; occaMemory d_indices; occaMemory d_toffsets; occaMemory d_tindices; @@ -55,28 +55,28 @@ typedef struct { // * CeedBasis Occa struct // ***************************************************************************** typedef struct { - bool ready; + bool ready; CeedElemRestriction er; - occaMemory qref1d; - occaMemory qweight1d; - occaMemory interp1d; - occaMemory grad1d; - occaMemory tmp0,tmp1; - occaKernel kZero,kInterp,kGrad,kWeight; + occaMemory qref1d; + occaMemory qweight1d; + occaMemory interp1d; + occaMemory grad1d; + occaMemory tmp0, tmp1; + occaKernel kZero, kInterp, kGrad, kWeight; } CeedBasis_Occa; // ***************************************************************************** // * CeedOperator Occa struct // ***************************************************************************** typedef struct { - CeedVector *Evecs; /// E-vectors needed to apply operator (in followed by out) + CeedVector *Evecs; /// E-vectors needed to apply operator (in followed by out) CeedScalar **Edata; - CeedVector *evecsin; /// Input E-vectors needed to apply operator - CeedVector *evecsout; /// Output E-vectors needed to apply operator - CeedVector *qvecsin; /// Input Q-vectors needed to apply operator - CeedVector *qvecsout; /// Output Q-vectors needed to apply operator - CeedInt numein; - CeedInt numeout; + CeedVector *evecsin; /// Input E-vectors needed to apply operator + CeedVector *evecsout; /// Output E-vectors needed to apply operator + CeedVector *qvecsin; /// Input Q-vectors needed to apply operator + CeedVector *qvecsout; /// Output Q-vectors needed to apply operator + CeedInt numein; + CeedInt numeout; } CeedOperator_Occa; // ***************************************************************************** @@ -84,16 +84,16 @@ typedef struct { // ***************************************************************************** #define N_MAX_IDX 16 typedef struct { - bool ready; - CeedInt idx,odx; - CeedInt iOf7[N_MAX_IDX]; - CeedInt oOf7[N_MAX_IDX]; - int nc, dim, nelem, elemsize, e; - occaMemory o_indata, o_outdata; - occaMemory d_ctx, d_idx, d_odx; - char *oklPath; - const char *qFunctionName; - occaKernel kQFunctionApply; + bool ready; + CeedInt idx, odx; + CeedInt iOf7[N_MAX_IDX]; + CeedInt oOf7[N_MAX_IDX]; + int nc, dim, nelem, elemsize, e; + occaMemory o_indata, o_outdata; + occaMemory d_ctx, d_idx, d_odx; + char *oklPath; + const char *qFunctionName; + occaKernel kQFunctionApply; CeedOperator op; } CeedQFunction_Occa; @@ -110,35 +110,27 @@ typedef struct { // ***************************************************************************** typedef struct { occaDevice device; - bool ocl; - char *libceed_dir; - char *occa_cache_dir; + bool ocl; + char *libceed_dir; + char *occa_cache_dir; } Ceed_Occa; // ***************************************************************************** -CEED_INTERN int CeedOklPath_Occa(const Ceed, const char *, const char *, - char **); +CEED_INTERN int CeedOklPath_Occa(const Ceed, const char *, const char *, char **); // ***************************************************************************** CEED_INTERN int CeedOklDladdr_Occa(Ceed); // ***************************************************************************** -CEED_INTERN int CeedBasisCreateTensorH1_Occa(CeedInt dim, - CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, - const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); +CEED_INTERN int CeedBasisCreateTensorH1_Occa(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, + const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); // ***************************************************************************** -CEED_INTERN int CeedBasisCreateH1_Occa(CeedElemTopology topo, - CeedInt dim, CeedInt ndof, CeedInt nqpts, - const CeedScalar *interp1d, - const CeedScalar *grad1d, - const CeedScalar *qref1d, - const CeedScalar *qweight1d, - CeedBasis basis); +CEED_INTERN int CeedBasisCreateH1_Occa(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp1d, + const CeedScalar *grad1d, const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); // ***************************************************************************** -CEED_INTERN int CeedBasisApplyElems_Occa(CeedBasis basis, CeedInt Q, - CeedTransposeMode tmode, CeedEvalMode emode, const CeedVector u, CeedVector v); +CEED_INTERN int CeedBasisApplyElems_Occa(CeedBasis basis, CeedInt Q, CeedTransposeMode tmode, CeedEvalMode emode, const CeedVector u, CeedVector v); // ***************************************************************************** CEED_INTERN int CeedOperatorCreate_Occa(CeedOperator op); @@ -150,14 +142,12 @@ CEED_INTERN int CeedQFunctionCreate_Occa(CeedQFunction qf); CEED_INTERN int CeedQFunctionContextCreate_Occa(CeedQFunctionContext ctx); // ***************************************************************************** -CEED_INTERN int CeedElemRestrictionCreate_Occa(const CeedMemType mtype, - const CeedCopyMode cmode, const CeedInt *indices, - const CeedElemRestriction res); +CEED_INTERN int CeedElemRestrictionCreate_Occa(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *indices, + const CeedElemRestriction res); // ***************************************************************************** -CEED_INTERN int CeedElemRestrictionCreateBlocked_Occa(const CeedMemType mtype, - const CeedCopyMode cmode, const CeedInt *indices, - const CeedElemRestriction res); +CEED_INTERN int CeedElemRestrictionCreateBlocked_Occa(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *indices, + const CeedElemRestriction res); // ***************************************************************************** CEED_INTERN int CeedVectorCreate_Occa(CeedInt n, CeedVector vec); diff --git a/backends/occa/kernels/elem-restriction.cpp b/backends/occa/kernels/elem-restriction.cpp index 05e89b8280..1ac6ba7a1a 100644 --- a/backends/occa/kernels/elem-restriction.cpp +++ b/backends/occa/kernels/elem-restriction.cpp @@ -27,132 +27,98 @@ const char *occa_elem_restriction_source = STRINGIFY_SOURCE( -@directive("#define PRINT_KERNEL_HASHES 0") - -typedef CeedScalar *QuadVector @dim(ELEMENT_SIZE, COMPONENT_COUNT, elementCount); - -@kernel -void applyRestriction(const CeedInt elementCount, - const CeedInt *indices, - CeedScalar *u, - QuadVector v) { - - @tile(TILE_SIZE, @outer, @inner) - for (int element = 0; element < elementCount; ++element) { - -@directive("#if PRINT_KERNEL_HASHES") - // Print to see which kernel is being run - if (element == 0) { - printf("\n\napplyRestriction Kernel: " OKL_KERNEL_HASH "\n\n"); - } -@directive("#endif") - -@directive("#if USES_INDICES") - for (int node = 0; node < ELEMENT_SIZE; ++node) { - const CeedInt index = indices[node + (element * ELEMENT_SIZE)]; - - for (int c = 0; c < COMPONENT_COUNT; ++c) { - v(node, c, element) = u[ - index + (c * UNSTRIDED_COMPONENT_STRIDE) - ]; - } - } -@directive("#else") - for (int node = 0; node < ELEMENT_SIZE; ++node) { - for (int c = 0; c < COMPONENT_COUNT; ++c) { - v(node, c, element) = u[ - (node * NODE_STRIDE) - + (c * COMPONENT_STRIDE) - + (element * ELEMENT_STRIDE) - ]; + @directive("#define PRINT_KERNEL_HASHES 0") + + typedef CeedScalar * + QuadVector @dim(ELEMENT_SIZE, COMPONENT_COUNT, elementCount); + + @kernel void applyRestriction(const CeedInt elementCount, const CeedInt *indices, CeedScalar *u, QuadVector v) { + @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) { + @directive("#if PRINT_KERNEL_HASHES") + // Print to see which kernel is being run + if (element == 0) { + printf("\n\napplyRestriction Kernel: " OKL_KERNEL_HASH "\n\n"); + } + @directive("#endif") + + @directive("#if USES_INDICES") for (int node = 0; node < ELEMENT_SIZE; ++node) { + const CeedInt index = indices[node + (element * ELEMENT_SIZE)]; + + for (int c = 0; c < COMPONENT_COUNT; ++c) { + v(node, c, element) = u[index + (c * UNSTRIDED_COMPONENT_STRIDE)]; + } + } + @directive("#else") for (int node = 0; node < ELEMENT_SIZE; ++node) { + for (int c = 0; c < COMPONENT_COUNT; ++c) { + v(node, c, element) = u[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)]; + } + } + @directive("#endif") } } -@directive("#endif") - } -} - -@directive("#if USES_INDICES") - -@kernel -void applyRestrictionTranspose(const CeedInt elementCount, - const CeedInt *quadIndices, - const CeedInt *dofOffsets, - const CeedInt *dofIndices, - const QuadVector u, - CeedScalar *v) { - @tile(TILE_SIZE, @outer, @inner) - for (int n = 0; n < NODE_COUNT; ++n) { - -@directive("#if PRINT_KERNEL_HASHES") - // Print to see which kernel is being run - if (n == 0) { - printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n"); - } -@directive("#endif") - - CeedScalar vComp[COMPONENT_COUNT]; - // Prefetch index information - const CeedInt vIndex = quadIndices[n]; - const CeedInt offsetStart = dofOffsets[n]; - const CeedInt offsetEnd = dofOffsets[n + 1]; - - for (int c = 0; c < COMPONENT_COUNT; ++c) { - vComp[c] = 0; - } - - // Aggregate by component - for (CeedInt i = offsetStart; i < offsetEnd; ++i) { - const CeedInt index = dofIndices[i]; - - const int node = (index % ELEMENT_SIZE); - const int element = (index / ELEMENT_SIZE); - - for (int c = 0; c < COMPONENT_COUNT; ++c) { - vComp[c] += u(node, c, element); - } - } - - // Update dofs by component - for (int c = 0; c < COMPONENT_COUNT; ++c) { - v[ - vIndex + (c * UNSTRIDED_COMPONENT_STRIDE) - ] += vComp[c]; - } - } -} - -@directive("#else") // USES_INDICES = false - -@kernel -void applyRestrictionTranspose(const CeedInt elementCount, - const CeedInt *quadIndices, - const CeedInt *dofOffsets, - const CeedInt *dofIndices, - const QuadVector u, - CeedScalar *v) { - @tile(TILE_SIZE, @outer, @inner) - for (int element = 0; element < elementCount; ++element) { - -@directive("#if PRINT_KERNEL_HASHES") - // Print to see which kernel is being run - if (element == 0) { - printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n"); - } -@directive("#endif") - - for (int node = 0; node < ELEMENT_SIZE; ++node) { - for (int c = 0; c < COMPONENT_COUNT; ++c) { - v[ - (node * NODE_STRIDE) - + (c * COMPONENT_STRIDE) - + (element * ELEMENT_STRIDE) - ] += u(node, c, element); + @directive("#if USES_INDICES") + + @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets, + const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) { + @tile(TILE_SIZE, @outer, @inner) for (int n = 0; n < NODE_COUNT; ++n) { + @directive("#if PRINT_KERNEL_HASHES") + // Print to see which kernel is being run + if (n == 0) { + printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n"); + } + @directive("#endif") + + CeedScalar vComp[COMPONENT_COUNT]; + + // Prefetch index information + const CeedInt vIndex = quadIndices[n]; + const CeedInt offsetStart = dofOffsets[n]; + const CeedInt offsetEnd = dofOffsets[n + 1]; + + for (int c = 0; c < COMPONENT_COUNT; ++c) { + vComp[c] = 0; + } + + // Aggregate by component + for (CeedInt i = offsetStart; i < offsetEnd; ++i) { + const CeedInt index = dofIndices[i]; + + const int node = (index % ELEMENT_SIZE); + const int element = (index / ELEMENT_SIZE); + + for (int c = 0; c < COMPONENT_COUNT; ++c) { + vComp[c] += u(node, c, element); + } + } + + // Update dofs by component + for (int c = 0; c < COMPONENT_COUNT; ++c) { + v[vIndex + (c * UNSTRIDED_COMPONENT_STRIDE)] += vComp[c]; + } + } + } + + @directive("#else") // USES_INDICES = false + + @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets, + const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) { + @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) { + @directive("#if PRINT_KERNEL_HASHES") + // Print to see which kernel is being run + if (element == 0) { + printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n"); + } + @directive("#endif") + + for (int node = 0; node < ELEMENT_SIZE; ++node) { + for (int c = 0; c < COMPONENT_COUNT; ++c) { + v[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)] += u(node, c, element); + } + } } } - } -} -@directive("#endif") // USES_INDICES + @directive("#endif") // USES_INDICES ); diff --git a/backends/occa/kernels/set-value.cpp b/backends/occa/kernels/set-value.cpp index 113d190ccd..dbac90e80a 100644 --- a/backends/occa/kernels/set-value.cpp +++ b/backends/occa/kernels/set-value.cpp @@ -14,13 +14,10 @@ // Expects the following constants to be defined: // - BLOCK_SIZE : CeedInt -const char *occa_set_value_source = STRINGIFY_SOURCE( +const char* occa_set_value_source = STRINGIFY_SOURCE( - @kernel - void setValue(CeedScalar* ptr,const CeedScalar value,const CeedInt count) { - @tile(BLOCK_SIZE,@outer,@inner) - for(CeedInt i=0; i < count; ++i) { - ptr[i] = value; - } - } -); + @kernel void setValue(CeedScalar* ptr, const CeedScalar value, const CeedInt count) { + @tile(BLOCK_SIZE, @outer, @inner) for (CeedInt i = 0; i < count; ++i) { + ptr[i] = value; + } + }); diff --git a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp index d541580f8a..f9daa85093 100644 --- a/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp +++ b/backends/occa/kernels/simplex-basis/cpu-simplex-basis.cpp @@ -9,177 +9,130 @@ const char *occa_simplex_basis_cpu_function_source = STRINGIFY_SOURCE( -@directive("#define SIMPLEX_FUNCTION(FUNCTION_NAME) simplex_ ## DIM ## d_ ## FUNCTION_NAME ## _Q ## Q ## _P ## P") - -inline void SIMPLEX_FUNCTION(interpElement)( - const CeedScalar *B @dim(P, Q), - const CeedScalar *Ue, - CeedScalar *Ve -) { - for (int q = 0; q < Q; ++q) { - CeedScalar v = 0; - for (int p = 0; p < P; ++p) { - v += B(p, q) * Ue[p]; - } - Ve[q] = v; - } -} - -inline void SIMPLEX_FUNCTION(interpElementTranspose)( - const CeedScalar *B @dim(P, Q), - const CeedScalar *Ue, - CeedScalar *Ve -) { - for (int p = 0; p < P; ++p) { - CeedScalar v = 0; - for (int q = 0; q < Q; ++q) { - v += B(p, q) * Ue[q]; - } - Ve[p] = v; - } -} - -inline void SIMPLEX_FUNCTION(gradElement)( - const CeedScalar *Bx @dim(P, Q, DIM), - const CeedScalar *Ue, - CeedScalar *Ve, -) { - for (int q = 0; q < Q; ++q) { - CeedScalar v[DIM]; - for (int dim = 0; dim < DIM; ++dim) { - v[dim] = 0; + @directive("#define SIMPLEX_FUNCTION(FUNCTION_NAME) simplex_ ## DIM ## d_ ## FUNCTION_NAME ## _Q ## Q ## _P ## P") + + inline void SIMPLEX_FUNCTION(interpElement)(const CeedScalar *B @dim(P, Q), const CeedScalar *Ue, CeedScalar *Ve) { + for (int q = 0; q < Q; ++q) { + CeedScalar v = 0; + for (int p = 0; p < P; ++p) { + v += B(p, q) * Ue[p]; + } + Ve[q] = v; + } + } + + inline void SIMPLEX_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P, Q), const CeedScalar *Ue, CeedScalar *Ve) { + for (int p = 0; p < P; ++p) { + CeedScalar v = 0; + for (int q = 0; q < Q; ++q) { + v += B(p, q) * Ue[q]; + } + Ve[p] = v; + } } - for (int p = 0; p < P; ++p) { - const CeedScalar u = Ue[p]; - for (int dim = 0; dim < DIM; ++dim) { - v[dim] += Bx(p, q, dim) * u; + inline void SIMPLEX_FUNCTION(gradElement)(const CeedScalar *Bx @dim(P, Q, DIM), const CeedScalar *Ue, CeedScalar *Ve, ) { + for (int q = 0; q < Q; ++q) { + CeedScalar v[DIM]; + for (int dim = 0; dim < DIM; ++dim) { + v[dim] = 0; + } + + for (int p = 0; p < P; ++p) { + const CeedScalar u = Ue[p]; + for (int dim = 0; dim < DIM; ++dim) { + v[dim] += Bx(p, q, dim) * u; + } + } + + for (int dim = 0; dim < DIM; ++dim) { + Ve[dim * Q + q] = v[dim]; + } } } - for (int dim = 0; dim < DIM; ++dim) { - Ve[dim*Q + q] = v[dim]; + inline void SIMPLEX_FUNCTION(gradElementTranspose)(const CeedScalar *Bx @dim(P, Q, DIM), const CeedScalar *Ue, CeedScalar *Ve) { + for (int p = 0; p < P; ++p) { + CeedScalar v = 0; + for (int dim = 0; dim < DIM; ++dim) { + for (int q = 0; q < Q; ++q) { + v += Bx(p, q, dim) * Ue[dim * Q + q]; + } + } + Ve[p] = v; + } } - } -} - -inline void SIMPLEX_FUNCTION(gradElementTranspose)( - const CeedScalar *Bx @dim(P, Q, DIM), - const CeedScalar *Ue, - CeedScalar *Ve -) { - for (int p = 0; p < P; ++p) { - CeedScalar v = 0; - for (int dim = 0; dim < DIM; ++dim) { + + inline void SIMPLEX_FUNCTION(weightElement)(const CeedScalar *qWeights, CeedScalar *We) { for (int q = 0; q < Q; ++q) { - v += Bx(p, q, dim) * Ue[dim*Q + q]; + We[q] = qWeights[q]; } } - Ve[p] = v; - } -} - -inline void SIMPLEX_FUNCTION(weightElement)( - const CeedScalar *qWeights, - CeedScalar *We -) { - for (int q = 0; q < Q; ++q) { - We[q] = qWeights[q]; - } -} ); const char *occa_simplex_basis_cpu_kernel_source = STRINGIFY_SOURCE( -@kernel void interp(const CeedInt elementCount, - const CeedScalar *B, - const CeedScalar *U, - CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = V; - - SIMPLEX_FUNCTION(interpElement)( - B, - &Ue(0, component, element), - &Ve(0, element, component) - ); - } else { - const CeedScalar *Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount) = V; - - SIMPLEX_FUNCTION(interpElementTranspose)( - B, - &Ue(0, element, component), - &Ve(0, component, element) - ); - } - } - } -} - -@kernel void grad(const CeedInt elementCount, - const CeedScalar *Bx, - const CeedScalar *U, - CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *_Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = V; - - CeedScalar Ve[DIM][Q]; - for (int dim = 0; dim < DIM; ++dim) { - for (int q = 0; q < Q; ++q) { - Ve[dim][q] = _Ve(q, element, component, dim); - } - } + @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + if (!TRANSPOSE) { + const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount) = U; + CeedScalar *Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = V; - SIMPLEX_FUNCTION(gradElement)( - Bx, - &Ue(0, component, element), - (CeedScalar*) Ve - ); + SIMPLEX_FUNCTION(interpElement)(B, &Ue(0, component, element), &Ve(0, element, component)); + } else { + const CeedScalar *Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT) = U; + CeedScalar *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount) = V; - for (int dim = 0; dim < DIM; ++dim) { - for (int q = 0; q < Q; ++q) { - _Ve(q, element, component, dim) = Ve[dim][q]; + SIMPLEX_FUNCTION(interpElementTranspose)(B, &Ue(0, element, component), &Ve(0, component, element)); } } - } else { - const CeedScalar *_Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = U; - CeedScalar *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount) = V; + } + } - CeedScalar Ue[DIM][Q]; - for (int dim = 0; dim < DIM; ++dim) { - for (int q = 0; q < Q; ++q) { - Ue[dim][q] = _Ue(q, element, component, dim); + @kernel void grad(const CeedInt elementCount, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + if (!TRANSPOSE) { + const CeedScalar *Ue @dim(P, BASIS_COMPONENT_COUNT, elementCount) = U; + CeedScalar *_Ve @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = V; + + CeedScalar Ve[DIM][Q]; + for (int dim = 0; dim < DIM; ++dim) { + for (int q = 0; q < Q; ++q) { + Ve[dim][q] = _Ve(q, element, component, dim); + } + } + + SIMPLEX_FUNCTION(gradElement)(Bx, &Ue(0, component, element), (CeedScalar *)Ve); + + for (int dim = 0; dim < DIM; ++dim) { + for (int q = 0; q < Q; ++q) { + _Ve(q, element, component, dim) = Ve[dim][q]; + } + } + } else { + const CeedScalar *_Ue @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM) = U; + CeedScalar *Ve @dim(P, BASIS_COMPONENT_COUNT, elementCount) = V; + + CeedScalar Ue[DIM][Q]; + for (int dim = 0; dim < DIM; ++dim) { + for (int q = 0; q < Q; ++q) { + Ue[dim][q] = _Ue(q, element, component, dim); + } + } + + SIMPLEX_FUNCTION(gradElementTranspose)(Bx, (CeedScalar *)Ue, &Ve(0, component, element)); } } + } + } - SIMPLEX_FUNCTION(gradElementTranspose)( - Bx, - (CeedScalar*) Ue, - &Ve(0, component, element) - ); + @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights, CeedScalar *W @dim(Q, elementCount)) { + @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) { + SIMPLEX_FUNCTION(weightElement)(qWeights, &W(0, element)); } } - } -} - -@kernel void weight(const CeedInt elementCount, - const CeedScalar *qWeights, - CeedScalar *W @dim(Q, elementCount)) { - @tile(32, @outer, @inner) - for (int element = 0; element < elementCount; ++element) { - SIMPLEX_FUNCTION(weightElement)( - qWeights, - &W(0, element) - ); - } -} ); diff --git a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp b/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp index 95ac8ce445..80a04fb443 100644 --- a/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp +++ b/backends/occa/kernels/simplex-basis/gpu-simplex-basis.cpp @@ -9,145 +9,130 @@ const char *occa_simplex_basis_gpu_source = STRINGIFY_SOURCE( -@directive("#if TRANSPOSE") - typedef CeedScalar* dofArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM); - typedef CeedScalar* quadArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM); -@directive("#else") - typedef CeedScalar* dofArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM); - typedef CeedScalar* quadArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM); -@directive("#endif") - -typedef CeedScalar* quadToDof @dim(P, Q); -typedef CeedScalar* dQuadToDof @dim(P, Q, DIM); -typedef CeedScalar* elementWeightArray @dim(Q, elementCount); - -@kernel void interp(const CeedInt elementCount, - const quadToDof B, - const dofArray U, - quadArray V) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar s_B[P * Q] @dim(P, Q); - - // Store weights in shared memory - for (int i = 0; i < MAX_PQ; ++i; @inner) { - for (int j = i; j < (P * Q); j+= MAX_PQ) { - s_B[j] = B[j]; - } - } + @directive("#if TRANSPOSE") typedef CeedScalar * dofArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM); + typedef CeedScalar * quadArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM); + @directive("#else") typedef CeedScalar * dofArray @dim(P, BASIS_COMPONENT_COUNT, elementCount, DIM); + typedef CeedScalar * quadArray @dim(Q, elementCount, BASIS_COMPONENT_COUNT, DIM); @directive("#endif") + + typedef CeedScalar * + quadToDof @dim(P, Q); + typedef CeedScalar * dQuadToDof @dim(P, Q, DIM); typedef CeedScalar * elementWeightArray @dim(Q, elementCount); + + @kernel void interp(const CeedInt elementCount, const quadToDof B, const dofArray U, quadArray V) { + for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { + @shared CeedScalar s_B[P * Q] @dim(P, Q); + + // Store weights in shared memory + for (int i = 0; i < MAX_PQ; ++i; @inner) { + for (int j = i; j < (P * Q); j += MAX_PQ) { + s_B[j] = B[j]; + } + } - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) { - for (int i = 0; i < MAX_PQ; ++i; @inner) { - const int element = elementOffset + localElement; - if (element < elementCount) { - - // Element operation - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - if (!TRANSPOSE) { - const int q = i; - if (q < Q) { - CeedScalar v = 0; - for (int p = 0; p < P; ++p) { - v += s_B(p, q) * U(p, component, element, 0); - } - V(q, element, component, 0) = v; - } - } else { - const int p = i; - if (p < P) { - CeedScalar v = 0; - for (int q = 0; q < Q; ++q) { - v += s_B(p, q) * U(q, element, component, 0); + for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) { + for (int i = 0; i < MAX_PQ; ++i; @inner) { + const int element = elementOffset + localElement; + if (element < elementCount) { + // Element operation + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { + if (!TRANSPOSE) { + const int q = i; + if (q < Q) { + CeedScalar v = 0; + for (int p = 0; p < P; ++p) { + v += s_B(p, q) * U(p, component, element, 0); + } + V(q, element, component, 0) = v; + } + } else { + const int p = i; + if (p < P) { + CeedScalar v = 0; + for (int q = 0; q < Q; ++q) { + v += s_B(p, q) * U(q, element, component, 0); + } + V(p, component, element, 0) = v; + } } - V(p, component, element, 0) = v; } } } - } } } - } -} - -@kernel void grad(const CeedInt elementCount, - const dQuadToDof Bx, - const dofArray U, - quadArray V) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar s_Bx[Q * P * DIM] @dim(P, Q, DIM); - - // Store weights in shared memory - for (int i = 0; i < MAX_PQ; ++i; @inner) { - for (int j = i; j < (P * Q * DIM); j+= MAX_PQ) { - s_Bx[j] = Bx[j]; - } - } - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) { - for (int i = 0; i < MAX_PQ; ++i; @inner) { - const int element = elementOffset + localElement; - if (element < elementCount) { - - // Element operation - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - if (!TRANSPOSE) { - const int q = i; - if (q < Q) { - CeedScalar v[DIM]; - for (int dim = 0; dim < DIM; ++dim) { - v[dim] = 0; - } + @kernel void grad(const CeedInt elementCount, const dQuadToDof Bx, const dofArray U, quadArray V) { + for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { + @shared CeedScalar s_Bx[Q * P * DIM] @dim(P, Q, DIM); - for (int p = 0; p < P; ++p) { - const CeedScalar u = U(p, component, element, 0); - for (int dim = 0; dim < DIM; ++dim) { - v[dim] += s_Bx(p, q, dim) * u; - } - } + // Store weights in shared memory + for (int i = 0; i < MAX_PQ; ++i; @inner) { + for (int j = i; j < (P * Q * DIM); j += MAX_PQ) { + s_Bx[j] = Bx[j]; + } + } - for (int dim = 0; dim < DIM; ++dim) { - V(q, element, component, dim) = v[dim]; - } - } - } else { - const int p = i; - if (p < P) { - CeedScalar v = 0; - for (int dim = 0; dim < DIM; ++dim) { - for (int q = 0; q < Q; ++q) { - v += s_Bx(p, q, dim) * U(q, element, component, dim); + for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) { + for (int i = 0; i < MAX_PQ; ++i; @inner) { + const int element = elementOffset + localElement; + if (element < elementCount) { + // Element operation + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { + if (!TRANSPOSE) { + const int q = i; + if (q < Q) { + CeedScalar v[DIM]; + for (int dim = 0; dim < DIM; ++dim) { + v[dim] = 0; + } + + for (int p = 0; p < P; ++p) { + const CeedScalar u = U(p, component, element, 0); + for (int dim = 0; dim < DIM; ++dim) { + v[dim] += s_Bx(p, q, dim) * u; + } + } + + for (int dim = 0; dim < DIM; ++dim) { + V(q, element, component, dim) = v[dim]; + } + } + } else { + const int p = i; + if (p < P) { + CeedScalar v = 0; + for (int dim = 0; dim < DIM; ++dim) { + for (int q = 0; q < Q; ++q) { + v += s_Bx(p, q, dim) * U(q, element, component, dim); + } + } + V(p, component, element, 0) = v; } } - V(p, component, element, 0) = v; } } } - } } } - } -} -@kernel void weight(const CeedInt elementCount, - const CeedScalar *qWeights, - elementWeightArray W) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar s_qWeights[Q]; + @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights, elementWeightArray W) { + for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { + @shared CeedScalar s_qWeights[Q]; - for (int q = 0; q < Q; ++q; @inner) { - s_qWeights[q] = qWeights[q]; - } - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) { - const int element = elementOffset + localElement; - if (element < elementCount) { for (int q = 0; q < Q; ++q; @inner) { - W(q, element) = s_qWeights[q]; + s_qWeights[q] = qWeights[q]; + } + + for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement) { + const int element = elementOffset + localElement; + if (element < elementCount) { + for (int q = 0; q < Q; ++q; @inner) { + W(q, element) = s_qWeights[q]; + } + } } } } - } -} ); diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp index 1a543926e9..d3982f4d17 100644 --- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp +++ b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-1d.cpp @@ -9,150 +9,100 @@ const char *occa_tensor_basis_1d_cpu_function_source = STRINGIFY_SOURCE( -@directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_1d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D") - -inline void TENSOR_FUNCTION(interpElement)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Ue, - CeedScalar *Ve -) { - for (int q = 0; q < Q1D; ++q) { - CeedScalar Vq = 0; - for (int p = 0; p < P1D; ++p) { - Vq += B(p, q) * Ue[p]; + @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_1d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D") + + inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue, CeedScalar *Ve) { + for (int q = 0; q < Q1D; ++q) { + CeedScalar Vq = 0; + for (int p = 0; p < P1D; ++p) { + Vq += B(p, q) * Ue[p]; + } + Ve[q] = Vq; + } + } + + inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue, CeedScalar *Ve) { + for (int p = 0; p < P1D; ++p) { + CeedScalar Vp = 0; + for (int q = 0; q < Q1D; ++q) { + Vp += B(p, q) * Ue[q]; + } + Ve[p] = Vp; + } } - Ve[q] = Vq; - } -} - -inline void TENSOR_FUNCTION(interpElementTranspose)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Ue, - CeedScalar *Ve -) { - for (int p = 0; p < P1D; ++p) { - CeedScalar Vp = 0; - for (int q = 0; q < Q1D; ++q) { - Vp += B(p, q) * Ue[q]; + + inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), const CeedScalar *Ue, + CeedScalar *Ve) { + for (int q = 0; q < Q1D; ++q) { + CeedScalar Vq = 0; + for (int p = 0; p < P1D; ++p) { + Vq += Bx(p, q) * Ue[p]; + } + Ve[q] = Vq; + } } - Ve[p] = Vp; - } -} - -inline void TENSOR_FUNCTION(gradElement)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue, - CeedScalar *Ve -) { - for (int q = 0; q < Q1D; ++q) { - CeedScalar Vq = 0; - for (int p = 0; p < P1D; ++p) { - Vq += Bx(p, q) * Ue[p]; + + inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), const CeedScalar *Ue, + CeedScalar *Ve) { + for (int p = 0; p < P1D; ++p) { + CeedScalar Vp = 0; + for (int q = 0; q < Q1D; ++q) { + Vp += Bx(p, q) * Ue[q]; + } + Ve[p] = Vp; + } } - Ve[q] = Vq; - } -} - -inline void TENSOR_FUNCTION(gradElementTranspose)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue, - CeedScalar *Ve -) { - for (int p = 0; p < P1D; ++p) { - CeedScalar Vp = 0; - for (int q = 0; q < Q1D; ++q) { - Vp += Bx(p, q) * Ue[q]; + + inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We) { + for (int q = 0; q < Q1D; ++q) { + We[q] = qWeights1D[q]; + } } - Ve[p] = Vp; - } -} - -inline void TENSOR_FUNCTION(weightElement)( - const CeedScalar *qWeights1D, - CeedScalar *We -) { - for (int q = 0; q < Q1D; ++q) { - We[q] = qWeights1D[q]; - } -} ); const char *occa_tensor_basis_1d_cpu_kernel_source = STRINGIFY_SOURCE( -@kernel void interp(const CeedInt elementCount, - const CeedScalar *B, - const CeedScalar *U, - CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; - - TENSOR_FUNCTION(interpElement)( - B, - &Ue(0, component, element), - &Ve(0, element, component) - ); - } else { - const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(interpElementTranspose)( - B, - &Ue(0, element, component), - &Ve(0, component, element) - ); + @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + if (!TRANSPOSE) { + const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U; + CeedScalar *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; + + TENSOR_FUNCTION(interpElement)(B, &Ue(0, component, element), &Ve(0, element, component)); + } else { + const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; + CeedScalar *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V; + + TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, element, component), &Ve(0, component, element)); + } + } } } - } -} - -@kernel void grad(const CeedInt elementCount, - const CeedScalar *B, - const CeedScalar *Bx, - const CeedScalar *U, - CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; - - TENSOR_FUNCTION(gradElement)( - B, - Bx, - &Ue(0, component, element), - &Ve(0, element, component) - ); - } else { - const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(gradElementTranspose)( - B, - Bx, - &Ue(0, element, component), - &Ve(0, component, element) - ); + + @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + if (!TRANSPOSE) { + const CeedScalar *Ue @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = U; + CeedScalar *Ve @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; + + TENSOR_FUNCTION(gradElement)(B, Bx, &Ue(0, component, element), &Ve(0, element, component)); + } else { + const CeedScalar *Ue @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; + CeedScalar *Ve @dim(P1D, BASIS_COMPONENT_COUNT, elementCount) = V; + + TENSOR_FUNCTION(gradElementTranspose)(B, Bx, &Ue(0, element, component), &Ve(0, component, element)); + } + } + } + } + + @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, elementCount)) { + @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) { + TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, element)); } } - } -} - -@kernel void weight(const CeedInt elementCount, - const CeedScalar *qWeights1D, - CeedScalar *W @dim(Q1D, elementCount)) { - @tile(32, @outer, @inner) - for (int element = 0; element < elementCount; ++element) { - TENSOR_FUNCTION(weightElement)( - qWeights1D, - &W(0, element) - ); - } -} ); diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp index 1a237d7258..5d975e55f0 100644 --- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp +++ b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-2d.cpp @@ -9,254 +9,203 @@ const char *occa_tensor_basis_2d_cpu_function_source = STRINGIFY_SOURCE( -@directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_2d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D") - -inline void TENSOR_FUNCTION(interpElement)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Ue @dim(P1D, P1D), - CeedScalar *Ve @dim(Q1D, Q1D) -) { - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve(qx, qy) = 0; - } - } - - for (int py = 0; py < P1D; ++py) { - CeedScalar V_x[Q1D]; - for (int qx = 0; qx < Q1D; ++qx) { - V_x[qx] = 0; - } - - for (int px = 0; px < P1D; ++px) { - const CeedScalar Up = Ue(px, py); - for (int qx = 0; qx < Q1D; ++qx) { - V_x[qx] += B(px, qx) * Up; + @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_2d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D") + + inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(P1D, P1D), + CeedScalar *Ve @dim(Q1D, Q1D)) { + for (int qy = 0; qy < Q1D; ++qy) { + for (int qx = 0; qx < Q1D; ++qx) { + Ve(qx, qy) = 0; + } + } + + for (int py = 0; py < P1D; ++py) { + CeedScalar V_x[Q1D]; + for (int qx = 0; qx < Q1D; ++qx) { + V_x[qx] = 0; + } + + for (int px = 0; px < P1D; ++px) { + const CeedScalar Up = Ue(px, py); + for (int qx = 0; qx < Q1D; ++qx) { + V_x[qx] += B(px, qx) * Up; + } + } + + for (int qy = 0; qy < Q1D; ++qy) { + const CeedScalar w = B(py, qy); + for (int qx = 0; qx < Q1D; ++qx) { + Ve(qx, qy) += w * V_x[qx]; + } + } + } + } + + inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(Q1D, Q1D), + CeedScalar *Ve @dim(P1D, P1D)) { + for (int py = 0; py < P1D; ++py) { + for (int px = 0; px < P1D; ++px) { + Ve(px, py) = 0; + } } - } - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar w = B(py, qy); - for (int qx = 0; qx < Q1D; ++qx) { - Ve(qx, qy) += w * V_x[qx]; + for (int qy = 0; qy < Q1D; ++qy) { + CeedScalar V_x[P1D]; + for (int py = 0; py < P1D; ++py) { + V_x[py] = 0; + } + + for (int qx = 0; qx < Q1D; ++qx) { + const CeedScalar Up = Ue(qx, qy); + for (int px = 0; px < P1D; ++px) { + V_x[px] += B(px, qx) * Up; + } + } + + for (int py = 0; py < P1D; ++py) { + const CeedScalar w = B(py, qy); + for (int px = 0; px < P1D; ++px) { + Ve(px, py) += w * V_x[px]; + } + } } } - } -} - -inline void TENSOR_FUNCTION(interpElementTranspose)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Ue @dim(Q1D, Q1D), - CeedScalar *Ve @dim(P1D, P1D) -) { - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py) = 0; - } - } - for (int qy = 0; qy < Q1D; ++qy) { - CeedScalar V_x[P1D]; - for (int py = 0; py < P1D; ++py) { - V_x[py] = 0; - } - - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar Up = Ue(qx, qy); - for (int px = 0; px < P1D; ++px) { - V_x[px] += B(px, qx) * Up; + inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), + const CeedScalar *Ue @dim(P1D, P1D), CeedScalar *Ve_x @dim(Q1D, Q1D), CeedScalar *Ve_y @dim(Q1D, Q1D)) { + CeedScalar grad[Q1D][Q1D][2]; + for (int qy = 0; qy < Q1D; ++qy) { + for (int qx = 0; qx < Q1D; ++qx) { + grad[qy][qx][0] = 0; + grad[qy][qx][1] = 0; + } } - } - for (int py = 0; py < P1D; ++py) { - const CeedScalar w = B(py, qy); - for (int px = 0; px < P1D; ++px) { - Ve(px, py) += w * V_x[px]; + for (int py = 0; py < P1D; ++py) { + CeedScalar gradX[Q1D][2]; + for (int qx = 0; qx < Q1D; ++qx) { + gradX[qx][0] = 0; + gradX[qx][1] = 0; + } + + for (int px = 0; px < P1D; ++px) { + const CeedScalar Up = Ue(px, py); + for (int qx = 0; qx < Q1D; ++qx) { + gradX[qx][0] += Up * B(px, qx); + gradX[qx][1] += Up * Bx(px, qx); + } + } + + for (int qy = 0; qy < Q1D; ++qy) { + const CeedScalar wy = B(py, qy); + const CeedScalar wDy = Bx(py, qy); + for (int qx = 0; qx < Q1D; ++qx) { + const CeedScalar wx = gradX[qx][0]; + const CeedScalar wDx = gradX[qx][1]; + grad[qy][qx][0] += wDx * wy; + grad[qy][qx][1] += wx * wDy; + } + } } - } - } -} - -inline void TENSOR_FUNCTION(gradElement)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue @dim(P1D, P1D), - CeedScalar *Ve_x @dim(Q1D, Q1D), - CeedScalar *Ve_y @dim(Q1D, Q1D) -) { - CeedScalar grad[Q1D][Q1D][2]; - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - grad[qy][qx][0] = 0; - grad[qy][qx][1] = 0; - } - } - - for (int py = 0; py < P1D; ++py) { - CeedScalar gradX[Q1D][2]; - for (int qx = 0; qx < Q1D; ++qx) { - gradX[qx][0] = 0; - gradX[qx][1] = 0; - } - - for (int px = 0; px < P1D; ++px) { - const CeedScalar Up = Ue(px, py); - for (int qx = 0; qx < Q1D; ++qx) { - gradX[qx][0] += Up * B(px, qx); - gradX[qx][1] += Up * Bx(px, qx); + for (int qy = 0; qy < Q1D; ++qy) { + for (int qx = 0; qx < Q1D; ++qx) { + Ve_x(qx, qy) = grad[qy][qx][0]; + Ve_y(qx, qy) = grad[qy][qx][1]; + } } } - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = B(py, qy); - const CeedScalar wDy = Bx(py, qy); - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar wx = gradX[qx][0]; - const CeedScalar wDx = gradX[qx][1]; - grad[qy][qx][0] += wDx * wy; - grad[qy][qx][1] += wx * wDy; + inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), + const CeedScalar *Ue_x @dim(Q1D, Q1D), const CeedScalar *Ue_y @dim(Q1D, Q1D), + CeedScalar *Ve @dim(P1D, P1D)) { + for (int py = 0; py < P1D; ++py) { + for (int px = 0; px < P1D; ++px) { + Ve(px, py) = 0.0; + } } - } - } - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve_x(qx, qy) = grad[qy][qx][0]; - Ve_y(qx, qy) = grad[qy][qx][1]; - } - } -} - -inline void TENSOR_FUNCTION(gradElementTranspose)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue_x @dim(Q1D, Q1D), - const CeedScalar *Ue_y @dim(Q1D, Q1D), - CeedScalar *Ve @dim(P1D, P1D) -) { - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py) = 0.0; - } - } - - for (int qy = 0; qy < Q1D; ++qy) { - CeedScalar gradX[P1D][2]; - for (int px = 0; px < P1D; ++px) { - gradX[px][0] = 0; - gradX[px][1] = 0; - } - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar Ux = Ue_x(qx, qy); - const CeedScalar Uy = Ue_y(qx, qy); - for (int px = 0; px < P1D; ++px) { - const CeedScalar wx = B(px, qx); - const CeedScalar wDx = Bx(px, qx); - gradX[px][0] += Ux * wDx; - gradX[px][1] += Uy * wx; + for (int qy = 0; qy < Q1D; ++qy) { + CeedScalar gradX[P1D][2]; + for (int px = 0; px < P1D; ++px) { + gradX[px][0] = 0; + gradX[px][1] = 0; + } + + for (int qx = 0; qx < Q1D; ++qx) { + const CeedScalar Ux = Ue_x(qx, qy); + const CeedScalar Uy = Ue_y(qx, qy); + for (int px = 0; px < P1D; ++px) { + const CeedScalar wx = B(px, qx); + const CeedScalar wDx = Bx(px, qx); + gradX[px][0] += Ux * wDx; + gradX[px][1] += Uy * wx; + } + } + + for (int py = 0; py < P1D; ++py) { + const CeedScalar wy = B(py, qy); + const CeedScalar wDy = Bx(py, qy); + for (int px = 0; px < P1D; ++px) { + Ve(px, py) += ((gradX[px][0] * wy) + (gradX[px][1] * wDy)); + } + } } } - for (int py = 0; py < P1D; ++py) { - const CeedScalar wy = B(py, qy); - const CeedScalar wDy = Bx(py, qy); - for (int px = 0; px < P1D; ++px) { - Ve(px, py) += ((gradX[px][0] * wy) + - (gradX[px][1] * wDy)); + inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We @dim(Q1D, Q1D)) { + for (int qy = 0; qy < Q1D; ++qy) { + const CeedScalar wy = qWeights1D[qy]; + for (int qx = 0; qx < Q1D; ++qx) { + We(qx, qy) = qWeights1D[qx] * wy; + } } } - } -} - -inline void TENSOR_FUNCTION(weightElement)( - const CeedScalar *qWeights1D, - CeedScalar *We @dim(Q1D, Q1D) -) { - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = qWeights1D[qy]; - for (int qx = 0; qx < Q1D; ++qx) { - We(qx, qy) = qWeights1D[qx] * wy; - } - } -} ); const char *occa_tensor_basis_2d_cpu_kernel_source = STRINGIFY_SOURCE( -@kernel void interp(const CeedInt elementCount, - const CeedScalar *B, - const CeedScalar *U, - CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; - - TENSOR_FUNCTION(interpElement)( - B, - &Ue(0, 0, component, element), - &Ve(0, 0, element, component) - ); - } else { - const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(interpElementTranspose)( - B, - &Ue(0, 0, element, component), - &Ve(0, 0, component, element) - ); + @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + if (!TRANSPOSE) { + const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; + CeedScalar *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; + + TENSOR_FUNCTION(interpElement)(B, &Ue(0, 0, component, element), &Ve(0, 0, element, component)); + } else { + const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; + CeedScalar *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; + + TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, 0, element, component), &Ve(0, 0, component, element)); + } + } } } - } -} -@kernel void grad(const CeedInt elementCount, - const CeedScalar *B, - const CeedScalar *Bx, - const CeedScalar *U, - CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = V; + @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + if (!TRANSPOSE) { + const CeedScalar *Ue @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; + CeedScalar *Ve @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = V; - TENSOR_FUNCTION(gradElement)( - B, - Bx, - &Ue(0, 0, component, element), - &Ve(0, 0, element, component, 0), - &Ve(0, 0, element, component, 1) - ); - } else { - const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = U; - CeedScalar *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; + TENSOR_FUNCTION(gradElement)(B, Bx, &Ue(0, 0, component, element), &Ve(0, 0, element, component, 0), &Ve(0, 0, element, component, 1)); + } else { + const CeedScalar *Ue @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2) = U; + CeedScalar *Ve @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - TENSOR_FUNCTION(gradElementTranspose)( - B, - Bx, - &Ue(0, 0, element, component, 0), - &Ue(0, 0, element, component, 1), - &Ve(0, 0, component, element) - ); + TENSOR_FUNCTION(gradElementTranspose) + (B, Bx, &Ue(0, 0, element, component, 0), &Ue(0, 0, element, component, 1), &Ve(0, 0, component, element)); + } + } } } - } -} -@kernel void weight(const CeedInt elementCount, - const CeedScalar *qWeights1D, - CeedScalar *W @dim(Q1D, Q1D, elementCount)) { - @tile(32, @outer, @inner) - for (int element = 0; element < elementCount; ++element) { - TENSOR_FUNCTION(weightElement)( - qWeights1D, - &W(0, 0, element) - ); - } -} + @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, Q1D, elementCount)) { + @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) { + TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, 0, element)); + } + } ); diff --git a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp index 1d3a94ebbd..24a1e767fe 100644 --- a/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp +++ b/backends/occa/kernels/tensor-basis/cpu/tensor-basis-3d.cpp @@ -9,350 +9,298 @@ const char *occa_tensor_basis_3d_cpu_function_source = STRINGIFY_SOURCE( -@directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_3d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D") - -inline void TENSOR_FUNCTION(interpElement)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Ue @dim(P1D, P1D, P1D), - CeedScalar *Ve @dim(Q1D, Q1D, Q1D) -) { - for (int qz = 0; qz < Q1D; ++qz) { - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve(qx, qy, qz) = 0; - } - } - } - - for (int pz = 0; pz < P1D; ++pz) { - CeedScalar V_xy[Q1D][Q1D]; - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - V_xy[qy][qx] = 0; - } - } + @directive("#define TENSOR_FUNCTION(FUNCTION_NAME) tensor_3d_ ## FUNCTION_NAME ## _Q ## Q1D ## _P ## P1D") + + inline void TENSOR_FUNCTION(interpElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(P1D, P1D, P1D), + CeedScalar *Ve @dim(Q1D, Q1D, Q1D)) { + for (int qz = 0; qz < Q1D; ++qz) { + for (int qy = 0; qy < Q1D; ++qy) { + for (int qx = 0; qx < Q1D; ++qx) { + Ve(qx, qy, qz) = 0; + } + } + } + + for (int pz = 0; pz < P1D; ++pz) { + CeedScalar V_xy[Q1D][Q1D]; + for (int qy = 0; qy < Q1D; ++qy) { + for (int qx = 0; qx < Q1D; ++qx) { + V_xy[qy][qx] = 0; + } + } + + for (int py = 0; py < P1D; ++py) { + CeedScalar V_x[Q1D]; + for (int qx = 0; qx < Q1D; ++qx) { + V_x[qx] = 0; + } + + for (int px = 0; px < P1D; ++px) { + const CeedScalar Up = Ue(px, py, pz); + for (int qx = 0; qx < Q1D; ++qx) { + V_x[qx] += B(px, qx) * Up; + } + } + + for (int qy = 0; qy < Q1D; ++qy) { + const CeedScalar wy = B(py, qy); + for (int qx = 0; qx < Q1D; ++qx) { + V_xy[qy][qx] += wy * V_x[qx]; + } + } + } + + for (int qz = 0; qz < Q1D; ++qz) { + const CeedScalar wz = B(pz, qz); + for (int qy = 0; qy < Q1D; ++qy) { + for (int qx = 0; qx < Q1D; ++qx) { + Ve(qx, qy, qz) += wz * V_xy[qy][qx]; + } + } + } + } + } - for (int py = 0; py < P1D; ++py) { - CeedScalar V_x[Q1D]; - for (int qx = 0; qx < Q1D; ++qx) { - V_x[qx] = 0; + inline void TENSOR_FUNCTION(interpElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Ue @dim(Q1D, Q1D, Q1D), + CeedScalar *Ve @dim(P1D, P1D, P1D)) { + for (int pz = 0; pz < P1D; ++pz) { + for (int py = 0; py < P1D; ++py) { + for (int px = 0; px < P1D; ++px) { + Ve(px, py, pz) = 0; + } + } } - for (int px = 0; px < P1D; ++px) { - const CeedScalar Up = Ue(px, py, pz); - for (int qx = 0; qx < Q1D; ++qx) { - V_x[qx] += B(px, qx) * Up; + for (int qz = 0; qz < Q1D; ++qz) { + CeedScalar V_xy[P1D][P1D]; + for (int py = 0; py < P1D; ++py) { + for (int px = 0; px < P1D; ++px) { + V_xy[py][px] = 0; + } } - } - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = B(py, qy); - for (int qx = 0; qx < Q1D; ++qx) { - V_xy[qy][qx] += wy * V_x[qx]; + for (int qy = 0; qy < Q1D; ++qy) { + CeedScalar V_x[P1D]; + for (int px = 0; px < P1D; ++px) { + V_x[px] = 0; + } + + for (int qx = 0; qx < Q1D; ++qx) { + const CeedScalar Uq = Ue(qx, qy, qz); + for (int px = 0; px < P1D; ++px) { + V_x[px] += B(px, qx) * Uq; + } + } + + for (int py = 0; py < P1D; ++py) { + const CeedScalar wy = B(py, qy); + for (int px = 0; px < P1D; ++px) { + V_xy[py][px] += wy * V_x[px]; + } + } } - } - } - for (int qz = 0; qz < Q1D; ++qz) { - const CeedScalar wz = B(pz, qz); - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve(qx, qy, qz) += wz * V_xy[qy][qx]; + for (int pz = 0; pz < P1D; ++pz) { + const CeedScalar wz = B(pz, qz); + for (int py = 0; py < P1D; ++py) { + for (int px = 0; px < P1D; ++px) { + Ve(px, py, pz) += wz * V_xy[py][px]; + } + } } } } - } -} - -inline void TENSOR_FUNCTION(interpElementTranspose)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Ue @dim(Q1D, Q1D, Q1D), - CeedScalar *Ve @dim(P1D, P1D, P1D) -) { - for (int pz = 0; pz < P1D; ++pz) { - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py, pz) = 0; - } - } - } - for (int qz = 0; qz < Q1D; ++qz) { - CeedScalar V_xy[P1D][P1D]; - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - V_xy[py][px] = 0; + inline void TENSOR_FUNCTION(gradElement)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), + const CeedScalar *Ue @dim(P1D, P1D, P1D), CeedScalar *Ve_x @dim(Q1D, Q1D, Q1D), + CeedScalar *Ve_y @dim(Q1D, Q1D, Q1D), CeedScalar *Ve_z @dim(Q1D, Q1D, Q1D)) { + for (int qz = 0; qz < Q1D; ++qz) { + for (int qy = 0; qy < Q1D; ++qy) { + for (int qx = 0; qx < Q1D; ++qx) { + Ve_x(qx, qy, qz) = 0; + Ve_y(qx, qy, qz) = 0; + Ve_z(qx, qy, qz) = 0; + } + } } - } - for (int qy = 0; qy < Q1D; ++qy) { - CeedScalar V_x[P1D]; - for (int px = 0; px < P1D; ++px) { - V_x[px] = 0; - } + for (int pz = 0; pz < P1D; ++pz) { + CeedScalar gradXY[Q1D][Q1D][3]; + for (int qy = 0; qy < Q1D; ++qy) { + for (int qx = 0; qx < Q1D; ++qx) { + gradXY[qy][qx][0] = 0; + gradXY[qy][qx][1] = 0; + gradXY[qy][qx][2] = 0; + } + } - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar Uq = Ue(qx, qy, qz); - for (int px = 0; px < P1D; ++px) { - V_x[px] += B(px, qx) * Uq; + for (int py = 0; py < P1D; ++py) { + CeedScalar gradX[Q1D][2]; + for (int qx = 0; qx < Q1D; ++qx) { + gradX[qx][0] = 0; + gradX[qx][1] = 0; + } + + for (int px = 0; px < P1D; ++px) { + const CeedScalar Up = Ue(px, py, pz); + for (int qx = 0; qx < Q1D; ++qx) { + gradX[qx][0] += Up * B(px, qx); + gradX[qx][1] += Up * Bx(px, qx); + } + } + + for (int qy = 0; qy < Q1D; ++qy) { + const CeedScalar wy = B(py, qy); + const CeedScalar wDy = Bx(py, qy); + for (int qx = 0; qx < Q1D; ++qx) { + const CeedScalar wx = gradX[qx][0]; + const CeedScalar wDx = gradX[qx][1]; + gradXY[qy][qx][0] += wDx * wy; + gradXY[qy][qx][1] += wx * wDy; + gradXY[qy][qx][2] += wx * wy; + } + } } - } - for (int py = 0; py < P1D; ++py) { - const CeedScalar wy = B(py, qy); - for (int px = 0; px < P1D; ++px) { - V_xy[py][px] += wy * V_x[px]; + for (int qz = 0; qz < Q1D; ++qz) { + const CeedScalar wz = B(pz, qz); + const CeedScalar wDz = Bx(pz, qz); + for (int qy = 0; qy < Q1D; ++qy) { + for (int qx = 0; qx < Q1D; ++qx) { + Ve_x(qx, qy, qz) += gradXY[qy][qx][0] * wz; + Ve_y(qx, qy, qz) += gradXY[qy][qx][1] * wz; + Ve_z(qx, qy, qz) += gradXY[qy][qx][2] * wDz; + } + } } } } - for (int pz = 0; pz < P1D; ++pz) { - const CeedScalar wz = B(pz, qz); - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py, pz) += wz * V_xy[py][px]; + inline void TENSOR_FUNCTION(gradElementTranspose)(const CeedScalar *B @dim(P1D, Q1D), const CeedScalar *Bx @dim(P1D, Q1D), + const CeedScalar *Ue_x @dim(Q1D, Q1D, Q1D), const CeedScalar *Ue_y @dim(Q1D, Q1D, Q1D), + const CeedScalar *Ue_z @dim(Q1D, Q1D, Q1D), CeedScalar *Ve @dim(P1D, P1D, P1D)) { + for (int pz = 0; pz < P1D; ++pz) { + for (int py = 0; py < P1D; ++py) { + for (int px = 0; px < P1D; ++px) { + Ve(px, py, pz) = 0; + } } } - } - } -} - -inline void TENSOR_FUNCTION(gradElement)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue @dim(P1D, P1D, P1D), - CeedScalar *Ve_x @dim(Q1D, Q1D, Q1D), - CeedScalar *Ve_y @dim(Q1D, Q1D, Q1D), - CeedScalar *Ve_z @dim(Q1D, Q1D, Q1D) -) { - for (int qz = 0; qz < Q1D; ++qz) { - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve_x(qx, qy, qz) = 0; - Ve_y(qx, qy, qz) = 0; - Ve_z(qx, qy, qz) = 0; - } - } - } - - for (int pz = 0; pz < P1D; ++pz) { - CeedScalar gradXY[Q1D][Q1D][3]; - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - gradXY[qy][qx][0] = 0; - gradXY[qy][qx][1] = 0; - gradXY[qy][qx][2] = 0; - } - } - for (int py = 0; py < P1D; ++py) { - CeedScalar gradX[Q1D][2]; - for (int qx = 0; qx < Q1D; ++qx) { - gradX[qx][0] = 0; - gradX[qx][1] = 0; - } + for (int qz = 0; qz < Q1D; ++qz) { + CeedScalar gradXY[P1D][P1D][3]; + for (int py = 0; py < P1D; ++py) { + for (int px = 0; px < P1D; ++px) { + gradXY[py][px][0] = 0; + gradXY[py][px][1] = 0; + gradXY[py][px][2] = 0; + } + } - for (int px = 0; px < P1D; ++px) { - const CeedScalar Up = Ue(px, py, pz); - for (int qx = 0; qx < Q1D; ++qx) { - gradX[qx][0] += Up * B(px, qx); - gradX[qx][1] += Up * Bx(px, qx); + for (int qy = 0; qy < Q1D; ++qy) { + CeedScalar gradX[P1D][3]; + for (int px = 0; px < P1D; ++px) { + gradX[px][0] = 0; + gradX[px][1] = 0; + gradX[px][2] = 0; + } + + for (int qx = 0; qx < Q1D; ++qx) { + const CeedScalar Ux = Ue_x(qx, qy, qz); + const CeedScalar Uy = Ue_y(qx, qy, qz); + const CeedScalar Uz = Ue_z(qx, qy, qz); + for (int px = 0; px < P1D; ++px) { + const CeedScalar wx = B(px, qx); + const CeedScalar wDx = Bx(px, qx); + gradX[px][0] += Ux * wDx; + gradX[px][1] += Uy * wx; + gradX[px][2] += Uz * wx; + } + } + + for (int py = 0; py < P1D; ++py) { + const CeedScalar wy = B(py, qy); + const CeedScalar wDy = Bx(py, qy); + for (int px = 0; px < P1D; ++px) { + gradXY[py][px][0] += gradX[px][0] * wy; + gradXY[py][px][1] += gradX[px][1] * wDy; + gradXY[py][px][2] += gradX[px][2] * wy; + } + } } - } - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = B(py, qy); - const CeedScalar wDy = Bx(py, qy); - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar wx = gradX[qx][0]; - const CeedScalar wDx = gradX[qx][1]; - gradXY[qy][qx][0] += wDx * wy; - gradXY[qy][qx][1] += wx * wDy; - gradXY[qy][qx][2] += wx * wy; + for (int pz = 0; pz < P1D; ++pz) { + const CeedScalar wz = B(pz, qz); + const CeedScalar wDz = Bx(pz, qz); + for (int py = 0; py < P1D; ++py) { + for (int px = 0; px < P1D; ++px) { + Ve(px, py, pz) += ((gradXY[py][px][0] * wz) + (gradXY[py][px][1] * wz) + (gradXY[py][px][2] * wDz)); + } + } } } } - for (int qz = 0; qz < Q1D; ++qz) { - const CeedScalar wz = B(pz, qz); - const CeedScalar wDz = Bx(pz, qz); - for (int qy = 0; qy < Q1D; ++qy) { - for (int qx = 0; qx < Q1D; ++qx) { - Ve_x(qx, qy, qz) += gradXY[qy][qx][0] * wz; - Ve_y(qx, qy, qz) += gradXY[qy][qx][1] * wz; - Ve_z(qx, qy, qz) += gradXY[qy][qx][2] * wDz; + inline void TENSOR_FUNCTION(weightElement)(const CeedScalar *qWeights1D, CeedScalar *We @dim(Q1D, Q1D, Q1D)) { + for (int qz = 0; qz < Q1D; ++qz) { + const CeedScalar wz = qWeights1D[qz]; + for (int qy = 0; qy < Q1D; ++qy) { + const CeedScalar wy = qWeights1D[qy]; + for (int qx = 0; qx < Q1D; ++qx) { + We(qx, qy, qz) = qWeights1D[qx] * wy * wz; + } } } } - } -} - -inline void TENSOR_FUNCTION(gradElementTranspose)( - const CeedScalar *B @dim(P1D, Q1D), - const CeedScalar *Bx @dim(P1D, Q1D), - const CeedScalar *Ue_x @dim(Q1D, Q1D, Q1D), - const CeedScalar *Ue_y @dim(Q1D, Q1D, Q1D), - const CeedScalar *Ue_z @dim(Q1D, Q1D, Q1D), - CeedScalar *Ve @dim(P1D, P1D, P1D) -) { - for (int pz = 0; pz < P1D; ++pz) { - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py, pz) = 0; - } - } - } - - for (int qz = 0; qz < Q1D; ++qz) { - CeedScalar gradXY[P1D][P1D][3]; - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - gradXY[py][px][0] = 0; - gradXY[py][px][1] = 0; - gradXY[py][px][2] = 0; - } - } - for (int qy = 0; qy < Q1D; ++qy) { - CeedScalar gradX[P1D][3]; - for (int px = 0; px < P1D; ++px) { - gradX[px][0] = 0; - gradX[px][1] = 0; - gradX[px][2] = 0; - } +); - for (int qx = 0; qx < Q1D; ++qx) { - const CeedScalar Ux = Ue_x(qx, qy, qz); - const CeedScalar Uy = Ue_y(qx, qy, qz); - const CeedScalar Uz = Ue_z(qx, qy, qz); - for (int px = 0; px < P1D; ++px) { - const CeedScalar wx = B(px, qx); - const CeedScalar wDx = Bx(px, qx); - gradX[px][0] += Ux * wDx; - gradX[px][1] += Uy * wx; - gradX[px][2] += Uz * wx; - } - } +const char *occa_tensor_basis_3d_cpu_kernel_source = STRINGIFY_SOURCE( + + @kernel void interp(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + if (!TRANSPOSE) { + const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; + CeedScalar *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; + + TENSOR_FUNCTION(interpElement)(B, &Ue(0, 0, 0, component, element), &Ve(0, 0, 0, element, component)); + } else { + const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; + CeedScalar *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - for (int py = 0; py < P1D; ++py) { - const CeedScalar wy = B(py, qy); - const CeedScalar wDy = Bx(py, qy); - for (int px = 0; px < P1D; ++px) { - gradXY[py][px][0] += gradX[px][0] * wy; - gradXY[py][px][1] += gradX[px][1] * wDy; - gradXY[py][px][2] += gradX[px][2] * wy; + TENSOR_FUNCTION(interpElementTranspose)(B, &Ue(0, 0, 0, element, component), &Ve(0, 0, 0, component, element)); + } } } } - for (int pz = 0; pz < P1D; ++pz) { - const CeedScalar wz = B(pz, qz); - const CeedScalar wDz = Bx(pz, qz); - for (int py = 0; py < P1D; ++py) { - for (int px = 0; px < P1D; ++px) { - Ve(px, py, pz) += ((gradXY[py][px][0] * wz) + - (gradXY[py][px][1] * wz) + - (gradXY[py][px][2] * wDz)); + @kernel void grad(const CeedInt elementCount, const CeedScalar *B, const CeedScalar *Bx, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + if (!TRANSPOSE) { + const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; + CeedScalar *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = V; + + TENSOR_FUNCTION(gradElement) + (B, Bx, &Ue(0, 0, 0, component, element), &Ve(0, 0, 0, element, component, 0), &Ve(0, 0, 0, element, component, 1), + &Ve(0, 0, 0, element, component, 2)); + } else { + const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = U; + CeedScalar *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; + + TENSOR_FUNCTION(gradElementTranspose) + (B, Bx, &Ue(0, 0, 0, element, component, 0), &Ue(0, 0, 0, element, component, 1), &Ue(0, 0, 0, element, component, 2), + &Ve(0, 0, 0, component, element)); + } } } } - } -} - -inline void TENSOR_FUNCTION(weightElement)( - const CeedScalar *qWeights1D, - CeedScalar *We @dim(Q1D, Q1D, Q1D) -) { - for (int qz = 0; qz < Q1D; ++qz) { - const CeedScalar wz = qWeights1D[qz]; - for (int qy = 0; qy < Q1D; ++qy) { - const CeedScalar wy = qWeights1D[qy]; - for (int qx = 0; qx < Q1D; ++qx) { - We(qx, qy, qz) = qWeights1D[qx] * wy * wz; - } - } - } -} -); - -const char *occa_tensor_basis_3d_cpu_kernel_source = STRINGIFY_SOURCE( - -@kernel void interp(const CeedInt elementCount, - const CeedScalar *B, - const CeedScalar *U, - CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = V; - - TENSOR_FUNCTION(interpElement)( - B, - &Ue(0, 0, 0, component, element), - &Ve(0, 0, 0, element, component) - ); - } else { - const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT) = U; - CeedScalar *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(interpElementTranspose)( - B, - &Ue(0, 0, 0, element, component), - &Ve(0, 0, 0, component, element) - ); - } - } - } -} - -@kernel void grad(const CeedInt elementCount, - const CeedScalar *B, - const CeedScalar *Bx, - const CeedScalar *U, - CeedScalar *V) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - if (!TRANSPOSE) { - const CeedScalar *Ue @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = U; - CeedScalar *Ve @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = V; - - TENSOR_FUNCTION(gradElement)( - B, - Bx, - &Ue(0, 0, 0, component, element), - &Ve(0, 0, 0, element, component, 0), - &Ve(0, 0, 0, element, component, 1), - &Ve(0, 0, 0, element, component, 2) - ); - } else { - const CeedScalar *Ue @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3) = U; - CeedScalar *Ve @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount) = V; - - TENSOR_FUNCTION(gradElementTranspose)( - B, - Bx, - &Ue(0, 0, 0, element, component, 0), - &Ue(0, 0, 0, element, component, 1), - &Ue(0, 0, 0, element, component, 2), - &Ve(0, 0, 0, component, element) - ); + @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, CeedScalar *W @dim(Q1D, Q1D, Q1D, elementCount)) { + @tile(32, @outer, @inner) for (int element = 0; element < elementCount; ++element) { + TENSOR_FUNCTION(weightElement)(qWeights1D, &W(0, 0, 0, element)); } } - } -} - -@kernel void weight(const CeedInt elementCount, - const CeedScalar *qWeights1D, - CeedScalar *W @dim(Q1D, Q1D, Q1D, elementCount)) { - @tile(32, @outer, @inner) - for (int element = 0; element < elementCount; ++element) { - TENSOR_FUNCTION(weightElement)( - qWeights1D, - &W(0, 0, 0, element) - ); - } -} ); diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp index 25e9d086b7..de4ba7972b 100644 --- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp +++ b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-1d.cpp @@ -9,158 +9,110 @@ const char *occa_tensor_basis_1d_gpu_source = STRINGIFY_SOURCE( -typedef CeedScalar* dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount); -typedef const CeedScalar* const_dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount); - -typedef CeedScalar* quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT); -typedef const CeedScalar* const_quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT); - -typedef CeedScalar* sharedBufferArray @dim(MAX_PQ, ELEMENTS_PER_BLOCK); -typedef const CeedScalar* quadToDof @dim(P1D, Q1D); -typedef CeedScalar* elementWeightArray @dim(Q1D, elementCount); - -//---[ Utility Methods ]---------------- -inline void readDofs(const int element, - const int localElement, - const int component, - const int p, - const_dofArray U, - sharedBufferArray sharedBuffer) { - // Zero out extra entries - sharedBuffer(p, localElement) = ( - (p < P1D) - ? U(p, component, element) - : 0.0 - ); -} - -inline void writeDofs(const int element, - const int component, - const int p, - const CeedScalar Vp, - dofArray V) { - if (p < P1D) { - V(p, component, element) = Vp; - } -} - -inline void readQuads(const int elementCount, - const int element, - const int localElement, - const int component, - const int q, - const_quadArray U, - sharedBufferArray sharedBuffer) { - sharedBuffer(q, localElement) = U(q, element, component); -} - -inline void writeQuads(const int elementCount, - const int element, - const int component, - const int q, - const CeedScalar Vq, - quadArray V) { - V(q, element, component) = Vq; -} - -inline void contractX(const int q, - const int localElement, - sharedBufferArray sharedBuffer, - quadToDof B, - CeedScalar &V) { - V = 0.0; - for (int p = 0; p < P1D; ++p) { - V += B(p, q) * sharedBuffer(p, localElement); - } -} - -inline void contractTransposeX(const int p, - const int localElement, - sharedBufferArray sharedBuffer, - quadToDof B, - CeedScalar &V) { - V = 0.0; - for (int q = 0; q < Q1D; ++q) { - V += B(p, q) * sharedBuffer(q, localElement); - } -} - -//---[ Kernels ]------------------------ -@kernel void interp(const CeedInt elementCount, - quadToDof B, - const CeedScalar *U, - CeedScalar *V) { - - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK]; - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { - for (int q = 0; q < Q1D; ++q; @inner) { - - const int element = elementOffset + localElement; - if (element < elementCount) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - CeedScalar r; - if (!TRANSPOSE) { - readDofs(element, localElement, component, q, U, sharedBuffer); - contractX(q, localElement, sharedBuffer, B, r); - writeQuads(elementCount, element, component, q, r, V); - } else { - readQuads(elementCount, element, localElement, component, q, U, sharedBuffer); - contractTransposeX(q, localElement, sharedBuffer, B, r); - writeDofs(element, component, q, r, V); + typedef CeedScalar * dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount); + typedef const CeedScalar *const_dofArray @dim(P1D, BASIS_COMPONENT_COUNT, elementCount); + + typedef CeedScalar * quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT); + typedef const CeedScalar *const_quadArray @dim(Q1D, elementCount, BASIS_COMPONENT_COUNT); + + typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, ELEMENTS_PER_BLOCK); typedef const CeedScalar *quadToDof @dim(P1D, Q1D); + typedef CeedScalar * elementWeightArray @dim(Q1D, elementCount); + + //---[ Utility Methods ]---------------- + inline void readDofs(const int element, const int localElement, const int component, const int p, const_dofArray U, + sharedBufferArray sharedBuffer) { + // Zero out extra entries + sharedBuffer(p, localElement) = ((p < P1D) ? U(p, component, element) : 0.0); + } + + inline void writeDofs(const int element, const int component, const int p, const CeedScalar Vp, dofArray V) { + if (p < P1D) { + V(p, component, element) = Vp; + } + } + + inline void readQuads(const int elementCount, const int element, const int localElement, const int component, const int q, const_quadArray U, + sharedBufferArray sharedBuffer) { sharedBuffer(q, localElement) = U(q, element, component); } + + inline void writeQuads(const int elementCount, const int element, const int component, const int q, const CeedScalar Vq, quadArray V) { + V(q, element, component) = Vq; + } + + inline void contractX(const int q, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, CeedScalar &V) { + V = 0.0; + for (int p = 0; p < P1D; ++p) { + V += B(p, q) * sharedBuffer(p, localElement); + } + } + + inline void contractTransposeX(const int p, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, CeedScalar &V) { + V = 0.0; + for (int q = 0; q < Q1D; ++q) { + V += B(p, q) * sharedBuffer(q, localElement); + } + } + + //---[ Kernels ]------------------------ + @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) { + for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { + @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK]; + + for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { + for (int q = 0; q < Q1D; ++q; @inner) { + const int element = elementOffset + localElement; + if (element < elementCount) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { + CeedScalar r; + if (!TRANSPOSE) { + readDofs(element, localElement, component, q, U, sharedBuffer); + contractX(q, localElement, sharedBuffer, B, r); + writeQuads(elementCount, element, component, q, r, V); + } else { + readQuads(elementCount, element, localElement, component, q, U, sharedBuffer); + contractTransposeX(q, localElement, sharedBuffer, B, r); + writeDofs(element, component, q, r, V); + } + } } } } - } } - } -} - -@kernel void grad(const CeedInt elementCount, - quadToDof B, - quadToDof Bx, - const CeedScalar *U, - CeedScalar *V) { - - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK]; - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { - for (int q = 0; q < Q1D; ++q; @inner) { - - const int element = elementOffset + localElement; - if (element < elementCount) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - CeedScalar r; - if (!TRANSPOSE) { - readDofs(element, localElement, component, q, U, sharedBuffer); - contractX(q, localElement, sharedBuffer, Bx, r); - writeQuads(elementCount, element, component, q, r, V); - } else { - readQuads(elementCount, element, localElement, component, q, U, sharedBuffer); - contractTransposeX(q, localElement, sharedBuffer, Bx, r); - writeDofs(element, component, q, r, V); + + @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) { + for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { + @shared CeedScalar sharedBuffer[MAX_PQ * ELEMENTS_PER_BLOCK]; + + for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { + for (int q = 0; q < Q1D; ++q; @inner) { + const int element = elementOffset + localElement; + if (element < elementCount) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { + CeedScalar r; + if (!TRANSPOSE) { + readDofs(element, localElement, component, q, U, sharedBuffer); + contractX(q, localElement, sharedBuffer, Bx, r); + writeQuads(elementCount, element, component, q, r, V); + } else { + readQuads(elementCount, element, localElement, component, q, U, sharedBuffer); + contractTransposeX(q, localElement, sharedBuffer, Bx, r); + writeDofs(element, component, q, r, V); + } + } } } } - } } - } -} - -@kernel void weight(const CeedInt elementCount, - const CeedScalar *qWeights1D, - elementWeightArray W) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) { - for (int q = 0; q < Q1D; ++q; @inner) { - W(q, element) = qWeights1D[q]; + + @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) { + for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { + for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) { + for (int q = 0; q < Q1D; ++q; @inner) { + W(q, element) = qWeights1D[q]; + } + } } } - } -} ); diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp index a987144665..4964818026 100644 --- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp +++ b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-2d.cpp @@ -9,220 +9,154 @@ const char *occa_tensor_basis_2d_gpu_source = STRINGIFY_SOURCE( -typedef CeedScalar* dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); -typedef const CeedScalar* const_dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); - -typedef CeedScalar* quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2); -typedef const CeedScalar* const_quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2); - -typedef CeedScalar* sharedBufferArray @dim(MAX_PQ, MAX_PQ, ELEMENTS_PER_BLOCK); -typedef const CeedScalar* quadToDof @dim(P1D, Q1D); -typedef CeedScalar* elementWeightArray @dim(Q1D, Q1D, elementCount); - -//---[ Utility Methods ]---------------- -inline void readDofs(const int element, - const int component, - const int px, - const int py, - const_dofArray U, - CeedScalar *Up) { - // Zero out extra entries - *Up = ( - (px < P1D) && (py < P1D) - ? U(px, py, component, element) - : 0.0 - ); -} - -inline void writeDofs(const int element, - const int component, - const int px, - const int py, - const CeedScalar Vp, - dofArray V) { - if ((px < P1D) && (py < P1D)) { - V(px, py, component, element) = Vp; - } -} - -inline void readQuads(const int elementCount, - const int element, - const int component, - const int qx, - const int qy, - const int dim, - const_quadArray U, - CeedScalar *Uq) { - *Uq = U(qx, qy, element, component, dim); -} - -inline void writeQuads(const int elementCount, - const int element, - const int component, - const int qx, - const int qy, - const int dim, - const CeedScalar Vq, - quadArray V) { - V(qx, qy, element, component, dim) = Vq; -} - -inline void contractX(const int qx, - const int qy, - const int localElement, - sharedBufferArray sharedBuffer, - quadToDof B, - const CeedScalar U, - CeedScalar *V) { - sharedBuffer(qx, qy, localElement) = U; - *V = 0.0; - @barrier(); - for (int p = 0; p < P1D; ++p) { - *V += B(p, qx) * sharedBuffer(p, qy, localElement); - } - @barrier(); -} - -inline void contractY(const int qx, - const int qy, - const int localElement, - sharedBufferArray sharedBuffer, - quadToDof B, - const CeedScalar U, - CeedScalar *V) { - sharedBuffer(qx, qy, localElement) = U; - *V = 0.0; - @barrier(); - for (int p = 0; p < P1D; ++p) { - *V += B(p, qy) * sharedBuffer(qx, p, localElement); - } - @barrier(); -} - -inline void contractTransposeX(const int px, - const int py, - const int localElement, - sharedBufferArray sharedBuffer, - quadToDof B, - const CeedScalar U, - CeedScalar *V) { - sharedBuffer(px, py, localElement) = U; - *V = 0.0; - @barrier(); - for (int q = 0; q < Q1D; ++q) { - *V += B(px, q) * sharedBuffer(q, py, localElement); - } - @barrier(); -} - -inline void contractTransposeY(const int px, - const int py, - const int localElement, - sharedBufferArray sharedBuffer, - quadToDof B, - const CeedScalar U, - CeedScalar *V) { - sharedBuffer(px, py, localElement) = U; - *V = 0.0; - @barrier(); - for (int q = 0; q < Q1D; ++q) { - *V += B(py, q) * sharedBuffer(px, q, localElement); - } - @barrier(); -} - -//---[ Kernels ]------------------------ -@kernel void interp(const CeedInt elementCount, - quadToDof B, - const CeedScalar *U, - CeedScalar *V) { - - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK]; - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { - const int element = elementOffset + localElement; - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { - - if (element < elementCount) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - CeedScalar r1, r2; - if (!TRANSPOSE) { - readDofs(element, component, qx, qy, U, &r1); - contractX(qx, qy, localElement, sharedBuffer, B, r1, &r2); - contractY(qx, qy, localElement, sharedBuffer, B, r2, &r1); - writeQuads(elementCount, element, component, qx, qy, 0, r1, V); - } else { - readQuads(elementCount, element, component, qx, qy, 0, U, &r1); - contractTransposeY(qx, qy, localElement, sharedBuffer, B, r1, &r2); - contractTransposeX(qx, qy, localElement, sharedBuffer, B, r2, &r1); - writeDofs(element, component, qx, qy, r1, V); + typedef CeedScalar * dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); + typedef const CeedScalar *const_dofArray @dim(P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); + + typedef CeedScalar * quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2); + typedef const CeedScalar *const_quadArray @dim(Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 2); + + typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, MAX_PQ, ELEMENTS_PER_BLOCK); typedef const CeedScalar *quadToDof @dim(P1D, Q1D); + typedef CeedScalar * elementWeightArray @dim(Q1D, Q1D, elementCount); + + //---[ Utility Methods ]---------------- + inline void readDofs(const int element, const int component, const int px, const int py, const_dofArray U, CeedScalar *Up) { + // Zero out extra entries + *Up = ((px < P1D) && (py < P1D) ? U(px, py, component, element) : 0.0); + } + + inline void writeDofs(const int element, const int component, const int px, const int py, const CeedScalar Vp, dofArray V) { + if ((px < P1D) && (py < P1D)) { + V(px, py, component, element) = Vp; + } + } + + inline void readQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim, + const_quadArray U, CeedScalar *Uq) { *Uq = U(qx, qy, element, component, dim); } + + inline void writeQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim, + const CeedScalar Vq, quadArray V) { V(qx, qy, element, component, dim) = Vq; } + + inline void contractX(const int qx, const int qy, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar U, + CeedScalar *V) { + sharedBuffer(qx, qy, localElement) = U; + *V = 0.0; + @barrier(); + for (int p = 0; p < P1D; ++p) { + *V += B(p, qx) * sharedBuffer(p, qy, localElement); + } + @barrier(); + } + + inline void contractY(const int qx, const int qy, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar U, + CeedScalar *V) { + sharedBuffer(qx, qy, localElement) = U; + *V = 0.0; + @barrier(); + for (int p = 0; p < P1D; ++p) { + *V += B(p, qy) * sharedBuffer(qx, p, localElement); + } + @barrier(); + } + + inline void contractTransposeX(const int px, const int py, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, + const CeedScalar U, CeedScalar *V) { + sharedBuffer(px, py, localElement) = U; + *V = 0.0; + @barrier(); + for (int q = 0; q < Q1D; ++q) { + *V += B(px, q) * sharedBuffer(q, py, localElement); + } + @barrier(); + } + + inline void contractTransposeY(const int px, const int py, const int localElement, sharedBufferArray sharedBuffer, quadToDof B, + const CeedScalar U, CeedScalar *V) { + sharedBuffer(px, py, localElement) = U; + *V = 0.0; + @barrier(); + for (int q = 0; q < Q1D; ++q) { + *V += B(py, q) * sharedBuffer(px, q, localElement); + } + @barrier(); + } + + //---[ Kernels ]------------------------ + @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) { + for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { + @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK]; + + for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { + const int element = elementOffset + localElement; + for (int qy = 0; qy < Q1D; ++qy; @inner) { + for (int qx = 0; qx < Q1D; ++qx; @inner) { + if (element < elementCount) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { + CeedScalar r1, r2; + if (!TRANSPOSE) { + readDofs(element, component, qx, qy, U, &r1); + contractX(qx, qy, localElement, sharedBuffer, B, r1, &r2); + contractY(qx, qy, localElement, sharedBuffer, B, r2, &r1); + writeQuads(elementCount, element, component, qx, qy, 0, r1, V); + } else { + readQuads(elementCount, element, component, qx, qy, 0, U, &r1); + contractTransposeY(qx, qy, localElement, sharedBuffer, B, r1, &r2); + contractTransposeX(qx, qy, localElement, sharedBuffer, B, r2, &r1); + writeDofs(element, component, qx, qy, r1, V); + } + } } } } - } } } - } -} - -@kernel void grad(const CeedInt elementCount, - quadToDof B, - quadToDof Bx, - const CeedScalar *U, - CeedScalar *V) { - - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK]; - - for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { - const int element = elementOffset + localElement; - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { - - if (element < elementCount) { - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { - CeedScalar r1, r2, r3; - if (!TRANSPOSE) { - readDofs(element, component, qx, qy, U, &r1); - contractX(qx, qy, localElement, sharedBuffer, Bx, r1, &r2); - contractY(qx, qy, localElement, sharedBuffer, B , r2, &r3); - writeQuads(elementCount, element, component, qx, qy, 0, r3, V); - contractX(qx, qy, localElement, sharedBuffer, B , r1, &r2); - contractY(qx, qy, localElement, sharedBuffer, Bx, r2, &r3); - writeQuads(elementCount, element, component, qx, qy, 1, r3, V); - } else { - readQuads(elementCount, element, component, qx, qy, 0, U, &r1); - contractTransposeY(qx, qy, localElement, sharedBuffer, B , r1, &r2); - contractTransposeX(qx, qy, localElement, sharedBuffer, Bx, r2, &r3); - readQuads(elementCount, element, component, qx, qy, 1, U, &r1); - contractTransposeY(qx, qy, localElement, sharedBuffer, Bx, r1, &r2); - contractTransposeX(qx, qy, localElement, sharedBuffer, B , r2, &r1); - writeDofs(element, component, qx, qy, r1 + r3, V); + + @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) { + for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { + @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * ELEMENTS_PER_BLOCK]; + + for (int localElement = 0; localElement < ELEMENTS_PER_BLOCK; ++localElement; @inner) { + const int element = elementOffset + localElement; + for (int qy = 0; qy < Q1D; ++qy; @inner) { + for (int qx = 0; qx < Q1D; ++qx; @inner) { + if (element < elementCount) { + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component) { + CeedScalar r1, r2, r3; + if (!TRANSPOSE) { + readDofs(element, component, qx, qy, U, &r1); + contractX(qx, qy, localElement, sharedBuffer, Bx, r1, &r2); + contractY(qx, qy, localElement, sharedBuffer, B, r2, &r3); + writeQuads(elementCount, element, component, qx, qy, 0, r3, V); + contractX(qx, qy, localElement, sharedBuffer, B, r1, &r2); + contractY(qx, qy, localElement, sharedBuffer, Bx, r2, &r3); + writeQuads(elementCount, element, component, qx, qy, 1, r3, V); + } else { + readQuads(elementCount, element, component, qx, qy, 0, U, &r1); + contractTransposeY(qx, qy, localElement, sharedBuffer, B, r1, &r2); + contractTransposeX(qx, qy, localElement, sharedBuffer, Bx, r2, &r3); + readQuads(elementCount, element, component, qx, qy, 1, U, &r1); + contractTransposeY(qx, qy, localElement, sharedBuffer, Bx, r1, &r2); + contractTransposeX(qx, qy, localElement, sharedBuffer, B, r2, &r1); + writeDofs(element, component, qx, qy, r1 + r3, V); + } + } } } } - } } } - } -} - -@kernel void weight(const CeedInt elementCount, - const CeedScalar *qWeights1D, - elementWeightArray W) { - for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { - for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) { - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { - W(qx, qy, element) = qWeights1D[qx] * qWeights1D[qy]; + + @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) { + for (int elementOffset = 0; elementOffset < elementCount; elementOffset += ELEMENTS_PER_BLOCK; @outer) { + for (int element = elementOffset; element < (elementOffset + ELEMENTS_PER_BLOCK); ++element; @outer) { + for (int qy = 0; qy < Q1D; ++qy; @inner) { + for (int qx = 0; qx < Q1D; ++qx; @inner) { + W(qx, qy, element) = qWeights1D[qx] * qWeights1D[qy]; + } + } } } } - } -} ); diff --git a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp index 96d70f69c3..37cd839914 100644 --- a/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp +++ b/backends/occa/kernels/tensor-basis/gpu/tensor-basis-3d.cpp @@ -9,299 +9,229 @@ const char *occa_tensor_basis_3d_gpu_source = STRINGIFY_SOURCE( -typedef CeedScalar* dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); -typedef const CeedScalar* const_dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); + typedef CeedScalar * dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); + typedef const CeedScalar *const_dofArray @dim(P1D, P1D, P1D, BASIS_COMPONENT_COUNT, elementCount); -typedef CeedScalar* quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3); -typedef const CeedScalar* const_quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3); + typedef CeedScalar * quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3); + typedef const CeedScalar *const_quadArray @dim(Q1D, Q1D, Q1D, elementCount, BASIS_COMPONENT_COUNT, 3); -typedef CeedScalar* sharedBufferArray @dim(MAX_PQ, MAX_PQ, BASIS_COMPONENT_COUNT); -typedef const CeedScalar* quadToDof @dim(P1D, Q1D); -typedef CeedScalar* elementWeightArray @dim(Q1D, Q1D, Q1D, elementCount); + typedef CeedScalar * sharedBufferArray @dim(MAX_PQ, MAX_PQ, BASIS_COMPONENT_COUNT); typedef const CeedScalar *quadToDof @dim(P1D, Q1D); + typedef CeedScalar * elementWeightArray @dim(Q1D, Q1D, Q1D, elementCount); -//---[ Utility Methods ]---------------- -inline void add(const CeedScalar *U, CeedScalar *V) { - for (int q = 0; q < Q1D; q++) { - V[q] += U[q]; - } -} - -inline void readDofs(const int element, - const int component, - const int px, - const int py, - const_dofArray U, - CeedScalar *Up) { - // Zero out extra entries - for (int pz = 0; pz < P1D; ++pz) { - Up[pz] = ( - (px < P1D) && (py < P1D) - ? U(px, py, pz, component, element) - : 0.0 - ); - } - for (int q = P1D; q < Q1D; ++q) { - Up[q] = 0.0; - } -} - -inline void writeDofs(const int element, - const int component, - const int px, - const int py, - const CeedScalar *Vp, - dofArray V) { - if ((px < P1D) && (py < P1D)) { - for (int pz = 0; pz < P1D; ++pz) { - V(px, py, pz, component, element) = Vp[pz]; + //---[ Utility Methods ]---------------- + inline void add(const CeedScalar *U, CeedScalar *V) { + for (int q = 0; q < Q1D; q++) { + V[q] += U[q]; + } } - } -} - -inline void readQuads(const int elementCount, - const int element, - const int component, - const int qx, - const int qy, - const int dim, - const_quadArray U, - CeedScalar *Uq) { - for (int qz = 0; qz < Q1D; ++qz) { - Uq[qz] = U(qx, qy, qz, element, component, dim); - } -} - -inline void writeQuads(const int elementCount, - const int element, - const int component, - const int qx, - const int qy, - const int dim, - const CeedScalar *Vq, - quadArray V) { - for (int qz = 0; qz < Q1D; ++qz) { - V(qx, qy, qz, element, component, dim) = Vq[qz]; - } -} -inline void contractX(const int qx, - const int qy, - const int component, - sharedBufferArray sharedBuffer, - quadToDof B, - const CeedScalar *Uq, - CeedScalar *Vp) { - for (int pz = 0; pz < P1D; ++pz) { - sharedBuffer(qx, qy, component) = Uq[pz]; - Vp[pz] = 0.0; - @barrier(); - for (int p = 0; p < P1D; ++p) { - Vp[pz] += B(p, qx) * sharedBuffer(p, qy, component); + inline void readDofs(const int element, const int component, const int px, const int py, const_dofArray U, CeedScalar *Up) { + // Zero out extra entries + for (int pz = 0; pz < P1D; ++pz) { + Up[pz] = ((px < P1D) && (py < P1D) ? U(px, py, pz, component, element) : 0.0); + } + for (int q = P1D; q < Q1D; ++q) { + Up[q] = 0.0; + } } - @barrier(); - } -} -inline void contractY(const int qx, - const int qy, - const int component, - sharedBufferArray sharedBuffer, - quadToDof B, - const CeedScalar *Uq, - CeedScalar *Vp) { - for (int pz = 0; pz < P1D; ++pz) { - sharedBuffer(qx, qy, component) = Uq[pz]; - Vp[pz] = 0.0; - @barrier(); - for (int p = 0; p < P1D; ++p) { - Vp[pz] += B(p, qy) * sharedBuffer(qx, p, component); + inline void writeDofs(const int element, const int component, const int px, const int py, const CeedScalar *Vp, dofArray V) { + if ((px < P1D) && (py < P1D)) { + for (int pz = 0; pz < P1D; ++pz) { + V(px, py, pz, component, element) = Vp[pz]; + } + } } - @barrier(); - } -} -inline void contractZ(const int qx, - const int qy, - quadToDof B, - const CeedScalar *Up, - CeedScalar *Vq) { - for (int qz = 0; qz < Q1D; ++qz) { - Vq[qz] = 0.0; - for (int p = 0; p < P1D; ++p) { - Vq[qz] += B(p, qz) * Up[p]; + inline void readQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim, + const_quadArray U, CeedScalar *Uq) { + for (int qz = 0; qz < Q1D; ++qz) { + Uq[qz] = U(qx, qy, qz, element, component, dim); + } } - } -} -inline void contractTransposeX(const int px, - const int py, - const int component, - sharedBufferArray sharedBuffer, - quadToDof B, - const CeedScalar *Up, - CeedScalar *Vp) { - for (int pz = 0; pz < P1D; ++pz) { - sharedBuffer(px, py, component) = Up[pz]; - Vp[pz] = 0.0; - @barrier(); - if (px < P1D) { - for (int qx = 0; qx < Q1D; ++qx) { - Vp[pz] += B(px, qx) * sharedBuffer(qx, py, component); + inline void writeQuads(const int elementCount, const int element, const int component, const int qx, const int qy, const int dim, + const CeedScalar *Vq, quadArray V) { + for (int qz = 0; qz < Q1D; ++qz) { + V(qx, qy, qz, element, component, dim) = Vq[qz]; } } - @barrier(); - } -} -inline void contractTransposeY(const int px, - const int py, - const int component, - sharedBufferArray sharedBuffer, - quadToDof B, - const CeedScalar *Up, - CeedScalar *Vp) { - for (int pz = 0; pz < P1D; ++pz) { - sharedBuffer(px, py, component) = Up[pz]; - Vp[pz] = 0.0; - @barrier(); - if (py < P1D) { - for (int qy = 0; qy < Q1D; ++qy) { - Vp[pz] += B(py, qy) * sharedBuffer(px, qy, component); + inline void contractX(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq, + CeedScalar *Vp) { + for (int pz = 0; pz < P1D; ++pz) { + sharedBuffer(qx, qy, component) = Uq[pz]; + Vp[pz] = 0.0; + @barrier(); + for (int p = 0; p < P1D; ++p) { + Vp[pz] += B(p, qx) * sharedBuffer(p, qy, component); + } + @barrier(); } } - @barrier(); - } -} -inline void contractTransposeZ(const int px, - const int py, - quadToDof B, - const CeedScalar *Uq, - CeedScalar *Vq) { - for (int pz = 0; pz < P1D; ++pz) { - Vq[pz] = 0.0; - for (int qz = 0; qz < Q1D; ++qz) { - Vq[pz] += B(pz, qz) * Uq[qz]; + inline void contractY(const int qx, const int qy, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Uq, + CeedScalar *Vp) { + for (int pz = 0; pz < P1D; ++pz) { + sharedBuffer(qx, qy, component) = Uq[pz]; + Vp[pz] = 0.0; + @barrier(); + for (int p = 0; p < P1D; ++p) { + Vp[pz] += B(p, qy) * sharedBuffer(qx, p, component); + } + @barrier(); + } } - } -} -//---[ Kernels ]------------------------ -@kernel void interp(const CeedInt elementCount, - quadToDof B, - const CeedScalar *U, - CeedScalar *V) { + inline void contractZ(const int qx, const int qy, quadToDof B, const CeedScalar *Up, CeedScalar *Vq) { + for (int qz = 0; qz < Q1D; ++qz) { + Vq[qz] = 0.0; + for (int p = 0; p < P1D; ++p) { + Vq[qz] += B(p, qz) * Up[p]; + } + } + } - for (int element = 0; element < elementCount; ++element; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT]; + inline void contractTransposeX(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up, + CeedScalar *Vp) { + for (int pz = 0; pz < P1D; ++pz) { + sharedBuffer(px, py, component) = Up[pz]; + Vp[pz] = 0.0; + @barrier(); + if (px < P1D) { + for (int qx = 0; qx < Q1D; ++qx) { + Vp[pz] += B(px, qx) * sharedBuffer(qx, py, component); + } + } + @barrier(); + } + } - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { + inline void contractTransposeY(const int px, const int py, const int component, sharedBufferArray sharedBuffer, quadToDof B, const CeedScalar *Up, + CeedScalar *Vp) { + for (int pz = 0; pz < P1D; ++pz) { + sharedBuffer(px, py, component) = Up[pz]; + Vp[pz] = 0.0; + @barrier(); + if (py < P1D) { + for (int qy = 0; qy < Q1D; ++qy) { + Vp[pz] += B(py, qy) * sharedBuffer(px, qy, component); + } + } + @barrier(); + } + } - if (element < elementCount) { - CeedScalar r1[MAX_PQ], r2[MAX_PQ]; - for (int q = 0; q < Q1D; ++q) { - r1[q] = 0.0; - r2[q] = 0.0; - } + inline void contractTransposeZ(const int px, const int py, quadToDof B, const CeedScalar *Uq, CeedScalar *Vq) { + for (int pz = 0; pz < P1D; ++pz) { + Vq[pz] = 0.0; + for (int qz = 0; qz < Q1D; ++qz) { + Vq[pz] += B(pz, qz) * Uq[qz]; + } + } + } - if (!TRANSPOSE) { - readDofs(element, component, qx, qy, U, r1); - contractX(qx, qy, component, sharedBuffer, B, r1, r2); - contractY(qx, qy, component, sharedBuffer, B, r2, r1); - contractZ(qx, qy, B, r1, r2); - writeQuads(elementCount, element, component, qx, qy, 0, r2, V); - } else { - readQuads(elementCount, element, component, qx, qy, 0, U, r1); - contractTransposeZ(qx, qy, B, r1, r2); - contractTransposeY(qx, qy, component, sharedBuffer, B, r2, r1); - contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r2); - writeDofs(element, component, qx, qy, r2, V); + //---[ Kernels ]------------------------ + @kernel void interp(const CeedInt elementCount, quadToDof B, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT]; + + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + for (int qy = 0; qy < Q1D; ++qy; @inner) { + for (int qx = 0; qx < Q1D; ++qx; @inner) { + if (element < elementCount) { + CeedScalar r1[MAX_PQ], r2[MAX_PQ]; + for (int q = 0; q < Q1D; ++q) { + r1[q] = 0.0; + r2[q] = 0.0; + } + + if (!TRANSPOSE) { + readDofs(element, component, qx, qy, U, r1); + contractX(qx, qy, component, sharedBuffer, B, r1, r2); + contractY(qx, qy, component, sharedBuffer, B, r2, r1); + contractZ(qx, qy, B, r1, r2); + writeQuads(elementCount, element, component, qx, qy, 0, r2, V); + } else { + readQuads(elementCount, element, component, qx, qy, 0, U, r1); + contractTransposeZ(qx, qy, B, r1, r2); + contractTransposeY(qx, qy, component, sharedBuffer, B, r2, r1); + contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r2); + writeDofs(element, component, qx, qy, r2, V); + } + } } } - } } } - } -} - -@kernel void grad(const CeedInt elementCount, - quadToDof B, - quadToDof Bx, - const CeedScalar *U, - CeedScalar *V) { - - for (int element = 0; element < elementCount; ++element; @outer) { - @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT]; - for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx; @inner) { - - if (element < elementCount) { - CeedScalar r1[MAX_PQ], r2[MAX_PQ], r3[MAX_PQ]; - - if (!TRANSPOSE) { - readDofs(element, component, qx, qy, U, r1); - // Dx - contractX(qx, qy, component, sharedBuffer, Bx, r1, r2); - contractY(qx, qy, component, sharedBuffer, B , r2, r3); - contractZ(qx, qy, B, r3, r2); - writeQuads(elementCount, element, component, qx, qy, 0, r2, V); - // Dy - contractX(qx, qy, component, sharedBuffer, B , r1, r2); - contractY(qx, qy, component, sharedBuffer, Bx, r2, r3); - contractZ(qx, qy, B , r3, r2); - writeQuads(elementCount, element, component, qx, qy, 1, r2, V); - // Dz - contractX(qx, qy, component, sharedBuffer, B , r1, r2); - contractY(qx, qy, component, sharedBuffer, B , r2, r3); - contractZ(qx, qy, Bx, r3, r2); - writeQuads(elementCount, element, component, qx, qy, 2, r2, V); - } else { - // Dx - readQuads(elementCount, element, component, qx, qy, 0, U, r1); - contractTransposeZ(qx, qy, B , r1, r3); - contractTransposeY(qx, qy, component, sharedBuffer, B , r3, r1); - contractTransposeX(qx, qy, component, sharedBuffer, Bx, r1, r2); - // Dy - readQuads(elementCount, element, component, qx, qy, 1, U, r1); - contractTransposeZ(qx, qy, B , r1, r3); - contractTransposeY(qx, qy, component, sharedBuffer, Bx, r3, r1); - contractTransposeX(qx, qy, component, sharedBuffer, B , r1, r3); - add(r3, r2); - // Dz - readQuads(elementCount, element, component, qx, qy, 2, U, r1); - contractTransposeZ(qx, qy, Bx, r1, r3); - contractTransposeY(qx, qy, component, sharedBuffer, B , r3, r1); - contractTransposeX(qx, qy, component, sharedBuffer, B , r1, r3); - add(r3, r2); - writeDofs(element, component, qx, qy, r2, V); + @kernel void grad(const CeedInt elementCount, quadToDof B, quadToDof Bx, const CeedScalar *U, CeedScalar *V) { + for (int element = 0; element < elementCount; ++element; @outer) { + @shared CeedScalar sharedBuffer[MAX_PQ * MAX_PQ * BASIS_COMPONENT_COUNT]; + + for (int component = 0; component < BASIS_COMPONENT_COUNT; ++component; @inner) { + for (int qy = 0; qy < Q1D; ++qy; @inner) { + for (int qx = 0; qx < Q1D; ++qx; @inner) { + if (element < elementCount) { + CeedScalar r1[MAX_PQ], r2[MAX_PQ], r3[MAX_PQ]; + + if (!TRANSPOSE) { + readDofs(element, component, qx, qy, U, r1); + // Dx + contractX(qx, qy, component, sharedBuffer, Bx, r1, r2); + contractY(qx, qy, component, sharedBuffer, B, r2, r3); + contractZ(qx, qy, B, r3, r2); + writeQuads(elementCount, element, component, qx, qy, 0, r2, V); + // Dy + contractX(qx, qy, component, sharedBuffer, B, r1, r2); + contractY(qx, qy, component, sharedBuffer, Bx, r2, r3); + contractZ(qx, qy, B, r3, r2); + writeQuads(elementCount, element, component, qx, qy, 1, r2, V); + // Dz + contractX(qx, qy, component, sharedBuffer, B, r1, r2); + contractY(qx, qy, component, sharedBuffer, B, r2, r3); + contractZ(qx, qy, Bx, r3, r2); + writeQuads(elementCount, element, component, qx, qy, 2, r2, V); + } else { + // Dx + readQuads(elementCount, element, component, qx, qy, 0, U, r1); + contractTransposeZ(qx, qy, B, r1, r3); + contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1); + contractTransposeX(qx, qy, component, sharedBuffer, Bx, r1, r2); + // Dy + readQuads(elementCount, element, component, qx, qy, 1, U, r1); + contractTransposeZ(qx, qy, B, r1, r3); + contractTransposeY(qx, qy, component, sharedBuffer, Bx, r3, r1); + contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3); + add(r3, r2); + // Dz + readQuads(elementCount, element, component, qx, qy, 2, U, r1); + contractTransposeZ(qx, qy, Bx, r1, r3); + contractTransposeY(qx, qy, component, sharedBuffer, B, r3, r1); + contractTransposeX(qx, qy, component, sharedBuffer, B, r1, r3); + add(r3, r2); + writeDofs(element, component, qx, qy, r2, V); + } + } } } - } } } - } -} -@kernel void weight(const CeedInt elementCount, - const CeedScalar *qWeights1D, - elementWeightArray W) { - for (int element = 0; element < elementCount; ++element; @outer) { - for (int qz = 0; qz < Q1D; ++qz; @inner) { - for (int qy = 0; qy < Q1D; ++qy; @inner) { - for (int qx = 0; qx < Q1D; ++qx) { - if (element < elementCount) { - W(qx, qy, qz, element) = qWeights1D[qx] * qWeights1D[qy] * qWeights1D[qz]; + @kernel void weight(const CeedInt elementCount, const CeedScalar *qWeights1D, elementWeightArray W) { + for (int element = 0; element < elementCount; ++element; @outer) { + for (int qz = 0; qz < Q1D; ++qz; @inner) { + for (int qy = 0; qy < Q1D; ++qy; @inner) { + for (int qx = 0; qx < Q1D; ++qx) { + if (element < elementCount) { + W(qx, qy, qz, element) = qWeights1D[qx] * qWeights1D[qy] * qWeights1D[qz]; + } + } } } } } - } -} ); diff --git a/backends/opt/ceed-opt-blocked.c b/backends/opt/ceed-opt-blocked.c index 3a5185c763..49ea2e2429 100644 --- a/backends/opt/ceed-opt-blocked.c +++ b/backends/opt/ceed-opt-blocked.c @@ -5,20 +5,20 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-opt.h" //------------------------------------------------------------------------------ // Backend Destroy //------------------------------------------------------------------------------ static int CeedDestroy_Opt(Ceed ceed) { - int ierr; Ceed_Opt *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -27,36 +27,31 @@ static int CeedDestroy_Opt(Ceed ceed) { // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Opt_Blocked(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/opt") && - strcmp(resource, "/cpu/self/opt/blocked")) + if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/opt") && strcmp(resource, "/cpu/self/opt/blocked")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Opt backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Opt backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceed_ref; - CeedInit("/cpu/self/ref/serial", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); // Set fallback CEED resource for advanced operator functionality const char fallbackresource[] = "/cpu/self/ref/serial"; - ierr = CeedSetOperatorFallbackResource(ceed, fallbackresource); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", - CeedDestroy_Opt); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", - CeedOperatorCreate_Opt); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Opt)); // Set blocksize Ceed_Opt *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); data->blk_size = 8; - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); + CeedCallBackend(CeedSetData(ceed, data)); return CEED_ERROR_SUCCESS; } @@ -64,7 +59,5 @@ static int CeedInit_Opt_Blocked(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Opt_Blocked(void) { - return CeedRegister("/cpu/self/opt/blocked", CeedInit_Opt_Blocked, 40); -} +CEED_INTERN int CeedRegister_Opt_Blocked(void) { return CeedRegister("/cpu/self/opt/blocked", CeedInit_Opt_Blocked, 40); } //------------------------------------------------------------------------------ diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c index 0995394956..2a23458ea5 100644 --- a/backends/opt/ceed-opt-operator.c +++ b/backends/opt/ceed-opt-operator.c @@ -5,122 +5,103 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include + #include "ceed-opt.h" //------------------------------------------------------------------------------ // Setup Input/Output Fields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, - bool is_input, const CeedInt blk_size, - CeedElemRestriction *blk_restr, - CeedVector *e_vecs_full, CeedVector *e_vecs, - CeedVector *q_vecs, CeedInt start_e, - CeedInt num_fields, CeedInt Q) { - CeedInt ierr, num_comp, size, P; +static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt blk_size, CeedElemRestriction *blk_restr, + CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, + CeedInt Q) { + CeedInt num_comp, size, P; CeedSize e_size, q_size; - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedBasis basis; + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedBasis basis; CeedElemRestriction r; - CeedOperatorField *op_fields; + CeedOperatorField *op_fields; CeedQFunctionField *qf_fields; if (is_input) { - ierr = CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); } else { - ierr = CeedOperatorGetFields(op, NULL, NULL, NULL,&op_fields); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); } // Loop over fields - for (CeedInt i=0; iblk_size; + CeedCallBackend(CeedGetData(ceed, &ceed_impl)); + const CeedInt blk_size = ceed_impl->blk_size; CeedOperator_Opt *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedInt Q, num_input_fields, num_output_fields; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - ierr = CeedQFunctionIsIdentity(qf, &impl->is_identity_qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedQFunctionIsIdentity(qf, &impl->is_identity_qf)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate - ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr); - CeedChkBackend(ierr); - ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full); - CeedChkBackend(ierr); - - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->input_states); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out); CeedChkBackend(ierr); - - impl->num_inputs = num_input_fields; + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr)); + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full)); + + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); + + impl->num_inputs = num_input_fields; impl->num_outputs = num_output_fields; // Set up infield and outfield pointer arrays // Infields - ierr = CeedOperatorSetupFields_Opt(qf, op, true, blk_size, impl->blk_restr, - impl->e_vecs_full, impl->e_vecs_in, - impl->q_vecs_in, 0, num_input_fields, Q); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, true, blk_size, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, + num_input_fields, Q)); // Outfields - ierr = CeedOperatorSetupFields_Opt(qf, op, false, blk_size, impl->blk_restr, - impl->e_vecs_full, impl->e_vecs_out, - impl->q_vecs_out, num_input_fields, - num_output_fields, Q); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, false, blk_size, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, + num_input_fields, num_output_fields, Q)); // Identity QFunctions if (impl->is_identity_qf) { - CeedEvalMode in_mode, out_mode; + CeedEvalMode in_mode, out_mode; CeedQFunctionField *in_fields, *out_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode)); if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) { impl->is_identity_restr_op = true; } else { - ierr = CeedVectorDestroy(&impl->q_vecs_out[0]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[0])); impl->q_vecs_out[0] = impl->q_vecs_in[0]; - ierr = CeedVectorAddReference(impl->q_vecs_in[0]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorAddReference(impl->q_vecs_in[0])); } } - ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } @@ -211,47 +177,34 @@ static int CeedOperatorSetup_Opt(CeedOperator op) { //------------------------------------------------------------------------------ // Setup Input Fields //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, - CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedVector in_vec, CeedScalar *e_data[2*CEED_FIELD_MAX], CeedOperator_Opt *impl, - CeedRequest *request) { - CeedInt ierr; +static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedVector in_vec, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl, + CeedRequest *request) { CeedEvalMode eval_mode; - CeedVector vec; - uint64_t state; + CeedVector vec; + uint64_t state; - for (CeedInt i=0; iinput_states[i]) { - ierr = CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE, - vec, impl->e_vecs_full[i], request); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); impl->input_states[i] = state; } // Get evec - ierr = CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, - (const CeedScalar **) &e_data[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data[i])); } else { // Set Qvec for CEED_EVAL_NONE if (eval_mode == CEED_EVAL_NONE) { - ierr = CeedVectorGetArrayRead(impl->e_vecs_in[i], CEED_MEM_HOST, - (const CeedScalar **)&e_data[i]); - CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, e_data[i]); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArrayRead(impl->e_vecs_in[i], - (const CeedScalar **)&e_data[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_in[i], CEED_MEM_HOST, (const CeedScalar **)&e_data[i])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, e_data[i])); + CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_in[i], (const CeedScalar **)&e_data[i])); } } } @@ -262,93 +215,66 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, - CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedInt num_input_fields, CeedInt blk_size, CeedVector in_vec, bool skip_active, - CeedScalar *e_data[2*CEED_FIELD_MAX], CeedOperator_Opt *impl, - CeedRequest *request) { - CeedInt ierr; - CeedInt dim, elem_size, size; +static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedInt num_input_fields, CeedInt blk_size, CeedVector in_vec, bool skip_active, + CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl, CeedRequest *request) { + CeedInt dim, elem_size, size; CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; - CeedVector vec; + CeedEvalMode eval_mode; + CeedBasis basis; + CeedVector vec; - for (CeedInt i=0; iblk_restr[i], e/blk_size, - CEED_NOTRANSPOSE, in_vec, - impl->e_vecs_in[i], request); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionApplyBlock(impl->blk_restr[i], e / blk_size, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request)); active_in = 1; } // Basis action - switch(eval_mode) { - case CEED_EVAL_NONE: - if (!active_in) { - ierr = CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data[i][e*Q*size]); - CeedChkBackend(ierr); - } - break; - case CEED_EVAL_INTERP: - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); - CeedChkBackend(ierr); - if (!active_in) { - ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data[i][e*elem_size*size]); - CeedChkBackend(ierr); - } - ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, - CEED_EVAL_INTERP, impl->e_vecs_in[i], - impl->q_vecs_in[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); - CeedChkBackend(ierr); - if (!active_in) { - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, - &e_data[i][e*elem_size*size/dim]); - CeedChkBackend(ierr); + switch (eval_mode) { + case CEED_EVAL_NONE: + if (!active_in) { + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][e * Q * size])); + } + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (!active_in) { + CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][e * elem_size * size])); + } + CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs_in[i], impl->q_vecs_in[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (!active_in) { + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][e * elem_size * size / dim])); + } + CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs_in[i], impl->q_vecs_in[i])); + break; + case CEED_EVAL_WEIGHT: + break; // No action + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + Ceed ceed; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Ceed evaluation mode not implemented"); + // LCOV_EXCL_STOP } - ierr = CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, - CEED_EVAL_GRAD, impl->e_vecs_in[i], - impl->q_vecs_in[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_WEIGHT: - break; // No action - // LCOV_EXCL_START - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); - CeedChkBackend(ierr); - Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Ceed evaluation mode not implemented"); - // LCOV_EXCL_STOP - } } } return CEED_ERROR_SUCCESS; @@ -357,69 +283,53 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, //------------------------------------------------------------------------------ // Output Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, - CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, - CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields, - CeedOperator op, CeedVector out_vec, CeedOperator_Opt *impl, - CeedRequest *request) { - CeedInt ierr; +static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, + CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, + CeedVector out_vec, CeedOperator_Opt *impl, CeedRequest *request) { CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; - CeedVector vec; + CeedEvalMode eval_mode; + CeedBasis basis; + CeedVector vec; - for (CeedInt i=0; iq_vecs_out[i], - impl->e_vecs_out[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, - CEED_EVAL_GRAD, impl->q_vecs_out[i], - impl->e_vecs_out[i]); CeedChkBackend(ierr); - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT cannot be an output " - "evaluation mode"); - } - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Ceed evaluation mode not implemented"); - // LCOV_EXCL_STOP - } + switch (eval_mode) { + case CEED_EVAL_NONE: + break; // No action + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs_out[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs_out[i])); + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, + "CEED_EVAL_WEIGHT cannot be an output " + "evaluation mode"); + } + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Ceed evaluation mode not implemented"); + // LCOV_EXCL_STOP + } } // Restrict output block // Get output vector - ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec); - CeedChkBackend(ierr); - if (vec == CEED_VECTOR_ACTIVE) - vec = out_vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; // Restrict - ierr = CeedElemRestrictionApplyBlock(impl->blk_restr[i+impl->num_inputs], - e/blk_size, CEED_TRANSPOSE, - impl->e_vecs_out[i], vec, request); - CeedChkBackend(ierr); + CeedCallBackend( + CeedElemRestrictionApplyBlock(impl->blk_restr[i + impl->num_inputs], e / blk_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request)); } return CEED_ERROR_SUCCESS; } @@ -427,22 +337,15 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, //------------------------------------------------------------------------------ // Restore Input Vectors //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Opt(CeedInt num_input_fields, - CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedScalar *e_data[2*CEED_FIELD_MAX], CeedOperator_Opt *impl) { - CeedInt ierr; - - for (CeedInt i=0; ie_vecs_full[i], - (const CeedScalar **) &e_data[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_full[i], (const CeedScalar **)&e_data[i])); } } return CEED_ERROR_SUCCESS; @@ -451,95 +354,72 @@ static inline int CeedOperatorRestoreInputs_Opt(CeedInt num_input_fields, //------------------------------------------------------------------------------ // Operator Apply //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, - CeedVector out_vec, CeedRequest *request) { - int ierr; +static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); Ceed_Opt *ceed_impl; - ierr = CeedGetData(ceed, &ceed_impl); CeedChkBackend(ierr); - CeedInt blk_size = ceed_impl->blk_size; + CeedCallBackend(CeedGetData(ceed, &ceed_impl)); + CeedInt blk_size = ceed_impl->blk_size; CeedOperator_Opt *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedInt Q, num_input_fields, num_output_fields, num_elem; - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedInt num_blks = (num_elem / blk_size) + !!(num_elem % blk_size); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); CeedEvalMode eval_mode; - CeedScalar *e_data[2*CEED_FIELD_MAX] = {0}; + CeedScalar *e_data[2 * CEED_FIELD_MAX] = {0}; // Setup - ierr = CeedOperatorSetup_Opt(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Opt(op)); // Restriction only operator if (impl->is_identity_restr_op) { - for (CeedInt b=0; bblk_restr[0], b, CEED_NOTRANSPOSE, - in_vec, impl->e_vecs_in[0], request); CeedChkBackend(ierr); - ierr = CeedElemRestrictionApplyBlock(impl->blk_restr[1], b, CEED_TRANSPOSE, - impl->e_vecs_in[0], out_vec, request); CeedChkBackend(ierr); + for (CeedInt b = 0; b < num_blks; b++) { + CeedCallBackend(CeedElemRestrictionApplyBlock(impl->blk_restr[0], b, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[0], request)); + CeedCallBackend(CeedElemRestrictionApplyBlock(impl->blk_restr[1], b, CEED_TRANSPOSE, impl->e_vecs_in[0], out_vec, request)); } return CEED_ERROR_SUCCESS; } // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, - op_input_fields, in_vec, e_data, - impl, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, in_vec, e_data, impl, request)); // Output Lvecs, Evecs, and Qvecs - for (CeedInt i=0; ie_vecs_out[i], CEED_MEM_HOST, - &e_data[i + num_input_fields]); - CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, - CEED_USE_POINTER, e_data[i + num_input_fields]); - CeedChkBackend(ierr); - ierr = CeedVectorRestoreArray(impl->e_vecs_out[i], - &e_data[i + num_input_fields]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_out[i], CEED_MEM_HOST, &e_data[i + num_input_fields])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, e_data[i + num_input_fields])); + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_out[i], &e_data[i + num_input_fields])); } } // Loop through elements - for (CeedInt e=0; eis_identity_qf) { - ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out)); } // Output basis apply and restrict - ierr = CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, - blk_size, num_input_fields, num_output_fields, - op, out_vec, impl, request); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, blk_size, num_input_fields, num_output_fields, op, out_vec, + impl, request)); } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, - op_input_fields, e_data, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl)); return CEED_ERROR_SUCCESS; } @@ -547,94 +427,77 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, //------------------------------------------------------------------------------ // Core code for linear QFunction assembly //------------------------------------------------------------------------------ -static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, - bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, - CeedRequest *request) { - int ierr; +static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); Ceed_Opt *ceed_impl; - ierr = CeedGetData(ceed, &ceed_impl); CeedChkBackend(ierr); - const CeedInt blk_size = ceed_impl->blk_size; - CeedSize q_size; + CeedCallBackend(CeedGetData(ceed, &ceed_impl)); + const CeedInt blk_size = ceed_impl->blk_size; + CeedSize q_size; CeedOperator_Opt *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedInt Q, num_input_fields, num_output_fields, num_elem, size; - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - CeedInt num_blks = (num_elem/blk_size) + !!(num_elem%blk_size); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedInt num_blks = (num_elem / blk_size) + !!(num_elem % blk_size); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); - CeedVector vec, l_vec = impl->qf_l_vec; - CeedInt num_active_in = impl->num_active_in, - num_active_out = impl->num_active_out; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedVector vec, l_vec = impl->qf_l_vec; + CeedInt num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; CeedVector *active_in = impl->qf_active_in; CeedScalar *a, *tmp; - CeedScalar *e_data[2*CEED_FIELD_MAX] = {0}; + CeedScalar *e_data[2 * CEED_FIELD_MAX] = {0}; // Setup - ierr = CeedOperatorSetup_Opt(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Opt(op)); // Check for identity - if (impl->is_identity_qf) + if (impl->is_identity_qf) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Assembling identity qfunctions not supported"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Assembling identity qfunctions not supported"); + // LCOV_EXCL_STOP + } // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, - op_input_fields, NULL, e_data, - impl, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, NULL, e_data, impl, request)); // Count number of active input fields if (!num_active_in) { - for (CeedInt i=0; iq_vecs_in[i], 0.0); CeedChkBackend(ierr); - ierr = CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp); - CeedChkBackend(ierr); - ierr = CeedRealloc(num_active_in + size, &active_in); CeedChkBackend(ierr); - for (CeedInt field=0; fieldq_vecs_in[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp)); + CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); + for (CeedInt field = 0; field < size; field++) { + q_size = (CeedSize)Q * blk_size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); + CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &tmp[field * Q * blk_size])); } num_active_in += size; - ierr = CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp)); } } impl->num_active_in = num_active_in; - impl->qf_active_in = active_in; + impl->qf_active_in = active_in; } // Count number of active output fields if (!num_active_out) { - for (CeedInt i=0; iqf_l_vec = l_vec; } // Build objects if needed - CeedInt strides[3] = {1, Q, num_active_in *num_active_out*Q}; + CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; if (build_objects) { // Create output restriction - ierr = CeedElemRestrictionCreateStrided(ceed, num_elem, Q, - num_active_in*num_active_out, - num_active_in*num_active_out*num_elem*Q, - strides, rstr); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q, + strides, rstr)); // Create assembled vector - CeedSize l_size = (CeedSize)num_elem*Q*num_active_in*num_active_out; - ierr = CeedVectorCreate(ceed, l_size, assembled); CeedChkBackend(ierr); + CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); } // Output blocked restriction CeedElemRestriction blk_rstr = impl->qf_blk_rstr; if (!blk_rstr) { - ierr = CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size, - num_active_in*num_active_out, num_active_in*num_active_out*num_elem*Q, - strides, &blk_rstr); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size, num_active_in * num_active_out, + num_active_in * num_active_out * num_elem * Q, strides, &blk_rstr)); impl->qf_blk_rstr = blk_rstr; } // Loop through elements - ierr = CeedVectorSetValue(*assembled, 0.0); CeedChkBackend(ierr); - for (CeedInt e=0; e 1) { - ierr = CeedVectorSetValue(active_in[(in+num_active_in-1)%num_active_in], - 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); } // Set Outputs - for (CeedInt out=0; outq_vecs_out[out], CEED_MEM_HOST, - CEED_USE_POINTER, a); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetSize(qf_output_fields[out], &size); - CeedChkBackend(ierr); - a += size*Q*blk_size; // Advance the pointer by the size of the output + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, a)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); + a += size * Q * blk_size; // Advance the pointer by the size of the output } } // Apply QFunction - ierr = CeedQFunctionApply(qf, Q*blk_size, impl->q_vecs_in, impl->q_vecs_out); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out)); } // Assemble into assembled vector - ierr = CeedVectorRestoreArray(l_vec, &a); CeedChkBackend(ierr); - ierr = CeedElemRestrictionApplyBlock(blk_rstr, e/blk_size, CEED_TRANSPOSE, - l_vec, *assembled, request); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(l_vec, &a)); + CeedCallBackend(CeedElemRestrictionApplyBlock(blk_rstr, e / blk_size, CEED_TRANSPOSE, l_vec, *assembled, request)); } // Un-set output Qvecs to prevent accidental overwrite of Assembled - for (CeedInt out=0; outq_vecs_out[out], CEED_MEM_HOST, CEED_COPY_VALUES, - NULL); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_COPY_VALUES, NULL)); } } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, - op_input_fields, e_data, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl)); return CEED_ERROR_SUCCESS; } @@ -745,60 +593,55 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, //------------------------------------------------------------------------------ // Assemble Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunction_Opt(CeedOperator op, - CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Opt(op, true, assembled, rstr, - request); +static int CeedOperatorLinearAssembleQFunction_Opt(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Opt(op, true, assembled, rstr, request); } //------------------------------------------------------------------------------ // Update Assembled Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunctionUpdate_Opt(CeedOperator op, - CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Opt(op, false, &assembled, - &rstr, request); +static int CeedOperatorLinearAssembleQFunctionUpdate_Opt(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Opt(op, false, &assembled, &rstr, request); } //------------------------------------------------------------------------------ // Operator Destroy //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Opt(CeedOperator op) { - int ierr; CeedOperator_Opt *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); - for (CeedInt i=0; inum_inputs+impl->num_outputs; i++) { - ierr = CeedElemRestrictionDestroy(&impl->blk_restr[i]); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->e_vecs_full[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { + CeedCallBackend(CeedElemRestrictionDestroy(&impl->blk_restr[i])); + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i])); } - ierr = CeedFree(&impl->blk_restr); CeedChkBackend(ierr); - ierr = CeedFree(&impl->e_vecs_full); CeedChkBackend(ierr); - ierr = CeedFree(&impl->input_states); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->blk_restr)); + CeedCallBackend(CeedFree(&impl->e_vecs_full)); + CeedCallBackend(CeedFree(&impl->input_states)); - for (CeedInt i=0; inum_inputs; i++) { - ierr = CeedVectorDestroy(&impl->e_vecs_in[i]); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->q_vecs_in[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_inputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i])); + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); } - ierr = CeedFree(&impl->e_vecs_in); CeedChkBackend(ierr); - ierr = CeedFree(&impl->q_vecs_in); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->e_vecs_in)); + CeedCallBackend(CeedFree(&impl->q_vecs_in)); - for (CeedInt i=0; inum_outputs; i++) { - ierr = CeedVectorDestroy(&impl->e_vecs_out[i]); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->q_vecs_out[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i])); + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); } - ierr = CeedFree(&impl->e_vecs_out); CeedChkBackend(ierr); - ierr = CeedFree(&impl->q_vecs_out); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->e_vecs_out)); + CeedCallBackend(CeedFree(&impl->q_vecs_out)); // QFunction assembly data - for (CeedInt i=0; inum_active_in; i++) { - ierr = CeedVectorDestroy(&impl->qf_active_in[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_active_in; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); } - ierr = CeedFree(&impl->qf_active_in); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->qf_l_vec); CeedChkBackend(ierr); - ierr = CeedElemRestrictionDestroy(&impl->qf_blk_rstr); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->qf_active_in)); + CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_blk_rstr)); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -806,34 +649,26 @@ static int CeedOperatorDestroy_Opt(CeedOperator op) { // Operator Create //------------------------------------------------------------------------------ int CeedOperatorCreate_Opt(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); Ceed_Opt *ceed_impl; - ierr = CeedGetData(ceed, &ceed_impl); CeedChkBackend(ierr); - CeedInt blk_size = ceed_impl->blk_size; + CeedCallBackend(CeedGetData(ceed, &ceed_impl)); + CeedInt blk_size = ceed_impl->blk_size; CeedOperator_Opt *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); - if (blk_size != 1 && blk_size != 8) + if (blk_size != 1 && blk_size != 8) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Opt backend cannot use blocksize: %" CeedInt_FMT, blk_size); - // LCOV_EXCL_STOP - - ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", - CeedOperatorLinearAssembleQFunction_Opt); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, - "LinearAssembleQFunctionUpdate", - CeedOperatorLinearAssembleQFunctionUpdate_Opt); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", - CeedOperatorApplyAdd_Opt); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", - CeedOperatorDestroy_Opt); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Opt backend cannot use blocksize: %" CeedInt_FMT, blk_size); + // LCOV_EXCL_STOP + } + + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Opt)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Opt)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Opt)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Opt)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ diff --git a/backends/opt/ceed-opt-serial.c b/backends/opt/ceed-opt-serial.c index 84fcb3a768..29c504c2fa 100644 --- a/backends/opt/ceed-opt-serial.c +++ b/backends/opt/ceed-opt-serial.c @@ -5,20 +5,20 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-opt.h" //------------------------------------------------------------------------------ // Backend Destroy //------------------------------------------------------------------------------ static int CeedDestroy_Opt(Ceed ceed) { - int ierr; Ceed_Opt *data; - ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); - ierr = CeedFree(&data); CeedChkBackend(ierr); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; } @@ -27,38 +27,32 @@ static int CeedDestroy_Opt(Ceed ceed) { // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Opt_Serial(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self") - && strcmp(resource, "/cpu/self/opt/serial")) + if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/opt/serial")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Opt backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Opt backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceed_ref; - CeedInit("/cpu/self/ref/serial", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); // Set fallback CEED resource for advanced operator functionality const char fallbackresource[] = "/cpu/self/ref/serial"; - ierr = CeedSetOperatorFallbackResource(ceed, fallbackresource); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", - CeedDestroy_Opt); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_Opt); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", - CeedOperatorCreate_Opt); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Opt)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Opt)); // Set blocksize Ceed_Opt *data; - ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &data)); data->blk_size = 1; - ierr = CeedSetData(ceed, data); CeedChkBackend(ierr); + CeedCallBackend(CeedSetData(ceed, data)); return CEED_ERROR_SUCCESS; } @@ -66,8 +60,6 @@ static int CeedInit_Opt_Serial(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Opt_Serial(void) { - return CeedRegister("/cpu/self/opt/serial", CeedInit_Opt_Serial, 45); -} +CEED_INTERN int CeedRegister_Opt_Serial(void) { return CeedRegister("/cpu/self/opt/serial", CeedInit_Opt_Serial, 45); } //------------------------------------------------------------------------------ diff --git a/backends/opt/ceed-opt-tensor.c b/backends/opt/ceed-opt-tensor.c index ca0787a514..ae21c81667 100644 --- a/backends/opt/ceed-opt-tensor.c +++ b/backends/opt/ceed-opt-tensor.c @@ -5,29 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include + #include "ceed-opt.h" //------------------------------------------------------------------------------ // Tensor Contract Core loop //------------------------------------------------------------------------------ -static inline int CeedTensorContractApply_Core_Opt(CeedTensorContract contract, - CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *restrict t, - CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, - CeedScalar *restrict v) { +static inline int CeedTensorContractApply_Core_Opt(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, + const CeedScalar *restrict t, CeedTransposeMode t_mode, const CeedInt add, + const CeedScalar *restrict u, CeedScalar *restrict v) { CeedInt t_stride_0 = B, t_stride_1 = 1; if (t_mode == CEED_TRANSPOSE) { - t_stride_0 = 1; t_stride_1 = J; + t_stride_0 = 1; + t_stride_1 = J; } - for (CeedInt a=0; a #include +#include #include #include @@ -22,23 +22,22 @@ typedef struct { } CeedBasis_Opt; typedef struct { - bool is_identity_qf, is_identity_restr_op; - CeedElemRestriction *blk_restr; /* Blocked versions of restrictions */ - CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ - uint64_t *input_states; /* State counter of inputs */ - CeedVector *e_vecs_in; /* Element block input E-vectors */ - CeedVector *e_vecs_out; /* Element block output E-vectors */ - CeedVector *q_vecs_in; /* Element block input Q-vectors */ - CeedVector *q_vecs_out; /* Element block output Q-vectors */ - CeedInt num_inputs,num_outputs; - CeedInt num_active_in, num_active_out; - CeedVector *qf_active_in; - CeedVector qf_l_vec; - CeedElemRestriction qf_blk_rstr; + bool is_identity_qf, is_identity_restr_op; + CeedElemRestriction *blk_restr; /* Blocked versions of restrictions */ + CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ + uint64_t *input_states; /* State counter of inputs */ + CeedVector *e_vecs_in; /* Element block input E-vectors */ + CeedVector *e_vecs_out; /* Element block output E-vectors */ + CeedVector *q_vecs_in; /* Element block input Q-vectors */ + CeedVector *q_vecs_out; /* Element block output Q-vectors */ + CeedInt num_inputs, num_outputs; + CeedInt num_active_in, num_active_out; + CeedVector *qf_active_in; + CeedVector qf_l_vec; + CeedElemRestriction qf_blk_rstr; } CeedOperator_Opt; -CEED_INTERN int CeedTensorContractCreate_Opt(CeedBasis basis, - CeedTensorContract contract); +CEED_INTERN int CeedTensorContractCreate_Opt(CeedBasis basis, CeedTensorContract contract); CEED_INTERN int CeedOperatorCreate_Opt(CeedOperator op); -#endif // _ceed_opt_h +#endif // _ceed_opt_h diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c index 8232c4d899..0ab2199c3b 100644 --- a/backends/ref/ceed-ref-basis.c +++ b/backends/ref/ceed-ref-basis.c @@ -5,330 +5,279 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include + #include "ceed-ref.h" //------------------------------------------------------------------------------ // Basis Apply //------------------------------------------------------------------------------ -static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, - CeedTransposeMode t_mode, CeedEvalMode eval_mode, - CeedVector U, CeedVector V) { - int ierr; +static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedInt dim, num_comp, num_nodes, num_qpts, Q_comp; - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); - ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadratureComponents(basis, &Q_comp); - CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, &Q_comp)); CeedTensorContract contract; - ierr = CeedBasisGetTensorContract(basis, &contract); CeedChkBackend(ierr); - const CeedInt add = (t_mode == CEED_TRANSPOSE); + CeedCallBackend(CeedBasisGetTensorContract(basis, &contract)); + const CeedInt add = (t_mode == CEED_TRANSPOSE); const CeedScalar *u; - CeedScalar *v; + CeedScalar *v; if (U != CEED_VECTOR_NONE) { - ierr = CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u)); } else if (eval_mode != CEED_EVAL_WEIGHT) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "An input vector is required for this CeedEvalMode"); + return CeedError(ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); // LCOV_EXCL_STOP } - ierr = CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v)); // Clear v if operating in transpose if (t_mode == CEED_TRANSPOSE) { - const CeedInt v_size = num_elem*num_comp*num_nodes; - for (CeedInt i = 0; i < v_size; i++) - v[i] = (CeedScalar) 0.0; + const CeedInt v_size = num_elem * num_comp * num_nodes; + for (CeedInt i = 0; i < v_size; i++) v[i] = (CeedScalar)0.0; } bool tensor_basis; - ierr = CeedBasisIsTensor(basis, &tensor_basis); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisIsTensor(basis, &tensor_basis)); // Tensor basis if (tensor_basis) { CeedInt P_1d, Q_1d; - ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); switch (eval_mode) { - // Interpolate to/from quadrature points - case CEED_EVAL_INTERP: { - CeedBasis_Ref *impl; - ierr = CeedBasisGetData(basis, &impl); CeedChkBackend(ierr); - if (impl->has_collo_interp) { - memcpy(v, u, num_elem*num_comp*num_nodes*sizeof(u[0])); - } else { + // Interpolate to/from quadrature points + case CEED_EVAL_INTERP: { + CeedBasis_Ref *impl; + CeedCallBackend(CeedBasisGetData(basis, &impl)); + if (impl->has_collo_interp) { + memcpy(v, u, num_elem * num_comp * num_nodes * sizeof(u[0])); + } else { + CeedInt P = P_1d, Q = Q_1d; + if (t_mode == CEED_TRANSPOSE) { + P = Q_1d; + Q = P_1d; + } + CeedInt pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; + CeedScalar tmp[2][num_elem * num_comp * Q * CeedIntPow(P > Q ? P : Q, dim - 1)]; + const CeedScalar *interp_1d; + CeedCallBackend(CeedBasisGetInterp1D(basis, &interp_1d)); + for (CeedInt d = 0; d < dim; d++) { + CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, interp_1d, t_mode, add && (d == dim - 1), d == 0 ? u : tmp[d % 2], + d == dim - 1 ? v : tmp[(d + 1) % 2])); + pre /= P; + post *= Q; + } + } + } break; + // Evaluate the gradient to/from quadrature points + case CEED_EVAL_GRAD: { + // In CEED_NOTRANSPOSE mode: + // u has shape [dim, num_comp, P^dim, num_elem], row-major layout + // v has shape [dim, num_comp, Q^dim, num_elem], row-major layout + // In CEED_TRANSPOSE mode, the sizes of u and v are switched. CeedInt P = P_1d, Q = Q_1d; if (t_mode == CEED_TRANSPOSE) { - P = Q_1d; Q = P_1d; + P = Q_1d, Q = Q_1d; } - CeedInt pre = num_comp*CeedIntPow(P, dim-1), post = num_elem; - CeedScalar tmp[2][num_elem*num_comp*Q*CeedIntPow(P>Q?P:Q, dim-1)]; + CeedBasis_Ref *impl; + CeedCallBackend(CeedBasisGetData(basis, &impl)); + CeedInt pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; const CeedScalar *interp_1d; - ierr = CeedBasisGetInterp1D(basis, &interp_1d); CeedChkBackend(ierr); - for (CeedInt d=0; dcollo_grad_1d) { - CeedScalar tmp[2][num_elem*num_comp*Q*CeedIntPow(P>Q?P:Q, dim-1)]; - CeedScalar interp[num_elem*num_comp*Q*CeedIntPow(P>Q?P:Q, dim-1)]; - // Interpolate to quadrature points (NoTranspose) - // or Grad to quadrature points (Transpose) - for (CeedInt d=0; dcollo_grad_1d), - t_mode, add&&(d>0), - (t_mode == CEED_NOTRANSPOSE - ? (d==0?u:tmp[d%2]) - : u + d*num_qpts*num_comp*num_elem), - (t_mode == CEED_NOTRANSPOSE - ? (d==dim-1?interp:tmp[(d+1)%2]) - : interp)); - CeedChkBackend(ierr); - pre /= P; - post *= Q; - } - // Grad to quadrature points (NoTranspose) - // or Interpolate to nodes (Transpose) - P = Q_1d, Q = Q_1d; - if (t_mode == CEED_TRANSPOSE) { - P = Q_1d, Q = P_1d; - } - pre = num_comp*CeedIntPow(P, dim-1), post = num_elem; - for (CeedInt d=0; dcollo_grad_1d - : interp_1d), - t_mode, add&&(d==dim-1), - (t_mode == CEED_NOTRANSPOSE - ? interp - : (d==0?interp:tmp[d%2])), - (t_mode == CEED_NOTRANSPOSE - ? v + d*num_qpts*num_comp*num_elem - : (d==dim-1?v:tmp[(d+1)%2]))); - CeedChkBackend(ierr); - pre /= P; - post *= Q; - } - } else if (impl->has_collo_interp) { // Qpts collocated with nodes - const CeedScalar *grad_1d; - ierr = CeedBasisGetGrad1D(basis, &grad_1d); CeedChkBackend(ierr); - - // Dim contractions, identity in other directions - CeedInt pre = num_comp*CeedIntPow(P, dim-1), post = num_elem; - for (CeedInt d=0; d0), - t_mode == CEED_NOTRANSPOSE - ? u : u+d*num_comp*num_qpts*num_elem, - t_mode == CEED_TRANSPOSE - ? v : v+d*num_comp*num_qpts*num_elem); - CeedChkBackend(ierr); - pre /= P; - post *= Q; - } - } else { // Underintegration, P > Q - const CeedScalar *grad_1d; - ierr = CeedBasisGetGrad1D(basis, &grad_1d); CeedChkBackend(ierr); - - if (t_mode == CEED_TRANSPOSE) { - P = Q_1d, Q = P_1d; - } - CeedScalar tmp[2][num_elem*num_comp*Q*CeedIntPow(P>Q?P:Q, dim-1)]; + CeedCallBackend(CeedBasisGetInterp1D(basis, &interp_1d)); + if (impl->collo_grad_1d) { + CeedScalar tmp[2][num_elem * num_comp * Q * CeedIntPow(P > Q ? P : Q, dim - 1)]; + CeedScalar interp[num_elem * num_comp * Q * CeedIntPow(P > Q ? P : Q, dim - 1)]; + // Interpolate to quadrature points (NoTranspose) + // or Grad to quadrature points (Transpose) + for (CeedInt d = 0; d < dim; d++) { + CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? interp_1d : impl->collo_grad_1d), t_mode, + add && (d > 0), + (t_mode == CEED_NOTRANSPOSE ? (d == 0 ? u : tmp[d % 2]) : u + d * num_qpts * num_comp * num_elem), + (t_mode == CEED_NOTRANSPOSE ? (d == dim - 1 ? interp : tmp[(d + 1) % 2]) : interp))); + pre /= P; + post *= Q; + } + // Grad to quadrature points (NoTranspose) + // or Interpolate to nodes (Transpose) + P = Q_1d, Q = Q_1d; + if (t_mode == CEED_TRANSPOSE) { + P = Q_1d, Q = P_1d; + } + pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; + for (CeedInt d = 0; d < dim; d++) { + CeedCallBackend(CeedTensorContractApply( + contract, pre, P, post, Q, (t_mode == CEED_NOTRANSPOSE ? impl->collo_grad_1d : interp_1d), t_mode, add && (d == dim - 1), + (t_mode == CEED_NOTRANSPOSE ? interp : (d == 0 ? interp : tmp[d % 2])), + (t_mode == CEED_NOTRANSPOSE ? v + d * num_qpts * num_comp * num_elem : (d == dim - 1 ? v : tmp[(d + 1) % 2])))); + pre /= P; + post *= Q; + } + } else if (impl->has_collo_interp) { // Qpts collocated with nodes + const CeedScalar *grad_1d; + CeedCallBackend(CeedBasisGetGrad1D(basis, &grad_1d)); - // Dim**2 contractions, apply grad when pass == dim - for (CeedInt p=0; p 0), + t_mode == CEED_NOTRANSPOSE ? u : u + d * num_comp * num_qpts * num_elem, + t_mode == CEED_TRANSPOSE ? v : v + d * num_comp * num_qpts * num_elem)); pre /= P; post *= Q; } + } else { // Underintegration, P > Q + const CeedScalar *grad_1d; + CeedCallBackend(CeedBasisGetGrad1D(basis, &grad_1d)); + + if (t_mode == CEED_TRANSPOSE) { + P = Q_1d, Q = P_1d; + } + CeedScalar tmp[2][num_elem * num_comp * Q * CeedIntPow(P > Q ? P : Q, dim - 1)]; + + // Dim**2 contractions, apply grad when pass == dim + for (CeedInt p = 0; p < dim; p++) { + CeedInt pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; + for (CeedInt d = 0; d < dim; d++) { + CeedCallBackend(CeedTensorContractApply( + contract, pre, P, post, Q, (p == d) ? grad_1d : interp_1d, t_mode, add && (d == dim - 1), + (d == 0 ? (t_mode == CEED_NOTRANSPOSE ? u : u + p * num_comp * num_qpts * num_elem) : tmp[d % 2]), + (d == dim - 1 ? (t_mode == CEED_TRANSPOSE ? v : v + p * num_comp * num_qpts * num_elem) : tmp[(d + 1) % 2]))); + pre /= P; + post *= Q; + } + } } - } - } break; - // Retrieve interpolation weights - case CEED_EVAL_WEIGHT: { - if (t_mode == CEED_TRANSPOSE) - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); - // LCOV_EXCL_STOP - CeedInt Q = Q_1d; - const CeedScalar *q_weight_1d; - ierr = CeedBasisGetQWeights(basis, &q_weight_1d); CeedChkBackend(ierr); - for (CeedInt d=0; dcollo_grad_1d); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetData(basis, &impl)); + CeedCallBackend(CeedFree(&impl->collo_grad_1d)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -376,46 +316,38 @@ static int CeedBasisDestroyTensor_Ref(CeedBasis basis) { //------------------------------------------------------------------------------ // Basis Create Tensor //------------------------------------------------------------------------------ -int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, - CeedInt Q_1d, const CeedScalar *interp_1d, - const CeedScalar *grad_1d, - const CeedScalar *q_ref_1d, - const CeedScalar *q_weight_1d, - CeedBasis basis) { - int ierr; +int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasis_Ref *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); // Check for collocated interp if (Q_1d == P_1d) { bool collocated = 1; - for (CeedInt i=0; ihas_collo_interp = collocated; } // Calculate collocated grad if (Q_1d >= P_1d && !impl->has_collo_interp) { - ierr = CeedMalloc(Q_1d*Q_1d, &impl->collo_grad_1d); CeedChkBackend(ierr); - ierr = CeedBasisGetCollocatedGrad(basis, impl->collo_grad_1d); - CeedChkBackend(ierr); + CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &impl->collo_grad_1d)); + CeedCallBackend(CeedBasisGetCollocatedGrad(basis, impl->collo_grad_1d)); } - ierr = CeedBasisSetData(basis, impl); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisSetData(basis, impl)); Ceed parent; - ierr = CeedGetParent(ceed, &parent); CeedChkBackend(ierr); + CeedCallBackend(CeedGetParent(ceed, &parent)); CeedTensorContract contract; - ierr = CeedTensorContractCreate(parent, basis, &contract); CeedChkBackend(ierr); - ierr = CeedBasisSetTensorContract(basis, contract); CeedChkBackend(ierr); + CeedCallBackend(CeedTensorContractCreate(parent, basis, &contract)); + CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", - CeedBasisApply_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", - CeedBasisDestroyTensor_Ref); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyTensor_Ref)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c index 3525ce7d75..cd00cd2262 100644 --- a/backends/ref/ceed-ref-operator.c +++ b/backends/ref/ceed-ref-operator.c @@ -5,84 +5,73 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include #include + #include "ceed-ref.h" //------------------------------------------------------------------------------ // Setup Input/Output Fields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, - bool is_input, CeedVector *e_vecs_full, - CeedVector *e_vecs, CeedVector *q_vecs, - CeedInt start_e, CeedInt num_fields, - CeedInt Q) { - CeedInt ierr, num_comp, size, P; +static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs_full, CeedVector *e_vecs, + CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) { + CeedInt num_comp, size, P; CeedSize e_size, q_size; - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - CeedBasis basis; + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedBasis basis; CeedElemRestriction elem_restr; - CeedOperatorField *op_fields; + CeedOperatorField *op_fields; CeedQFunctionField *qf_fields; if (is_input) { - ierr = CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); } else { - ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields); - CeedChkBackend(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); } // Loop over fields - for (CeedInt i=0; iis_identity_qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedQFunctionIsIdentity(qf, &impl->is_identity_qf)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate - ierr = CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full); - CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full)); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->input_states); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); - impl->num_inputs = num_input_fields; + impl->num_inputs = num_input_fields; impl->num_outputs = num_output_fields; // Set up infield and outfield e_vecs and q_vecs // Infields - ierr = CeedOperatorSetupFields_Ref(qf, op, true, impl->e_vecs_full, - impl->e_vecs_in, impl->q_vecs_in, 0, - num_input_fields, Q); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupFields_Ref(qf, op, true, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q)); // Outfields - ierr = CeedOperatorSetupFields_Ref(qf, op, false, impl->e_vecs_full, - impl->e_vecs_out, impl->q_vecs_out, - num_input_fields, num_output_fields, Q); - CeedChkBackend(ierr); + CeedCallBackend( + CeedOperatorSetupFields_Ref(qf, op, false, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q)); // Identity QFunctions if (impl->is_identity_qf) { - CeedEvalMode in_mode, out_mode; + CeedEvalMode in_mode, out_mode; CeedQFunctionField *in_fields, *out_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode); - CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode)); if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) { impl->is_identity_restr_op = true; } else { - ierr = CeedVectorDestroy(&impl->q_vecs_out[0]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[0])); impl->q_vecs_out[0] = impl->q_vecs_in[0]; - ierr = CeedVectorAddReference(impl->q_vecs_in[0]); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorAddReference(impl->q_vecs_in[0])); } } - ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } @@ -167,48 +142,36 @@ static int CeedOperatorSetup_Ref(CeedOperator op) { //------------------------------------------------------------------------------ // Setup Operator Inputs //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, - CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedVector in_vec, const bool skip_active, - CeedScalar *e_data_full[2*CEED_FIELD_MAX], - CeedOperator_Ref *impl, CeedRequest *request) { - CeedInt ierr; - CeedEvalMode eval_mode; - CeedVector vec; +static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedVector in_vec, const bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], + CeedOperator_Ref *impl, CeedRequest *request) { + CeedEvalMode eval_mode; + CeedVector vec; CeedElemRestriction elem_restr; - uint64_t state; + uint64_t state; - for (CeedInt i=0; iinput_states[i] || vec == in_vec) { - ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr); - CeedChkBackend(ierr); - ierr = CeedElemRestrictionApply(elem_restr, CEED_NOTRANSPOSE, vec, - impl->e_vecs_full[i], request); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr)); + CeedCallBackend(CeedElemRestrictionApply(elem_restr, CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); impl->input_states[i] = state; } // Get evec - ierr = CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, - (const CeedScalar **) &e_data_full[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i])); } } return CEED_ERROR_SUCCESS; @@ -217,74 +180,53 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, - CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedInt num_input_fields, const bool skip_active, - CeedScalar *e_data_full[2*CEED_FIELD_MAX], CeedOperator_Ref *impl) { - CeedInt ierr; - CeedInt dim, elem_size, size; +static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], + CeedOperator_Ref *impl) { + CeedInt dim, elem_size, size; CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; + CeedEvalMode eval_mode; + CeedBasis basis; - for (CeedInt i=0; iq_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data_full[i][e*Q*size]); - CeedChkBackend(ierr); - break; - case CEED_EVAL_INTERP: - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data_full[i][e*elem_size*size]); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, - impl->e_vecs_in[i], impl->q_vecs_in[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, - CEED_USE_POINTER, &e_data_full[i][e*elem_size*size/dim]); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, - CEED_EVAL_GRAD, impl->e_vecs_in[i], - impl->q_vecs_in[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_WEIGHT: - break; // No action - // LCOV_EXCL_START - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); - CeedChkBackend(ierr); - Ceed ceed; - ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Ceed evaluation mode not implemented"); - // LCOV_EXCL_STOP - } + switch (eval_mode) { + case CEED_EVAL_NONE: + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][e * Q * size])); + break; + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][e * elem_size * size])); + CeedCallBackend(CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs_in[i], impl->q_vecs_in[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][e * elem_size * size / dim])); + CeedCallBackend(CeedBasisApply(basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs_in[i], impl->q_vecs_in[i])); + break; + case CEED_EVAL_WEIGHT: + break; // No action + // LCOV_EXCL_START + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + Ceed ceed; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Ceed evaluation mode not implemented"); + // LCOV_EXCL_STOP + } } } return CEED_ERROR_SUCCESS; @@ -293,69 +235,52 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, //------------------------------------------------------------------------------ // Output Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, - CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, - CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, - CeedScalar *e_data_full[2*CEED_FIELD_MAX], CeedOperator_Ref *impl) { - CeedInt ierr; - CeedInt dim, elem_size, size; +static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, + CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, + CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl) { + CeedInt dim, elem_size, size; CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; + CeedEvalMode eval_mode; + CeedBasis basis; - for (CeedInt i=0; ie_vecs_out[i], CEED_MEM_HOST, - CEED_USE_POINTER, - &e_data_full[i + num_input_fields][e*elem_size*size]); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE, - CEED_EVAL_INTERP, impl->q_vecs_out[i], - impl->e_vecs_out[i]); CeedChkBackend(ierr); - break; - case CEED_EVAL_GRAD: - ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); - CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); - ierr = CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, - CEED_USE_POINTER, - &e_data_full[i + num_input_fields][e*elem_size*size/dim]); - CeedChkBackend(ierr); - ierr = CeedBasisApply(basis, 1, CEED_TRANSPOSE, - CEED_EVAL_GRAD, impl->q_vecs_out[i], - impl->e_vecs_out[i]); CeedChkBackend(ierr); - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CEED_EVAL_WEIGHT cannot be an output " - "evaluation mode"); - } - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: { - Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - return CeedError(ceed, CEED_ERROR_BACKEND, - "Ceed evaluation mode not implemented"); - // LCOV_EXCL_STOP - } + switch (eval_mode) { + case CEED_EVAL_NONE: + break; // No action + case CEED_EVAL_INTERP: + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend( + CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][e * elem_size * size])); + CeedCallBackend(CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs_out[i])); + break; + case CEED_EVAL_GRAD: + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend( + CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][e * elem_size * size / dim])); + CeedCallBackend(CeedBasisApply(basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs_out[i])); + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, + "CEED_EVAL_WEIGHT cannot be an output " + "evaluation mode"); + } + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: { + Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "Ceed evaluation mode not implemented"); + // LCOV_EXCL_STOP + } } } return CEED_ERROR_SUCCESS; @@ -364,30 +289,22 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, //------------------------------------------------------------------------------ // Restore Input Vectors //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Ref(CeedInt num_input_fields, - CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - const bool skip_active, CeedScalar *e_data_full[2*CEED_FIELD_MAX], - CeedOperator_Ref *impl) { - CeedInt ierr; +static inline int CeedOperatorRestoreInputs_Ref(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + const bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl) { CeedEvalMode eval_mode; - for (CeedInt i=0; ie_vecs_full[i], - (const CeedScalar **) &e_data_full[i]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs_full[i], (const CeedScalar **)&e_data_full[i])); } } return CEED_ERROR_SUCCESS; @@ -396,116 +313,82 @@ static inline int CeedOperatorRestoreInputs_Ref(CeedInt num_input_fields, //------------------------------------------------------------------------------ // Operator Apply //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, - CeedVector out_vec, CeedRequest *request) { - int ierr; +static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { CeedOperator_Ref *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedInt Q, num_elem, num_input_fields, num_output_fields, size; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); - CeedEvalMode eval_mode; - CeedVector vec; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedEvalMode eval_mode; + CeedVector vec; CeedElemRestriction elem_restr; - CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0}; + CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0}; // Setup - ierr = CeedOperatorSetup_Ref(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Ref(op)); // Restriction only operator if (impl->is_identity_restr_op) { - ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[0], &elem_restr); - CeedChkBackend(ierr); - ierr = CeedElemRestrictionApply(elem_restr, CEED_NOTRANSPOSE, in_vec, - impl->e_vecs_full[0], request); - CeedChkBackend(ierr); - ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[0], &elem_restr); - CeedChkBackend(ierr); - ierr = CeedElemRestrictionApply(elem_restr, CEED_TRANSPOSE, - impl->e_vecs_full[0], - out_vec, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[0], &elem_restr)); + CeedCallBackend(CeedElemRestrictionApply(elem_restr, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[0], &elem_restr)); + CeedCallBackend(CeedElemRestrictionApply(elem_restr, CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request)); return CEED_ERROR_SUCCESS; } // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, - op_input_fields, in_vec, false, e_data_full, impl, - request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data_full, impl, request)); // Output Evecs - for (CeedInt i=0; ie_vecs_full[i+impl->num_inputs], - CEED_MEM_HOST, &e_data_full[i + num_input_fields]); - CeedChkBackend(ierr); + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs_full[i + impl->num_inputs], CEED_MEM_HOST, &e_data_full[i + num_input_fields])); } // Loop through elements - for (CeedInt e=0; eq_vecs_out[i], CEED_MEM_HOST, - CEED_USE_POINTER, - &e_data_full[i + num_input_fields][e*Q*size]); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][e * Q * size])); } } // Input basis apply - ierr = CeedOperatorInputBasis_Ref(e, Q, qf_input_fields, op_input_fields, - num_input_fields, false, e_data_full, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorInputBasis_Ref(e, Q, qf_input_fields, op_input_fields, num_input_fields, false, e_data_full, impl)); // Q function if (!impl->is_identity_qf) { - ierr = CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out)); } // Output basis apply - ierr = CeedOperatorOutputBasis_Ref(e, Q, qf_output_fields, op_output_fields, - num_input_fields, num_output_fields, op, - e_data_full, impl); CeedChkBackend(ierr); + CeedCallBackend( + CeedOperatorOutputBasis_Ref(e, Q, qf_output_fields, op_output_fields, num_input_fields, num_output_fields, op, e_data_full, impl)); } // Output restriction - for (CeedInt i=0; ie_vecs_full[i+impl->num_inputs], - &e_data_full[i + num_input_fields]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields])); // Get output vector - ierr = CeedOperatorFieldGetVector(op_output_fields[i], &vec); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Active - if (vec == CEED_VECTOR_ACTIVE) - vec = out_vec; + if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; // Restrict - ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr); - CeedChkBackend(ierr); - ierr = CeedElemRestrictionApply(elem_restr, CEED_TRANSPOSE, - impl->e_vecs_full[i+impl->num_inputs], - vec, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr)); + CeedCallBackend(CeedElemRestrictionApply(elem_restr, CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request)); } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, - op_input_fields, false, e_data_full, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl)); return CEED_ERROR_SUCCESS; } @@ -513,93 +396,75 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, //------------------------------------------------------------------------------ // Core code for assembling linear QFunction //------------------------------------------------------------------------------ -static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, - bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, - CeedRequest *request) { - int ierr; +static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { CeedOperator_Ref *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); - CeedInt Q, num_elem, num_input_fields, num_output_fields, size; + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedInt Q, num_elem, num_input_fields, num_output_fields, size; CeedSize q_size; - ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedOperatorField *op_input_fields, *op_output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, - &num_output_fields, &op_output_fields); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedQFunctionField *qf_input_fields, *qf_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, - &qf_output_fields); - CeedChkBackend(ierr); - CeedVector vec; - CeedInt num_active_in = impl->num_active_in, - num_active_out = impl->num_active_out; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedVector vec; + CeedInt num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; CeedVector *active_in = impl->qf_active_in; CeedScalar *a, *tmp; - Ceed ceed, ceed_parent; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); - ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceed_parent); - CeedChkBackend(ierr); - ceed_parent = ceed_parent ? ceed_parent : ceed; - CeedScalar *e_data_full[2*CEED_FIELD_MAX] = {0}; + Ceed ceed, ceed_parent; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedGetOperatorFallbackParentCeed(ceed, &ceed_parent)); + ceed_parent = ceed_parent ? ceed_parent : ceed; + CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0}; // Setup - ierr = CeedOperatorSetup_Ref(op); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetup_Ref(op)); // Check for identity - if (impl->is_identity_qf) + if (impl->is_identity_qf) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Assembling identity QFunctions not supported"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); + // LCOV_EXCL_STOP + } // Input Evecs and Restriction - ierr = CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, - op_input_fields, NULL, true, e_data_full, - impl, request); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorSetupInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data_full, impl, request)); // Count number of active input fields if (!num_active_in) { - for (CeedInt i=0; iq_vecs_in[i], 0.0); CeedChkBackend(ierr); - ierr = CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp); - CeedChkBackend(ierr); - ierr = CeedRealloc(num_active_in + size, &active_in); CeedChkBackend(ierr); - for (CeedInt field=0; fieldq_vecs_in[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp)); + CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); + for (CeedInt field = 0; field < size; field++) { q_size = (CeedSize)Q; - ierr = CeedVectorCreate(ceed, q_size, &active_in[num_active_in+field]); - CeedChkBackend(ierr); - ierr = CeedVectorSetArray(active_in[num_active_in+field], CEED_MEM_HOST, - CEED_USE_POINTER, &tmp[field*Q]); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); + CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &tmp[field * Q])); } num_active_in += size; - ierr = CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp)); } } impl->num_active_in = num_active_in; - impl->qf_active_in = active_in; + impl->qf_active_in = active_in; } // Count number of active output fields if (!num_active_out) { - for (CeedInt i=0; i 1) { - ierr = CeedVectorSetValue(active_in[(in+num_active_in-1)%num_active_in], - 0.0); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); } // Set Outputs - for (CeedInt out=0; outq_vecs_out[out], CEED_MEM_HOST, - CEED_USE_POINTER, a); CeedChkBackend(ierr); - ierr = CeedQFunctionFieldGetSize(qf_output_fields[out], &size); - CeedChkBackend(ierr); - a += size*Q; // Advance the pointer by the size of the output + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, a)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); + a += size * Q; // Advance the pointer by the size of the output } } // Apply QFunction - ierr = CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionApply(qf, Q, impl->q_vecs_in, impl->q_vecs_out)); } } // Un-set output Qvecs to prevent accidental overwrite of Assembled - for (CeedInt out=0; out 0) { - CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_HOST, NULL)); } } // Restore input arrays - ierr = CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, - op_input_fields, true, e_data_full, impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, true, e_data_full, impl)); // Restore output - ierr = CeedVectorRestoreArray(*assembled, &a); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorRestoreArray(*assembled, &a)); return CEED_ERROR_SUCCESS; } @@ -691,56 +542,51 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, //------------------------------------------------------------------------------ // Assemble Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunction_Ref(CeedOperator op, - CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Ref(op, true, assembled, rstr, - request); +static int CeedOperatorLinearAssembleQFunction_Ref(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Ref(op, true, assembled, rstr, request); } //------------------------------------------------------------------------------ // Update Assembled Linear QFunction //------------------------------------------------------------------------------ -static int CeedOperatorLinearAssembleQFunctionUpdate_Ref(CeedOperator op, - CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionCore_Ref(op, false, &assembled, - &rstr, request); +static int CeedOperatorLinearAssembleQFunctionUpdate_Ref(CeedOperator op, CeedVector assembled, CeedElemRestriction rstr, CeedRequest *request) { + return CeedOperatorLinearAssembleQFunctionCore_Ref(op, false, &assembled, &rstr, request); } //------------------------------------------------------------------------------ // Operator Destroy //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Ref(CeedOperator op) { - int ierr; CeedOperator_Ref *impl; - ierr = CeedOperatorGetData(op, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetData(op, &impl)); - for (CeedInt i=0; inum_inputs+impl->num_outputs; i++) { - ierr = CeedVectorDestroy(&impl->e_vecs_full[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i])); } - ierr = CeedFree(&impl->e_vecs_full); CeedChkBackend(ierr); - ierr = CeedFree(&impl->input_states); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->e_vecs_full)); + CeedCallBackend(CeedFree(&impl->input_states)); - for (CeedInt i=0; inum_inputs; i++) { - ierr = CeedVectorDestroy(&impl->e_vecs_in[i]); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->q_vecs_in[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_inputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_in[i])); + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); } - ierr = CeedFree(&impl->e_vecs_in); CeedChkBackend(ierr); - ierr = CeedFree(&impl->q_vecs_in); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->e_vecs_in)); + CeedCallBackend(CeedFree(&impl->q_vecs_in)); - for (CeedInt i=0; inum_outputs; i++) { - ierr = CeedVectorDestroy(&impl->e_vecs_out[i]); CeedChkBackend(ierr); - ierr = CeedVectorDestroy(&impl->q_vecs_out[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_out[i])); + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); } - ierr = CeedFree(&impl->e_vecs_out); CeedChkBackend(ierr); - ierr = CeedFree(&impl->q_vecs_out); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->e_vecs_out)); + CeedCallBackend(CeedFree(&impl->q_vecs_out)); // QFunction assembly - for (CeedInt i=0; inum_active_in; i++) { - ierr = CeedVectorDestroy(&impl->qf_active_in[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < impl->num_active_in; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); } - ierr = CeedFree(&impl->qf_active_in); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->qf_active_in)); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -748,24 +594,16 @@ static int CeedOperatorDestroy_Ref(CeedOperator op) { // Operator Create //------------------------------------------------------------------------------ int CeedOperatorCreate_Ref(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedOperator_Ref *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedOperatorSetData(op, impl); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", - CeedOperatorLinearAssembleQFunction_Ref); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, - "LinearAssembleQFunctionUpdate", - CeedOperatorLinearAssembleQFunctionUpdate_Ref); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", - CeedOperatorApplyAdd_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy", - CeedOperatorDestroy_Ref); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedOperatorSetData(op, impl)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Ref)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-qfunction.c b/backends/ref/ceed-ref-qfunction.c index 28cdbaca95..f2336aecde 100644 --- a/backends/ref/ceed-ref-qfunction.c +++ b/backends/ref/ceed-ref-qfunction.c @@ -5,48 +5,44 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include + #include "ceed-ref.h" //------------------------------------------------------------------------------ // QFunction Apply //------------------------------------------------------------------------------ -static int CeedQFunctionApply_Ref(CeedQFunction qf, CeedInt Q, - CeedVector *U, CeedVector *V) { - int ierr; +static int CeedQFunctionApply_Ref(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { CeedQFunction_Ref *impl; - ierr = CeedQFunctionGetData(qf, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &impl)); void *ctx_data = NULL; - ierr = CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data)); CeedQFunctionUser f = NULL; - ierr = CeedQFunctionGetUserFunction(qf, &f); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetUserFunction(qf, &f)); CeedInt num_in, num_out; - ierr = CeedQFunctionGetNumArgs(qf, &num_in, &num_out); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_in, &num_out)); - for (int i = 0; iinputs[i]); - CeedChkBackend(ierr); + for (CeedInt i = 0; i < num_in; i++) { + CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i])); } - for (int i = 0; ioutputs[i]); - CeedChkBackend(ierr); + for (CeedInt i = 0; i < num_out; i++) { + CeedCallBackend(CeedVectorGetArrayWrite(V[i], CEED_MEM_HOST, &impl->outputs[i])); } - ierr = f(ctx_data, Q, impl->inputs, impl->outputs); CeedChkBackend(ierr); + CeedCallBackend(f(ctx_data, Q, impl->inputs, impl->outputs)); - for (int i = 0; iinputs[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < num_in; i++) { + CeedCallBackend(CeedVectorRestoreArrayRead(U[i], &impl->inputs[i])); } - for (int i = 0; ioutputs[i]); CeedChkBackend(ierr); + for (CeedInt i = 0; i < num_out; i++) { + CeedCallBackend(CeedVectorRestoreArray(V[i], &impl->outputs[i])); } - ierr = CeedQFunctionRestoreContextData(qf, &ctx_data); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionRestoreContextData(qf, &ctx_data)); return CEED_ERROR_SUCCESS; } @@ -55,13 +51,12 @@ static int CeedQFunctionApply_Ref(CeedQFunction qf, CeedInt Q, // QFunction Destroy //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Ref(CeedQFunction qf) { - int ierr; CeedQFunction_Ref *impl; - ierr = CeedQFunctionGetData(qf, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetData(qf, &impl)); - ierr = CeedFree(&impl->inputs); CeedChkBackend(ierr); - ierr = CeedFree(&impl->outputs); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->inputs)); + CeedCallBackend(CeedFree(&impl->outputs)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -70,20 +65,17 @@ static int CeedQFunctionDestroy_Ref(CeedQFunction qf) { // QFunction Create //------------------------------------------------------------------------------ int CeedQFunctionCreate_Ref(CeedQFunction qf) { - int ierr; Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedQFunction_Ref *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->inputs); CeedChkBackend(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &impl->outputs); CeedChkBackend(ierr); - ierr = CeedQFunctionSetData(qf, impl); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", - CeedQFunctionApply_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", - CeedQFunctionDestroy_Ref); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->inputs)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->outputs)); + CeedCallBackend(CeedQFunctionSetData(qf, impl)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Ref)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-qfunctioncontext.c b/backends/ref/ceed-ref-qfunctioncontext.c index 23ffa0d960..5e4a957fab 100644 --- a/backends/ref/ceed-ref-qfunctioncontext.c +++ b/backends/ref/ceed-ref-qfunctioncontext.c @@ -5,20 +5,18 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include + #include "ceed-ref.h" //------------------------------------------------------------------------------ // QFunctionContext has valid data //------------------------------------------------------------------------------ -static int CeedQFunctionContextHasValidData_Ref(CeedQFunctionContext ctx, - bool *has_valid_data) { - int ierr; +static int CeedQFunctionContextHasValidData_Ref(CeedQFunctionContext ctx, bool *has_valid_data) { CeedQFunctionContext_Ref *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); *has_valid_data = !!impl->data; @@ -28,26 +26,21 @@ static int CeedQFunctionContextHasValidData_Ref(CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // QFunctionContext has borrowed data //------------------------------------------------------------------------------ -static int CeedQFunctionContextHasBorrowedDataOfType_Ref( - CeedQFunctionContext ctx, CeedMemType mem_type, - bool *has_borrowed_data_of_type) { - int ierr; +static int CeedQFunctionContextHasBorrowedDataOfType_Ref(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { CeedQFunctionContext_Ref *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); switch (mem_type) { - case CEED_MEM_HOST: - *has_borrowed_data_of_type = !!impl->data_borrowed; - break; - default: - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only set HOST memory for this backend"); - // LCOV_EXCL_STOP - break; + case CEED_MEM_HOST: + *has_borrowed_data_of_type = !!impl->data_borrowed; + break; + default: + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + // LCOV_EXCL_STOP + break; } return CEED_ERROR_SUCCESS; @@ -56,39 +49,36 @@ static int CeedQFunctionContextHasBorrowedDataOfType_Ref( //------------------------------------------------------------------------------ // QFunctionContext Set Data //------------------------------------------------------------------------------ -static int CeedQFunctionContextSetData_Ref(CeedQFunctionContext ctx, - CeedMemType mem_type, CeedCopyMode copy_mode, void *data) { - int ierr; +static int CeedQFunctionContextSetData_Ref(CeedQFunctionContext ctx, CeedMemType mem_type, CeedCopyMode copy_mode, void *data) { CeedQFunctionContext_Ref *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); size_t ctx_size; - ierr = CeedQFunctionContextGetContextSize(ctx, &ctx_size); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only set HOST memory for this backend"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + // LCOV_EXCL_STOP + } - ierr = CeedFree(&impl->data_owned); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->data_owned)); switch (copy_mode) { - case CEED_COPY_VALUES: - ierr = CeedMallocArray(1, ctx_size, &impl->data_owned); CeedChkBackend(ierr); - impl->data_borrowed = NULL; - impl->data = impl->data_owned; - memcpy(impl->data, data, ctx_size); - break; - case CEED_OWN_POINTER: - impl->data_owned = data; - impl->data_borrowed = NULL; - impl->data = data; - break; - case CEED_USE_POINTER: - impl->data_borrowed = data; - impl->data = data; + case CEED_COPY_VALUES: + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_owned)); + impl->data_borrowed = NULL; + impl->data = impl->data_owned; + memcpy(impl->data, data, ctx_size); + break; + case CEED_OWN_POINTER: + impl->data_owned = data; + impl->data_borrowed = NULL; + impl->data = data; + break; + case CEED_USE_POINTER: + impl->data_borrowed = data; + impl->data = data; } return CEED_ERROR_SUCCESS; } @@ -96,24 +86,21 @@ static int CeedQFunctionContextSetData_Ref(CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // QFunctionContext Take Data //------------------------------------------------------------------------------ -static int CeedQFunctionContextTakeData_Ref(CeedQFunctionContext ctx, - CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextTakeData_Ref(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { CeedQFunctionContext_Ref *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only provide HOST memory for this backend"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + // LCOV_EXCL_STOP + } - *(void **)data = impl->data; + *(void **)data = impl->data; impl->data_borrowed = NULL; - impl->data = NULL; + impl->data = NULL; return CEED_ERROR_SUCCESS; } @@ -121,20 +108,17 @@ static int CeedQFunctionContextTakeData_Ref(CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // QFunctionContext Get Data //------------------------------------------------------------------------------ -static int CeedQFunctionContextGetData_Ref(CeedQFunctionContext ctx, - CeedMemType mem_type, void *data) { - int ierr; +static int CeedQFunctionContextGetData_Ref(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { CeedQFunctionContext_Ref *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, (void *)&impl); - CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only provide HOST memory for this backend"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + // LCOV_EXCL_STOP + } *(void **)data = impl->data; @@ -144,20 +128,17 @@ static int CeedQFunctionContextGetData_Ref(CeedQFunctionContext ctx, //------------------------------------------------------------------------------ // QFunctionContext Restore Data //------------------------------------------------------------------------------ -static int CeedQFunctionContextRestoreData_Ref(CeedQFunctionContext ctx) { - return CEED_ERROR_SUCCESS; -} +static int CeedQFunctionContextRestoreData_Ref(CeedQFunctionContext ctx) { return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // QFunctionContext Destroy //------------------------------------------------------------------------------ static int CeedQFunctionContextDestroy_Ref(CeedQFunctionContext ctx) { - int ierr; CeedQFunctionContext_Ref *impl; - ierr = CeedQFunctionContextGetBackendData(ctx, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - ierr = CeedFree(&impl->data_owned); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->data_owned)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -165,35 +146,22 @@ static int CeedQFunctionContextDestroy_Ref(CeedQFunctionContext ctx) { // QFunctionContext Create //------------------------------------------------------------------------------ int CeedQFunctionContextCreate_Ref(CeedQFunctionContext ctx) { - int ierr; CeedQFunctionContext_Ref *impl; - Ceed ceed; - ierr = CeedQFunctionContextGetCeed(ctx, &ceed); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", - CeedQFunctionContextHasValidData_Ref); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, - "HasBorrowedDataOfType", - CeedQFunctionContextHasBorrowedDataOfType_Ref); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", - CeedQFunctionContextSetData_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "TakeData", - CeedQFunctionContextTakeData_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", - CeedQFunctionContextGetData_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", - CeedQFunctionContextGetData_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreData", - CeedQFunctionContextRestoreData_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", - CeedQFunctionContextRestoreData_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", - CeedQFunctionContextDestroy_Ref); CeedChkBackend(ierr); - - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedQFunctionContextSetBackendData(ctx, impl); CeedChkBackend(ierr); + Ceed ceed; + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", CeedQFunctionContextHasValidData_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasBorrowedDataOfType", CeedQFunctionContextHasBorrowedDataOfType_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", CeedQFunctionContextSetData_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "TakeData", CeedQFunctionContextTakeData_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetData_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreData", CeedQFunctionContextRestoreData_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreData_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Ref)); + + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c index 4f564a5b15..21f0201bf2 100644 --- a/backends/ref/ceed-ref-restriction.c +++ b/backends/ref/ceed-ref-restriction.c @@ -5,38 +5,37 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-ref.h" //------------------------------------------------------------------------------ // Core ElemRestriction Apply Code //------------------------------------------------------------------------------ -static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - int ierr; +static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, + CeedRequest *request) { CeedElemRestriction_Ref *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); const CeedScalar *uu; - CeedScalar *vv; - CeedInt num_elem, elem_size, v_offset; - ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); - v_offset = start*blk_size*elem_size*num_comp; + CeedScalar *vv; + CeedInt num_elem, elem_size, v_offset; + CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); + v_offset = start * blk_size * elem_size * num_comp; bool is_oriented; - ierr = CeedElemRestrictionIsOriented(r, &is_oriented); CeedChkBackend(ierr); - ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionIsOriented(r, &is_oriented)); + CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu)); if (t_mode == CEED_TRANSPOSE) { // Sum into for transpose mode, e-vec to l-vec - ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_HOST, &vv)); } else { // Overwrite for notranspose mode, l-vec to e-vec - ierr = CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &vv); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &vv)); } // Restriction from L-vector to E-vector // Perform: v = r * u @@ -44,48 +43,47 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction r, // No offsets provided, Identity Restriction if (!impl->offsets) { bool has_backend_strides; - ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); if (has_backend_strides) { // CPU backend strides are {1, elem_size, elem_size*num_comp} // This if branch is left separate to allow better inlining - for (CeedInt e = start*blk_size; e < stop*blk_size; e+=blk_size) - CeedPragmaSIMD - for (CeedInt k = 0; k < num_comp; k++) - CeedPragmaSIMD - for (CeedInt n = 0; n < elem_size; n++) - CeedPragmaSIMD - for (CeedInt j = 0; j < blk_size; j++) - vv[e*elem_size*num_comp + (k*elem_size+n)*blk_size + j - v_offset] - = uu[n + k*elem_size + - CeedIntMin(e+j, num_elem-1)*elem_size*num_comp]; + for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { + CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) { + CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = + uu[n + k * elem_size + CeedIntMin(e + j, num_elem - 1) * elem_size * num_comp]; + } + } + } + } } else { // User provided strides CeedInt strides[3]; - ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr); - for (CeedInt e = start*blk_size; e < stop*blk_size; e+=blk_size) - CeedPragmaSIMD - for (CeedInt k = 0; k < num_comp; k++) - CeedPragmaSIMD - for (CeedInt n = 0; n < elem_size; n++) - CeedPragmaSIMD - for (CeedInt j = 0; j < blk_size; j++) - vv[e*elem_size*num_comp + (k*elem_size+n)*blk_size + j - v_offset] - = uu[n*strides[0] + k*strides[1] + - CeedIntMin(e+j, num_elem-1)*strides[2]]; + CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); + for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { + CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) { + CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = + uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * strides[2]]; + } + } + } + } } } else { // Offsets provided, standard or blocked restriction // vv has shape [elem_size, num_comp, num_elem], row-major // uu has shape [nnodes, num_comp] - for (CeedInt e = start*blk_size; e < stop*blk_size; e+=blk_size) - CeedPragmaSIMD - for (CeedInt k = 0; k < num_comp; k++) - CeedPragmaSIMD - for (CeedInt i = 0; i < elem_size*blk_size; i++) - vv[elem_size*(k*blk_size+num_comp*e) + i - v_offset] - = uu[impl->offsets[i+elem_size*e] + k*comp_stride] * - (is_oriented && impl->orient[i+elem_size*e] ? -1. : 1.); + for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { + CeedPragmaSIMD for (CeedInt i = 0; i < elem_size * blk_size; i++) { + vv[elem_size * (k * blk_size + num_comp * e) + i - v_offset] = + uu[impl->offsets[i + elem_size * e] + k * comp_stride] * (is_oriented && impl->orient[i + elem_size * e] ? -1. : 1.); + } + } + } } } else { // Restriction from E-vector to L-vector @@ -93,210 +91,169 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction r, // No offsets provided, Identity Restriction if (!impl->offsets) { bool has_backend_strides; - ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); if (has_backend_strides) { // CPU backend strides are {1, elem_size, elem_size*num_comp} // This if brach is left separate to allow better inlining - for (CeedInt e = start*blk_size; e < stop*blk_size; e+=blk_size) - CeedPragmaSIMD - for (CeedInt k = 0; k < num_comp; k++) - CeedPragmaSIMD - for (CeedInt n = 0; n < elem_size; n++) - CeedPragmaSIMD - for (CeedInt j = 0; j < CeedIntMin(blk_size, num_elem-e); j++) - vv[n + k*elem_size + (e+j)*elem_size*num_comp] - += uu[e*elem_size*num_comp + (k*elem_size+n)*blk_size + j - v_offset]; + for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { + CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) { + CeedPragmaSIMD for (CeedInt j = 0; j < CeedIntMin(blk_size, num_elem - e); j++) { + vv[n + k * elem_size + (e + j) * elem_size * num_comp] += + uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset]; + } + } + } + } } else { // User provided strides CeedInt strides[3]; - ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr); - for (CeedInt e = start*blk_size; e < stop*blk_size; e+=blk_size) - CeedPragmaSIMD - for (CeedInt k = 0; k < num_comp; k++) - CeedPragmaSIMD - for (CeedInt n = 0; n < elem_size; n++) - CeedPragmaSIMD - for (CeedInt j = 0; j < CeedIntMin(blk_size, num_elem-e); j++) - vv[n*strides[0] + k*strides[1] + (e+j)*strides[2]] - += uu[e*elem_size*num_comp + (k*elem_size+n)*blk_size + j - v_offset]; + CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); + for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { + CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) { + CeedPragmaSIMD for (CeedInt j = 0; j < CeedIntMin(blk_size, num_elem - e); j++) { + vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] += + uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset]; + } + } + } + } } } else { // Offsets provided, standard or blocked restriction // uu has shape [elem_size, num_comp, num_elem] // vv has shape [nnodes, num_comp] - for (CeedInt e = start*blk_size; e < stop*blk_size; e+=blk_size) - for (CeedInt k = 0; k < num_comp; k++) - for (CeedInt i = 0; i < elem_size*blk_size; i+=blk_size) + for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt k = 0; k < num_comp; k++) { + for (CeedInt i = 0; i < elem_size * blk_size; i += blk_size) { // Iteration bound set to discard padding elements - for (CeedInt j = i; j < i+CeedIntMin(blk_size, num_elem-e); j++) - vv[impl->offsets[j+e*elem_size] + k*comp_stride] - += uu[elem_size*(k*blk_size+num_comp*e) + j - v_offset] * - (is_oriented && impl->orient[j+e*elem_size] ? -1. : 1.); + for (CeedInt j = i; j < i + CeedIntMin(blk_size, num_elem - e); j++) { + vv[impl->offsets[j + e * elem_size] + k * comp_stride] += + uu[elem_size * (k * blk_size + num_comp * e) + j - v_offset] * (is_oriented && impl->orient[j + e * elem_size] ? -1. : 1.); + } + } + } + } } } - ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChkBackend(ierr); - ierr = CeedVectorRestoreArray(v, &vv); CeedChkBackend(ierr); - if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) - *request = NULL; + CeedCallBackend(CeedVectorRestoreArrayRead(u, &uu)); + CeedCallBackend(CeedVectorRestoreArray(v, &vv)); + if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // ElemRestriction Apply - Common Sizes //------------------------------------------------------------------------------ -static int CeedElemRestrictionApply_Ref_110(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 1, 1, comp_stride, start, stop, - t_mode, u, v, request); +static int CeedElemRestrictionApply_Ref_110(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 1, 1, comp_stride, start, stop, t_mode, u, v, request); } -static int CeedElemRestrictionApply_Ref_111(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 1, 1, 1, start, stop, t_mode, - u, v, request); +static int CeedElemRestrictionApply_Ref_111(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 1, 1, 1, start, stop, t_mode, u, v, request); } -static int CeedElemRestrictionApply_Ref_180(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 1, 8, comp_stride, start, stop, - t_mode, u, v, request); +static int CeedElemRestrictionApply_Ref_180(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 1, 8, comp_stride, start, stop, t_mode, u, v, request); } -static int CeedElemRestrictionApply_Ref_181(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 1, 8, 1, start, stop, t_mode, - u, v, request); +static int CeedElemRestrictionApply_Ref_181(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 1, 8, 1, start, stop, t_mode, u, v, request); } -static int CeedElemRestrictionApply_Ref_310(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 3, 1, comp_stride, start, stop, - t_mode, u, v, request); +static int CeedElemRestrictionApply_Ref_310(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 3, 1, comp_stride, start, stop, t_mode, u, v, request); } -static int CeedElemRestrictionApply_Ref_311(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 3, 1, 1, start, stop, t_mode, - u, v, request); +static int CeedElemRestrictionApply_Ref_311(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 3, 1, 1, start, stop, t_mode, u, v, request); } -static int CeedElemRestrictionApply_Ref_380(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 3, 8, comp_stride, start, stop, - t_mode, u, v, request); +static int CeedElemRestrictionApply_Ref_380(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 3, 8, comp_stride, start, stop, t_mode, u, v, request); } -static int CeedElemRestrictionApply_Ref_381(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 3, 8, 1, start, stop, t_mode, - u, v, request); +static int CeedElemRestrictionApply_Ref_381(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 3, 8, 1, start, stop, t_mode, u, v, request); } // LCOV_EXCL_START -static int CeedElemRestrictionApply_Ref_510(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 5, 1, comp_stride, start, stop, - t_mode, u, v, request); +static int CeedElemRestrictionApply_Ref_510(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 5, 1, comp_stride, start, stop, t_mode, u, v, request); } // LCOV_EXCL_STOP -static int CeedElemRestrictionApply_Ref_511(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 5, 1, 1, start, stop, t_mode, - u, v, request); +static int CeedElemRestrictionApply_Ref_511(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 5, 1, 1, start, stop, t_mode, u, v, request); } // LCOV_EXCL_START -static int CeedElemRestrictionApply_Ref_580(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 5, 8, comp_stride, start, stop, - t_mode, u, v, request); +static int CeedElemRestrictionApply_Ref_580(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 5, 8, comp_stride, start, stop, t_mode, u, v, request); } // LCOV_EXCL_STOP -static int CeedElemRestrictionApply_Ref_581(CeedElemRestriction r, - const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - return CeedElemRestrictionApply_Ref_Core(r, 5, 8, 1, start, stop, t_mode, - u, v, request); +static int CeedElemRestrictionApply_Ref_581(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, + CeedInt start, CeedInt stop, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + return CeedElemRestrictionApply_Ref_Core(r, 5, 8, 1, start, stop, t_mode, u, v, request); } //------------------------------------------------------------------------------ // ElemRestriction Apply //------------------------------------------------------------------------------ -static int CeedElemRestrictionApply_Ref(CeedElemRestriction r, - CeedTransposeMode t_mode, CeedVector u, - CeedVector v, CeedRequest *request) { - int ierr; +static int CeedElemRestrictionApply_Ref(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { CeedInt num_blk, blk_size, num_comp, comp_stride; - ierr = CeedElemRestrictionGetNumBlocks(r, &num_blk); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetBlockSize(r, &blk_size); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_blk)); + CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &blk_size)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); CeedElemRestriction_Ref *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - return impl->Apply(r, num_comp, blk_size, comp_stride, 0, num_blk, t_mode, u, v, - request); + return impl->Apply(r, num_comp, blk_size, comp_stride, 0, num_blk, t_mode, u, v, request); } //------------------------------------------------------------------------------ // ElemRestriction Apply Block //------------------------------------------------------------------------------ -static int CeedElemRestrictionApplyBlock_Ref(CeedElemRestriction r, - CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v, - CeedRequest *request) { - int ierr; +static int CeedElemRestrictionApplyBlock_Ref(CeedElemRestriction r, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v, + CeedRequest *request) { CeedInt blk_size, num_comp, comp_stride; - ierr = CeedElemRestrictionGetBlockSize(r, &blk_size); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &blk_size)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); CeedElemRestriction_Ref *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - return impl->Apply(r, num_comp, blk_size, comp_stride, block, block+1, t_mode, - u, v, request); + return impl->Apply(r, num_comp, blk_size, comp_stride, block, block + 1, t_mode, u, v, request); } //------------------------------------------------------------------------------ // ElemRestriction Get Offsets //------------------------------------------------------------------------------ -static int CeedElemRestrictionGetOffsets_Ref(CeedElemRestriction rstr, - CeedMemType mem_type, const CeedInt **offsets) { - int ierr; +static int CeedElemRestrictionGetOffsets_Ref(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { CeedElemRestriction_Ref *impl; - ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); Ceed ceed; - ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "Can only provide to HOST memory"); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } *offsets = impl->offsets; return CEED_ERROR_SUCCESS; @@ -306,148 +263,135 @@ static int CeedElemRestrictionGetOffsets_Ref(CeedElemRestriction rstr, // ElemRestriction Destroy //------------------------------------------------------------------------------ static int CeedElemRestrictionDestroy_Ref(CeedElemRestriction r) { - int ierr; CeedElemRestriction_Ref *impl; - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - ierr = CeedFree(&impl->offsets_allocated); CeedChkBackend(ierr); - ierr = CeedFree(&impl->orient_allocated); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->offsets_allocated)); + CeedCallBackend(CeedFree(&impl->orient_allocated)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // ElemRestriction Create //------------------------------------------------------------------------------ -int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, - const CeedInt *offsets, - CeedElemRestriction r) { - int ierr; +int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, CeedElemRestriction r) { CeedElemRestriction_Ref *impl; - CeedInt num_elem, elem_size, num_blk, blk_size, num_comp, comp_stride; - ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumBlocks(r, &num_blk); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetBlockSize(r, &blk_size); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr); + CeedInt num_elem, elem_size, num_blk, blk_size, num_comp, comp_stride; + CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_blk)); + CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &blk_size)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); Ceed ceed; - ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST supported"); - // LCOV_EXCL_STOP - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedCalloc(1, &impl)); // Offsets data bool is_strided; - ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); if (!is_strided) { // Check indices for ref or memcheck backends Ceed parent_ceed = ceed, curr_ceed = NULL; while (parent_ceed != curr_ceed) { curr_ceed = parent_ceed; - ierr = CeedGetParent(curr_ceed, &parent_ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedGetParent(curr_ceed, &parent_ceed)); } const char *resource; - ierr = CeedGetResource(parent_ceed, &resource); CeedChkBackend(ierr); - if (!strcmp(resource, "/cpu/self/ref/serial") || - !strcmp(resource, "/cpu/self/ref/blocked") || - !strcmp(resource, "/cpu/self/memcheck/serial") || + CeedCallBackend(CeedGetResource(parent_ceed, &resource)); + if (!strcmp(resource, "/cpu/self/ref/serial") || !strcmp(resource, "/cpu/self/ref/blocked") || !strcmp(resource, "/cpu/self/memcheck/serial") || !strcmp(resource, "/cpu/self/memcheck/blocked")) { CeedSize l_size; - ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); - for (CeedInt i = 0; i < num_elem*elem_size; i++) - if (offsets[i] < 0 || l_size <= offsets[i] + (num_comp - 1) * comp_stride) + for (CeedInt i = 0; i < num_elem * elem_size; i++) { + if (offsets[i] < 0 || l_size <= offsets[i] + (num_comp - 1) * comp_stride) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Restriction offset %" CeedInt_FMT " (%" CeedInt_FMT ") out of range " - "[0, %" CeedInt_FMT "]", i, offsets[i], l_size); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Restriction offset %" CeedInt_FMT " (%" CeedInt_FMT ") out of range [0, %" CeedInt_FMT "]", i, + offsets[i], l_size); + // LCOV_EXCL_STOP + } + } } // Copy data switch (copy_mode) { - case CEED_COPY_VALUES: - ierr = CeedMalloc(num_elem*elem_size, &impl->offsets_allocated); - CeedChkBackend(ierr); - memcpy(impl->offsets_allocated, offsets, - num_elem * elem_size * sizeof(offsets[0])); - impl->offsets = impl->offsets_allocated; - break; - case CEED_OWN_POINTER: - impl->offsets_allocated = (CeedInt *)offsets; - impl->offsets = impl->offsets_allocated; - break; - case CEED_USE_POINTER: - impl->offsets = offsets; + case CEED_COPY_VALUES: + CeedCallBackend(CeedMalloc(num_elem * elem_size, &impl->offsets_allocated)); + memcpy(impl->offsets_allocated, offsets, num_elem * elem_size * sizeof(offsets[0])); + impl->offsets = impl->offsets_allocated; + break; + case CEED_OWN_POINTER: + impl->offsets_allocated = (CeedInt *)offsets; + impl->offsets = impl->offsets_allocated; + break; + case CEED_USE_POINTER: + impl->offsets = offsets; } } - ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr); - CeedInt layout[3] = {1, elem_size, elem_size*num_comp}; - ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", - CeedElemRestrictionApply_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", - CeedElemRestrictionApplyBlock_Ref); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", - CeedElemRestrictionGetOffsets_Ref); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", - CeedElemRestrictionDestroy_Ref); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionSetData(r, impl)); + CeedInt layout[3] = {1, elem_size, elem_size * num_comp}; + CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", CeedElemRestrictionApplyBlock_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Ref)); // Set apply function based upon num_comp, blk_size, and comp_stride CeedInt idx = -1; - if (blk_size < 10) - idx = 100*num_comp + 10*blk_size + (comp_stride == 1); + if (blk_size < 10) idx = 100 * num_comp + 10 * blk_size + (comp_stride == 1); switch (idx) { - case 110: - impl->Apply = CeedElemRestrictionApply_Ref_110; - break; - case 111: - impl->Apply = CeedElemRestrictionApply_Ref_111; - break; - case 180: - impl->Apply = CeedElemRestrictionApply_Ref_180; - break; - case 181: - impl->Apply = CeedElemRestrictionApply_Ref_181; - break; - case 310: - impl->Apply = CeedElemRestrictionApply_Ref_310; - break; - case 311: - impl->Apply = CeedElemRestrictionApply_Ref_311; - break; - case 380: - impl->Apply = CeedElemRestrictionApply_Ref_380; - break; - case 381: - impl->Apply = CeedElemRestrictionApply_Ref_381; - break; - // LCOV_EXCL_START - case 510: - impl->Apply = CeedElemRestrictionApply_Ref_510; - break; - // LCOV_EXCL_STOP - case 511: - impl->Apply = CeedElemRestrictionApply_Ref_511; - break; - // LCOV_EXCL_START - case 580: - impl->Apply = CeedElemRestrictionApply_Ref_580; - break; - // LCOV_EXCL_STOP - case 581: - impl->Apply = CeedElemRestrictionApply_Ref_581; - break; - default: - impl->Apply = CeedElemRestrictionApply_Ref_Core; - break; + case 110: + impl->Apply = CeedElemRestrictionApply_Ref_110; + break; + case 111: + impl->Apply = CeedElemRestrictionApply_Ref_111; + break; + case 180: + impl->Apply = CeedElemRestrictionApply_Ref_180; + break; + case 181: + impl->Apply = CeedElemRestrictionApply_Ref_181; + break; + case 310: + impl->Apply = CeedElemRestrictionApply_Ref_310; + break; + case 311: + impl->Apply = CeedElemRestrictionApply_Ref_311; + break; + case 380: + impl->Apply = CeedElemRestrictionApply_Ref_380; + break; + case 381: + impl->Apply = CeedElemRestrictionApply_Ref_381; + break; + // LCOV_EXCL_START + case 510: + impl->Apply = CeedElemRestrictionApply_Ref_510; + break; + // LCOV_EXCL_STOP + case 511: + impl->Apply = CeedElemRestrictionApply_Ref_511; + break; + // LCOV_EXCL_START + case 580: + impl->Apply = CeedElemRestrictionApply_Ref_580; + break; + // LCOV_EXCL_STOP + case 581: + impl->Apply = CeedElemRestrictionApply_Ref_581; + break; + default: + impl->Apply = CeedElemRestrictionApply_Ref_Core; + break; } return CEED_ERROR_SUCCESS; @@ -456,35 +400,29 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, //------------------------------------------------------------------------------ // ElemRestriction Create Oriented //------------------------------------------------------------------------------ -int CeedElemRestrictionCreateOriented_Ref(CeedMemType mem_type, - CeedCopyMode copy_mode, - const CeedInt *offsets, const bool *orient, - CeedElemRestriction r) { - int ierr; +int CeedElemRestrictionCreateOriented_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orient, + CeedElemRestriction r) { CeedElemRestriction_Ref *impl; - CeedInt num_elem, elem_size; + CeedInt num_elem, elem_size; // Set up for normal restriction with explicit offsets. This sets up dispatch to // CeedElemRestrictionApply_Ref_* and manages the impl->offsets array copy/allocation. - ierr = CeedElemRestrictionCreate_Ref(mem_type, copy_mode, offsets, r); - CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionCreate_Ref(mem_type, copy_mode, offsets, r)); - ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); - ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); + CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); switch (copy_mode) { - case CEED_COPY_VALUES: - ierr = CeedMalloc(num_elem * elem_size, &impl->orient_allocated); - CeedChkBackend(ierr); - memcpy(impl->orient_allocated, orient, - num_elem * elem_size * sizeof(orient[0])); - impl->orient = impl->orient_allocated; - break; - case CEED_OWN_POINTER: - impl->orient_allocated = (bool *)orient; - impl->orient = impl->orient_allocated; - break; - case CEED_USE_POINTER: - impl->orient = orient; + case CEED_COPY_VALUES: + CeedCallBackend(CeedMalloc(num_elem * elem_size, &impl->orient_allocated)); + memcpy(impl->orient_allocated, orient, num_elem * elem_size * sizeof(orient[0])); + impl->orient = impl->orient_allocated; + break; + case CEED_OWN_POINTER: + impl->orient_allocated = (bool *)orient; + impl->orient = impl->orient_allocated; + break; + case CEED_USE_POINTER: + impl->orient = orient; } return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-tensor.c b/backends/ref/ceed-ref-tensor.c index f60b7cba4c..bc879da82d 100644 --- a/backends/ref/ceed-ref-tensor.c +++ b/backends/ref/ceed-ref-tensor.c @@ -5,57 +5,51 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include + #include "ceed-ref.h" //------------------------------------------------------------------------------ // Tensor Contract Apply //------------------------------------------------------------------------------ -static int CeedTensorContractApply_Ref(CeedTensorContract contract, CeedInt A, - CeedInt B, CeedInt C, CeedInt J, - const CeedScalar *restrict t, - CeedTransposeMode t_mode, const CeedInt add, - const CeedScalar *restrict u, - CeedScalar *restrict v) { +static int CeedTensorContractApply_Ref(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v) { CeedInt t_stride_0 = B, t_stride_1 = 1; if (t_mode == CEED_TRANSPOSE) { - t_stride_0 = 1; t_stride_1 = J; + t_stride_0 = 1; + t_stride_1 = J; } - if (!add) - for (CeedInt q=0; q #include +#include #include + #include "ceed-ref.h" //------------------------------------------------------------------------------ // Has Valid Array //------------------------------------------------------------------------------ static int CeedVectorHasValidArray_Ref(CeedVector vec, bool *has_valid_array) { - int ierr; CeedVector_Ref *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); *has_valid_array = !!impl->array; @@ -26,24 +26,21 @@ static int CeedVectorHasValidArray_Ref(CeedVector vec, bool *has_valid_array) { //------------------------------------------------------------------------------ // Check if has borrowed array of given type //------------------------------------------------------------------------------ -static inline int CeedVectorHasBorrowedArrayOfType_Ref(const CeedVector vec, - CeedMemType mem_type, bool *has_borrowed_array_of_type) { - int ierr; +static inline int CeedVectorHasBorrowedArrayOfType_Ref(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { CeedVector_Ref *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); switch (mem_type) { - case CEED_MEM_HOST: - *has_borrowed_array_of_type = !!impl->array_borrowed; - break; - default: - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only set HOST memory for this backend"); - // LCOV_EXCL_STOP - break; + case CEED_MEM_HOST: + *has_borrowed_array_of_type = !!impl->array_borrowed; + break; + default: + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + // LCOV_EXCL_STOP + break; } return CEED_ERROR_SUCCESS; @@ -52,42 +49,39 @@ static inline int CeedVectorHasBorrowedArrayOfType_Ref(const CeedVector vec, //------------------------------------------------------------------------------ // Vector Set Array //------------------------------------------------------------------------------ -static int CeedVectorSetArray_Ref(CeedVector vec, CeedMemType mem_type, - CeedCopyMode copy_mode, CeedScalar *array) { - int ierr; +static int CeedVectorSetArray_Ref(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array) { CeedVector_Ref *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedSize length; - ierr = CeedVectorGetLength(vec, &length); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetLength(vec, &length)); Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only set HOST memory for this backend"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); + // LCOV_EXCL_STOP + } switch (copy_mode) { - case CEED_COPY_VALUES: - if (!impl->array_owned) { - ierr = CeedCalloc(length, &impl->array_owned); CeedChkBackend(ierr); - } - impl->array_borrowed = NULL; - impl->array = impl->array_owned; - if (array) - memcpy(impl->array, array, length * sizeof(array[0])); - break; - case CEED_OWN_POINTER: - ierr = CeedFree(&impl->array_owned); CeedChkBackend(ierr); - impl->array_owned = array; - impl->array_borrowed = NULL; - impl->array = array; - break; - case CEED_USE_POINTER: - ierr = CeedFree(&impl->array_owned); CeedChkBackend(ierr); - impl->array_borrowed = array; - impl->array = array; + case CEED_COPY_VALUES: + if (!impl->array_owned) { + CeedCallBackend(CeedCalloc(length, &impl->array_owned)); + } + impl->array_borrowed = NULL; + impl->array = impl->array_owned; + if (array) memcpy(impl->array, array, length * sizeof(array[0])); + break; + case CEED_OWN_POINTER: + CeedCallBackend(CeedFree(&impl->array_owned)); + impl->array_owned = array; + impl->array_borrowed = NULL; + impl->array = array; + break; + case CEED_USE_POINTER: + CeedCallBackend(CeedFree(&impl->array_owned)); + impl->array_borrowed = array; + impl->array = array; } return CEED_ERROR_SUCCESS; } @@ -95,17 +89,15 @@ static int CeedVectorSetArray_Ref(CeedVector vec, CeedMemType mem_type, //------------------------------------------------------------------------------ // Vector Take Array //------------------------------------------------------------------------------ -static int CeedVectorTakeArray_Ref(CeedVector vec, CeedMemType mem_type, - CeedScalar **array) { - int ierr; +static int CeedVectorTakeArray_Ref(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { CeedVector_Ref *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - (*array) = impl->array_borrowed; + (*array) = impl->array_borrowed; impl->array_borrowed = NULL; - impl->array = NULL; + impl->array = NULL; return CEED_ERROR_SUCCESS; } @@ -113,19 +105,17 @@ static int CeedVectorTakeArray_Ref(CeedVector vec, CeedMemType mem_type, //------------------------------------------------------------------------------ // Vector Get Array //------------------------------------------------------------------------------ -static int CeedVectorGetArrayCore_Ref(CeedVector vec, CeedMemType mem_type, - CeedScalar **array) { - int ierr; +static int CeedVectorGetArrayCore_Ref(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { CeedVector_Ref *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - if (mem_type != CEED_MEM_HOST) + if (mem_type != CEED_MEM_HOST) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Can only provide HOST memory for this backend"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); + // LCOV_EXCL_STOP + } *array = impl->array; @@ -135,39 +125,32 @@ static int CeedVectorGetArrayCore_Ref(CeedVector vec, CeedMemType mem_type, //------------------------------------------------------------------------------ // Vector Get Array Read //------------------------------------------------------------------------------ -static int CeedVectorGetArrayRead_Ref(CeedVector vec, CeedMemType mem_type, - const CeedScalar **array) { +static int CeedVectorGetArrayRead_Ref(CeedVector vec, CeedMemType mem_type, const CeedScalar **array) { return CeedVectorGetArrayCore_Ref(vec, mem_type, (CeedScalar **)array); } //------------------------------------------------------------------------------ // Vector Get Array //------------------------------------------------------------------------------ -static int CeedVectorGetArray_Ref(CeedVector vec, CeedMemType mem_type, - CeedScalar **array) { +static int CeedVectorGetArray_Ref(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { return CeedVectorGetArrayCore_Ref(vec, mem_type, array); } //------------------------------------------------------------------------------ // Vector Get Array Write //------------------------------------------------------------------------------ -static int CeedVectorGetArrayWrite_Ref(CeedVector vec, CeedMemType mem_type, - const CeedScalar **array) { - int ierr; +static int CeedVectorGetArrayWrite_Ref(CeedVector vec, CeedMemType mem_type, const CeedScalar **array) { CeedVector_Ref *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); if (!impl->array) { if (!impl->array_owned && !impl->array_borrowed) { // Allocate if array is not yet allocated - ierr = CeedVectorSetArray(vec, CEED_MEM_HOST, CEED_COPY_VALUES, NULL); - CeedChkBackend(ierr); + CeedCallBackend(CeedVectorSetArray(vec, CEED_MEM_HOST, CEED_COPY_VALUES, NULL)); } else { // Select dirty array for GetArrayWrite - if (impl->array_borrowed) - impl->array = impl->array_borrowed; - else - impl->array = impl->array_owned; + if (impl->array_borrowed) impl->array = impl->array_borrowed; + else impl->array = impl->array_owned; } } @@ -177,24 +160,19 @@ static int CeedVectorGetArrayWrite_Ref(CeedVector vec, CeedMemType mem_type, //------------------------------------------------------------------------------ // Vector Restore Array //------------------------------------------------------------------------------ -static int CeedVectorRestoreArray_Ref(CeedVector vec) { - return CEED_ERROR_SUCCESS; -} +static int CeedVectorRestoreArray_Ref(CeedVector vec) { return CEED_ERROR_SUCCESS; } -static int CeedVectorRestoreArrayRead_Ref(CeedVector vec) { - return CEED_ERROR_SUCCESS; -} +static int CeedVectorRestoreArrayRead_Ref(CeedVector vec) { return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Vector Destroy //------------------------------------------------------------------------------ static int CeedVectorDestroy_Ref(CeedVector vec) { - int ierr; CeedVector_Ref *impl; - ierr = CeedVectorGetData(vec, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedVectorGetData(vec, &impl)); - ierr = CeedFree(&impl->array_owned); CeedChkBackend(ierr); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl->array_owned)); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } @@ -202,35 +180,23 @@ static int CeedVectorDestroy_Ref(CeedVector vec) { // Vector Create //------------------------------------------------------------------------------ int CeedVectorCreate_Ref(CeedSize n, CeedVector vec) { - int ierr; CeedVector_Ref *impl; - Ceed ceed; - ierr = CeedVectorGetCeed(vec, &ceed); CeedChkBackend(ierr); - - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", - CeedVectorHasValidArray_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", - CeedVectorHasBorrowedArrayOfType_Ref); - CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", - CeedVectorSetArray_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", - CeedVectorTakeArray_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", - CeedVectorGetArray_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", - CeedVectorGetArrayRead_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", - CeedVectorGetArrayWrite_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", - CeedVectorRestoreArray_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", - CeedVectorRestoreArrayRead_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", - CeedVectorDestroy_Ref); CeedChkBackend(ierr); - - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); - ierr = CeedVectorSetData(vec, impl); CeedChkBackend(ierr); + Ceed ceed; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", CeedVectorRestoreArray_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", CeedVectorRestoreArrayRead_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Ref)); + + CeedCallBackend(CeedCalloc(1, &impl)); + CeedCallBackend(CeedVectorSetData(vec, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref.c b/backends/ref/ceed-ref.c index aec9714b05..fec790ea9a 100644 --- a/backends/ref/ceed-ref.c +++ b/backends/ref/ceed-ref.c @@ -5,48 +5,34 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include "ceed-ref.h" + #include +#include #include -#include "ceed-ref.h" //------------------------------------------------------------------------------ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Ref(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/ref") - && strcmp(resource, "/cpu/self/ref/serial")) + if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/ref") && strcmp(resource, "/cpu/self/ref/serial")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "Ref backend cannot use resource: %s", resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "Ref backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", - CeedVectorCreate_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", - CeedBasisCreateTensorH1_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", - CeedBasisCreateH1_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", - CeedBasisCreateHdiv_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", - CeedElemRestrictionCreate_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, - "ElemRestrictionCreateOriented", - CeedElemRestrictionCreateOriented_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, - "ElemRestrictionCreateBlocked", - CeedElemRestrictionCreate_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", - CeedQFunctionCreate_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", - CeedQFunctionContextCreate_Ref); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", - CeedOperatorCreate_Ref); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateOriented", CeedElemRestrictionCreateOriented_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreateBlocked", CeedElemRestrictionCreate_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Ref)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Ref)); return CEED_ERROR_SUCCESS; } @@ -55,8 +41,8 @@ static int CeedInit_Ref(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ CEED_INTERN int CeedRegister_Ref(void) { return -//! [Register] - CeedRegister("/cpu/self/ref/serial", CeedInit_Ref, 50); -//! [Register] + //! [Register] + CeedRegister("/cpu/self/ref/serial", CeedInit_Ref, 50); + //! [Register] } //------------------------------------------------------------------------------ diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h index 07685c8de4..652cd6c273 100644 --- a/backends/ref/ceed-ref.h +++ b/backends/ref/ceed-ref.h @@ -8,14 +8,14 @@ #ifndef _ceed_ref_h #define _ceed_ref_h -#include #include +#include #include #include typedef struct { CeedScalar *collo_grad_1d; - bool has_collo_interp; + bool has_collo_interp; } CeedBasis_Ref; typedef struct { @@ -26,18 +26,17 @@ typedef struct { typedef struct { const CeedInt *offsets; - CeedInt *offsets_allocated; + CeedInt *offsets_allocated; // Orientation, if it exists, is true when the face must be flipped (multiplies by -1.). const bool *orient; - bool *orient_allocated; - int (*Apply)(CeedElemRestriction, const CeedInt, const CeedInt, - const CeedInt, CeedInt, CeedInt, CeedTransposeMode, CeedVector, - CeedVector, CeedRequest *); + bool *orient_allocated; + int (*Apply)(CeedElemRestriction, const CeedInt, const CeedInt, const CeedInt, CeedInt, CeedInt, CeedTransposeMode, CeedVector, CeedVector, + CeedRequest *); } CeedElemRestriction_Ref; typedef struct { const CeedScalar **inputs; - CeedScalar **outputs; + CeedScalar **outputs; } CeedQFunction_Ref; typedef struct { @@ -47,49 +46,32 @@ typedef struct { } CeedQFunctionContext_Ref; typedef struct { - bool is_identity_qf, is_identity_restr_op; - CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ - uint64_t *input_states; /* State counter of inputs */ - CeedVector *e_vecs_in; /* Single element input E-vectors */ - CeedVector *e_vecs_out; /* Single element output E-vectors */ - CeedVector *q_vecs_in; /* Single element input Q-vectors */ - CeedVector *q_vecs_out; /* Single element output Q-vectors */ - CeedInt num_inputs, num_outputs; - CeedInt num_active_in, num_active_out; + bool is_identity_qf, is_identity_restr_op; + CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ + uint64_t *input_states; /* State counter of inputs */ + CeedVector *e_vecs_in; /* Single element input E-vectors */ + CeedVector *e_vecs_out; /* Single element output E-vectors */ + CeedVector *q_vecs_in; /* Single element input Q-vectors */ + CeedVector *q_vecs_out; /* Single element output Q-vectors */ + CeedInt num_inputs, num_outputs; + CeedInt num_active_in, num_active_out; CeedVector *qf_active_in; } CeedOperator_Ref; CEED_INTERN int CeedVectorCreate_Ref(CeedSize n, CeedVector vec); -CEED_INTERN int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, - CeedCopyMode copy_mode, const CeedInt *indices, CeedElemRestriction r); - -CEED_INTERN int CeedElemRestrictionCreateOriented_Ref(CeedMemType mem_type, - CeedCopyMode copy_mode, const CeedInt *indices, - const bool *orient, CeedElemRestriction r); - -CEED_INTERN int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, - CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, - const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); - -CEED_INTERN int CeedBasisCreateH1_Ref(CeedElemTopology topo, - CeedInt dim, CeedInt num_dof, CeedInt num_qpts, - const CeedScalar *interp, - const CeedScalar *grad, - const CeedScalar *q_ref, - const CeedScalar *q_weight, - CeedBasis basis); +CEED_INTERN int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, CeedElemRestriction r); +CEED_INTERN int CeedElemRestrictionCreateOriented_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orient, + CeedElemRestriction r); -CEED_INTERN int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, - CeedInt dim, CeedInt num_dof, CeedInt num_qpts, - const CeedScalar *interp, - const CeedScalar *div, - const CeedScalar *q_ref, - const CeedScalar *q_weight, - CeedBasis basis); +CEED_INTERN int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); +CEED_INTERN int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); +CEED_INTERN int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); -CEED_INTERN int CeedTensorContractCreate_Ref(CeedBasis basis, - CeedTensorContract contract); +CEED_INTERN int CeedTensorContractCreate_Ref(CeedBasis basis, CeedTensorContract contract); CEED_INTERN int CeedQFunctionCreate_Ref(CeedQFunction qf); @@ -97,4 +79,4 @@ CEED_INTERN int CeedQFunctionContextCreate_Ref(CeedQFunctionContext ctx); CEED_INTERN int CeedOperatorCreate_Ref(CeedOperator op); -#endif // _ceed_ref_h +#endif // _ceed_ref_h diff --git a/backends/xsmm/ceed-xsmm-blocked.c b/backends/xsmm/ceed-xsmm-blocked.c index 2ee878e361..f611ec22e9 100644 --- a/backends/xsmm/ceed-xsmm-blocked.c +++ b/backends/xsmm/ceed-xsmm-blocked.c @@ -5,40 +5,34 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-xsmm.h" //------------------------------------------------------------------------------ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Xsmm_Blocked(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/xsmm") && - strcmp(resource, "/cpu/self/xsmm/blocked")) + if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/xsmm") && strcmp(resource, "/cpu/self/xsmm/blocked")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "blocked libXSMM backend cannot use resource: %s", - resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "blocked libXSMM backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceed_ref; - CeedInit("/cpu/self/opt/blocked", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_f64_Xsmm); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Xsmm)); } else { - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_f32_Xsmm); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Xsmm)); } return CEED_ERROR_SUCCESS; @@ -47,7 +41,5 @@ static int CeedInit_Xsmm_Blocked(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Xsmm_Blocked(void) { - return CeedRegister("/cpu/self/xsmm/blocked", CeedInit_Xsmm_Blocked, 20); -} +CEED_INTERN int CeedRegister_Xsmm_Blocked(void) { return CeedRegister("/cpu/self/xsmm/blocked", CeedInit_Xsmm_Blocked, 20); } //------------------------------------------------------------------------------ diff --git a/backends/xsmm/ceed-xsmm-serial.c b/backends/xsmm/ceed-xsmm-serial.c index 7f58a29c04..a0fc394420 100644 --- a/backends/xsmm/ceed-xsmm-serial.c +++ b/backends/xsmm/ceed-xsmm-serial.c @@ -5,40 +5,34 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include + #include "ceed-xsmm.h" //------------------------------------------------------------------------------ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Xsmm_Serial(const char *resource, Ceed ceed) { - int ierr; - if (strcmp(resource, "/cpu/self") - && strcmp(resource, "/cpu/self/xsmm/serial")) + if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/xsmm/serial")) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, - "serial libXSMM backend cannot use resource: %s", - resource); - // LCOV_EXCL_STOP - ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr); + return CeedError(ceed, CEED_ERROR_BACKEND, "serial libXSMM backend cannot use resource: %s", resource); + // LCOV_EXCL_STOP + } + CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference CEED that implementation will be dispatched // through unless overridden Ceed ceed_ref; - CeedInit("/cpu/self/opt/serial", &ceed_ref); - ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr); + CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_f64_Xsmm); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Xsmm)); } else { - ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", - CeedTensorContractCreate_f32_Xsmm); - CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Xsmm)); } return CEED_ERROR_SUCCESS; @@ -47,7 +41,5 @@ static int CeedInit_Xsmm_Serial(const char *resource, Ceed ceed) { //------------------------------------------------------------------------------ // Backend Register //------------------------------------------------------------------------------ -CEED_INTERN int CeedRegister_Xsmm_Serial(void) { - return CeedRegister("/cpu/self/xsmm/serial", CeedInit_Xsmm_Serial, 25); -} +CEED_INTERN int CeedRegister_Xsmm_Serial(void) { return CeedRegister("/cpu/self/xsmm/serial", CeedInit_Xsmm_Serial, 25); } //------------------------------------------------------------------------------ diff --git a/backends/xsmm/ceed-xsmm-tensor-f32.c b/backends/xsmm/ceed-xsmm-tensor-f32.c index f00666f527..fa5b7328e2 100644 --- a/backends/xsmm/ceed-xsmm-tensor-f32.c +++ b/backends/xsmm/ceed-xsmm-tensor-f32.c @@ -5,37 +5,28 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include #include + #include "ceed-xsmm.h" //------------------------------------------------------------------------------ // Tensor Contract C=1 //------------------------------------------------------------------------------ -static int CeedTensorContract_Xsmm_C1(CeedTensorContract contract, - CeedInt A, CeedInt B, CeedInt C, - CeedInt J, const float *restrict t, - CeedTransposeMode t_mode, - const CeedInt add, - const float *restrict u, - float *restrict v) { +static int CeedTensorContract_Xsmm_C1(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) { float alpha = 1.0, beta = 1.0; - char trans_u = 'N', trans_t = 'N'; - if ((t_mode == CEED_TRANSPOSE && C != 1) || - (t_mode == CEED_NOTRANSPOSE && C == 1)) - trans_t = 'T'; + char trans_u = 'N', trans_t = 'N'; + if ((t_mode == CEED_TRANSPOSE && C != 1) || (t_mode == CEED_NOTRANSPOSE && C == 1)) trans_t = 'T'; - if (!add) - beta = 0.0; + if (!add) beta = 0.0; // libXSMM GEMM - libxsmm_sgemm(&trans_t, &trans_u, &J, &A, &B, - &alpha, &t[0], NULL, &u[0], NULL, - &beta, &v[0], NULL); + libxsmm_sgemm(&trans_t, &trans_u, &J, &A, &B, &alpha, &t[0], NULL, &u[0], NULL, &beta, &v[0], NULL); return CEED_ERROR_SUCCESS; } @@ -43,29 +34,23 @@ static int CeedTensorContract_Xsmm_C1(CeedTensorContract contract, //------------------------------------------------------------------------------ // Tensor Contract Apply //------------------------------------------------------------------------------ -static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, - CeedInt B, CeedInt C, CeedInt J, - const float *restrict t, - CeedTransposeMode t_mode, - const CeedInt add, - const float *restrict u, - float *restrict v) { - int ierr; +static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) { CeedTensorContract_Xsmm *impl; - ierr = CeedTensorContractGetData(contract, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedTensorContractGetData(contract, &impl)); // Get kernel libxsmm_smmfunction kernel; - CeedHashIJKLMKey key = {B, C, J, t_mode, add}; - khint_t k = kh_get(f32, impl->lookup_f32, key); + CeedHashIJKLMKey key = {B, C, J, t_mode, add}; + khint_t k = kh_get(f32, impl->lookup_f32, key); CeedHashGetValue(impl->lookup_f32, k, kernel); // Run kernel or fallback to default implementation - if (C != 1) - for (CeedInt a=0; alookup_f32, kernel, libxsmm_release_kernel(&kernel)); kh_destroy(f32, impl->lookup_f32); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Tensor Contract Create //------------------------------------------------------------------------------ -int CeedTensorContractCreate_f32_Xsmm(CeedBasis basis, - CeedTensorContract contract) { - int ierr; +int CeedTensorContractCreate_f32_Xsmm(CeedBasis basis, CeedTensorContract contract) { Ceed ceed; - ierr = CeedTensorContractGetCeed(contract, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); CeedTensorContract_Xsmm *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); // Setup kernels hash table impl->lookup_f32 = kh_init(f32); // Set up pointers to kernels - ierr = CeedBasisIsTensor(basis, &impl->is_tensor); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisIsTensor(basis, &impl->is_tensor)); if (impl->is_tensor) { - ierr = CeedBasisGetNumNodes1D(basis, &impl->P); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &impl->Q); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &impl->dim); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &impl->P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &impl->Q)); + CeedCallBackend(CeedBasisGetDimension(basis, &impl->dim)); // Build all required kernels - for (CeedInt num_elem = 1; num_elem <= 8; num_elem+=7) - for (CeedInt add = 0; add <= 1; add++) - for (CeedInt t_mode = 0; t_mode <= 1; t_mode++) - for (CeedInt grad = 0; grad <=1; grad++) + for (CeedInt num_elem = 1; num_elem <= 8; num_elem += 7) { + for (CeedInt add = 0; add <= 1; add++) { + for (CeedInt t_mode = 0; t_mode <= 1; t_mode++) { + for (CeedInt grad = 0; grad <= 1; grad++) { for (CeedInt dim = 0; dim < impl->dim; dim++) { const int flags = LIBXSMM_GEMM_FLAGS('N', t_mode ? 'T' : 'N'); - CeedInt B = grad ? impl->Q : (t_mode ? impl->Q : impl->P), - J = grad ? impl->Q : (t_mode ? impl->P : impl->Q), - C = num_elem*CeedIntPow(J, dim); + CeedInt B = grad ? impl->Q : (t_mode ? impl->Q : impl->P), J = grad ? impl->Q : (t_mode ? impl->P : impl->Q), + C = num_elem * CeedIntPow(J, dim); // Add key, kernel pair to hash table CeedHashIJKLMKey key = {B, C, J, t_mode, add}; - int new_item; - khint_t k = kh_put(f32, impl->lookup_f32, key, &new_item); + int new_item; + khint_t k = kh_put(f32, impl->lookup_f32, key, &new_item); if (new_item) { // Build kernel float alpha = 1.0, beta = 1.0; if (!add) beta = 0.0; - libxsmm_smmfunction kernel = libxsmm_smmdispatch( - C, J, B, NULL, NULL, NULL, &alpha, &beta, &flags, NULL); + libxsmm_smmfunction kernel = libxsmm_smmdispatch(C, J, B, NULL, NULL, NULL, &alpha, &beta, &flags, NULL); if (!kernel) // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); @@ -134,30 +114,31 @@ int CeedTensorContractCreate_f32_Xsmm(CeedBasis basis, kh_value(impl->lookup_f32, k) = kernel; } } + } + } + } + } } else { - ierr = CeedBasisGetNumNodes(basis, &impl->P); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis, &impl->Q); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &impl->dim); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumNodes(basis, &impl->P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &impl->Q)); + CeedCallBackend(CeedBasisGetDimension(basis, &impl->dim)); // Build all required kernels - for (CeedInt num_elem = 1; num_elem <= 8; num_elem+=7) - for (CeedInt add = 0; add <= 1; add++) + for (CeedInt num_elem = 1; num_elem <= 8; num_elem += 7) { + for (CeedInt add = 0; add <= 1; add++) { for (CeedInt t_mode = 0; t_mode <= 1; t_mode++) { - CeedInt gradstride = CeedIntMax(impl->dim-1, 1); - for (CeedInt grad = 1; grad <= impl->dim; grad+=gradstride) { + CeedInt gradstride = CeedIntMax(impl->dim - 1, 1); + for (CeedInt grad = 1; grad <= impl->dim; grad += gradstride) { const int flags = LIBXSMM_GEMM_FLAGS('N', t_mode ? 'T' : 'N'); - CeedInt B = t_mode ? grad*impl->Q : impl->P, - J = t_mode ? impl->P : grad*impl->Q, - C = num_elem; + CeedInt B = t_mode ? grad * impl->Q : impl->P, J = t_mode ? impl->P : grad * impl->Q, C = num_elem; // Add key, kernel pair to hash table CeedHashIJKLMKey key = {B, C, J, t_mode, add}; - int new_item; - khint_t k = kh_put(f32, impl->lookup_f32, key, &new_item); + int new_item; + khint_t k = kh_put(f32, impl->lookup_f32, key, &new_item); if (new_item) { // Build kernel float alpha = 1.0, beta = 1.0; if (!add) beta = 0.0; - libxsmm_smmfunction kernel = libxsmm_smmdispatch( - C, J, B, NULL, NULL, NULL, &alpha, &beta, &flags, NULL); + libxsmm_smmfunction kernel = libxsmm_smmdispatch(C, J, B, NULL, NULL, NULL, &alpha, &beta, &flags, NULL); if (!kernel) // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); @@ -167,13 +148,13 @@ int CeedTensorContractCreate_f32_Xsmm(CeedBasis basis, } } } + } + } } - ierr = CeedTensorContractSetData(contract, impl); CeedChkBackend(ierr); + CeedCallBackend(CeedTensorContractSetData(contract, impl)); - ierr = CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", - CeedTensorContractApply_Xsmm); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "TensorContract", contract, "Destroy", - CeedTensorContractDestroy_Xsmm); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Xsmm)); + CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Destroy", CeedTensorContractDestroy_Xsmm)); return CEED_ERROR_SUCCESS; } diff --git a/backends/xsmm/ceed-xsmm-tensor-f64.c b/backends/xsmm/ceed-xsmm-tensor-f64.c index b25da133d5..610b1fbd73 100644 --- a/backends/xsmm/ceed-xsmm-tensor-f64.c +++ b/backends/xsmm/ceed-xsmm-tensor-f64.c @@ -5,37 +5,28 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include #include #include #include #include + #include "ceed-xsmm.h" //------------------------------------------------------------------------------ // Tensor Contract C=1 //------------------------------------------------------------------------------ -static int CeedTensorContract_Xsmm_C1(CeedTensorContract contract, - CeedInt A, CeedInt B, CeedInt C, - CeedInt J, const double *restrict t, - CeedTransposeMode t_mode, - const CeedInt add, - const double *restrict u, - double *restrict v) { +static int CeedTensorContract_Xsmm_C1(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v) { double alpha = 1.0, beta = 1.0; - char trans_u = 'N', trans_t = 'N'; - if ((t_mode == CEED_TRANSPOSE && C != 1) || - (t_mode == CEED_NOTRANSPOSE && C == 1)) - trans_t = 'T'; + char trans_u = 'N', trans_t = 'N'; + if ((t_mode == CEED_TRANSPOSE && C != 1) || (t_mode == CEED_NOTRANSPOSE && C == 1)) trans_t = 'T'; - if (!add) - beta = 0.0; + if (!add) beta = 0.0; // libXSMM GEMM - libxsmm_dgemm(&trans_t, &trans_u, &J, &A, &B, - &alpha, &t[0], NULL, &u[0], NULL, - &beta, &v[0], NULL); + libxsmm_dgemm(&trans_t, &trans_u, &J, &A, &B, &alpha, &t[0], NULL, &u[0], NULL, &beta, &v[0], NULL); return CEED_ERROR_SUCCESS; } @@ -43,29 +34,23 @@ static int CeedTensorContract_Xsmm_C1(CeedTensorContract contract, //------------------------------------------------------------------------------ // Tensor Contract Apply //------------------------------------------------------------------------------ -static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, - CeedInt B, CeedInt C, CeedInt J, - const double *restrict t, - CeedTransposeMode t_mode, - const CeedInt add, - const double *restrict u, - double *restrict v) { - int ierr; +static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v) { CeedTensorContract_Xsmm *impl; - ierr = CeedTensorContractGetData(contract, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedTensorContractGetData(contract, &impl)); // Get kernel libxsmm_dmmfunction kernel; - CeedHashIJKLMKey key = {B, C, J, t_mode, add}; - khint_t k = kh_get(f64, impl->lookup_f64, key); + CeedHashIJKLMKey key = {B, C, J, t_mode, add}; + khint_t k = kh_get(f64, impl->lookup_f64, key); CeedHashGetValue(impl->lookup_f64, k, kernel); // Run kernel or fallback to default implementation - if (C != 1) - for (CeedInt a=0; alookup_f64, kernel, libxsmm_release_kernel(&kernel)); kh_destroy(f64, impl->lookup_f64); - ierr = CeedFree(&impl); CeedChkBackend(ierr); + CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Tensor Contract Create //------------------------------------------------------------------------------ -int CeedTensorContractCreate_f64_Xsmm(CeedBasis basis, - CeedTensorContract contract) { - int ierr; +int CeedTensorContractCreate_f64_Xsmm(CeedBasis basis, CeedTensorContract contract) { Ceed ceed; - ierr = CeedTensorContractGetCeed(contract, &ceed); CeedChkBackend(ierr); + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); CeedTensorContract_Xsmm *impl; - ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); + CeedCallBackend(CeedCalloc(1, &impl)); // Setup kernels hash table impl->lookup_f64 = kh_init(f64); // Set up pointers to kernels - ierr = CeedBasisIsTensor(basis, &impl->is_tensor); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisIsTensor(basis, &impl->is_tensor)); if (impl->is_tensor) { - ierr = CeedBasisGetNumNodes1D(basis, &impl->P); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints1D(basis, &impl->Q); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &impl->dim); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &impl->P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &impl->Q)); + CeedCallBackend(CeedBasisGetDimension(basis, &impl->dim)); // Build all required kernels - for (CeedInt num_elem = 1; num_elem <= 8; num_elem+=7) - for (CeedInt add = 0; add <= 1; add++) - for (CeedInt t_mode = 0; t_mode <= 1; t_mode++) - for (CeedInt grad = 0; grad <=1; grad++) + for (CeedInt num_elem = 1; num_elem <= 8; num_elem += 7) { + for (CeedInt add = 0; add <= 1; add++) { + for (CeedInt t_mode = 0; t_mode <= 1; t_mode++) { + for (CeedInt grad = 0; grad <= 1; grad++) { for (CeedInt dim = 0; dim < impl->dim; dim++) { const int flags = LIBXSMM_GEMM_FLAGS('N', t_mode ? 'T' : 'N'); - CeedInt B = grad ? impl->Q : (t_mode ? impl->Q : impl->P), - J = grad ? impl->Q : (t_mode ? impl->P : impl->Q), - C = num_elem*CeedIntPow(J, dim); + CeedInt B = grad ? impl->Q : (t_mode ? impl->Q : impl->P), J = grad ? impl->Q : (t_mode ? impl->P : impl->Q), + C = num_elem * CeedIntPow(J, dim); // Add key, kernel pair to hash table CeedHashIJKLMKey key = {B, C, J, t_mode, add}; - int new_item; - khint_t k = kh_put(f64, impl->lookup_f64, key, &new_item); + int new_item; + khint_t k = kh_put(f64, impl->lookup_f64, key, &new_item); if (new_item) { // Build kernel double alpha = 1.0, beta = 1.0; if (!add) beta = 0.0; - libxsmm_dmmfunction kernel = libxsmm_dmmdispatch( - C, J, B, NULL, NULL, NULL, &alpha, &beta, &flags, NULL); + libxsmm_dmmfunction kernel = libxsmm_dmmdispatch(C, J, B, NULL, NULL, NULL, &alpha, &beta, &flags, NULL); if (!kernel) // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); @@ -134,30 +114,31 @@ int CeedTensorContractCreate_f64_Xsmm(CeedBasis basis, kh_value(impl->lookup_f64, k) = kernel; } } + } + } + } + } } else { - ierr = CeedBasisGetNumNodes(basis, &impl->P); CeedChkBackend(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis, &impl->Q); CeedChkBackend(ierr); - ierr = CeedBasisGetDimension(basis, &impl->dim); CeedChkBackend(ierr); + CeedCallBackend(CeedBasisGetNumNodes(basis, &impl->P)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &impl->Q)); + CeedCallBackend(CeedBasisGetDimension(basis, &impl->dim)); // Build all required kernels - for (CeedInt num_elem = 1; num_elem <= 8; num_elem+=7) - for (CeedInt add = 0; add <= 1; add++) + for (CeedInt num_elem = 1; num_elem <= 8; num_elem += 7) { + for (CeedInt add = 0; add <= 1; add++) { for (CeedInt t_mode = 0; t_mode <= 1; t_mode++) { - CeedInt gradstride = CeedIntMax(impl->dim-1, 1); - for (CeedInt grad = 1; grad <= impl->dim; grad+=gradstride) { + CeedInt gradstride = CeedIntMax(impl->dim - 1, 1); + for (CeedInt grad = 1; grad <= impl->dim; grad += gradstride) { const int flags = LIBXSMM_GEMM_FLAGS('N', t_mode ? 'T' : 'N'); - CeedInt B = t_mode ? grad*impl->Q : impl->P, - J = t_mode ? impl->P : grad*impl->Q, - C = num_elem; + CeedInt B = t_mode ? grad * impl->Q : impl->P, J = t_mode ? impl->P : grad * impl->Q, C = num_elem; // Add key, kernel pair to hash table CeedHashIJKLMKey key = {B, C, J, t_mode, add}; - int new_item; - khint_t k = kh_put(f64, impl->lookup_f64, key, &new_item); + int new_item; + khint_t k = kh_put(f64, impl->lookup_f64, key, &new_item); if (new_item) { // Build kernel double alpha = 1.0, beta = 1.0; if (!add) beta = 0.0; - libxsmm_dmmfunction kernel = libxsmm_dmmdispatch( - C, J, B, NULL, NULL, NULL, &alpha, &beta, &flags, NULL); + libxsmm_dmmfunction kernel = libxsmm_dmmdispatch(C, J, B, NULL, NULL, NULL, &alpha, &beta, &flags, NULL); if (!kernel) // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); @@ -167,13 +148,13 @@ int CeedTensorContractCreate_f64_Xsmm(CeedBasis basis, } } } + } + } } - ierr = CeedTensorContractSetData(contract, impl); CeedChkBackend(ierr); + CeedCallBackend(CeedTensorContractSetData(contract, impl)); - ierr = CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", - CeedTensorContractApply_Xsmm); CeedChkBackend(ierr); - ierr = CeedSetBackendFunction(ceed, "TensorContract", contract, "Destroy", - CeedTensorContractDestroy_Xsmm); CeedChkBackend(ierr); + CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Xsmm)); + CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Destroy", CeedTensorContractDestroy_Xsmm)); return CEED_ERROR_SUCCESS; } diff --git a/backends/xsmm/ceed-xsmm.h b/backends/xsmm/ceed-xsmm.h index c71e36f2df..5c59b21c5d 100644 --- a/backends/xsmm/ceed-xsmm.h +++ b/backends/xsmm/ceed-xsmm.h @@ -8,42 +8,37 @@ #ifndef _ceed_xsmm_h #define _ceed_xsmm_h -#include #include +#include #include #include #if !defined(LIBXSMM_VERSION_GE) -#define LIBXSMM_VERSION_GE(major, minor, update, patch) \ - (LIBXSMM_VERSION_MAJOR > major || \ - (LIBXSMM_VERSION_MAJOR == major && \ - (LIBXSMM_VERSION_MINOR > minor || \ - (LIBXSMM_VERSION_MINOR == minor && \ - (LIBXSMM_VERSION_UPDATE > update || \ - (LIBXSMM_VERSION_UPDATE == update && LIBXSMM_VERSION_PATCH >= patch )))))) +#define LIBXSMM_VERSION_GE(major, minor, update, patch) \ + (LIBXSMM_VERSION_MAJOR > major || \ + (LIBXSMM_VERSION_MAJOR == major && \ + (LIBXSMM_VERSION_MINOR > minor || (LIBXSMM_VERSION_MINOR == minor && \ + (LIBXSMM_VERSION_UPDATE > update || (LIBXSMM_VERSION_UPDATE == update && LIBXSMM_VERSION_PATCH >= patch)))))) #endif #if LIBXSMM_VERSION_GE(1, 17, 0, 0) -# define LIBXSMM_MMFUNCTION_KERNEL(a, b, c) kernel(a, b, c) +#define LIBXSMM_MMFUNCTION_KERNEL(a, b, c) kernel(a, b, c) #else -# define LIBXSMM_MMFUNCTION_KERNEL(a, b, c) kernel(a, b, c, NULL, NULL, NULL) +#define LIBXSMM_MMFUNCTION_KERNEL(a, b, c) kernel(a, b, c, NULL, NULL, NULL) #endif // Instantiate khash structs and methods -CeedHashIJKLMInit(f32, libxsmm_smmfunction) -CeedHashIJKLMInit(f64, libxsmm_dmmfunction) +CeedHashIJKLMInit(f32, libxsmm_smmfunction) CeedHashIJKLMInit(f64, libxsmm_dmmfunction) -typedef struct { - bool is_tensor; + typedef struct { + bool is_tensor; CeedInt P, Q, dim; - khash_t(f32) *lookup_f32; - khash_t(f64) *lookup_f64; + khash_t(f32) * lookup_f32; + khash_t(f64) * lookup_f64; } CeedTensorContract_Xsmm; -CEED_INTERN int CeedTensorContractCreate_f32_Xsmm(CeedBasis basis, - CeedTensorContract contract); +CEED_INTERN int CeedTensorContractCreate_f32_Xsmm(CeedBasis basis, CeedTensorContract contract); -CEED_INTERN int CeedTensorContractCreate_f64_Xsmm(CeedBasis basis, - CeedTensorContract contract); +CEED_INTERN int CeedTensorContractCreate_f64_Xsmm(CeedBasis basis, CeedTensorContract contract); -#endif // _ceed_xsmm_h +#endif // _ceed_xsmm_h diff --git a/doc/sphinx/source/libCEEDdev.md b/doc/sphinx/source/libCEEDdev.md index 262d51b6dc..35feb27175 100644 --- a/doc/sphinx/source/libCEEDdev.md +++ b/doc/sphinx/source/libCEEDdev.md @@ -4,7 +4,7 @@ Please check your code for style issues by running -`make style` +`make format` In addition to those automatically enforced style rules, libCEED tends to follow the following code style conventions: diff --git a/doc/sphinx/source/releasenotes.md b/doc/sphinx/source/releasenotes.md index 28b84c2b57..f20184d402 100644 --- a/doc/sphinx/source/releasenotes.md +++ b/doc/sphinx/source/releasenotes.md @@ -20,6 +20,10 @@ On this page we provide a summary of the main API changes, new features and exam Due to a limitation of the OCCA parser, typedefs are required to use pointers to arrays in QFunctions with the OCCA backend. This issue will be fixed in a future OCCA release. +### Other + +- Switch to `clang-format` over `astyle` for automatic formatting; Makefile command changed to `make format` from `make style`. + ### Bugfix - Fix bug in setting device id for GPU backends. diff --git a/examples/ceed/ex1-volume.c b/examples/ceed/ex1-volume.c index 6daf25e2d1..566da82039 100644 --- a/examples/ceed/ex1-volume.c +++ b/examples/ceed/ex1-volume.c @@ -39,56 +39,53 @@ /// @file /// libCEED example using mass operator to compute volume +#include "ex1-volume.h" + #include #include #include #include -#include "ex1-volume.h" -// Auxiliary functions. -int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, - CeedInt num_xyz[dim]); -int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], - CeedInt degree, CeedInt num_comp, CeedInt *size, - CeedInt num_qpts, CeedElemRestriction *restr, - CeedElemRestriction *restr_i); -int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], - CeedInt mesh_degree, CeedVector mesh_coords); -CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, - CeedVector mesh_coords); +// Auxiliary functions +int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]); +int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts, + CeedElemRestriction *restr, CeedElemRestriction *restr_i); +int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords); +CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords); +// Main example int main(int argc, const char *argv[]) { - const char *ceed_spec = "/cpu/self"; - CeedInt dim = 3; // dimension of the mesh - CeedInt num_comp_x = 3; // number of x components - CeedInt mesh_degree = 4; // polynomial degree for the mesh - CeedInt sol_degree = 4; // polynomial degree for the solution - CeedInt num_qpts = sol_degree + 2; // number of 1D quadrature points - CeedInt prob_size = -1; // approximate problem size - CeedInt help = 0, test = 0, gallery = 0; + const char *ceed_spec = "/cpu/self"; + CeedInt dim = 3; // dimension of the mesh + CeedInt num_comp_x = 3; // number of x components + CeedInt mesh_degree = 4; // polynomial degree for the mesh + CeedInt sol_degree = 4; // polynomial degree for the solution + CeedInt num_qpts = sol_degree + 2; // number of 1D quadrature points + CeedInt prob_size = -1; // approximate problem size + CeedInt help = 0, test = 0, gallery = 0; // Process command line arguments. for (int ia = 1; ia < argc; ia++) { // LCOV_EXCL_START - int next_arg = ((ia+1) < argc), parse_error = 0; - if (!strcmp(argv[ia],"-h")) { + int next_arg = ((ia + 1) < argc), parse_error = 0; + if (!strcmp(argv[ia], "-h")) { help = 1; - } else if (!strcmp(argv[ia],"-c") || !strcmp(argv[ia],"-ceed")) { + } else if (!strcmp(argv[ia], "-c") || !strcmp(argv[ia], "-ceed")) { parse_error = next_arg ? ceed_spec = argv[++ia], 0 : 1; - } else if (!strcmp(argv[ia],"-d")) { + } else if (!strcmp(argv[ia], "-d")) { parse_error = next_arg ? dim = atoi(argv[++ia]), 0 : 1; - num_comp_x = dim; - } else if (!strcmp(argv[ia],"-m")) { + num_comp_x = dim; + } else if (!strcmp(argv[ia], "-m")) { parse_error = next_arg ? mesh_degree = atoi(argv[++ia]), 0 : 1; - } else if (!strcmp(argv[ia],"-p")) { - parse_error = next_arg ? sol_degree= atoi(argv[++ia]), 0 : 1; - } else if (!strcmp(argv[ia],"-q")) { + } else if (!strcmp(argv[ia], "-p")) { + parse_error = next_arg ? sol_degree = atoi(argv[++ia]), 0 : 1; + } else if (!strcmp(argv[ia], "-q")) { parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1; - } else if (!strcmp(argv[ia],"-s")) { + } else if (!strcmp(argv[ia], "-s")) { parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1; - } else if (!strcmp(argv[ia],"-t")) { + } else if (!strcmp(argv[ia], "-t")) { test = 1; - } else if (!strcmp(argv[ia],"-g")) { + } else if (!strcmp(argv[ia], "-g")) { gallery = 1; } if (parse_error) { @@ -97,7 +94,7 @@ int main(int argc, const char *argv[]) { } // LCOV_EXCL_STOP } - if (prob_size < 0) prob_size = test ? 8*16 : 256*1024; + if (prob_size < 0) prob_size = test ? 8 * 16 : 256 * 1024; // Print the values of all options: if (!test || help) { @@ -109,9 +106,9 @@ int main(int argc, const char *argv[]) { printf(" Solution degree [-p] : %" CeedInt_FMT "\n", sol_degree); printf(" Num. 1D quadr. pts [-q] : %" CeedInt_FMT "\n", num_qpts); printf(" Approx. # unknowns [-s] : %" CeedInt_FMT "\n", prob_size); - printf(" QFunction source [-g] : %s\n", gallery?"gallery":"header"); + printf(" QFunction source [-g] : %s\n", gallery ? "gallery" : "header"); if (help) { - printf("Test/quiet mode is %s\n", (test?"ON":"OFF (use -t to enable)")); + printf("Test/quiet mode is %s\n", (test ? "ON" : "OFF (use -t to enable)")); return 0; } printf("\n"); @@ -125,10 +122,8 @@ int main(int argc, const char *argv[]) { // Construct the mesh and solution bases. CeedBasis mesh_basis, sol_basis; - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, - num_qpts, CEED_GAUSS, &mesh_basis); - CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, - CEED_GAUSS, &sol_basis); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis); // Determine the mesh size based on the given approximate problem size. CeedInt num_xyz[dim]; @@ -136,23 +131,21 @@ int main(int argc, const char *argv[]) { if (!test) { // LCOV_EXCL_START printf("Mesh size: nx = %" CeedInt_FMT, num_xyz[0]); - if (dim > 1) { printf(", ny = %" CeedInt_FMT, num_xyz[1]); } - if (dim > 2) { printf(", nz = %" CeedInt_FMT, num_xyz[2]); } + if (dim > 1) printf(", ny = %" CeedInt_FMT, num_xyz[1]); + if (dim > 2) printf(", nz = %" CeedInt_FMT, num_xyz[2]); printf("\n"); // LCOV_EXCL_STOP } // Build CeedElemRestriction objects describing the mesh and solution discrete // representations. - CeedInt mesh_size, sol_size; + CeedInt mesh_size, sol_size; CeedElemRestriction mesh_restr, sol_restr, sol_restr_i; - BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, - &mesh_size, num_qpts, &mesh_restr, NULL); - BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, - num_qpts, &sol_restr, &sol_restr_i); + BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restr, NULL); + BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restr, &sol_restr_i); if (!test) { // LCOV_EXCL_START - printf("Number of mesh nodes : %" CeedInt_FMT "\n", mesh_size/dim); + printf("Number of mesh nodes : %" CeedInt_FMT "\n", mesh_size / dim); printf("Number of solution nodes : %" CeedInt_FMT "\n", sol_size); // LCOV_EXCL_STOP } @@ -167,79 +160,68 @@ int main(int argc, const char *argv[]) { // Context data to be passed to the 'f_build_mass' QFunction. CeedQFunctionContext build_ctx; - struct BuildContext build_ctx_data; + struct BuildContext build_ctx_data; build_ctx_data.dim = build_ctx_data.space_dim = dim; CeedQFunctionContextCreate(ceed, &build_ctx); - CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(build_ctx_data), &build_ctx_data); + CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); // Create the QFunction that builds the mass operator (i.e. computes its // quadrature data) and set its context data. CeedQFunction qf_build; switch (gallery) { - case 0: - // This creates the QFunction directly. - CeedQFunctionCreateInterior(ceed, 1, f_build_mass, - f_build_mass_loc, &qf_build); - CeedQFunctionAddInput(qf_build, "dx", num_comp_x*dim, CEED_EVAL_GRAD); - CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT); - CeedQFunctionAddOutput(qf_build, "qdata", 1, CEED_EVAL_NONE); - CeedQFunctionSetContext(qf_build, build_ctx); - break; - case 1: { - // This creates the QFunction via the gallery. - char name[13] = ""; - snprintf(name, sizeof name, "Mass%" CeedInt_FMT "DBuild", dim); - CeedQFunctionCreateInteriorByName(ceed, name, &qf_build); - break; - } + case 0: + // This creates the QFunction directly. + CeedQFunctionCreateInterior(ceed, 1, f_build_mass, f_build_mass_loc, &qf_build); + CeedQFunctionAddInput(qf_build, "dx", num_comp_x * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_build, "qdata", 1, CEED_EVAL_NONE); + CeedQFunctionSetContext(qf_build, build_ctx); + break; + case 1: { + // This creates the QFunction via the gallery. + char name[13] = ""; + snprintf(name, sizeof name, "Mass%" CeedInt_FMT "DBuild", dim); + CeedQFunctionCreateInteriorByName(ceed, name, &qf_build); + break; + } } // Create the operator that builds the quadrature data for the mass operator. CeedOperator op_build; - CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_build); - CeedOperatorSetField(op_build, "dx", mesh_restr, mesh_basis, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, - mesh_basis, CEED_VECTOR_NONE); - CeedOperatorSetField(op_build, "qdata", sol_restr_i, CEED_BASIS_COLLOCATED, - CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build); + CeedOperatorSetField(op_build, "dx", mesh_restr, mesh_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE); + CeedOperatorSetField(op_build, "qdata", sol_restr_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // Compute the quadrature data for the mass operator. CeedVector q_data; - CeedInt elem_qpts = CeedIntPow(num_qpts, dim); - CeedInt num_elem = 1; - for (CeedInt d = 0; d < dim; d++) - num_elem *= num_xyz[d]; - CeedVectorCreate(ceed, num_elem*elem_qpts, &q_data); - CeedOperatorApply(op_build, mesh_coords, q_data, - CEED_REQUEST_IMMEDIATE); + CeedInt elem_qpts = CeedIntPow(num_qpts, dim); + CeedInt num_elem = 1; + for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d]; + CeedVectorCreate(ceed, num_elem * elem_qpts, &q_data); + CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE); // Create the QFunction that defines the action of the mass operator. CeedQFunction qf_apply; switch (gallery) { - case 0: - // This creates the QFunction directly. - CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, - f_apply_mass_loc, &qf_apply); - CeedQFunctionAddInput(qf_apply, "u", 1, CEED_EVAL_INTERP); - CeedQFunctionAddInput(qf_apply, "qdata", 1, CEED_EVAL_NONE); - CeedQFunctionAddOutput(qf_apply, "v", 1, CEED_EVAL_INTERP); - break; - case 1: - // This creates the QFunction via the gallery. - CeedQFunctionCreateInteriorByName(ceed, "MassApply", &qf_apply); - break; + case 0: + // This creates the QFunction directly. + CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, f_apply_mass_loc, &qf_apply); + CeedQFunctionAddInput(qf_apply, "u", 1, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_apply, "qdata", 1, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_apply, "v", 1, CEED_EVAL_INTERP); + break; + case 1: + // This creates the QFunction via the gallery. + CeedQFunctionCreateInteriorByName(ceed, "MassApply", &qf_apply); + break; } // Create the mass operator. CeedOperator op_apply; - CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_apply); + CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); CeedOperatorSetField(op_apply, "u", sol_restr, sol_basis, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply, "qdata", sol_restr_i, CEED_BASIS_COLLOCATED, - q_data); + CeedOperatorSetField(op_apply, "qdata", sol_restr_i, CEED_BASIS_COLLOCATED, q_data); CeedOperatorSetField(op_apply, "v", sol_restr, sol_basis, CEED_VECTOR_ACTIVE); // Create auxiliary solution-size vectors. @@ -257,23 +239,18 @@ int main(int argc, const char *argv[]) { const CeedScalar *v_array; CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); CeedScalar vol = 0.; - for (CeedInt i = 0; i < sol_size; i++) { - vol += v_array[i]; - } + for (CeedInt i = 0; i < sol_size; i++) vol += v_array[i]; CeedVectorRestoreArrayRead(v, &v_array); if (!test) { // LCOV_EXCL_START printf(" done.\n"); printf("Exact mesh volume : % .14g\n", exact_vol); printf("Computed mesh volume : % .14g\n", vol); - printf("Volume error : % .14g\n", vol-exact_vol); + printf("Volume error : % .14g\n", vol - exact_vol); // LCOV_EXCL_STOP } else { - CeedScalar tol = (dim==1 ? 100.*CEED_EPSILON : dim==2 ? 1E-5 : 1E-5); - if (fabs(vol-exact_vol)>tol) - // LCOV_EXCL_START - printf("Volume error : % .1e\n", vol-exact_vol); - // LCOV_EXCL_STOP + CeedScalar tol = (dim == 1 ? 100. * CEED_EPSILON : dim == 2 ? 1E-5 : 1E-5); + if (fabs(vol - exact_vol) > tol) printf("Volume error : % .1e\n", vol - exact_vol); } // Free dynamically allocated memory. @@ -295,47 +272,50 @@ int main(int argc, const char *argv[]) { return 0; } -int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, - CeedInt num_xyz[dim]) { +int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[dim]) { // Use the approximate formula: // prob_size ~ num_elem * degree^dim CeedInt num_elem = prob_size / CeedIntPow(degree, dim); - CeedInt s = 0; // find s: num_elem/2 < 2^s <= num_elem + CeedInt s = 0; // find s: num_elem/2 < 2^s <= num_elem while (num_elem > 1) { num_elem /= 2; s++; } - CeedInt r = s%dim; + CeedInt r = s % dim; for (CeedInt d = 0; d < dim; d++) { - CeedInt sd = s/dim; - if (r > 0) { sd++; r--; } + CeedInt sd = s / dim; + if (r > 0) { + sd++; + r--; + } num_xyz[d] = 1 << sd; } return 0; } -int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], - CeedInt degree, CeedInt num_comp, CeedInt *size, - CeedInt num_qpts, CeedElemRestriction *restr, - CeedElemRestriction *restr_i) { - CeedInt p = degree + 1; - CeedInt num_nodes = CeedIntPow(p, dim); // number of scalar nodes per element - CeedInt elem_qpts = CeedIntPow(num_qpts, dim); // number of qpts per element +int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts, + CeedElemRestriction *restr, CeedElemRestriction *restr_i) { + CeedInt p = degree + 1; + CeedInt num_nodes = CeedIntPow(p, dim); // number of scalar nodes per element + CeedInt elem_qpts = CeedIntPow(num_qpts, dim); // number of qpts per element CeedInt nd[3], num_elem = 1, scalar_size = 1; for (CeedInt d = 0; d < dim; d++) { num_elem *= num_xyz[d]; nd[d] = num_xyz[d] * (p - 1) + 1; scalar_size *= nd[d]; } - *size = scalar_size*num_comp; + *size = scalar_size * num_comp; // elem: 0 1 n-1 // |---*-...-*---|---*-...-*---|- ... -|--...--| // num_nodes: 0 1 p-1 p p+1 2*p n*p - CeedInt *elem_nodes = malloc(sizeof(CeedInt)*num_elem*num_nodes); + CeedInt *elem_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes); for (CeedInt e = 0; e < num_elem; e++) { CeedInt e_xyz[3] = {1, 1, 1}, re = e; - for (CeedInt d = 0; d < dim; d++) { e_xyz[d] = re % num_xyz[d]; re /= num_xyz[d]; } - CeedInt *loc_el_nodes = elem_nodes + e*num_nodes; + for (CeedInt d = 0; d < dim; d++) { + e_xyz[d] = re % num_xyz[d]; + re /= num_xyz[d]; + } + CeedInt *loc_el_nodes = elem_nodes + e * num_nodes; for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) { CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes; for (CeedInt d = 0; d < dim; d++) { @@ -346,19 +326,14 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[dim], loc_el_nodes[l_nodes] = g_nodes; } } - CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, - num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, - elem_nodes, restr); - if (restr_i) - CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, - num_comp, num_comp * elem_qpts * num_elem, - CEED_STRIDES_BACKEND, restr_i); + CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, elem_nodes, + restr); + if (restr_i) CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, restr_i); free(elem_nodes); return 0; } -int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], - CeedInt mesh_degree, CeedVector mesh_coords) { +int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedInt mesh_degree, CeedVector mesh_coords) { CeedInt p = mesh_degree + 1; CeedInt nd[3], num_elem = 1, scalar_size = 1; for (CeedInt d = 0; d < dim; d++) { @@ -370,14 +345,15 @@ int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords); CeedScalar *nodes = malloc(sizeof(CeedScalar) * p); // The H1 basis uses Lobatto quadrature points as nodes. - CeedLobattoQuadrature(p, nodes, NULL); // nodes are in [-1,1] - for (CeedInt i = 0; i < p; i++) { nodes[i] = 0.5 + 0.5 * nodes[i]; } + CeedLobattoQuadrature(p, nodes, NULL); // nodes are in [-1,1] + for (CeedInt i = 0; i < p; i++) { + nodes[i] = 0.5 + 0.5 * nodes[i]; + } for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) { CeedInt r_nodes = gs_nodes; for (CeedInt d = 0; d < dim; d++) { - CeedInt d_1d = r_nodes % nd[d]; - coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % - (p - 1)]) / num_xyz[d]; + CeedInt d_1d = r_nodes % nd[d]; + coords[gs_nodes + scalar_size * d] = ((d_1d / (p - 1)) + nodes[d_1d % (p - 1)]) / num_xyz[d]; r_nodes /= nd[d]; } } @@ -387,33 +363,32 @@ int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[dim], } #ifndef M_PI -#define M_PI 3.14159265358979323846 -#define M_PI_2 1.57079632679489661923 +#define M_PI 3.14159265358979323846 +#define M_PI_2 1.57079632679489661923 #endif -CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, - CeedVector mesh_coords) { - CeedScalar exact_volume; +CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) { + CeedScalar exact_volume; CeedScalar *coords; CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords); if (dim == 1) { for (CeedInt i = 0; i < mesh_size; i++) { // map [0,1] to [0,1] varying the mesh density - coords[i] = 0.5 + 1./sqrt(3.) * sin((2./3.) * M_PI*(coords[i] - 0.5)); + coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5)); } exact_volume = 1.; } else { - CeedInt num_nodes = mesh_size/dim; + CeedInt num_nodes = mesh_size / dim; for (CeedInt i = 0; i < num_nodes; i++) { // map (x,y) from [0,1]x[0,1] to the quarter annulus with polar // coordinates, (r,phi) in [1,2]x[0,pi/2] with area = 3/4*pi - CeedScalar u = coords[i], v = coords[i+num_nodes]; - u = 1. + u; - v = M_PI_2 * v; - coords[i] = u * cos(v); - coords[i+num_nodes] = u * sin(v); + CeedScalar u = coords[i], v = coords[i + num_nodes]; + u = 1. + u; + v = M_PI_2 * v; + coords[i] = u * cos(v); + coords[i + num_nodes] = u * sin(v); } - exact_volume = 3./4. * M_PI; + exact_volume = 3. / 4. * M_PI; } CeedVectorRestoreArray(mesh_coords, &coords); return exact_volume; diff --git a/examples/ceed/ex1-volume.h b/examples/ceed/ex1-volume.h index a9bf4f9349..1ff66713f8 100644 --- a/examples/ceed/ex1-volume.h +++ b/examples/ceed/ex1-volume.h @@ -11,62 +11,55 @@ #include /// A structure used to pass additional data to f_build_mass -struct BuildContext { CeedInt dim, space_dim; }; +struct BuildContext { + CeedInt dim, space_dim; +}; /// libCEED Q-function for building quadrature data for a mass operator -CEED_QFUNCTION(f_build_mass)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(f_build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is Jacobians with shape [dim, nc=dim, Q] // in[1] is quadrature weights, size (Q) struct BuildContext *bc = (struct BuildContext *)ctx; - const CeedScalar *J = in[0], *w = in[1]; - CeedScalar *q_data = out[0]; + const CeedScalar *J = in[0], *w = in[1]; + CeedScalar *q_data = out[0]; - switch (bc->dim + 10*bc->space_dim) { - case 11: - // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; idim + 10 * bc->space_dim) { + case 11: + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = J[i] * w[i]; } // End of Quadrature Point Loop + break; + case 22: + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // 0 2 + // 1 3 + q_data[i] = (J[i + Q * 0] * J[i + Q * 3] - J[i + Q * 1] * J[i + Q * 2]) * w[i]; + } // End of Quadrature Point Loop + break; + case 33: + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // 0 3 6 + // 1 4 7 + // 2 5 8 + q_data[i] = (J[i + Q * 0] * (J[i + Q * 4] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 7]) - + J[i + Q * 1] * (J[i + Q * 3] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 6]) + + J[i + Q * 2] * (J[i + Q * 3] * J[i + Q * 7] - J[i + Q * 4] * J[i + Q * 6])) * + w[i]; + } // End of Quadrature Point Loop + break; } return 0; } /// libCEED Q-function for applying a mass operator -CEED_QFUNCTION(f_apply_mass)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(f_apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *u = in[0], *q_data = in[1]; - CeedScalar *v = out[0]; + CeedScalar *v = out[0]; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; i #include #include #include -#include "ex2-surface.h" -// Auxiliary functions. -int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, - CeedInt num_xyz[3]); -int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], - CeedInt degree, CeedInt num_comp, CeedInt *size, - CeedInt num_qpts, CeedElemRestriction *restr, - CeedElemRestriction *restr_i); -int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[3], CeedInt mesh_degree, - CeedVector mesh_coords); -CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, - CeedVector mesh_coords); +// Auxiliary functions +int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[3]); +int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts, + CeedElemRestriction *restr, CeedElemRestriction *restr_i); +int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[3], CeedInt mesh_degree, CeedVector mesh_coords); +CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords); +// Main example int main(int argc, const char *argv[]) { - const char *ceed_spec = "/cpu/self"; - CeedInt dim = 3; // dimension of the mesh - CeedInt num_comp_x = 3; // number of x components - CeedInt mesh_degree = 4; // polynomial degree for the mesh - CeedInt sol_degree = 4; // polynomial degree for the solution - CeedInt num_qpts = sol_degree + 2; // number of 1D quadrature points - CeedInt prob_size = -1; // approximate problem size - CeedInt help = 0, test = 0, gallery = 0; + const char *ceed_spec = "/cpu/self"; + CeedInt dim = 3; // dimension of the mesh + CeedInt num_comp_x = 3; // number of x components + CeedInt mesh_degree = 4; // polynomial degree for the mesh + CeedInt sol_degree = 4; // polynomial degree for the solution + CeedInt num_qpts = sol_degree + 2; // number of 1D quadrature points + CeedInt prob_size = -1; // approximate problem size + CeedInt help = 0, test = 0, gallery = 0; // Process command line arguments. for (int ia = 1; ia < argc; ia++) { // LCOV_EXCL_START - int next_arg = ((ia+1) < argc), parse_error = 0; - if (!strcmp(argv[ia],"-h")) { + int next_arg = ((ia + 1) < argc), parse_error = 0; + if (!strcmp(argv[ia], "-h")) { help = 1; - } else if (!strcmp(argv[ia],"-c") || !strcmp(argv[ia],"-ceed")) { + } else if (!strcmp(argv[ia], "-c") || !strcmp(argv[ia], "-ceed")) { parse_error = next_arg ? ceed_spec = argv[++ia], 0 : 1; - } else if (!strcmp(argv[ia],"-d")) { + } else if (!strcmp(argv[ia], "-d")) { parse_error = next_arg ? dim = atoi(argv[++ia]), 0 : 1; - num_comp_x = dim; - } else if (!strcmp(argv[ia],"-m")) { + num_comp_x = dim; + } else if (!strcmp(argv[ia], "-m")) { parse_error = next_arg ? mesh_degree = atoi(argv[++ia]), 0 : 1; - } else if (!strcmp(argv[ia],"-p")) { + } else if (!strcmp(argv[ia], "-p")) { parse_error = next_arg ? sol_degree = atoi(argv[++ia]), 0 : 1; - } else if (!strcmp(argv[ia],"-q")) { + } else if (!strcmp(argv[ia], "-q")) { parse_error = next_arg ? num_qpts = atoi(argv[++ia]), 0 : 1; - } else if (!strcmp(argv[ia],"-s")) { + } else if (!strcmp(argv[ia], "-s")) { parse_error = next_arg ? prob_size = atoi(argv[++ia]), 0 : 1; - } else if (!strcmp(argv[ia],"-t")) { + } else if (!strcmp(argv[ia], "-t")) { test = 1; - } else if (!strcmp(argv[ia],"-g")) { + } else if (!strcmp(argv[ia], "-g")) { gallery = 1; } if (parse_error) { @@ -98,11 +95,11 @@ int main(int argc, const char *argv[]) { } // LCOV_EXCL_STOP } - if (prob_size < 0) prob_size = test ? 16*16*dim*dim : 256*1024; + if (prob_size < 0) prob_size = test ? 16 * 16 * dim * dim : 256 * 1024; // Set mesh_degree = sol_degree. mesh_degree = fmax(mesh_degree, sol_degree); - sol_degree = mesh_degree; + sol_degree = mesh_degree; // Print the values of all options: if (!test || help) { @@ -114,9 +111,9 @@ int main(int argc, const char *argv[]) { printf(" Solution degree [-p] : %" CeedInt_FMT "\n", sol_degree); printf(" Num. 1D quadr. pts [-q] : %" CeedInt_FMT "\n", num_qpts); printf(" Approx. # unknowns [-s] : %" CeedInt_FMT "\n", prob_size); - printf(" QFunction source [-g] : %s\n", gallery?"gallery":"header"); + printf(" QFunction source [-g] : %s\n", gallery ? "gallery" : "header"); if (help) { - printf("Test/quiet mode is %s\n", (test?"ON":"OFF (use -t to enable)")); + printf("Test/quiet mode is %s\n", (test ? "ON" : "OFF (use -t to enable)")); return 0; } printf("\n"); @@ -130,10 +127,8 @@ int main(int argc, const char *argv[]) { // Construct the mesh and solution bases. CeedBasis mesh_basis, sol_basis; - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, - num_qpts, CEED_GAUSS, &mesh_basis); - CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, - CEED_GAUSS, &sol_basis); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, mesh_degree + 1, num_qpts, CEED_GAUSS, &mesh_basis); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, sol_degree + 1, num_qpts, CEED_GAUSS, &sol_basis); // Determine the mesh size based on the given approximate problem size. CeedInt num_xyz[3]; @@ -142,25 +137,22 @@ int main(int argc, const char *argv[]) { if (!test) { // LCOV_EXCL_START printf("Mesh size: nx = %" CeedInt_FMT, num_xyz[0]); - if (dim > 1) { printf(", ny = %" CeedInt_FMT, num_xyz[1]); } - if (dim > 2) { printf(", nz = %" CeedInt_FMT, num_xyz[2]); } + if (dim > 1) printf(", ny = %" CeedInt_FMT, num_xyz[1]); + if (dim > 2) printf(", nz = %" CeedInt_FMT, num_xyz[2]); printf("\n"); // LCOV_EXCL_STOP } // Build CeedElemRestriction objects describing the mesh and solution discrete // representations. - CeedInt mesh_size, sol_size; + CeedInt mesh_size, sol_size; CeedElemRestriction mesh_restr, sol_restr, q_data_restr_i; - BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, - &mesh_size, num_qpts, &mesh_restr, NULL); - BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, dim*(dim+1)/2, - &sol_size, num_qpts, NULL, &q_data_restr_i); - BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, - num_qpts, &sol_restr, NULL); + BuildCartesianRestriction(ceed, dim, num_xyz, mesh_degree, num_comp_x, &mesh_size, num_qpts, &mesh_restr, NULL); + BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, dim * (dim + 1) / 2, &sol_size, num_qpts, NULL, &q_data_restr_i); + BuildCartesianRestriction(ceed, dim, num_xyz, sol_degree, 1, &sol_size, num_qpts, &sol_restr, NULL); if (!test) { // LCOV_EXCL_START - printf("Number of mesh nodes : %" CeedInt_FMT "\n", mesh_size/dim); + printf("Number of mesh nodes : %" CeedInt_FMT "\n", mesh_size / dim); printf("Number of solution nodes : %" CeedInt_FMT "\n", sol_size); // LCOV_EXCL_STOP } @@ -175,84 +167,73 @@ int main(int argc, const char *argv[]) { // Context data to be passed to the 'f_build_diff' QFunction. CeedQFunctionContext build_ctx; - struct BuildContext build_ctx_data; + struct BuildContext build_ctx_data; build_ctx_data.dim = build_ctx_data.space_dim = dim; CeedQFunctionContextCreate(ceed, &build_ctx); - CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(build_ctx_data), &build_ctx_data); + CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); // Create the QFunction that builds the diffusion operator (i.e. computes its // quadrature data) and set its context data. CeedQFunction qf_build; switch (gallery) { - case 0: - // This creates the QFunction directly. - CeedQFunctionCreateInterior(ceed, 1, f_build_diff, - f_build_diff_loc, &qf_build); - CeedQFunctionAddInput(qf_build, "dx", num_comp_x*dim, CEED_EVAL_GRAD); - CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT); - CeedQFunctionAddOutput(qf_build, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedQFunctionSetContext(qf_build, build_ctx); - break; - case 1: { - // This creates the QFunction via the gallery. - char name[16] = ""; - snprintf(name, sizeof name, "Poisson%" CeedInt_FMT "DBuild", dim); - CeedQFunctionCreateInteriorByName(ceed, name, &qf_build); - break; - } + case 0: + // This creates the QFunction directly. + CeedQFunctionCreateInterior(ceed, 1, f_build_diff, f_build_diff_loc, &qf_build); + CeedQFunctionAddInput(qf_build, "dx", num_comp_x * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_build, "weights", 1, CEED_EVAL_WEIGHT); + CeedQFunctionAddOutput(qf_build, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE); + CeedQFunctionSetContext(qf_build, build_ctx); + break; + case 1: { + // This creates the QFunction via the gallery. + char name[16] = ""; + snprintf(name, sizeof name, "Poisson%" CeedInt_FMT "DBuild", dim); + CeedQFunctionCreateInteriorByName(ceed, name, &qf_build); + break; + } } // Create the operator that builds the quadrature data for the diffusion // operator. CeedOperator op_build; - CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_build); - CeedOperatorSetField(op_build, "dx", mesh_restr, mesh_basis, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, - mesh_basis, CEED_VECTOR_NONE); - CeedOperatorSetField(op_build, "qdata", q_data_restr_i, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_build, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_build); + CeedOperatorSetField(op_build, "dx", mesh_restr, mesh_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_build, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE); + CeedOperatorSetField(op_build, "qdata", q_data_restr_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // Compute the quadrature data for the diffusion operator. CeedVector q_data; - CeedInt elem_qpts = CeedIntPow(num_qpts, dim); - CeedInt num_elem = 1; - for (CeedInt d = 0; d < dim; d++) - num_elem *= num_xyz[d]; - CeedVectorCreate(ceed, num_elem*elem_qpts*dim*(dim+1)/2, &q_data); - CeedOperatorApply(op_build, mesh_coords, q_data, - CEED_REQUEST_IMMEDIATE); + CeedInt elem_qpts = CeedIntPow(num_qpts, dim); + CeedInt num_elem = 1; + for (CeedInt d = 0; d < dim; d++) num_elem *= num_xyz[d]; + CeedVectorCreate(ceed, num_elem * elem_qpts * dim * (dim + 1) / 2, &q_data); + CeedOperatorApply(op_build, mesh_coords, q_data, CEED_REQUEST_IMMEDIATE); // Create the QFunction that defines the action of the diffusion operator. CeedQFunction qf_apply; switch (gallery) { - case 0: - // This creates the QFunction directly. - CeedQFunctionCreateInterior(ceed, 1, f_apply_diff, - f_apply_diff_loc, &qf_apply); - CeedQFunctionAddInput(qf_apply, "du", dim, CEED_EVAL_GRAD); - CeedQFunctionAddInput(qf_apply, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedQFunctionAddOutput(qf_apply, "dv", dim, CEED_EVAL_GRAD); - CeedQFunctionSetContext(qf_apply, build_ctx); - break; - case 1: { - // This creates the QFunction via the gallery. - char name[16] = ""; - snprintf(name, sizeof name, "Poisson%" CeedInt_FMT "DApply", dim); - CeedQFunctionCreateInteriorByName(ceed, name, &qf_apply); - break; - } + case 0: + // This creates the QFunction directly. + CeedQFunctionCreateInterior(ceed, 1, f_apply_diff, f_apply_diff_loc, &qf_apply); + CeedQFunctionAddInput(qf_apply, "du", dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_apply, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_apply, "dv", dim, CEED_EVAL_GRAD); + CeedQFunctionSetContext(qf_apply, build_ctx); + break; + case 1: { + // This creates the QFunction via the gallery. + char name[16] = ""; + snprintf(name, sizeof name, "Poisson%" CeedInt_FMT "DApply", dim); + CeedQFunctionCreateInteriorByName(ceed, name, &qf_apply); + break; + } } // Create the diffusion operator. CeedOperator op_apply; - CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_apply); + CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); CeedOperatorSetField(op_apply, "du", sol_restr, sol_basis, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply, "qdata", q_data_restr_i, CEED_BASIS_COLLOCATED, - q_data); + CeedOperatorSetField(op_apply, "qdata", q_data_restr_i, CEED_BASIS_COLLOCATED, q_data); CeedOperatorSetField(op_apply, "dv", sol_restr, sol_basis, CEED_VECTOR_ACTIVE); // Create auxiliary solution-size vectors. @@ -261,14 +242,13 @@ int main(int argc, const char *argv[]) { CeedVectorCreate(ceed, sol_size, &v); // Initialize 'u' with sum of coordinates, x+y+z. - CeedScalar *u_array; + CeedScalar *u_array; const CeedScalar *x_array; CeedVectorGetArrayWrite(u, CEED_MEM_HOST, &u_array); CeedVectorGetArrayRead(mesh_coords, CEED_MEM_HOST, &x_array); for (CeedInt i = 0; i < sol_size; i++) { u_array[i] = 0; - for (CeedInt d = 0; d < dim; d++) - u_array[i] += x_array[i+d*sol_size]; + for (CeedInt d = 0; d < dim; d++) u_array[i] += x_array[i + d * sol_size]; } CeedVectorRestoreArray(u, &u_array); CeedVectorRestoreArrayRead(mesh_coords, &x_array); @@ -281,23 +261,18 @@ int main(int argc, const char *argv[]) { const CeedScalar *v_array; CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); CeedScalar sa = 0.; - for (CeedInt i = 0; i < sol_size; i++) { - sa += fabs(v_array[i]); - } + for (CeedInt i = 0; i < sol_size; i++) sa += fabs(v_array[i]); CeedVectorRestoreArrayRead(v, &v_array); if (!test) { // LCOV_EXCL_START printf(" done.\n"); printf("Exact mesh surface area : % .14g\n", exact_sa); printf("Computed mesh surface area : % .14g\n", sa); - printf("Surface area error : % .14g\n", sa-exact_sa); + printf("Surface area error : % .14g\n", sa - exact_sa); // LCOV_EXCL_STOP } else { - CeedScalar tol = (dim==1 ? 10000.*CEED_EPSILON : dim==2 ? 1E-1 : 1E-1); - if (fabs(sa-exact_sa)>tol) - // LCOV_EXCL_START - printf("Surface area error : % .14g\n", sa-exact_sa); - // LCOV_EXCL_STOP + CeedScalar tol = (dim == 1 ? 10000. * CEED_EPSILON : dim == 2 ? 1E-1 : 1E-1); + if (fabs(sa - exact_sa) > tol) printf("Surface area error : % .14g\n", sa - exact_sa); } // Free dynamically allocated memory. @@ -319,47 +294,50 @@ int main(int argc, const char *argv[]) { return 0; } -int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, - CeedInt num_xyz[3]) { +int GetCartesianMeshSize(CeedInt dim, CeedInt degree, CeedInt prob_size, CeedInt num_xyz[3]) { // Use the approximate formula: // prob_size ~ num_elem * degree^dim CeedInt num_elem = prob_size / CeedIntPow(degree, dim); - CeedInt s = 0; // find s: num_elem/2 < 2^s <= num_elem + CeedInt s = 0; // find s: num_elem/2 < 2^s <= num_elem while (num_elem > 1) { num_elem /= 2; s++; } - CeedInt r = s%dim; + CeedInt r = s % dim; for (CeedInt d = 0; d < dim; d++) { - CeedInt sd = s/dim; - if (r > 0) { sd++; r--; } + CeedInt sd = s / dim; + if (r > 0) { + sd++; + r--; + } num_xyz[d] = 1 << sd; } return 0; } -int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], - CeedInt degree, CeedInt num_comp, CeedInt *size, - CeedInt num_qpts, CeedElemRestriction *restr, - CeedElemRestriction *restr_i) { - CeedInt p = degree + 1; - CeedInt num_nodes = CeedIntPow(p, dim); // number of scalar nodes per element - CeedInt elem_qpts = CeedIntPow(num_qpts, dim); // number of qpts per element +int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], CeedInt degree, CeedInt num_comp, CeedInt *size, CeedInt num_qpts, + CeedElemRestriction *restr, CeedElemRestriction *restr_i) { + CeedInt p = degree + 1; + CeedInt num_nodes = CeedIntPow(p, dim); // number of scalar nodes per element + CeedInt elem_qpts = CeedIntPow(num_qpts, dim); // number of qpts per element CeedInt nd[3], num_elem = 1, scalar_size = 1; for (CeedInt d = 0; d < dim; d++) { num_elem *= num_xyz[d]; nd[d] = num_xyz[d] * (p - 1) + 1; scalar_size *= nd[d]; } - *size = scalar_size*num_comp; + *size = scalar_size * num_comp; // elem: 0 1 n-1 // |---*-...-*---|---*-...-*---|- ... -|--...--| // num_nodes: 0 1 p-1 p p+1 2*p n*p - CeedInt *el_nodes = malloc(sizeof(CeedInt)*num_elem*num_nodes); + CeedInt *el_nodes = malloc(sizeof(CeedInt) * num_elem * num_nodes); for (CeedInt e = 0; e < num_elem; e++) { CeedInt e_xyz[3] = {1, 1, 1}, re = e; - for (CeedInt d = 0; d < dim; d++) { e_xyz[d] = re%num_xyz[d]; re /= num_xyz[d]; } - CeedInt *loc_el_nodes = el_nodes + e*num_nodes; + for (CeedInt d = 0; d < dim; d++) { + e_xyz[d] = re % num_xyz[d]; + re /= num_xyz[d]; + } + CeedInt *loc_el_nodes = el_nodes + e * num_nodes; for (CeedInt l_nodes = 0; l_nodes < num_nodes; l_nodes++) { CeedInt g_nodes = 0, g_nodes_stride = 1, r_nodes = l_nodes; for (CeedInt d = 0; d < dim; d++) { @@ -371,22 +349,18 @@ int BuildCartesianRestriction(Ceed ceed, CeedInt dim, CeedInt num_xyz[3], } } if (restr) - CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, - num_comp * scalar_size, CEED_MEM_HOST, - CEED_COPY_VALUES, el_nodes, restr); + CeedElemRestrictionCreate(ceed, num_elem, num_nodes, num_comp, scalar_size, num_comp * scalar_size, CEED_MEM_HOST, CEED_COPY_VALUES, el_nodes, + restr); free(el_nodes); if (restr_i) { - CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, - num_comp, num_comp * elem_qpts * num_elem, - CEED_STRIDES_BACKEND, restr_i); + CeedElemRestrictionCreateStrided(ceed, num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, restr_i); } return 0; } -int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[3], CeedInt mesh_degree, - CeedVector mesh_coords) { +int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[3], CeedInt mesh_degree, CeedVector mesh_coords) { CeedInt p = mesh_degree + 1; CeedInt nd[3], num_elem = 1, scalar_size = 1; for (CeedInt d = 0; d < dim; d++) { @@ -398,14 +372,15 @@ int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[3], CeedInt mesh_degree, CeedVectorGetArrayWrite(mesh_coords, CEED_MEM_HOST, &coords); CeedScalar *nodes = malloc(sizeof(CeedScalar) * p); // The H1 basis uses Lobatto quadrature points as nodes. - CeedLobattoQuadrature(p, nodes, NULL); // nodes are in [-1,1] - for (CeedInt i = 0; i < p; i++) { nodes[i] = 0.5 + 0.5 * nodes[i]; } + CeedLobattoQuadrature(p, nodes, NULL); // nodes are in [-1,1] + for (CeedInt i = 0; i < p; i++) { + nodes[i] = 0.5 + 0.5 * nodes[i]; + } for (CeedInt gs_nodes = 0; gs_nodes < scalar_size; gs_nodes++) { CeedInt r_nodes = gs_nodes; for (CeedInt d = 0; d < dim; d++) { - CeedInt d1d = r_nodes % nd[d]; - coords[gs_nodes + scalar_size * d] = ((d1d / (p - 1)) + nodes[d1d % - (p - 1)]) / num_xyz[d]; + CeedInt d1d = r_nodes % nd[d]; + coords[gs_nodes + scalar_size * d] = ((d1d / (p - 1)) + nodes[d1d % (p - 1)]) / num_xyz[d]; r_nodes /= nd[d]; } } @@ -415,18 +390,17 @@ int SetCartesianMeshCoords(CeedInt dim, CeedInt num_xyz[3], CeedInt mesh_degree, } #ifndef M_PI -#define M_PI 3.14159265358979323846 +#define M_PI 3.14159265358979323846 #endif -CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, - CeedVector mesh_coords) { - CeedScalar exact_sa = (dim == 1 ? 2 : dim == 2 ? 4 : 6); +CeedScalar TransformMeshCoords(CeedInt dim, CeedInt mesh_size, CeedVector mesh_coords) { + CeedScalar exact_sa = (dim == 1 ? 2 : dim == 2 ? 4 : 6); CeedScalar *coords; CeedVectorGetArray(mesh_coords, CEED_MEM_HOST, &coords); for (CeedInt i = 0; i < mesh_size; i++) { // map [0,1] to [0,1] varying the mesh density - coords[i] = 0.5 + 1./sqrt(3.) * sin((2./3.) * M_PI * (coords[i] - 0.5)); + coords[i] = 0.5 + 1. / sqrt(3.) * sin((2. / 3.) * M_PI * (coords[i] - 0.5)); } CeedVectorRestoreArray(mesh_coords, &coords); diff --git a/examples/ceed/ex2-surface.h b/examples/ceed/ex2-surface.h index 26ff69fd12..d5d91c44c3 100644 --- a/examples/ceed/ex2-surface.h +++ b/examples/ceed/ex2-surface.h @@ -11,11 +11,12 @@ #include /// A structure used to pass additional data to f_build_diff -struct BuildContext { CeedInt dim, space_dim; }; +struct BuildContext { + CeedInt dim, space_dim; +}; /// libCEED Q-function for building quadrature data for a diffusion operator -CEED_QFUNCTION(f_build_diff)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(f_build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { struct BuildContext *bc = (struct BuildContext *)ctx; // in[0] is Jacobians with shape [dim, nc=dim, Q] // in[1] is quadrature weights, size (Q) @@ -23,137 +24,110 @@ CEED_QFUNCTION(f_build_diff)(void *ctx, const CeedInt Q, // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store // the symmetric part of the result. const CeedScalar *J = in[0], *w = in[1]; - CeedScalar *q_data = out[0]; + CeedScalar *q_data = out[0]; - switch (bc->dim + 10*bc->space_dim) { - case 11: - CeedPragmaSIMD - for (CeedInt i=0; idim + 10 * bc->space_dim) { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { q_data[i] = w[i] / J[i]; } // End of Quadrature Point Loop + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // J: 0 2 q_data: 0 2 adj(J): J22 -J12 + // 1 3 2 1 -J21 J11 + const CeedScalar J11 = J[i + Q * 0]; + const CeedScalar J21 = J[i + Q * 1]; + const CeedScalar J12 = J[i + Q * 2]; + const CeedScalar J22 = J[i + Q * 3]; + const CeedScalar qw = w[i] / (J11 * J22 - J21 * J12); + q_data[i + Q * 0] = qw * (J12 * J12 + J22 * J22); + q_data[i + Q * 1] = qw * (J11 * J11 + J21 * J21); + q_data[i + Q * 2] = -qw * (J11 * J12 + J21 * J22); + } // End of Quadrature Point Loop + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // Compute the adjoint + CeedScalar A[3][3]; + for (CeedInt j = 0; j < 3; j++) + for (CeedInt k = 0; k < 3; k++) + // Equivalent code with J as a VLA and no mod operations: + // A[k][j] = J[j+1][k+1]*J[j+2][k+2] - J[j+1][k+2]*J[j+2][k+1] + A[k][j] = J[i + Q * ((j + 1) % 3 + 3 * ((k + 1) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 2) % 3))] - + J[i + Q * ((j + 1) % 3 + 3 * ((k + 2) % 3))] * J[i + Q * ((j + 2) % 3 + 3 * ((k + 1) % 3))]; - // Compute quadrature weight / det(J) - const CeedScalar qw = w[i] / (J[i+Q*0]*A[0][0] + J[i+Q*1]*A[0][1] + - J[i+Q*2]*A[0][2]); + // Compute quadrature weight / det(J) + const CeedScalar qw = w[i] / (J[i + Q * 0] * A[0][0] + J[i + Q * 1] * A[0][1] + J[i + Q * 2] * A[0][2]); - // Compute geometric factors - // Stored in Voigt convention - // 0 5 4 - // 5 1 3 - // 4 3 2 - q_data[i+Q*0] = qw * (A[0][0]*A[0][0] + A[0][1]*A[0][1] + A[0][2]*A[0][2]); - q_data[i+Q*1] = qw * (A[1][0]*A[1][0] + A[1][1]*A[1][1] + A[1][2]*A[1][2]); - q_data[i+Q*2] = qw * (A[2][0]*A[2][0] + A[2][1]*A[2][1] + A[2][2]*A[2][2]); - q_data[i+Q*3] = qw * (A[1][0]*A[2][0] + A[1][1]*A[2][1] + A[1][2]*A[2][2]); - q_data[i+Q*4] = qw * (A[0][0]*A[2][0] + A[0][1]*A[2][1] + A[0][2]*A[2][2]); - q_data[i+Q*5] = qw * (A[0][0]*A[1][0] + A[0][1]*A[1][1] + A[0][2]*A[1][2]); - } // End of Quadrature Point Loop - break; + // Compute geometric factors + // Stored in Voigt convention + // 0 5 4 + // 5 1 3 + // 4 3 2 + q_data[i + Q * 0] = qw * (A[0][0] * A[0][0] + A[0][1] * A[0][1] + A[0][2] * A[0][2]); + q_data[i + Q * 1] = qw * (A[1][0] * A[1][0] + A[1][1] * A[1][1] + A[1][2] * A[1][2]); + q_data[i + Q * 2] = qw * (A[2][0] * A[2][0] + A[2][1] * A[2][1] + A[2][2] * A[2][2]); + q_data[i + Q * 3] = qw * (A[1][0] * A[2][0] + A[1][1] * A[2][1] + A[1][2] * A[2][2]); + q_data[i + Q * 4] = qw * (A[0][0] * A[2][0] + A[0][1] * A[2][1] + A[0][2] * A[2][2]); + q_data[i + Q * 5] = qw * (A[0][0] * A[1][0] + A[0][1] * A[1][1] + A[0][2] * A[1][2]); + } // End of Quadrature Point Loop + break; } return 0; } /// libCEED Q-function for applying a diff operator -CEED_QFUNCTION(f_apply_diff)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(f_apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { struct BuildContext *bc = (struct BuildContext *)ctx; // in[0], out[0] have shape [dim, nc=1, Q] const CeedScalar *ug = in[0], *q_data = in[1]; - CeedScalar *vg = out[0]; + CeedScalar *vg = out[0]; switch (bc->dim) { - case 1: - CeedPragmaSIMD - for (CeedInt i=0; iapp_ctx = app_ctx; - user->units = units; - user->phys = phys_ctx; + user->app_ctx = app_ctx; + user->units = units; + user->phys = phys_ctx; problem->bc_from_ics = PETSC_TRUE; // --------------------------------------------------------------------------- // Process command line options // --------------------------------------------------------------------------- // -- Register problems to be available on the command line - ierr = RegisterProblems_NS(app_ctx); CHKERRQ(ierr); + PetscCall(RegisterProblems_NS(app_ctx)); // -- Process general command line options MPI_Comm comm = PETSC_COMM_WORLD; - user->comm = comm; - ierr = ProcessCommandLineOptions(comm, app_ctx, bc); CHKERRQ(ierr); + user->comm = comm; + PetscCall(ProcessCommandLineOptions(comm, app_ctx, bc)); // --------------------------------------------------------------------------- // Initialize libCEED @@ -109,23 +107,25 @@ int main(int argc, char **argv) { // Set up global mesh // --------------------------------------------------------------------------- // -- Create DM - DM dm; + DM dm; VecType vec_type = NULL; MatType mat_type = NULL; switch (mem_type_backend) { - case CEED_MEM_HOST: vec_type = VECSTANDARD; break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip")) vec_type = VECKOKKOS; - else vec_type = VECSTANDARD; - } + case CEED_MEM_HOST: + vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip")) vec_type = VECKOKKOS; + else vec_type = VECSTANDARD; + } } if (strstr(vec_type, VECCUDA)) mat_type = MATAIJCUSPARSE; else if (strstr(vec_type, VECKOKKOS)) mat_type = MATAIJKOKKOS; else mat_type = MATAIJ; - ierr = CreateDM(comm, problem, mat_type, vec_type, &dm); CHKERRQ(ierr); + PetscCall(CreateDM(comm, problem, mat_type, vec_type, &dm)); user->dm = dm; PetscCall(DMSetApplicationContext(dm, user)); @@ -134,55 +134,49 @@ int main(int argc, char **argv) { // --------------------------------------------------------------------------- { PetscErrorCode (*p)(ProblemData *, DM, void *); - ierr = PetscFunctionListFind(app_ctx->problems, app_ctx->problem_name, &p); - CHKERRQ(ierr); - if (!p) SETERRQ(PETSC_COMM_SELF, 1, "Problem '%s' not found", - app_ctx->problem_name); - ierr = (*p)(problem, dm, &user); CHKERRQ(ierr); + PetscCall(PetscFunctionListFind(app_ctx->problems, app_ctx->problem_name, &p)); + if (!p) SETERRQ(PETSC_COMM_SELF, 1, "Problem '%s' not found", app_ctx->problem_name); + PetscCall((*p)(problem, dm, &user)); } // -- Set up DM - ierr = SetUpDM(dm, problem, app_ctx->degree, bc, phys_ctx); - CHKERRQ(ierr); + PetscCall(SetUpDM(dm, problem, app_ctx->degree, bc, phys_ctx)); // -- Refine DM for high-order viz if (app_ctx->viz_refine) { - ierr = VizRefineDM(dm, user, problem, bc, phys_ctx); - CHKERRQ(ierr); + PetscCall(VizRefineDM(dm, user, problem, bc, phys_ctx)); } // --------------------------------------------------------------------------- // Set up libCEED // --------------------------------------------------------------------------- // -- Set up libCEED objects - ierr = SetupLibceed(ceed, ceed_data, dm, user, app_ctx, problem, bc); - CHKERRQ(ierr); + PetscCall(SetupLibceed(ceed, ceed_data, dm, user, app_ctx, problem, bc)); // --------------------------------------------------------------------------- // Set up ICs // --------------------------------------------------------------------------- // -- Set up global state vector Q Vec Q; - ierr = DMCreateGlobalVector(dm, &Q); CHKERRQ(ierr); - ierr = VecZeroEntries(Q); CHKERRQ(ierr); + PetscCall(DMCreateGlobalVector(dm, &Q)); + PetscCall(VecZeroEntries(Q)); // -- Set up local state vectors Q_loc, Q_dot_loc - ierr = DMCreateLocalVector(dm, &user->Q_loc); CHKERRQ(ierr); - ierr = DMCreateLocalVector(dm, &user->Q_dot_loc); CHKERRQ(ierr); - ierr = VecZeroEntries(user->Q_dot_loc); CHKERRQ(ierr); + PetscCall(DMCreateLocalVector(dm, &user->Q_loc)); + PetscCall(DMCreateLocalVector(dm, &user->Q_dot_loc)); + PetscCall(VecZeroEntries(user->Q_dot_loc)); // -- Fix multiplicity for ICs - ierr = ICs_FixMultiplicity(dm, ceed_data, user, user->Q_loc, Q, 0.0); - CHKERRQ(ierr); + PetscCall(ICs_FixMultiplicity(dm, ceed_data, user, user->Q_loc, Q, 0.0)); // --------------------------------------------------------------------------- // Set up lumped mass matrix // --------------------------------------------------------------------------- // -- Set up global mass vector - ierr = VecDuplicate(Q, &user->M); CHKERRQ(ierr); + PetscCall(VecDuplicate(Q, &user->M)); // -- Compute lumped mass matrix - ierr = ComputeLumpedMassMatrix(ceed, dm, ceed_data, user->M); CHKERRQ(ierr); + PetscCall(ComputeLumpedMassMatrix(ceed, dm, ceed_data, user->M)); // --------------------------------------------------------------------------- // Record boundary values from initial condition @@ -193,7 +187,7 @@ int main(int argc, char **argv) { // still get the same results due to the problem->bc function, but with // potentially much slower execution. if (problem->bc_from_ics) { - ierr = SetBCsFromICs_NS(dm, Q, user->Q_loc); CHKERRQ(ierr); + PetscCall(SetBCsFromICs_NS(dm, Q, user->Q_loc)); } // --------------------------------------------------------------------------- @@ -201,14 +195,16 @@ int main(int argc, char **argv) { // --------------------------------------------------------------------------- PetscMPIInt rank; MPI_Comm_rank(comm, &rank); - if (!rank) {ierr = PetscMkdir(app_ctx->output_dir); CHKERRQ(ierr);} + if (!rank) { + PetscCall(PetscMkdir(app_ctx->output_dir)); + } // --------------------------------------------------------------------------- // Gather initial Q values in case of continuation of simulation // --------------------------------------------------------------------------- // -- Set up initial values from binary file if (app_ctx->cont_steps) { - ierr = SetupICsFromBinary(comm, app_ctx, Q); CHKERRQ(ierr); + PetscCall(SetupICsFromBinary(comm, app_ctx, Q)); } // --------------------------------------------------------------------------- @@ -218,84 +214,78 @@ int main(int argc, char **argv) { // Header and rank char host_name[PETSC_MAX_PATH_LEN]; int comm_size; - ierr = PetscGetHostName(host_name, sizeof host_name); CHKERRQ(ierr); - ierr = MPI_Comm_size(comm, &comm_size); CHKERRQ(ierr); - ierr = PetscPrintf(comm, - "\n-- Navier-Stokes solver - libCEED + PETSc --\n" - " MPI:\n" - " Host Name : %s\n" - " Total ranks : %d\n", - host_name, comm_size); CHKERRQ(ierr); + PetscCall(PetscGetHostName(host_name, sizeof host_name)); + PetscCall(MPI_Comm_size(comm, &comm_size)); + PetscCall(PetscPrintf(comm, + "\n-- Navier-Stokes solver - libCEED + PETSc --\n" + " MPI:\n" + " Host Name : %s\n" + " Total ranks : %d\n", + host_name, comm_size)); // Problem specific info - ierr = problem->print_info(problem, app_ctx); CHKERRQ(ierr); + PetscCall(problem->print_info(problem, app_ctx)); // libCEED const char *used_resource; CeedGetResource(ceed, &used_resource); - ierr = PetscPrintf(comm, - " libCEED:\n" - " libCEED Backend : %s\n" - " libCEED Backend MemType : %s\n", - used_resource, CeedMemTypes[mem_type_backend]); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, + " libCEED:\n" + " libCEED Backend : %s\n" + " libCEED Backend MemType : %s\n", + used_resource, CeedMemTypes[mem_type_backend])); // PETSc char box_faces_str[PETSC_MAX_PATH_LEN] = "3,3,3"; if (problem->dim == 2) box_faces_str[3] = '\0'; - ierr = PetscOptionsGetString(NULL, NULL, "-dm_plex_box_faces", box_faces_str, - sizeof(box_faces_str), NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsGetString(NULL, NULL, "-dm_plex_box_faces", box_faces_str, sizeof(box_faces_str), NULL)); MatType mat_type; VecType vec_type; - ierr = DMGetMatType(dm, &mat_type); CHKERRQ(ierr); - ierr = DMGetVecType(dm, &vec_type); CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " PETSc:\n" - " Box Faces : %s\n" - " DM MatType : %s\n" - " DM VecType : %s\n" - " Time Stepping Scheme : %s\n", - box_faces_str, mat_type, vec_type, - phys_ctx->implicit ? "implicit" : "explicit"); CHKERRQ(ierr); + PetscCall(DMGetMatType(dm, &mat_type)); + PetscCall(DMGetVecType(dm, &vec_type)); + PetscCall(PetscPrintf(comm, + " PETSc:\n" + " Box Faces : %s\n" + " DM MatType : %s\n" + " DM VecType : %s\n" + " Time Stepping Scheme : %s\n", + box_faces_str, mat_type, vec_type, phys_ctx->implicit ? "implicit" : "explicit")); // Mesh const PetscInt num_comp_q = 5; CeedInt glob_dofs, owned_dofs; PetscInt glob_nodes, owned_nodes; - const CeedInt num_P = app_ctx->degree + 1, - num_Q = num_P + app_ctx->q_extra; + const CeedInt num_P = app_ctx->degree + 1, num_Q = num_P + app_ctx->q_extra; // -- Get global size - ierr = VecGetSize(Q, &glob_dofs); CHKERRQ(ierr); - ierr = VecGetLocalSize(Q, &owned_dofs); CHKERRQ(ierr); - glob_nodes = glob_dofs/num_comp_q; + PetscCall(VecGetSize(Q, &glob_dofs)); + PetscCall(VecGetLocalSize(Q, &owned_dofs)); + glob_nodes = glob_dofs / num_comp_q; // -- Get local size - ierr = VecGetSize(user->Q_loc, &owned_nodes); CHKERRQ(ierr); + PetscCall(VecGetSize(user->Q_loc, &owned_nodes)); owned_nodes /= num_comp_q; - ierr = PetscPrintf(comm, - " Mesh:\n" - " Number of 1D Basis Nodes (P) : %" CeedInt_FMT "\n" - " Number of 1D Quadrature Points (Q) : %" CeedInt_FMT "\n" - " Global DoFs : %" PetscInt_FMT "\n" - " Owned DoFs : %" PetscInt_FMT "\n" - " DoFs per node : %" PetscInt_FMT "\n" - " Global nodes : %" PetscInt_FMT "\n" - " Owned nodes : %" PetscInt_FMT "\n", - num_P, num_Q, glob_dofs, owned_dofs, num_comp_q, - glob_nodes, owned_nodes); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, + " Mesh:\n" + " Number of 1D Basis Nodes (P) : %" CeedInt_FMT "\n" + " Number of 1D Quadrature Points (Q) : %" CeedInt_FMT "\n" + " Global DoFs : %" PetscInt_FMT "\n" + " Owned DoFs : %" PetscInt_FMT "\n" + " DoFs per node : %" PetscInt_FMT "\n" + " Global nodes : %" PetscInt_FMT "\n" + " Owned nodes : %" PetscInt_FMT "\n", + num_P, num_Q, glob_dofs, owned_dofs, num_comp_q, glob_nodes, owned_nodes)); } // -- Zero Q_loc - ierr = VecZeroEntries(user->Q_loc); CHKERRQ(ierr); + PetscCall(VecZeroEntries(user->Q_loc)); // --------------------------------------------------------------------------- // TS: Create, setup, and solve // --------------------------------------------------------------------------- - TS ts; + TS ts; PetscScalar final_time; - ierr = TSSolve_NS(dm, user, app_ctx, phys_ctx, &Q, &final_time, &ts); - CHKERRQ(ierr); + PetscCall(TSSolve_NS(dm, user, app_ctx, phys_ctx, &Q, &final_time, &ts)); // --------------------------------------------------------------------------- // Post-processing // --------------------------------------------------------------------------- - ierr = PostProcess_NS(ts, ceed_data, dm, problem, user, Q, final_time); - CHKERRQ(ierr); + PetscCall(PostProcess_NS(ts, ceed_data, dm, problem, user, Q, final_time)); // --------------------------------------------------------------------------- // Destroy libCEED objects @@ -348,34 +338,34 @@ int main(int argc, char **argv) { // Clean up PETSc // --------------------------------------------------------------------------- // -- Vectors - ierr = VecDestroy(&Q); CHKERRQ(ierr); - ierr = VecDestroy(&user->M); CHKERRQ(ierr); - ierr = VecDestroy(&user->Q_loc); CHKERRQ(ierr); - ierr = VecDestroy(&user->Q_dot_loc); CHKERRQ(ierr); + PetscCall(VecDestroy(&Q)); + PetscCall(VecDestroy(&user->M)); + PetscCall(VecDestroy(&user->Q_loc)); + PetscCall(VecDestroy(&user->Q_dot_loc)); // -- Matrices - ierr = MatDestroy(&user->interp_viz); CHKERRQ(ierr); + PetscCall(MatDestroy(&user->interp_viz)); // -- DM - ierr = DMDestroy(&dm); CHKERRQ(ierr); - ierr = DMDestroy(&user->dm_viz); CHKERRQ(ierr); + PetscCall(DMDestroy(&dm)); + PetscCall(DMDestroy(&user->dm_viz)); // -- TS - ierr = TSDestroy(&ts); CHKERRQ(ierr); + PetscCall(TSDestroy(&ts)); // -- Function list - ierr = PetscFunctionListDestroy(&app_ctx->problems); CHKERRQ(ierr); + PetscCall(PetscFunctionListDestroy(&app_ctx->problems)); PetscCall(PetscFree(app_ctx->amat_type)); // -- Structs - ierr = PetscFree(units); CHKERRQ(ierr); - ierr = PetscFree(user); CHKERRQ(ierr); - ierr = PetscFree(problem); CHKERRQ(ierr); - ierr = PetscFree(bc); CHKERRQ(ierr); - ierr = PetscFree(phys_ctx); CHKERRQ(ierr); - ierr = PetscFree(app_ctx); CHKERRQ(ierr); - ierr = PetscFree(ceed_data); CHKERRQ(ierr); + PetscCall(PetscFree(units)); + PetscCall(PetscFree(user)); + PetscCall(PetscFree(problem)); + PetscCall(PetscFree(bc)); + PetscCall(PetscFree(phys_ctx)); + PetscCall(PetscFree(app_ctx)); + PetscCall(PetscFree(ceed_data)); return PetscFinalize(); } diff --git a/examples/fluids/navierstokes.h b/examples/fluids/navierstokes.h index 8356e4c000..3f3b4f9035 100644 --- a/examples/fluids/navierstokes.h +++ b/examples/fluids/navierstokes.h @@ -14,13 +14,14 @@ #include #include #include -#include "qfunctions/stabilization_types.h" + #include "qfunctions/newtonian_types.h" +#include "qfunctions/stabilization_types.h" // ----------------------------------------------------------------------------- // PETSc Version // ----------------------------------------------------------------------------- -#if PETSC_VERSION_LT(3,17,0) +#if PETSC_VERSION_LT(3, 17, 0) #error "PETSc v3.17 or later is required" #endif @@ -28,31 +29,21 @@ // Enums // ----------------------------------------------------------------------------- // Translate PetscMemType to CeedMemType -static inline CeedMemType MemTypeP2C(PetscMemType mem_type) { - return PetscMemTypeDevice(mem_type) ? CEED_MEM_DEVICE : CEED_MEM_HOST; -} +static inline CeedMemType MemTypeP2C(PetscMemType mem_type) { return PetscMemTypeDevice(mem_type) ? CEED_MEM_DEVICE : CEED_MEM_HOST; } // Advection - Wind Options typedef enum { WIND_ROTATION = 0, WIND_TRANSLATION = 1, } WindType; -static const char *const WindTypes[] = { - "rotation", - "translation", - "WindType", "WIND_", NULL -}; +static const char *const WindTypes[] = {"rotation", "translation", "WindType", "WIND_", NULL}; // Advection - Bubble Types typedef enum { - BUBBLE_SPHERE = 0, // dim=3 - BUBBLE_CYLINDER = 1, // dim=2 + BUBBLE_SPHERE = 0, // dim=3 + BUBBLE_CYLINDER = 1, // dim=2 } BubbleType; -static const char *const BubbleTypes[] = { - "sphere", - "cylinder", - "BubbleType", "BUBBLE_", NULL -}; +static const char *const BubbleTypes[] = {"sphere", "cylinder", "BubbleType", "BUBBLE_", NULL}; // Advection - Bubble Continuity Types typedef enum { @@ -60,89 +51,68 @@ typedef enum { BUBBLE_CONTINUITY_BACK_SHARP = 1, // Discontinuous, sharp back half shape BUBBLE_CONTINUITY_THICK = 2, // Define a finite thickness } BubbleContinuityType; -static const char *const BubbleContinuityTypes[] = { - "smooth", - "back_sharp", - "thick", - "BubbleContinuityType", "BUBBLE_CONTINUITY_", NULL -}; +static const char *const BubbleContinuityTypes[] = {"smooth", "back_sharp", "thick", "BubbleContinuityType", "BUBBLE_CONTINUITY_", NULL}; // Euler - test cases typedef enum { EULER_TEST_ISENTROPIC_VORTEX = 0, - EULER_TEST_1 = 1, - EULER_TEST_2 = 2, - EULER_TEST_3 = 3, - EULER_TEST_4 = 4, - EULER_TEST_5 = 5, + EULER_TEST_1 = 1, + EULER_TEST_2 = 2, + EULER_TEST_3 = 3, + EULER_TEST_4 = 4, + EULER_TEST_5 = 5, } EulerTestType; -static const char *const EulerTestTypes[] = { - "isentropic_vortex", - "test_1", - "test_2", - "test_3", - "test_4", - "test_5", - "EulerTestType", "EULER_TEST_", NULL -}; +static const char *const EulerTestTypes[] = {"isentropic_vortex", "test_1", "test_2", "test_3", "test_4", "test_5", + "EulerTestType", "EULER_TEST_", NULL}; // Stabilization methods -static const char *const StabilizationTypes[] = { - "none", - "SU", - "SUPG", - "StabilizationType", "STAB_", NULL -}; +static const char *const StabilizationTypes[] = {"none", "SU", "SUPG", "StabilizationType", "STAB_", NULL}; // ----------------------------------------------------------------------------- // Structs // ----------------------------------------------------------------------------- // Structs declarations -typedef struct AppCtx_private *AppCtx; -typedef struct CeedData_private *CeedData; -typedef struct User_private *User; -typedef struct Units_private *Units; -typedef struct SimpleBC_private *SimpleBC; -typedef struct Physics_private *Physics; +typedef struct AppCtx_private *AppCtx; +typedef struct CeedData_private *CeedData; +typedef struct User_private *User; +typedef struct Units_private *Units; +typedef struct SimpleBC_private *SimpleBC; +typedef struct Physics_private *Physics; // Application context from user command line options struct AppCtx_private { // libCEED arguments - char ceed_resource[PETSC_MAX_PATH_LEN]; // libCEED backend - PetscInt degree; - PetscInt q_extra; + char ceed_resource[PETSC_MAX_PATH_LEN]; // libCEED backend + PetscInt degree; + PetscInt q_extra; // Solver arguments - MatType amat_type; - PetscBool pmat_pbdiagonal; + MatType amat_type; + PetscBool pmat_pbdiagonal; // Post-processing arguments - PetscInt output_freq; - PetscInt viz_refine; - PetscInt cont_steps; - char cont_file[PETSC_MAX_PATH_LEN]; - char cont_time_file[PETSC_MAX_PATH_LEN]; - char output_dir[PETSC_MAX_PATH_LEN]; - PetscBool add_stepnum2bin; + PetscInt output_freq; + PetscInt viz_refine; + PetscInt cont_steps; + char cont_file[PETSC_MAX_PATH_LEN]; + char cont_time_file[PETSC_MAX_PATH_LEN]; + char output_dir[PETSC_MAX_PATH_LEN]; + PetscBool add_stepnum2bin; // Problem type arguments PetscFunctionList problems; char problem_name[PETSC_MAX_PATH_LEN]; // Test mode arguments - PetscBool test_mode; - PetscScalar test_tol; - char file_path[PETSC_MAX_PATH_LEN]; + PetscBool test_mode; + PetscScalar test_tol; + char file_path[PETSC_MAX_PATH_LEN]; }; // libCEED data struct struct CeedData_private { - CeedVector x_coord, q_data; - CeedQFunction qf_setup_vol, qf_ics, qf_rhs_vol, qf_ifunction_vol, - qf_setup_sur, - qf_apply_inflow, qf_apply_inflow_jacobian, - qf_apply_outflow, qf_apply_outflow_jacobian, - qf_apply_freestream, qf_apply_freestream_jacobian; - CeedBasis basis_x, basis_xc, basis_q, basis_x_sur, basis_q_sur, - basis_xc_sur; - CeedElemRestriction elem_restr_x, elem_restr_q, elem_restr_qd_i; - CeedOperator op_setup_vol, op_ics; + CeedVector x_coord, q_data; + CeedQFunction qf_setup_vol, qf_ics, qf_rhs_vol, qf_ifunction_vol, qf_setup_sur, qf_apply_inflow, qf_apply_inflow_jacobian, qf_apply_outflow, + qf_apply_outflow_jacobian, qf_apply_freestream, qf_apply_freestream_jacobian; + CeedBasis basis_x, basis_xc, basis_q, basis_x_sur, basis_q_sur, basis_xc_sur; + CeedElemRestriction elem_restr_x, elem_restr_q, elem_restr_qd_i; + CeedOperator op_setup_vol, op_ics; }; // PETSc user data @@ -156,12 +126,10 @@ struct User_private { Vec M, Q_loc, Q_dot_loc; Physics phys; AppCtx app_ctx; - CeedVector q_ceed, q_dot_ceed, g_ceed, coo_values_amat, coo_values_pmat, - x_ceed; - CeedOperator op_rhs_vol, op_rhs, op_ifunction_vol, op_ifunction, op_ijacobian, - op_dirichlet; - bool matrices_set_up; - CeedScalar time, dt; + CeedVector q_ceed, q_dot_ceed, g_ceed, coo_values_amat, coo_values_pmat, x_ceed; + CeedOperator op_rhs_vol, op_rhs, op_ifunction_vol, op_ifunction, op_ijacobian, op_dirichlet; + bool matrices_set_up; + CeedScalar time, dt; }; // Units @@ -181,38 +149,36 @@ struct Units_private { // Boundary conditions struct SimpleBC_private { - PetscInt num_wall, // Number of faces with wall BCs - wall_comps[5], // An array of constrained component numbers - num_comps, - num_slip[3], // Number of faces with slip BCs - num_inflow, - num_outflow, - num_freestream; + PetscInt num_wall, // Number of faces with wall BCs + wall_comps[5], // An array of constrained component numbers + num_comps, + num_slip[3], // Number of faces with slip BCs + num_inflow, num_outflow, num_freestream; PetscInt walls[16], slips[3][16], inflows[16], outflows[16], freestreams[16]; PetscBool user_bc; }; // Struct that contains all enums and structs used for the physics of all problems struct Physics_private { - WindType wind_type; - BubbleType bubble_type; - BubbleContinuityType bubble_continuity_type; - EulerTestType euler_test; - StabilizationType stab; - PetscBool implicit; - StateVariable state_var; - PetscBool has_curr_time; - PetscBool has_neumann; - CeedContextFieldLabel solution_time_label; - CeedContextFieldLabel stg_solution_time_label; - CeedContextFieldLabel timestep_size_label; - CeedContextFieldLabel ics_time_label; - CeedContextFieldLabel ijacobian_time_shift_label; + WindType wind_type; + BubbleType bubble_type; + BubbleContinuityType bubble_continuity_type; + EulerTestType euler_test; + StabilizationType stab; + PetscBool implicit; + StateVariable state_var; + PetscBool has_curr_time; + PetscBool has_neumann; + CeedContextFieldLabel solution_time_label; + CeedContextFieldLabel stg_solution_time_label; + CeedContextFieldLabel timestep_size_label; + CeedContextFieldLabel ics_time_label; + CeedContextFieldLabel ijacobian_time_shift_label; }; typedef struct { CeedQFunctionUser qfunction; - const char *qfunction_loc; + const char *qfunction_loc; CeedQFunctionContext qfunction_context; } ProblemQFunctionSpec; @@ -220,17 +186,15 @@ typedef struct { // *INDENT-OFF* typedef struct ProblemData_private ProblemData; struct ProblemData_private { - CeedInt dim, q_data_size_vol, q_data_size_sur, jac_data_size_sur; - CeedScalar dm_scale; - ProblemQFunctionSpec setup_vol, setup_sur, ics, apply_vol_rhs, apply_vol_ifunction, - apply_vol_ijacobian, apply_inflow, apply_outflow, apply_freestream, - apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian; - bool non_zero_time; - PetscErrorCode (*bc)(PetscInt, PetscReal, const PetscReal[], PetscInt, - PetscScalar[], void *); - void *bc_ctx; + CeedInt dim, q_data_size_vol, q_data_size_sur, jac_data_size_sur; + CeedScalar dm_scale; + ProblemQFunctionSpec setup_vol, setup_sur, ics, apply_vol_rhs, apply_vol_ifunction, apply_vol_ijacobian, apply_inflow, apply_outflow, + apply_freestream, apply_inflow_jacobian, apply_outflow_jacobian, apply_freestream_jacobian; + bool non_zero_time; + PetscErrorCode (*bc)(PetscInt, PetscReal, const PetscReal[], PetscInt, PetscScalar[], void *); + void *bc_ctx; PetscBool bc_from_ics, use_dirichlet_ceed; - PetscErrorCode (*print_info)(ProblemData*, AppCtx); + PetscErrorCode (*print_info)(ProblemData *, AppCtx); }; // *INDENT-ON* @@ -240,25 +204,16 @@ extern int FreeContextPetsc(void *); // Set up problems // ----------------------------------------------------------------------------- // Set up function for each problem -extern PetscErrorCode NS_NEWTONIAN_WAVE(ProblemData *problem, DM dm, - void *ctx); -extern PetscErrorCode NS_CHANNEL(ProblemData *problem, DM dm, - void *ctx); -extern PetscErrorCode NS_BLASIUS(ProblemData *problem, DM dm, - void *ctx); -extern PetscErrorCode NS_NEWTONIAN_IG(ProblemData *problem, DM dm, - void *ctx); -extern PetscErrorCode NS_DENSITY_CURRENT(ProblemData *problem, DM dm, - void *ctx); - -extern PetscErrorCode NS_EULER_VORTEX(ProblemData *problem, DM dm, - void *ctx); -extern PetscErrorCode NS_SHOCKTUBE(ProblemData *problem, DM dm, - void *ctx); -extern PetscErrorCode NS_ADVECTION(ProblemData *problem, DM dm, - void *ctx); -extern PetscErrorCode NS_ADVECTION2D(ProblemData *problem, DM dm, - void *ctx); +extern PetscErrorCode NS_NEWTONIAN_WAVE(ProblemData *problem, DM dm, void *ctx); +extern PetscErrorCode NS_CHANNEL(ProblemData *problem, DM dm, void *ctx); +extern PetscErrorCode NS_BLASIUS(ProblemData *problem, DM dm, void *ctx); +extern PetscErrorCode NS_NEWTONIAN_IG(ProblemData *problem, DM dm, void *ctx); +extern PetscErrorCode NS_DENSITY_CURRENT(ProblemData *problem, DM dm, void *ctx); + +extern PetscErrorCode NS_EULER_VORTEX(ProblemData *problem, DM dm, void *ctx); +extern PetscErrorCode NS_SHOCKTUBE(ProblemData *problem, DM dm, void *ctx); +extern PetscErrorCode NS_ADVECTION(ProblemData *problem, DM dm, void *ctx); +extern PetscErrorCode NS_ADVECTION2D(ProblemData *problem, DM dm, void *ctx); // Print function for each problem extern PetscErrorCode PRINT_NEWTONIAN(ProblemData *problem, AppCtx app_ctx); @@ -278,66 +233,48 @@ extern PetscErrorCode PRINT_ADVECTION2D(ProblemData *problem, AppCtx app_ctx); PetscInt Involute(PetscInt i); // Utility function to create local CEED restriction -PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr); +PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr); // Utility function to get Ceed Restriction for each domain -PetscErrorCode GetRestrictionForDomain(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, PetscInt value, - CeedInt Q, CeedInt q_data_size, - CeedElemRestriction *elem_restr_q, - CeedElemRestriction *elem_restr_x, - CeedElemRestriction *elem_restr_qd_i); +PetscErrorCode GetRestrictionForDomain(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, PetscInt value, CeedInt Q, CeedInt q_data_size, + CeedElemRestriction *elem_restr_q, CeedElemRestriction *elem_restr_x, CeedElemRestriction *elem_restr_qd_i); // Utility function to create CEED Composite Operator for the entire domain -PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, - CeedData ceed_data, Physics phys, - CeedOperator op_apply_vol, - CeedOperator op_apply_ijacobian_vol, - CeedInt height, - CeedInt P_sur, CeedInt Q_sur, - CeedInt q_data_size_sur, CeedInt jac_data_size_sur, - CeedOperator *op_apply, CeedOperator *op_apply_ijacobian); - -PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, - AppCtx app_ctx, ProblemData *problem, SimpleBC bc); +PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, CeedData ceed_data, Physics phys, CeedOperator op_apply_vol, + CeedOperator op_apply_ijacobian_vol, CeedInt height, CeedInt P_sur, CeedInt Q_sur, CeedInt q_data_size_sur, + CeedInt jac_data_size_sur, CeedOperator *op_apply, CeedOperator *op_apply_ijacobian); + +PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData *problem, SimpleBC bc); // ----------------------------------------------------------------------------- // Time-stepping functions // ----------------------------------------------------------------------------- // Compute mass matrix for explicit scheme -PetscErrorCode ComputeLumpedMassMatrix(Ceed ceed, DM dm, CeedData ceed_data, - Vec M); +PetscErrorCode ComputeLumpedMassMatrix(Ceed ceed, DM dm, CeedData ceed_data, Vec M); // RHS (Explicit time-stepper) function setup PetscErrorCode RHS_NS(TS ts, PetscReal t, Vec Q, Vec G, void *user_data); // Implicit time-stepper function setup -PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, - void *user_data); +PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *user_data); // User provided TS Monitor -PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, Vec Q, - void *ctx); +PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, Vec Q, void *ctx); // TS: Create, setup, and solve -PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, - Vec *Q, PetscScalar *f_time, TS *ts); +PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q, PetscScalar *f_time, TS *ts); // ----------------------------------------------------------------------------- // Setup DM // ----------------------------------------------------------------------------- // Create mesh -PetscErrorCode CreateDM(MPI_Comm comm, ProblemData *problem, - MatType, VecType, DM *dm); +PetscErrorCode CreateDM(MPI_Comm comm, ProblemData *problem, MatType, VecType, DM *dm); // Set up DM -PetscErrorCode SetUpDM(DM dm, ProblemData *problem, PetscInt degree, - SimpleBC bc, Physics phys); +PetscErrorCode SetUpDM(DM dm, ProblemData *problem, PetscInt degree, SimpleBC bc, Physics phys); // Refine DM for high-order viz -PetscErrorCode VizRefineDM(DM dm, User user, ProblemData *problem, - SimpleBC bc, Physics phys); +PetscErrorCode VizRefineDM(DM dm, User user, ProblemData *problem, SimpleBC bc, Physics phys); // ----------------------------------------------------------------------------- // Process command line options @@ -346,31 +283,24 @@ PetscErrorCode VizRefineDM(DM dm, User user, ProblemData *problem, PetscErrorCode RegisterProblems_NS(AppCtx app_ctx); // Process general command line options -PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, - SimpleBC bc); +PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC bc); // ----------------------------------------------------------------------------- // Miscellaneous utility functions // ----------------------------------------------------------------------------- -PetscErrorCode ICs_FixMultiplicity(DM dm, CeedData ceed_data, User user, - Vec Q_loc, Vec Q, - CeedScalar time); +PetscErrorCode ICs_FixMultiplicity(DM dm, CeedData ceed_data, User user, Vec Q_loc, Vec Q, CeedScalar time); -PetscErrorCode DMPlexInsertBoundaryValues_NS(DM dm, - PetscBool insert_essential, Vec Q_loc, PetscReal time, Vec face_geom_FVM, - Vec cell_geom_FVM, Vec grad_FVM); +PetscErrorCode DMPlexInsertBoundaryValues_NS(DM dm, PetscBool insert_essential, Vec Q_loc, PetscReal time, Vec face_geom_FVM, Vec cell_geom_FVM, + Vec grad_FVM); // Compare reference solution values with current test run for CI PetscErrorCode RegressionTests_NS(AppCtx app_ctx, Vec Q); // Get error for problems with exact solutions -PetscErrorCode GetError_NS(CeedData ceed_data, DM dm, User user, Vec Q, - PetscScalar final_time); +PetscErrorCode GetError_NS(CeedData ceed_data, DM dm, User user, Vec Q, PetscScalar final_time); // Post-processing -PetscErrorCode PostProcess_NS(TS ts, CeedData ceed_data, DM dm, - ProblemData *problem, User user, - Vec Q, PetscScalar final_time); +PetscErrorCode PostProcess_NS(TS ts, CeedData ceed_data, DM dm, ProblemData *problem, User user, Vec Q, PetscScalar final_time); // -- Gather initial Q values in case of continuation of simulation PetscErrorCode SetupICsFromBinary(MPI_Comm comm, AppCtx app_ctx, Vec Q); @@ -383,9 +313,9 @@ PetscErrorCode SetBCsFromICs_NS(DM dm, Vec Q, Vec Q_loc); // ----------------------------------------------------------------------------- // Setup StrongBCs that use QFunctions -PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, - User user, AppCtx app_ctx, ProblemData *problem, - SimpleBC bc, CeedInt Q_sur, CeedInt q_data_size_sur); +PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData *problem, SimpleBC bc, CeedInt Q_sur, + CeedInt q_data_size_sur); PetscErrorCode FreestreamBCSetup(ProblemData *problem, DM dm, void *ctx); -#endif // libceed_fluids_examples_navier_stokes_h + +#endif // libceed_fluids_examples_navier_stokes_h diff --git a/examples/fluids/problems/advection.c b/examples/fluids/problems/advection.c index 0b13395391..ac33d4c837 100644 --- a/examples/fluids/problems/advection.c +++ b/examples/fluids/problems/advection.c @@ -8,27 +8,27 @@ /// @file /// Utility functions for setting up ADVECTION +#include "../qfunctions/advection.h" + #include "../navierstokes.h" #include "../qfunctions/setupgeo.h" -#include "../qfunctions/advection.h" PetscErrorCode NS_ADVECTION(ProblemData *problem, DM dm, void *ctx) { WindType wind_type; BubbleType bubble_type; BubbleContinuityType bubble_continuity_type; StabilizationType stab; - SetupContextAdv setup_context; + SetupContextAdv setup_context; User user = *(User *)ctx; MPI_Comm comm = PETSC_COMM_WORLD; PetscBool implicit; PetscBool has_curr_time = PETSC_FALSE; - PetscInt ierr; AdvectionContext advection_ctx; CeedQFunctionContext advection_context; PetscFunctionBeginUser; - ierr = PetscCalloc1(1, &setup_context); CHKERRQ(ierr); - ierr = PetscCalloc1(1, &advection_ctx); CHKERRQ(ierr); + PetscCall(PetscCalloc1(1, &setup_context)); + PetscCall(PetscCalloc1(1, &advection_ctx)); // ------------------------------------------------------ // SET UP ADVECTION @@ -56,22 +56,21 @@ PetscErrorCode NS_ADVECTION(ProblemData *problem, DM dm, void *ctx) { // ------------------------------------------------------ // Create the libCEED context // ------------------------------------------------------ - CeedScalar rc = 1000.; // m (Radius of bubble) - CeedScalar CtauS = 0.; // dimensionless - CeedScalar strong_form = 0.; // [0,1] - CeedScalar E_wind = 1.e6; // J - PetscReal wind[3] = {1., 0, 0}; // m/s - PetscReal domain_min[3], domain_max[3], domain_size[3]; - ierr = DMGetBoundingBox(dm, domain_min, domain_max); CHKERRQ(ierr); - for (PetscInt i=0; i<3; i++) domain_size[i] = domain_max[i] - domain_min[i]; - + CeedScalar rc = 1000.; // m (Radius of bubble) + CeedScalar CtauS = 0.; // dimensionless + CeedScalar strong_form = 0.; // [0,1] + CeedScalar E_wind = 1.e6; // J + PetscReal wind[3] = {1., 0, 0}; // m/s + PetscReal domain_min[3], domain_max[3], domain_size[3]; + PetscCall(DMGetBoundingBox(dm, domain_min, domain_max)); + for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i]; // ------------------------------------------------------ // Create the PETSc context // ------------------------------------------------------ - PetscScalar meter = 1e-2; // 1 meter in scaled length units - PetscScalar kilogram = 1e-6; // 1 kilogram in scaled mass units - PetscScalar second = 1e-2; // 1 second in scaled time units + PetscScalar meter = 1e-2; // 1 meter in scaled length units + PetscScalar kilogram = 1e-6; // 1 kilogram in scaled mass units + PetscScalar second = 1e-2; // 1 second in scaled time units PetscScalar Joule; // ------------------------------------------------------ @@ -79,74 +78,46 @@ PetscErrorCode NS_ADVECTION(ProblemData *problem, DM dm, void *ctx) { // ------------------------------------------------------ PetscOptionsBegin(comm, NULL, "Options for ADVECTION problem", NULL); // -- Physics - ierr = PetscOptionsScalar("-rc", "Characteristic radius of thermal bubble", - NULL, rc, &rc, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-rc", "Characteristic radius of thermal bubble", NULL, rc, &rc, NULL)); PetscBool translation; - ierr = PetscOptionsEnum("-wind_type", "Wind type in Advection", - NULL, WindTypes, - (PetscEnum)(wind_type = WIND_ROTATION), - (PetscEnum *)&wind_type, &translation); CHKERRQ(ierr); + PetscCall(PetscOptionsEnum("-wind_type", "Wind type in Advection", NULL, WindTypes, (PetscEnum)(wind_type = WIND_ROTATION), (PetscEnum *)&wind_type, + &translation)); if (translation) user->phys->has_neumann = PETSC_TRUE; - PetscInt n = problem->dim; + PetscInt n = problem->dim; PetscBool user_wind; - ierr = PetscOptionsRealArray("-wind_translation", "Constant wind vector", - NULL, wind, &n, &user_wind); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-CtauS", - "Scale coefficient for tau (nondimensional)", - NULL, CtauS, &CtauS, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-strong_form", - "Strong (1) or weak/integrated by parts (0) advection residual", - NULL, strong_form, &strong_form, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-E_wind", "Total energy of inflow wind", - NULL, E_wind, &E_wind, NULL); CHKERRQ(ierr); - ierr = PetscOptionsEnum("-bubble_type", "Sphere (3D) or cylinder (2D)", - NULL, BubbleTypes, - (PetscEnum)(bubble_type = BUBBLE_SPHERE), - (PetscEnum *)&bubble_type, NULL); CHKERRQ(ierr); - ierr = PetscOptionsEnum("-bubble_continuity", "Smooth, back_sharp, or thick", - NULL, BubbleContinuityTypes, - (PetscEnum)(bubble_continuity_type = BUBBLE_CONTINUITY_SMOOTH), - (PetscEnum *)&bubble_continuity_type, NULL); CHKERRQ(ierr); - ierr = PetscOptionsEnum("-stab", "Stabilization method", NULL, - StabilizationTypes, (PetscEnum)(stab = STAB_NONE), - (PetscEnum *)&stab, NULL); CHKERRQ(ierr); - ierr = PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", - NULL, implicit=PETSC_FALSE, &implicit, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsRealArray("-wind_translation", "Constant wind vector", NULL, wind, &n, &user_wind)); + PetscCall(PetscOptionsScalar("-CtauS", "Scale coefficient for tau (nondimensional)", NULL, CtauS, &CtauS, NULL)); + PetscCall( + PetscOptionsScalar("-strong_form", "Strong (1) or weak/integrated by parts (0) advection residual", NULL, strong_form, &strong_form, NULL)); + PetscCall(PetscOptionsScalar("-E_wind", "Total energy of inflow wind", NULL, E_wind, &E_wind, NULL)); + PetscCall(PetscOptionsEnum("-bubble_type", "Sphere (3D) or cylinder (2D)", NULL, BubbleTypes, (PetscEnum)(bubble_type = BUBBLE_SPHERE), + (PetscEnum *)&bubble_type, NULL)); + PetscCall(PetscOptionsEnum("-bubble_continuity", "Smooth, back_sharp, or thick", NULL, BubbleContinuityTypes, + (PetscEnum)(bubble_continuity_type = BUBBLE_CONTINUITY_SMOOTH), (PetscEnum *)&bubble_continuity_type, NULL)); + PetscCall(PetscOptionsEnum("-stab", "Stabilization method", NULL, StabilizationTypes, (PetscEnum)(stab = STAB_NONE), (PetscEnum *)&stab, NULL)); + PetscCall(PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", NULL, implicit = PETSC_FALSE, &implicit, NULL)); // -- Units - ierr = PetscOptionsScalar("-units_meter", "1 meter in scaled length units", - NULL, meter, &meter, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_meter", "1 meter in scaled length units", NULL, meter, &meter, NULL)); meter = fabs(meter); - ierr = PetscOptionsScalar("-units_kilogram","1 kilogram in scaled mass units", - NULL, kilogram, &kilogram, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_kilogram", "1 kilogram in scaled mass units", NULL, kilogram, &kilogram, NULL)); kilogram = fabs(kilogram); - ierr = PetscOptionsScalar("-units_second","1 second in scaled time units", - NULL, second, &second, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_second", "1 second in scaled time units", NULL, second, &second, NULL)); second = fabs(second); // -- Warnings if (wind_type == WIND_ROTATION && user_wind) { - ierr = PetscPrintf(comm, - "Warning! Use -wind_translation only with -wind_type translation\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! Use -wind_translation only with -wind_type translation\n")); } - if (wind_type == WIND_TRANSLATION - && bubble_type == BUBBLE_CYLINDER && wind[2] != 0.) { + if (wind_type == WIND_TRANSLATION && bubble_type == BUBBLE_CYLINDER && wind[2] != 0.) { wind[2] = 0; - ierr = PetscPrintf(comm, - "Warning! Background wind in the z direction should be zero (-wind_translation x,x,0) with -bubble_type cylinder\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! Background wind in the z direction should be zero (-wind_translation x,x,0) with -bubble_type cylinder\n")); } if (stab == STAB_NONE && CtauS != 0) { - ierr = PetscPrintf(comm, - "Warning! Use -CtauS only with -stab su or -stab supg\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! Use -CtauS only with -stab su or -stab supg\n")); } if (stab == STAB_SUPG && !implicit) { - ierr = PetscPrintf(comm, - "Warning! Use -stab supg only with -implicit\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! Use -stab supg only with -implicit\n")); } PetscOptionsEnd(); @@ -168,8 +139,8 @@ PetscErrorCode NS_ADVECTION(ProblemData *problem, DM dm, void *ctx) { // -- Scale variables to desired units E_wind *= Joule; rc = fabs(rc) * meter; - for (PetscInt i=0; i<3; i++) { - wind[i] *= (meter/second); + for (PetscInt i = 0; i < 3; i++) { + wind[i] *= (meter / second); domain_size[i] *= meter; } problem->dm_scale = meter; @@ -185,16 +156,16 @@ PetscErrorCode NS_ADVECTION(ProblemData *problem, DM dm, void *ctx) { setup_context->wind_type = wind_type; setup_context->bubble_type = bubble_type; setup_context->bubble_continuity_type = bubble_continuity_type; - setup_context->time = 0; + setup_context->time = 0; // -- QFunction Context - user->phys->stab = stab; - user->phys->wind_type = wind_type; - user->phys->bubble_type = bubble_type; - user->phys->bubble_continuity_type = bubble_continuity_type; + user->phys->stab = stab; + user->phys->wind_type = wind_type; + user->phys->bubble_type = bubble_type; + user->phys->bubble_continuity_type = bubble_continuity_type; // if passed correctly - user->phys->implicit = implicit; - user->phys->has_curr_time = has_curr_time; + user->phys->implicit = implicit; + user->phys->has_curr_time = has_curr_time; advection_ctx->CtauS = CtauS; advection_ctx->E_wind = E_wind; advection_ctx->implicit = implicit; @@ -202,58 +173,41 @@ PetscErrorCode NS_ADVECTION(ProblemData *problem, DM dm, void *ctx) { advection_ctx->stabilization = stab; CeedQFunctionContextCreate(user->ceed, &problem->ics.qfunction_context); - CeedQFunctionContextSetData(problem->ics.qfunction_context, CEED_MEM_HOST, - CEED_USE_POINTER, sizeof(*setup_context), setup_context); - CeedQFunctionContextSetDataDestroy(problem->ics.qfunction_context, - CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(problem->ics.qfunction_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*setup_context), setup_context); + CeedQFunctionContextSetDataDestroy(problem->ics.qfunction_context, CEED_MEM_HOST, FreeContextPetsc); CeedQFunctionContextCreate(user->ceed, &advection_context); - CeedQFunctionContextSetData(advection_context, CEED_MEM_HOST, - CEED_USE_POINTER, - sizeof(*advection_ctx), advection_ctx); - CeedQFunctionContextSetDataDestroy(advection_context, CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(advection_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*advection_ctx), advection_ctx); + CeedQFunctionContextSetDataDestroy(advection_context, CEED_MEM_HOST, FreeContextPetsc); problem->apply_vol_rhs.qfunction_context = advection_context; - CeedQFunctionContextReferenceCopy(advection_context, - &problem->apply_vol_ifunction.qfunction_context); - CeedQFunctionContextReferenceCopy(advection_context, - &problem->apply_inflow.qfunction_context); + CeedQFunctionContextReferenceCopy(advection_context, &problem->apply_vol_ifunction.qfunction_context); + CeedQFunctionContextReferenceCopy(advection_context, &problem->apply_inflow.qfunction_context); PetscFunctionReturn(0); } PetscErrorCode PRINT_ADVECTION(ProblemData *problem, AppCtx app_ctx) { - MPI_Comm comm = PETSC_COMM_WORLD; - PetscErrorCode ierr; - SetupContextAdv setup_ctx; + MPI_Comm comm = PETSC_COMM_WORLD; + SetupContextAdv setup_ctx; AdvectionContext advection_ctx; PetscFunctionBeginUser; - CeedQFunctionContextGetData(problem->ics.qfunction_context, - CEED_MEM_HOST, &setup_ctx); - CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, - CEED_MEM_HOST, &advection_ctx); - ierr = PetscPrintf(comm, - " Problem:\n" - " Problem Name : %s\n" - " Stabilization : %s\n" - " Bubble Type : %s (%" CeedInt_FMT "D)\n" - " Bubble Continuity : %s\n" - " Wind Type : %s\n", - app_ctx->problem_name, StabilizationTypes[advection_ctx->stabilization], - BubbleTypes[setup_ctx->bubble_type], - setup_ctx->bubble_type == BUBBLE_SPHERE ? 3 : 2, - BubbleContinuityTypes[setup_ctx->bubble_continuity_type], - WindTypes[setup_ctx->wind_type]); CHKERRQ(ierr); + CeedQFunctionContextGetData(problem->ics.qfunction_context, CEED_MEM_HOST, &setup_ctx); + CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &advection_ctx); + PetscCall(PetscPrintf(comm, + " Problem:\n" + " Problem Name : %s\n" + " Stabilization : %s\n" + " Bubble Type : %s (%" CeedInt_FMT "D)\n" + " Bubble Continuity : %s\n" + " Wind Type : %s\n", + app_ctx->problem_name, StabilizationTypes[advection_ctx->stabilization], BubbleTypes[setup_ctx->bubble_type], + setup_ctx->bubble_type == BUBBLE_SPHERE ? 3 : 2, BubbleContinuityTypes[setup_ctx->bubble_continuity_type], + WindTypes[setup_ctx->wind_type])); if (setup_ctx->wind_type == WIND_TRANSLATION) { - ierr = PetscPrintf(comm, - " Background Wind : %f,%f,%f\n", - setup_ctx->wind[0], setup_ctx->wind[1], setup_ctx->wind[2]); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " Background Wind : %f,%f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1], setup_ctx->wind[2])); } - CeedQFunctionContextRestoreData(problem->ics.qfunction_context, - &setup_ctx); - CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, - &advection_ctx); + CeedQFunctionContextRestoreData(problem->ics.qfunction_context, &setup_ctx); + CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &advection_ctx); PetscFunctionReturn(0); } diff --git a/examples/fluids/problems/advection2d.c b/examples/fluids/problems/advection2d.c index febb9fbf4d..1f80ef0d4a 100644 --- a/examples/fluids/problems/advection2d.c +++ b/examples/fluids/problems/advection2d.c @@ -8,27 +8,25 @@ /// @file /// Utility functions for setting up ADVECTION2D +#include "../qfunctions/advection2d.h" + #include "../navierstokes.h" #include "../qfunctions/setupgeo2d.h" -#include "../qfunctions/advection2d.h" PetscErrorCode NS_ADVECTION2D(ProblemData *problem, DM dm, void *ctx) { - - WindType wind_type; - StabilizationType stab; - SetupContextAdv2D setup_context; - User user = *(User *)ctx; - MPI_Comm comm = PETSC_COMM_WORLD; - PetscBool implicit; - PetscBool has_curr_time = PETSC_FALSE; - PetscInt ierr; + WindType wind_type; + StabilizationType stab; + SetupContextAdv2D setup_context; + User user = *(User *)ctx; + MPI_Comm comm = PETSC_COMM_WORLD; + PetscBool implicit; + PetscBool has_curr_time = PETSC_FALSE; AdvectionContext advection_ctx; CeedQFunctionContext advection_context; - PetscFunctionBeginUser; - ierr = PetscCalloc1(1, &advection_ctx); CHKERRQ(ierr); - ierr = PetscCalloc1(1, &setup_context); CHKERRQ(ierr); + PetscCall(PetscCalloc1(1, &advection_ctx)); + PetscCall(PetscCalloc1(1, &setup_context)); // ------------------------------------------------------ // SET UP ADVECTION2D @@ -56,22 +54,21 @@ PetscErrorCode NS_ADVECTION2D(ProblemData *problem, DM dm, void *ctx) { // ------------------------------------------------------ // Create the libCEED context // ------------------------------------------------------ - CeedScalar rc = 1000.; // m (Radius of bubble) - CeedScalar CtauS = 0.; // dimensionless - CeedScalar strong_form = 0.; // [0,1] - CeedScalar E_wind = 1.e6; // J - PetscReal wind[2] = {1., 0.}; // m/s - PetscReal domain_min[2], domain_max[2], domain_size[2]; - ierr = DMGetBoundingBox(dm, domain_min, domain_max); CHKERRQ(ierr); - for (PetscInt i=0; i<2; i++) domain_size[i] = domain_max[i] - domain_min[i]; - + CeedScalar rc = 1000.; // m (Radius of bubble) + CeedScalar CtauS = 0.; // dimensionless + CeedScalar strong_form = 0.; // [0,1] + CeedScalar E_wind = 1.e6; // J + PetscReal wind[2] = {1., 0.}; // m/s + PetscReal domain_min[2], domain_max[2], domain_size[2]; + PetscCall(DMGetBoundingBox(dm, domain_min, domain_max)); + for (PetscInt i = 0; i < 2; i++) domain_size[i] = domain_max[i] - domain_min[i]; // ------------------------------------------------------ // Create the PETSc context // ------------------------------------------------------ - PetscScalar meter = 1e-2; // 1 meter in scaled length units - PetscScalar kilogram = 1e-6; // 1 kilogram in scaled mass units - PetscScalar second = 1e-2; // 1 second in scaled time units + PetscScalar meter = 1e-2; // 1 meter in scaled length units + PetscScalar kilogram = 1e-6; // 1 kilogram in scaled mass units + PetscScalar second = 1e-2; // 1 second in scaled time units PetscScalar Joule; // ------------------------------------------------------ @@ -79,58 +76,38 @@ PetscErrorCode NS_ADVECTION2D(ProblemData *problem, DM dm, void *ctx) { // ------------------------------------------------------ PetscOptionsBegin(comm, NULL, "Options for ADVECTION2D problem", NULL); // -- Physics - ierr = PetscOptionsScalar("-rc", "Characteristic radius of thermal bubble", - NULL, rc, &rc, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-rc", "Characteristic radius of thermal bubble", NULL, rc, &rc, NULL)); PetscBool translation; - ierr = PetscOptionsEnum("-wind_type", "Wind type in Advection", - NULL, WindTypes, (PetscEnum)(wind_type = WIND_ROTATION), - (PetscEnum *)&wind_type, &translation); CHKERRQ(ierr); + PetscCall(PetscOptionsEnum("-wind_type", "Wind type in Advection", NULL, WindTypes, (PetscEnum)(wind_type = WIND_ROTATION), (PetscEnum *)&wind_type, + &translation)); if (translation) user->phys->has_neumann = PETSC_TRUE; - PetscInt n = problem->dim; + PetscInt n = problem->dim; PetscBool user_wind; - ierr = PetscOptionsRealArray("-wind_translation", "Constant wind vector", - NULL, wind, &n, &user_wind); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-CtauS", - "Scale coefficient for tau (nondimensional)", - NULL, CtauS, &CtauS, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-strong_form", - "Strong (1) or weak/integrated by parts (0) advection residual", - NULL, strong_form, &strong_form, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-E_wind", "Total energy of inflow wind", - NULL, E_wind, &E_wind, NULL); CHKERRQ(ierr); - ierr = PetscOptionsEnum("-stab", "Stabilization method", NULL, - StabilizationTypes, (PetscEnum)(stab = STAB_NONE), - (PetscEnum *)&stab, NULL); CHKERRQ(ierr); - ierr = PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", - NULL, implicit=PETSC_FALSE, &implicit, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsRealArray("-wind_translation", "Constant wind vector", NULL, wind, &n, &user_wind)); + PetscCall(PetscOptionsScalar("-CtauS", "Scale coefficient for tau (nondimensional)", NULL, CtauS, &CtauS, NULL)); + PetscCall( + PetscOptionsScalar("-strong_form", "Strong (1) or weak/integrated by parts (0) advection residual", NULL, strong_form, &strong_form, NULL)); + PetscCall(PetscOptionsScalar("-E_wind", "Total energy of inflow wind", NULL, E_wind, &E_wind, NULL)); + PetscCall(PetscOptionsEnum("-stab", "Stabilization method", NULL, StabilizationTypes, (PetscEnum)(stab = STAB_NONE), (PetscEnum *)&stab, NULL)); + PetscCall(PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", NULL, implicit = PETSC_FALSE, &implicit, NULL)); // -- Units - ierr = PetscOptionsScalar("-units_meter", "1 meter in scaled length units", - NULL, meter, &meter, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_meter", "1 meter in scaled length units", NULL, meter, &meter, NULL)); meter = fabs(meter); - ierr = PetscOptionsScalar("-units_kilogram","1 kilogram in scaled mass units", - NULL, kilogram, &kilogram, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_kilogram", "1 kilogram in scaled mass units", NULL, kilogram, &kilogram, NULL)); kilogram = fabs(kilogram); - ierr = PetscOptionsScalar("-units_second","1 second in scaled time units", - NULL, second, &second, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_second", "1 second in scaled time units", NULL, second, &second, NULL)); second = fabs(second); // -- Warnings if (wind_type == WIND_ROTATION && user_wind) { - ierr = PetscPrintf(comm, - "Warning! Use -wind_translation only with -wind_type translation\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! Use -wind_translation only with -wind_type translation\n")); } if (stab == STAB_NONE && CtauS != 0) { - ierr = PetscPrintf(comm, - "Warning! Use -CtauS only with -stab su or -stab supg\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! Use -CtauS only with -stab su or -stab supg\n")); } if (stab == STAB_SUPG && !implicit) { - ierr = PetscPrintf(comm, - "Warning! Use -stab supg only with -implicit\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! Use -stab supg only with -implicit\n")); } PetscOptionsEnd(); @@ -152,8 +129,8 @@ PetscErrorCode NS_ADVECTION2D(ProblemData *problem, DM dm, void *ctx) { // -- Scale variables to desired units E_wind *= Joule; rc = fabs(rc) * meter; - for (PetscInt i=0; i<2; i++) { - wind[i] *= (meter/second); + for (PetscInt i = 0; i < 2; i++) { + wind[i] *= (meter / second); domain_size[i] *= meter; } problem->dm_scale = meter; @@ -168,10 +145,10 @@ PetscErrorCode NS_ADVECTION2D(ProblemData *problem, DM dm, void *ctx) { setup_context->time = 0; // -- QFunction Context - user->phys->stab = stab; - user->phys->wind_type = wind_type; - user->phys->implicit = implicit; - user->phys->has_curr_time = has_curr_time; + user->phys->stab = stab; + user->phys->wind_type = wind_type; + user->phys->implicit = implicit; + user->phys->has_curr_time = has_curr_time; advection_ctx->CtauS = CtauS; advection_ctx->E_wind = E_wind; advection_ctx->implicit = implicit; @@ -179,54 +156,37 @@ PetscErrorCode NS_ADVECTION2D(ProblemData *problem, DM dm, void *ctx) { advection_ctx->stabilization = stab; CeedQFunctionContextCreate(user->ceed, &problem->ics.qfunction_context); - CeedQFunctionContextSetData(problem->ics.qfunction_context, CEED_MEM_HOST, - CEED_USE_POINTER, sizeof(*setup_context), setup_context); - CeedQFunctionContextSetDataDestroy(problem->ics.qfunction_context, - CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(problem->ics.qfunction_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*setup_context), setup_context); + CeedQFunctionContextSetDataDestroy(problem->ics.qfunction_context, CEED_MEM_HOST, FreeContextPetsc); CeedQFunctionContextCreate(user->ceed, &advection_context); - CeedQFunctionContextSetData(advection_context, CEED_MEM_HOST, - CEED_USE_POINTER, - sizeof(*advection_ctx), advection_ctx); - CeedQFunctionContextSetDataDestroy(advection_context, CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(advection_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*advection_ctx), advection_ctx); + CeedQFunctionContextSetDataDestroy(advection_context, CEED_MEM_HOST, FreeContextPetsc); problem->apply_vol_rhs.qfunction_context = advection_context; - CeedQFunctionContextReferenceCopy(advection_context, - &problem->apply_vol_ifunction.qfunction_context); - CeedQFunctionContextReferenceCopy(advection_context, - &problem->apply_inflow.qfunction_context); + CeedQFunctionContextReferenceCopy(advection_context, &problem->apply_vol_ifunction.qfunction_context); + CeedQFunctionContextReferenceCopy(advection_context, &problem->apply_inflow.qfunction_context); PetscFunctionReturn(0); } -PetscErrorCode PRINT_ADVECTION2D(ProblemData *problem, - AppCtx app_ctx) { +PetscErrorCode PRINT_ADVECTION2D(ProblemData *problem, AppCtx app_ctx) { MPI_Comm comm = PETSC_COMM_WORLD; - PetscErrorCode ierr; SetupContextAdv2D setup_ctx; AdvectionContext advection_ctx; PetscFunctionBeginUser; - CeedQFunctionContextGetData(problem->ics.qfunction_context, - CEED_MEM_HOST, &setup_ctx); - CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, - CEED_MEM_HOST, &advection_ctx); - ierr = PetscPrintf(comm, - " Problem:\n" - " Problem Name : %s\n" - " Stabilization : %s\n" - " Wind Type : %s\n", - app_ctx->problem_name, StabilizationTypes[advection_ctx->stabilization], - WindTypes[setup_ctx->wind_type]); CHKERRQ(ierr); + CeedQFunctionContextGetData(problem->ics.qfunction_context, CEED_MEM_HOST, &setup_ctx); + CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &advection_ctx); + PetscCall(PetscPrintf(comm, + " Problem:\n" + " Problem Name : %s\n" + " Stabilization : %s\n" + " Wind Type : %s\n", + app_ctx->problem_name, StabilizationTypes[advection_ctx->stabilization], WindTypes[setup_ctx->wind_type])); if (setup_ctx->wind_type == WIND_TRANSLATION) { - ierr = PetscPrintf(comm, - " Background Wind : %f,%f\n", - setup_ctx->wind[0], setup_ctx->wind[1]); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " Background Wind : %f,%f\n", setup_ctx->wind[0], setup_ctx->wind[1])); } - CeedQFunctionContextRestoreData(problem->ics.qfunction_context, - &setup_ctx); - CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, - &advection_ctx); + CeedQFunctionContextRestoreData(problem->ics.qfunction_context, &setup_ctx); + CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &advection_ctx); PetscFunctionReturn(0); } diff --git a/examples/fluids/problems/blasius.c b/examples/fluids/problems/blasius.c index 077e55eb47..5ff349b15b 100644 --- a/examples/fluids/problems/blasius.c +++ b/examples/fluids/problems/blasius.c @@ -8,17 +8,17 @@ /// @file /// Utility functions for setting up Blasius Boundary Layer -#include "../navierstokes.h" #include "../qfunctions/blasius.h" + +#include "../navierstokes.h" #include "stg_shur14.h" PetscErrorCode CompressibleBlasiusResidual(SNES snes, Vec X, Vec R, void *ctx) { const BlasiusContext blasius = (BlasiusContext)ctx; - const PetscScalar *Tf, *Th; // Chebyshev coefficients - PetscScalar *r, f[4], h[4]; - PetscInt N = blasius->n_cheb; - PetscScalar Ma = Mach(&blasius->newtonian_ctx, blasius->T_inf, blasius->U_inf), - Pr = Prandtl(&blasius->newtonian_ctx), + const PetscScalar *Tf, *Th; // Chebyshev coefficients + PetscScalar *r, f[4], h[4]; + PetscInt N = blasius->n_cheb; + PetscScalar Ma = Mach(&blasius->newtonian_ctx, blasius->T_inf, blasius->U_inf), Pr = Prandtl(&blasius->newtonian_ctx), gamma = HeatCapacityRatio(&blasius->newtonian_ctx); PetscFunctionBegin; PetscCall(VecGetArrayRead(X, &Tf)); @@ -32,32 +32,31 @@ PetscErrorCode CompressibleBlasiusResidual(SNES snes, Vec X, Vec R, void *ctx) { // f - right end boundary condition ChebyshevEval(N, Tf, 1., blasius->eta_max, f); - r[2] = f[1] - 1.; + r[2] = f[1] - 1.; - for (int i=0; iX[i], blasius->eta_max, f); - ChebyshevEval(N-1, Th, blasius->X[i], blasius->eta_max, h); + ChebyshevEval(N - 1, Th, blasius->X[i], blasius->eta_max, h); // mu and rho generally depend on h. We naively assume constant mu. // For an ideal gas at constant pressure, density is inversely proportional to enthalpy. // The *_tilde values are *relative* to their freestream values, and we proved first derivatives here. - const PetscScalar mu_tilde[2] = {1, 0}; - const PetscScalar rho_tilde[2] = {1 / h[0], -h[1] / PetscSqr(h[0])}; + const PetscScalar mu_tilde[2] = {1, 0}; + const PetscScalar rho_tilde[2] = {1 / h[0], -h[1] / PetscSqr(h[0])}; const PetscScalar mu_rho_tilde[2] = { - mu_tilde[0] *rho_tilde[0], - mu_tilde[1] *rho_tilde[0] + mu_tilde[0] *rho_tilde[1], + mu_tilde[0] * rho_tilde[0], + mu_tilde[1] * rho_tilde[0] + mu_tilde[0] * rho_tilde[1], }; - r[3+i] = 2*(mu_rho_tilde[0] * f[3] + mu_rho_tilde[1] * f[2]) + f[2] * f[0]; - r[N+2+i] = (mu_rho_tilde[0] * h[2] + mu_rho_tilde[1] * h[1]) + Pr * f[0] * h[1] - + Pr * (gamma - 1) * mu_rho_tilde[0] * PetscSqr(Ma * f[2]); + r[3 + i] = 2 * (mu_rho_tilde[0] * f[3] + mu_rho_tilde[1] * f[2]) + f[2] * f[0]; + r[N + 2 + i] = (mu_rho_tilde[0] * h[2] + mu_rho_tilde[1] * h[1]) + Pr * f[0] * h[1] + Pr * (gamma - 1) * mu_rho_tilde[0] * PetscSqr(Ma * f[2]); } // h - left end boundary condition - ChebyshevEval(N-1, Th, -1., blasius->eta_max, h); + ChebyshevEval(N - 1, Th, -1., blasius->eta_max, h); r[N] = h[0] - blasius->T_wall / blasius->T_inf; // h - right end boundary condition - ChebyshevEval(N-1, Th, 1., blasius->eta_max, h); - r[N+1] = h[0] - 1.; + ChebyshevEval(N - 1, Th, 1., blasius->eta_max, h); + r[N + 1] = h[0] - 1.; // Restore vectors PetscCall(VecRestoreArrayRead(X, &Tf)); @@ -66,22 +65,22 @@ PetscErrorCode CompressibleBlasiusResidual(SNES snes, Vec X, Vec R, void *ctx) { } PetscErrorCode ComputeChebyshevCoefficients(BlasiusContext blasius) { - SNES snes; - Vec sol, res; - PetscReal *w; - PetscInt N = blasius->n_cheb; + SNES snes; + Vec sol, res; + PetscReal *w; + PetscInt N = blasius->n_cheb; SNESConvergedReason reason; - const PetscScalar *cheb_coefs; + const PetscScalar *cheb_coefs; PetscFunctionBegin; // Allocate memory - PetscCall(PetscMalloc2(N-3, &blasius->X, N-3, &w)); - PetscCall(PetscDTGaussQuadrature(N-3, -1., 1., blasius->X, w)); + PetscCall(PetscMalloc2(N - 3, &blasius->X, N - 3, &w)); + PetscCall(PetscDTGaussQuadrature(N - 3, -1., 1., blasius->X, w)); // Snes solve PetscCall(SNESCreate(PETSC_COMM_SELF, &snes)); PetscCall(VecCreate(PETSC_COMM_SELF, &sol)); - PetscCall(VecSetSizes(sol, PETSC_DECIDE, 2*N-1)); + PetscCall(VecSetSizes(sol, PETSC_DECIDE, 2 * N - 1)); PetscCall(VecSetFromOptions(sol)); // Constant relative enthalpy 1 as initial guess PetscCall(VecSetValue(sol, N, 1., INSERT_VALUES)); @@ -91,14 +90,12 @@ PetscErrorCode ComputeChebyshevCoefficients(BlasiusContext blasius) { PetscCall(SNESSetFromOptions(snes)); PetscCall(SNESSolve(snes, NULL, sol)); PetscCall(SNESGetConvergedReason(snes, &reason)); - if (reason < 0) - SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_CONV_FAILED, - "The Chebyshev solve failed.\n"); + if (reason < 0) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_CONV_FAILED, "The Chebyshev solve failed.\n"); // Assign Chebyshev coefficients PetscCall(VecGetArrayRead(sol, &cheb_coefs)); - for (int i=0; iTf_cheb[i] = cheb_coefs[i]; - for (int i=0; iTh_cheb[i] = cheb_coefs[i+N]; + for (int i = 0; i < N; i++) blasius->Tf_cheb[i] = cheb_coefs[i]; + for (int i = 0; i < N - 1; i++) blasius->Th_cheb[i] = cheb_coefs[i + N]; // Destroy objects PetscCall(PetscFree2(blasius->X, w)); @@ -108,38 +105,34 @@ PetscErrorCode ComputeChebyshevCoefficients(BlasiusContext blasius) { PetscFunctionReturn(0); } -static PetscErrorCode GetYNodeLocs(const MPI_Comm comm, - const char path[PETSC_MAX_PATH_LEN], PetscReal **pynodes, - PetscInt *nynodes) { - PetscErrorCode ierr; - PetscInt ndims, dims[2]; - FILE *fp; +static PetscErrorCode GetYNodeLocs(const MPI_Comm comm, const char path[PETSC_MAX_PATH_LEN], PetscReal **pynodes, PetscInt *nynodes) { + PetscInt ndims, dims[2]; + FILE *fp; const PetscInt char_array_len = 512; - char line[char_array_len]; - char **array; - PetscReal *node_locs; + char line[char_array_len]; + char **array; + PetscReal *node_locs; PetscFunctionBeginUser; - ierr = PetscFOpen(comm, path, "r", &fp); CHKERRQ(ierr); - ierr = PetscSynchronizedFGets(comm, fp, char_array_len, line); CHKERRQ(ierr); - ierr = PetscStrToArray(line, ' ', &ndims, &array); CHKERRQ(ierr); + PetscCall(PetscFOpen(comm, path, "r", &fp)); + PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line)); + PetscCall(PetscStrToArray(line, ' ', &ndims, &array)); - for (PetscInt i=0; i faces[1] +1) { - ierr = PetscPrintf(comm, "WARNING: y_node_locs_path has more locations (%d) " - "than the mesh has nodes (%d). This maybe unintended.\n", - *num_node_locs, faces[1]+1); CHKERRQ(ierr); + faces[1] + 1, *num_node_locs); + } + if (*num_node_locs > faces[1] + 1) { + PetscCall(PetscPrintf(comm, + "WARNING: y_node_locs_path has more locations (%d) " + "than the mesh has nodes (%d). This maybe unintended.\n", + *num_node_locs, faces[1] + 1)); } PetscScalar max_y = (*node_locs)[faces[1]]; - for (PetscInt i=0; iics.qfunction = ICsBlasius; - problem->ics.qfunction_loc = ICsBlasius_loc; - - CeedScalar U_inf = 40; // m/s - CeedScalar T_inf = 288.; // K - CeedScalar T_wall = 288.; // K - CeedScalar delta0 = 4.2e-3; // m - CeedScalar P0 = 1.01e5; // Pa - CeedInt N = 20; // Number of Chebyshev terms - PetscBool weakT = PETSC_FALSE; // weak density or temperature - PetscReal mesh_refine_height = 5.9e-4; // m - PetscReal mesh_growth = 1.08; // [-] - PetscInt mesh_Ndelta = 45; // [-] - PetscReal mesh_top_angle = 5; // degrees - char mesh_ynodes_path[PETSC_MAX_PATH_LEN] = ""; + problem->ics.qfunction = ICsBlasius; + problem->ics.qfunction_loc = ICsBlasius_loc; + + CeedScalar U_inf = 40; // m/s + CeedScalar T_inf = 288.; // K + CeedScalar T_wall = 288.; // K + CeedScalar delta0 = 4.2e-3; // m + CeedScalar P0 = 1.01e5; // Pa + CeedInt N = 20; // Number of Chebyshev terms + PetscBool weakT = PETSC_FALSE; // weak density or temperature + PetscReal mesh_refine_height = 5.9e-4; // m + PetscReal mesh_growth = 1.08; // [-] + PetscInt mesh_Ndelta = 45; // [-] + PetscReal mesh_top_angle = 5; // degrees + char mesh_ynodes_path[PETSC_MAX_PATH_LEN] = ""; PetscOptionsBegin(comm, NULL, "Options for BLASIUS problem", NULL); - ierr = PetscOptionsBool("-weakT", "Change from rho weak to T weak at inflow", - NULL, weakT, &weakT, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-velocity_infinity", - "Velocity at boundary layer edge", - NULL, U_inf, &U_inf, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-temperature_infinity", - "Temperature at boundary layer edge", - NULL, T_inf, &T_inf, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-temperature_wall", "Temperature at wall", - NULL, T_wall, &T_wall, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-delta0", "Boundary layer height at inflow", - NULL, delta0, &delta0, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-P0", "Pressure at outflow", - NULL, P0, &P0, NULL); CHKERRQ(ierr); - ierr = PetscOptionsInt("-n_chebyshev", "Number of Chebyshev terms", - NULL, N, &N, NULL); CHKERRQ(ierr); - PetscCheck(3 <= N && N <= BLASIUS_MAX_N_CHEBYSHEV, - comm, PETSC_ERR_ARG_OUTOFRANGE, - "-n_chebyshev %" PetscInt_FMT " must be in range [3, %d]", N, + PetscCall(PetscOptionsBool("-weakT", "Change from rho weak to T weak at inflow", NULL, weakT, &weakT, NULL)); + PetscCall(PetscOptionsScalar("-velocity_infinity", "Velocity at boundary layer edge", NULL, U_inf, &U_inf, NULL)); + PetscCall(PetscOptionsScalar("-temperature_infinity", "Temperature at boundary layer edge", NULL, T_inf, &T_inf, NULL)); + PetscCall(PetscOptionsScalar("-temperature_wall", "Temperature at wall", NULL, T_wall, &T_wall, NULL)); + PetscCall(PetscOptionsScalar("-delta0", "Boundary layer height at inflow", NULL, delta0, &delta0, NULL)); + PetscCall(PetscOptionsScalar("-P0", "Pressure at outflow", NULL, P0, &P0, NULL)); + PetscCall(PetscOptionsInt("-n_chebyshev", "Number of Chebyshev terms", NULL, N, &N, NULL)); + PetscCheck(3 <= N && N <= BLASIUS_MAX_N_CHEBYSHEV, comm, PETSC_ERR_ARG_OUTOFRANGE, "-n_chebyshev %" PetscInt_FMT " must be in range [3, %d]", N, BLASIUS_MAX_N_CHEBYSHEV); - ierr = PetscOptionsBoundedInt("-platemesh_Ndelta", - "Velocity at boundary layer edge", - NULL, mesh_Ndelta, &mesh_Ndelta, NULL, 1); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-platemesh_refine_height", - "Height of boundary layer mesh refinement", - NULL, mesh_refine_height, &mesh_refine_height, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-platemesh_growth", - "Geometric growth rate of boundary layer mesh", - NULL, mesh_growth, &mesh_growth, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-platemesh_top_angle", - "Geometric top_angle rate of boundary layer mesh", - NULL, mesh_top_angle, &mesh_top_angle, NULL); CHKERRQ(ierr); - ierr = PetscOptionsString("-platemesh_y_node_locs_path", - "Path to file with y node locations. " - "If empty, will use the algorithmic mesh warping.", NULL, - mesh_ynodes_path, mesh_ynodes_path, - sizeof(mesh_ynodes_path), NULL); CHKERRQ(ierr); - ierr = PetscOptionsBool("-stg_use", "Use STG inflow boundary condition", - NULL, use_stg, &use_stg, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsBoundedInt("-platemesh_Ndelta", "Velocity at boundary layer edge", NULL, mesh_Ndelta, &mesh_Ndelta, NULL, 1)); + PetscCall(PetscOptionsScalar("-platemesh_refine_height", "Height of boundary layer mesh refinement", NULL, mesh_refine_height, &mesh_refine_height, + NULL)); + PetscCall(PetscOptionsScalar("-platemesh_growth", "Geometric growth rate of boundary layer mesh", NULL, mesh_growth, &mesh_growth, NULL)); + PetscCall( + PetscOptionsScalar("-platemesh_top_angle", "Geometric top_angle rate of boundary layer mesh", NULL, mesh_top_angle, &mesh_top_angle, NULL)); + PetscCall(PetscOptionsString("-platemesh_y_node_locs_path", + "Path to file with y node locations. " + "If empty, will use the algorithmic mesh warping.", + NULL, mesh_ynodes_path, mesh_ynodes_path, sizeof(mesh_ynodes_path), NULL)); + PetscCall(PetscOptionsBool("-stg_use", "Use STG inflow boundary condition", NULL, use_stg, &use_stg, NULL)); PetscOptionsEnd(); PetscScalar meter = user->units->meter; @@ -321,25 +290,21 @@ PetscErrorCode NS_BLASIUS(ProblemData *problem, DM dm, void *ctx) { PetscScalar Kelvin = user->units->Kelvin; PetscScalar Pascal = user->units->Pascal; - T_inf *= Kelvin; + T_inf *= Kelvin; T_wall *= Kelvin; - P0 *= Pascal; - U_inf *= meter / second; + P0 *= Pascal; + U_inf *= meter / second; delta0 *= meter; - PetscReal *mesh_ynodes = NULL; - PetscInt mesh_nynodes = 0; + PetscReal *mesh_ynodes = NULL; + PetscInt mesh_nynodes = 0; if (strcmp(mesh_ynodes_path, "")) { - ierr = GetYNodeLocs(comm, mesh_ynodes_path, &mesh_ynodes, &mesh_nynodes); - CHKERRQ(ierr); + PetscCall(GetYNodeLocs(comm, mesh_ynodes_path, &mesh_ynodes, &mesh_nynodes)); } - ierr = ModifyMesh(comm, dm, problem->dim, mesh_growth, mesh_Ndelta, - mesh_refine_height, mesh_top_angle, &mesh_ynodes, - &mesh_nynodes); CHKERRQ(ierr); + PetscCall(ModifyMesh(comm, dm, problem->dim, mesh_growth, mesh_Ndelta, mesh_refine_height, mesh_top_angle, &mesh_ynodes, &mesh_nynodes)); // Some properties depend on parameters from NewtonianIdealGas - CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, - CEED_MEM_HOST, &newtonian_ig_ctx); + CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx); blasius_ctx->weakT = weakT; blasius_ctx->U_inf = U_inf; @@ -354,36 +319,30 @@ PetscErrorCode NS_BLASIUS(ProblemData *problem, DM dm, void *ctx) { { PetscReal domain_min[3], domain_max[3]; - ierr = DMGetBoundingBox(dm, domain_min, domain_max); CHKERRQ(ierr); + PetscCall(DMGetBoundingBox(dm, domain_min, domain_max)); blasius_ctx->x_inflow = domain_min[0]; blasius_ctx->eta_max = 5 * domain_max[1] / blasius_ctx->delta0; } - if(!use_stg) PetscCall(ComputeChebyshevCoefficients(blasius_ctx)); + if (!use_stg) PetscCall(ComputeChebyshevCoefficients(blasius_ctx)); - CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, - &newtonian_ig_ctx); + CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ig_ctx); CeedQFunctionContextCreate(user->ceed, &blasius_context); - CeedQFunctionContextSetData(blasius_context, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(*blasius_ctx), blasius_ctx); - CeedQFunctionContextSetDataDestroy(blasius_context, CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(blasius_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*blasius_ctx), blasius_ctx); + CeedQFunctionContextSetDataDestroy(blasius_context, CEED_MEM_HOST, FreeContextPetsc); CeedQFunctionContextDestroy(&problem->ics.qfunction_context); problem->ics.qfunction_context = blasius_context; if (use_stg) { - ierr = SetupSTG(comm, dm, problem, user, weakT, T_inf, P0, mesh_ynodes, - mesh_nynodes); CHKERRQ(ierr); + PetscCall(SetupSTG(comm, dm, problem, user, weakT, T_inf, P0, mesh_ynodes, mesh_nynodes)); } else { problem->apply_inflow.qfunction = Blasius_Inflow; problem->apply_inflow.qfunction_loc = Blasius_Inflow_loc; problem->apply_inflow_jacobian.qfunction = Blasius_Inflow_Jacobian; problem->apply_inflow_jacobian.qfunction_loc = Blasius_Inflow_Jacobian_loc; - CeedQFunctionContextReferenceCopy(blasius_context, - &problem->apply_inflow.qfunction_context); - CeedQFunctionContextReferenceCopy(blasius_context, - &problem->apply_inflow_jacobian.qfunction_context); + CeedQFunctionContextReferenceCopy(blasius_context, &problem->apply_inflow.qfunction_context); + CeedQFunctionContextReferenceCopy(blasius_context, &problem->apply_inflow_jacobian.qfunction_context); } - ierr = PetscFree(mesh_ynodes); CHKERRQ(ierr); + PetscCall(PetscFree(mesh_ynodes)); PetscFunctionReturn(0); } diff --git a/examples/fluids/problems/channel.c b/examples/fluids/problems/channel.c index 287d39caef..a7dd37dfe3 100644 --- a/examples/fluids/problems/channel.c +++ b/examples/fluids/problems/channel.c @@ -8,21 +8,20 @@ /// @file /// Utility functions for setting up Channel flow -#include "../navierstokes.h" #include "../qfunctions/channel.h" -PetscErrorCode NS_CHANNEL(ProblemData *problem, DM dm, void *ctx) { +#include "../navierstokes.h" - PetscInt ierr; - User user = *(User *)ctx; - MPI_Comm comm = PETSC_COMM_WORLD; - ChannelContext channel_ctx; +PetscErrorCode NS_CHANNEL(ProblemData *problem, DM dm, void *ctx) { + User user = *(User *)ctx; + MPI_Comm comm = PETSC_COMM_WORLD; + ChannelContext channel_ctx; NewtonianIdealGasContext newtonian_ig_ctx; - CeedQFunctionContext channel_context; + CeedQFunctionContext channel_context; PetscFunctionBeginUser; - ierr = NS_NEWTONIAN_IG(problem, dm, ctx); CHKERRQ(ierr); - ierr = PetscCalloc1(1, &channel_ctx); CHKERRQ(ierr); + PetscCall(NS_NEWTONIAN_IG(problem, dm, ctx)); + PetscCall(PetscCalloc1(1, &channel_ctx)); // ------------------------------------------------------ // SET UP Channel @@ -38,19 +37,15 @@ PetscErrorCode NS_CHANNEL(ProblemData *problem, DM dm, void *ctx) { } // -- Command Line Options - CeedScalar umax = 10.; // m/s - CeedScalar theta0 = 300.; // K - CeedScalar P0 = 1.e5; // Pa - PetscReal body_force_scale = 1.; + CeedScalar umax = 10.; // m/s + CeedScalar theta0 = 300.; // K + CeedScalar P0 = 1.e5; // Pa + PetscReal body_force_scale = 1.; PetscOptionsBegin(comm, NULL, "Options for CHANNEL problem", NULL); - ierr = PetscOptionsScalar("-umax", "Centerline velocity of the Channel", - NULL, umax, &umax, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-theta0", "Wall temperature", - NULL, theta0, &theta0, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-P0", "Pressure at outflow", - NULL, P0, &P0, NULL); CHKERRQ(ierr); - ierr = PetscOptionsReal("-body_force_scale", "Multiplier for body force", - NULL, body_force_scale=1, &body_force_scale, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-umax", "Centerline velocity of the Channel", NULL, umax, &umax, NULL)); + PetscCall(PetscOptionsScalar("-theta0", "Wall temperature", NULL, theta0, &theta0, NULL)); + PetscCall(PetscOptionsScalar("-P0", "Pressure at outflow", NULL, P0, &P0, NULL)); + PetscCall(PetscOptionsReal("-body_force_scale", "Multiplier for body force", NULL, body_force_scale = 1, &body_force_scale, NULL)); PetscOptionsEnd(); PetscScalar meter = user->units->meter; @@ -59,23 +54,22 @@ PetscErrorCode NS_CHANNEL(ProblemData *problem, DM dm, void *ctx) { PetscScalar Pascal = user->units->Pascal; theta0 *= Kelvin; - P0 *= Pascal; - umax *= meter / second; + P0 *= Pascal; + umax *= meter / second; //-- Setup Problem information CeedScalar H, center; { PetscReal domain_min[3], domain_max[3], domain_size[3]; - ierr = DMGetBoundingBox(dm, domain_min, domain_max); CHKERRQ(ierr); - for (PetscInt i=0; i<3; i++) domain_size[i] = domain_max[i] - domain_min[i]; + PetscCall(DMGetBoundingBox(dm, domain_min, domain_max)); + for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i]; - H = 0.5*domain_size[1]*meter; - center = H + domain_min[1]*meter; + H = 0.5 * domain_size[1] * meter; + center = H + domain_min[1] * meter; } // Some properties depend on parameters from NewtonianIdealGas - CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, - CEED_MEM_HOST, &newtonian_ig_ctx); + CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx); channel_ctx->center = center; channel_ctx->H = H; @@ -83,32 +77,25 @@ PetscErrorCode NS_CHANNEL(ProblemData *problem, DM dm, void *ctx) { channel_ctx->P0 = P0; channel_ctx->umax = umax; channel_ctx->implicit = user->phys->implicit; - channel_ctx->B = body_force_scale * 2 * umax*newtonian_ig_ctx->mu / (H*H); + channel_ctx->B = body_force_scale * 2 * umax * newtonian_ig_ctx->mu / (H * H); { // Calculate Body force - CeedScalar cv = newtonian_ig_ctx->cv, - cp = newtonian_ig_ctx->cp; + CeedScalar cv = newtonian_ig_ctx->cv, cp = newtonian_ig_ctx->cp; CeedScalar Rd = cp - cv; - CeedScalar rho = P0 / (Rd*theta0); + CeedScalar rho = P0 / (Rd * theta0); CeedScalar g[] = {channel_ctx->B / rho, 0., 0.}; - ierr = PetscArraycpy(newtonian_ig_ctx->g, g, 3); CHKERRQ(ierr); + PetscCall(PetscArraycpy(newtonian_ig_ctx->g, g, 3)); } channel_ctx->newtonian_ctx = *newtonian_ig_ctx; - CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, - &newtonian_ig_ctx); + CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ig_ctx); CeedQFunctionContextCreate(user->ceed, &channel_context); - CeedQFunctionContextSetData(channel_context, CEED_MEM_HOST, - CEED_USE_POINTER, - sizeof(*channel_ctx), channel_ctx); - CeedQFunctionContextSetDataDestroy(channel_context, CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(channel_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*channel_ctx), channel_ctx); + CeedQFunctionContextSetDataDestroy(channel_context, CEED_MEM_HOST, FreeContextPetsc); problem->ics.qfunction_context = channel_context; - CeedQFunctionContextReferenceCopy(channel_context, - &problem->apply_inflow.qfunction_context); - CeedQFunctionContextReferenceCopy(channel_context, - &problem->apply_outflow.qfunction_context); + CeedQFunctionContextReferenceCopy(channel_context, &problem->apply_inflow.qfunction_context); + CeedQFunctionContextReferenceCopy(channel_context, &problem->apply_outflow.qfunction_context); PetscFunctionReturn(0); } diff --git a/examples/fluids/problems/densitycurrent.c b/examples/fluids/problems/densitycurrent.c index 869d1ee213..cc811582fd 100644 --- a/examples/fluids/problems/densitycurrent.c +++ b/examples/fluids/problems/densitycurrent.c @@ -10,11 +10,10 @@ /// Utility functions for setting up DENSITY_CURRENT #include "../qfunctions/densitycurrent.h" + #include "../navierstokes.h" PetscErrorCode NS_DENSITY_CURRENT(ProblemData *problem, DM dm, void *ctx) { - - PetscInt ierr; MPI_Comm comm = PETSC_COMM_WORLD; User user = *(User *)ctx; DensityCurrentContext dc_ctx; @@ -22,8 +21,8 @@ PetscErrorCode NS_DENSITY_CURRENT(ProblemData *problem, DM dm, void *ctx) { NewtonianIdealGasContext newtonian_ig_ctx; PetscFunctionBeginUser; - ierr = NS_NEWTONIAN_IG(problem, dm, ctx); CHKERRQ(ierr); - ierr = PetscCalloc1(1, &dc_ctx); CHKERRQ(ierr); + PetscCall(NS_NEWTONIAN_IG(problem, dm, ctx)); + PetscCall(PetscCalloc1(1, &dc_ctx)); // ------------------------------------------------------ // SET UP DENSITY_CURRENT // ------------------------------------------------------ @@ -34,53 +33,37 @@ PetscErrorCode NS_DENSITY_CURRENT(ProblemData *problem, DM dm, void *ctx) { // ------------------------------------------------------ // Create the libCEED context // ------------------------------------------------------ - CeedScalar theta0 = 300.; // K - CeedScalar thetaC = -15.; // K - CeedScalar P0 = 1.e5; // Pa - CeedScalar N = 0.01; // 1/s - CeedScalar rc = 1000.; // m (Radius of bubble) - PetscReal center[3], dc_axis[3] = {0, 0, 0}; - PetscReal domain_min[3], domain_max[3], domain_size[3]; - ierr = DMGetBoundingBox(dm, domain_min, domain_max); - CHKERRQ(ierr); - for (PetscInt i = 0; i < 3; i++) - domain_size[i] = domain_max[i] - domain_min[i]; + CeedScalar theta0 = 300.; // K + CeedScalar thetaC = -15.; // K + CeedScalar P0 = 1.e5; // Pa + CeedScalar N = 0.01; // 1/s + CeedScalar rc = 1000.; // m (Radius of bubble) + PetscReal center[3], dc_axis[3] = {0, 0, 0}; + PetscReal domain_min[3], domain_max[3], domain_size[3]; + PetscCall(DMGetBoundingBox(dm, domain_min, domain_max)); + for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i]; // ------------------------------------------------------ // Command line Options // ------------------------------------------------------ PetscOptionsBegin(comm, NULL, "Options for DENSITY_CURRENT problem", NULL); - ierr = PetscOptionsScalar("-theta0", "Reference potential temperature", NULL, - theta0, &theta0, NULL); - CHKERRQ(ierr); - ierr = PetscOptionsScalar("-thetaC", "Perturbation of potential temperature", - NULL, thetaC, &thetaC, NULL); - CHKERRQ(ierr); - ierr = PetscOptionsScalar("-P0", "Atmospheric pressure", NULL, P0, &P0, NULL); - CHKERRQ(ierr); - ierr = PetscOptionsScalar("-N", "Brunt-Vaisala frequency", NULL, N, &N, NULL); - CHKERRQ(ierr); - ierr = PetscOptionsScalar("-rc", "Characteristic radius of thermal bubble", - NULL, rc, &rc, NULL); - CHKERRQ(ierr); - for (PetscInt i = 0; i < 3; i++) - center[i] = .5 * domain_size[i]; + PetscCall(PetscOptionsScalar("-theta0", "Reference potential temperature", NULL, theta0, &theta0, NULL)); + PetscCall(PetscOptionsScalar("-thetaC", "Perturbation of potential temperature", NULL, thetaC, &thetaC, NULL)); + PetscCall(PetscOptionsScalar("-P0", "Atmospheric pressure", NULL, P0, &P0, NULL)); + PetscCall(PetscOptionsScalar("-N", "Brunt-Vaisala frequency", NULL, N, &N, NULL)); + PetscCall(PetscOptionsScalar("-rc", "Characteristic radius of thermal bubble", NULL, rc, &rc, NULL)); + for (PetscInt i = 0; i < 3; i++) center[i] = .5 * domain_size[i]; PetscInt n = problem->dim; - ierr = PetscOptionsRealArray("-center", "Location of bubble center", NULL, - center, &n, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsRealArray("-center", "Location of bubble center", NULL, center, &n, NULL)); n = problem->dim; - ierr = PetscOptionsRealArray("-dc_axis", - "Axis of density current cylindrical anomaly, " - "or {0,0,0} for spherically symmetric", - NULL, dc_axis, &n, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsRealArray("-dc_axis", + "Axis of density current cylindrical anomaly, " + "or {0,0,0} for spherically symmetric", + NULL, dc_axis, &n, NULL)); { - PetscReal norm = PetscSqrtReal(PetscSqr(dc_axis[0]) + PetscSqr(dc_axis[1]) + - PetscSqr(dc_axis[2])); + PetscReal norm = PetscSqrtReal(PetscSqr(dc_axis[0]) + PetscSqr(dc_axis[1]) + PetscSqr(dc_axis[2])); if (norm > 0) { - for (PetscInt i = 0; i < 3; i++) - dc_axis[i] /= norm; + for (PetscInt i = 0; i < 3; i++) dc_axis[i] /= norm; } } @@ -90,36 +73,31 @@ PetscErrorCode NS_DENSITY_CURRENT(ProblemData *problem, DM dm, void *ctx) { PetscScalar second = user->units->second; PetscScalar Kelvin = user->units->Kelvin; PetscScalar Pascal = user->units->Pascal; - rc = fabs(rc) * meter; + rc = fabs(rc) * meter; theta0 *= Kelvin; thetaC *= Kelvin; P0 *= Pascal; N *= (1. / second); - for (PetscInt i = 0; i < 3; i++) - center[i] *= meter; + for (PetscInt i = 0; i < 3; i++) center[i] *= meter; - dc_ctx->theta0 = theta0; - dc_ctx->thetaC = thetaC; - dc_ctx->P0 = P0; - dc_ctx->N = N; - dc_ctx->rc = rc; - dc_ctx->center[0] = center[0]; - dc_ctx->center[1] = center[1]; - dc_ctx->center[2] = center[2]; + dc_ctx->theta0 = theta0; + dc_ctx->thetaC = thetaC; + dc_ctx->P0 = P0; + dc_ctx->N = N; + dc_ctx->rc = rc; + dc_ctx->center[0] = center[0]; + dc_ctx->center[1] = center[1]; + dc_ctx->center[2] = center[2]; dc_ctx->dc_axis[0] = dc_axis[0]; dc_ctx->dc_axis[1] = dc_axis[1]; dc_ctx->dc_axis[2] = dc_axis[2]; - CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, - CEED_MEM_HOST, &newtonian_ig_ctx); + CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx); dc_ctx->newtonian_ctx = *newtonian_ig_ctx; - CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, - &newtonian_ig_ctx); + CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ig_ctx); CeedQFunctionContextCreate(user->ceed, &density_current_context); - CeedQFunctionContextSetData(density_current_context, CEED_MEM_HOST, - CEED_USE_POINTER, sizeof(*dc_ctx), dc_ctx); - CeedQFunctionContextSetDataDestroy(density_current_context, CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(density_current_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*dc_ctx), dc_ctx); + CeedQFunctionContextSetDataDestroy(density_current_context, CEED_MEM_HOST, FreeContextPetsc); problem->ics.qfunction_context = density_current_context; PetscFunctionReturn(0); diff --git a/examples/fluids/problems/eulervortex.c b/examples/fluids/problems/eulervortex.c index a1fc15436e..a0a02b6f9c 100644 --- a/examples/fluids/problems/eulervortex.c +++ b/examples/fluids/problems/eulervortex.c @@ -8,25 +8,24 @@ /// @file /// Utility functions for setting up EULER_VORTEX +#include "../qfunctions/eulervortex.h" + #include "../navierstokes.h" #include "../qfunctions/setupgeo.h" -#include "../qfunctions/eulervortex.h" PetscErrorCode NS_EULER_VORTEX(ProblemData *problem, DM dm, void *ctx) { - - EulerTestType euler_test; - User user = *(User *)ctx; - StabilizationType stab; - MPI_Comm comm = PETSC_COMM_WORLD; - PetscBool implicit; - PetscBool has_curr_time = PETSC_TRUE; - PetscBool has_neumann = PETSC_TRUE; - PetscInt ierr; - EulerContext euler_ctx; + EulerTestType euler_test; + User user = *(User *)ctx; + StabilizationType stab; + MPI_Comm comm = PETSC_COMM_WORLD; + PetscBool implicit; + PetscBool has_curr_time = PETSC_TRUE; + PetscBool has_neumann = PETSC_TRUE; + EulerContext euler_ctx; CeedQFunctionContext euler_context; PetscFunctionBeginUser; - ierr = PetscCalloc1(1, &euler_ctx); CHKERRQ(ierr); + PetscCall(PetscCalloc1(1, &euler_ctx)); // ------------------------------------------------------ // SET UP DENSITY_CURRENT @@ -56,68 +55,50 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData *problem, DM dm, void *ctx) { // ------------------------------------------------------ // Create the libCEED context // ------------------------------------------------------ - CeedScalar vortex_strength = 5.; // - - CeedScalar c_tau = 0.5; // - + CeedScalar vortex_strength = 5.; // - + CeedScalar c_tau = 0.5; // - // c_tau = 0.5 is reported as "optimal" in Hughes et al 2010 - PetscReal center[3], // m - mean_velocity[3] = {1., 1., 0}; // m/s + PetscReal center[3], // m + mean_velocity[3] = {1., 1., 0}; // m/s PetscReal domain_min[3], domain_max[3], domain_size[3]; - ierr = DMGetBoundingBox(dm, domain_min, domain_max); CHKERRQ(ierr); - for (PetscInt i=0; i<3; i++) domain_size[i] = domain_max[i] - domain_min[i]; + PetscCall(DMGetBoundingBox(dm, domain_min, domain_max)); + for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i]; // ------------------------------------------------------ // Create the PETSc context // ------------------------------------------------------ - PetscScalar meter = 1e-2; // 1 meter in scaled length units - PetscScalar second = 1e-2; // 1 second in scaled time units + PetscScalar meter = 1e-2; // 1 meter in scaled length units + PetscScalar second = 1e-2; // 1 second in scaled time units // ------------------------------------------------------ // Command line Options // ------------------------------------------------------ PetscOptionsBegin(comm, NULL, "Options for EULER_VORTEX problem", NULL); // -- Physics - ierr = PetscOptionsScalar("-vortex_strength", "Strength of Vortex", - NULL, vortex_strength, &vortex_strength, NULL); - CHKERRQ(ierr); - PetscInt n = problem->dim; + PetscCall(PetscOptionsScalar("-vortex_strength", "Strength of Vortex", NULL, vortex_strength, &vortex_strength, NULL)); + PetscInt n = problem->dim; PetscBool user_velocity; - ierr = PetscOptionsRealArray("-mean_velocity", "Background velocity vector", - NULL, mean_velocity, &n, &user_velocity); - CHKERRQ(ierr); - for (PetscInt i=0; i<3; i++) center[i] = .5*domain_size[i]; + PetscCall(PetscOptionsRealArray("-mean_velocity", "Background velocity vector", NULL, mean_velocity, &n, &user_velocity)); + for (PetscInt i = 0; i < 3; i++) center[i] = .5 * domain_size[i]; n = problem->dim; - ierr = PetscOptionsRealArray("-center", "Location of vortex center", - NULL, center, &n, NULL); CHKERRQ(ierr); - ierr = PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", - NULL, implicit=PETSC_FALSE, &implicit, NULL); - CHKERRQ(ierr); - ierr = PetscOptionsEnum("-euler_test", "Euler test option", NULL, - EulerTestTypes, (PetscEnum)(euler_test = EULER_TEST_ISENTROPIC_VORTEX), - (PetscEnum *)&euler_test, NULL); CHKERRQ(ierr); - ierr = PetscOptionsEnum("-stab", "Stabilization method", NULL, - StabilizationTypes, (PetscEnum)(stab = STAB_NONE), - (PetscEnum *)&stab, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-c_tau", "Stabilization constant", - NULL, c_tau, &c_tau, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsRealArray("-center", "Location of vortex center", NULL, center, &n, NULL)); + PetscCall(PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", NULL, implicit = PETSC_FALSE, &implicit, NULL)); + PetscCall(PetscOptionsEnum("-euler_test", "Euler test option", NULL, EulerTestTypes, (PetscEnum)(euler_test = EULER_TEST_ISENTROPIC_VORTEX), + (PetscEnum *)&euler_test, NULL)); + PetscCall(PetscOptionsEnum("-stab", "Stabilization method", NULL, StabilizationTypes, (PetscEnum)(stab = STAB_NONE), (PetscEnum *)&stab, NULL)); + PetscCall(PetscOptionsScalar("-c_tau", "Stabilization constant", NULL, c_tau, &c_tau, NULL)); // -- Units - ierr = PetscOptionsScalar("-units_meter", "1 meter in scaled length units", - NULL, meter, &meter, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_meter", "1 meter in scaled length units", NULL, meter, &meter, NULL)); meter = fabs(meter); - ierr = PetscOptionsScalar("-units_second","1 second in scaled time units", - NULL, second, &second, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_second", "1 second in scaled time units", NULL, second, &second, NULL)); second = fabs(second); // -- Warnings if (stab == STAB_SUPG && !implicit) { - ierr = PetscPrintf(comm, - "Warning! Use -stab supg only with -implicit\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! Use -stab supg only with -implicit\n")); } - if (user_velocity && (euler_test == EULER_TEST_1 - || euler_test == EULER_TEST_3)) { - ierr = PetscPrintf(comm, - "Warning! Background velocity vector for -euler_test t1 and -euler_test t3 is (0,0,0)\n"); - CHKERRQ(ierr); + if (user_velocity && (euler_test == EULER_TEST_1 || euler_test == EULER_TEST_3)) { + PetscCall(PetscPrintf(comm, "Warning! Background velocity vector for -euler_test t1 and -euler_test t3 is (0,0,0)\n")); } PetscOptionsEnd(); @@ -132,19 +113,19 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData *problem, DM dm, void *ctx) { // Set up the libCEED context // ------------------------------------------------------ // -- Scale variables to desired units - for (PetscInt i=0; i<3; i++) { + for (PetscInt i = 0; i < 3; i++) { center[i] *= meter; domain_size[i] *= meter; - mean_velocity[i] *= (meter/second); + mean_velocity[i] *= (meter / second); } problem->dm_scale = meter; // -- QFunction Context - user->phys->stab = stab; - user->phys->euler_test = euler_test; - user->phys->implicit = implicit; - user->phys->has_curr_time = has_curr_time; - user->phys->has_neumann = has_neumann; + user->phys->stab = stab; + user->phys->euler_test = euler_test; + user->phys->implicit = implicit; + user->phys->has_curr_time = has_curr_time; + user->phys->has_neumann = has_neumann; euler_ctx->curr_time = 0.; euler_ctx->implicit = implicit; euler_ctx->euler_test = euler_test; @@ -159,45 +140,31 @@ PetscErrorCode NS_EULER_VORTEX(ProblemData *problem, DM dm, void *ctx) { euler_ctx->stabilization = stab; CeedQFunctionContextCreate(user->ceed, &euler_context); - CeedQFunctionContextSetData(euler_context, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(*euler_ctx), euler_ctx); - CeedQFunctionContextSetDataDestroy(euler_context, CEED_MEM_HOST, - FreeContextPetsc); - CeedQFunctionContextRegisterDouble(euler_context, "solution time", - offsetof(struct EulerContext_, curr_time), 1, "Physical time of the solution"); - CeedQFunctionContextReferenceCopy(euler_context, - &problem->ics.qfunction_context); - CeedQFunctionContextReferenceCopy(euler_context, - &problem->apply_vol_rhs.qfunction_context); - CeedQFunctionContextReferenceCopy(euler_context, - &problem->apply_vol_ifunction.qfunction_context); - CeedQFunctionContextReferenceCopy(euler_context, - &problem->apply_inflow.qfunction_context); - CeedQFunctionContextReferenceCopy(euler_context, - &problem->apply_outflow.qfunction_context); + CeedQFunctionContextSetData(euler_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*euler_ctx), euler_ctx); + CeedQFunctionContextSetDataDestroy(euler_context, CEED_MEM_HOST, FreeContextPetsc); + CeedQFunctionContextRegisterDouble(euler_context, "solution time", offsetof(struct EulerContext_, curr_time), 1, "Physical time of the solution"); + CeedQFunctionContextReferenceCopy(euler_context, &problem->ics.qfunction_context); + CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_vol_rhs.qfunction_context); + CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_vol_ifunction.qfunction_context); + CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_inflow.qfunction_context); + CeedQFunctionContextReferenceCopy(euler_context, &problem->apply_outflow.qfunction_context); PetscFunctionReturn(0); } -PetscErrorCode PRINT_EULER_VORTEX(ProblemData *problem, - AppCtx app_ctx) { - MPI_Comm comm = PETSC_COMM_WORLD; - PetscErrorCode ierr; - EulerContext euler_ctx; +PetscErrorCode PRINT_EULER_VORTEX(ProblemData *problem, AppCtx app_ctx) { + MPI_Comm comm = PETSC_COMM_WORLD; + EulerContext euler_ctx; PetscFunctionBeginUser; - CeedQFunctionContextGetData(problem->ics.qfunction_context, CEED_MEM_HOST, - &euler_ctx); - ierr = PetscPrintf(comm, - " Problem:\n" - " Problem Name : %s\n" - " Test Case : %s\n" - " Background Velocity : %f,%f,%f\n" - " Stabilization : %s\n", - app_ctx->problem_name, EulerTestTypes[euler_ctx->euler_test], - euler_ctx->mean_velocity[0], - euler_ctx->mean_velocity[1], - euler_ctx->mean_velocity[2], - StabilizationTypes[euler_ctx->stabilization]); CHKERRQ(ierr); + CeedQFunctionContextGetData(problem->ics.qfunction_context, CEED_MEM_HOST, &euler_ctx); + PetscCall(PetscPrintf(comm, + " Problem:\n" + " Problem Name : %s\n" + " Test Case : %s\n" + " Background Velocity : %f,%f,%f\n" + " Stabilization : %s\n", + app_ctx->problem_name, EulerTestTypes[euler_ctx->euler_test], euler_ctx->mean_velocity[0], euler_ctx->mean_velocity[1], + euler_ctx->mean_velocity[2], StabilizationTypes[euler_ctx->stabilization])); CeedQFunctionContextRestoreData(problem->ics.qfunction_context, &euler_ctx); PetscFunctionReturn(0); diff --git a/examples/fluids/problems/freestream_bc.c b/examples/fluids/problems/freestream_bc.c index 1bfd60b504..1ad070632b 100644 --- a/examples/fluids/problems/freestream_bc.c +++ b/examples/fluids/problems/freestream_bc.c @@ -8,53 +8,46 @@ /// @file /// Utility functions for setting up Freestream boundary condition -#include "../navierstokes.h" #include "../qfunctions/freestream_bc.h" +#include "../navierstokes.h" + PetscErrorCode FreestreamBCSetup(ProblemData *problem, DM dm, void *ctx) { - User user = *(User *)ctx; - MPI_Comm comm = PETSC_COMM_WORLD; - FreestreamContext freestream_ctx; + User user = *(User *)ctx; + MPI_Comm comm = PETSC_COMM_WORLD; + FreestreamContext freestream_ctx; NewtonianIdealGasContext newtonian_ig_ctx; - CeedQFunctionContext freestream_context; + CeedQFunctionContext freestream_context; PetscFunctionBeginUser; // *INDENT-OFF* switch (user->phys->state_var) { - case STATEVAR_CONSERVATIVE: - problem->apply_freestream.qfunction = Freestream_Conserv; - problem->apply_freestream.qfunction_loc = Freestream_Conserv_loc; - problem->apply_freestream_jacobian.qfunction = Freestream_Jacobian_Conserv; - problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Conserv_loc; - case STATEVAR_PRIMITIVE: - problem->apply_freestream.qfunction = Freestream_Prim; - problem->apply_freestream.qfunction_loc = Freestream_Prim_loc; - problem->apply_freestream_jacobian.qfunction = Freestream_Jacobian_Prim; - problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Prim_loc; + case STATEVAR_CONSERVATIVE: + problem->apply_freestream.qfunction = Freestream_Conserv; + problem->apply_freestream.qfunction_loc = Freestream_Conserv_loc; + problem->apply_freestream_jacobian.qfunction = Freestream_Jacobian_Conserv; + problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Conserv_loc; + case STATEVAR_PRIMITIVE: + problem->apply_freestream.qfunction = Freestream_Prim; + problem->apply_freestream.qfunction_loc = Freestream_Prim_loc; + problem->apply_freestream_jacobian.qfunction = Freestream_Jacobian_Prim; + problem->apply_freestream_jacobian.qfunction_loc = Freestream_Jacobian_Prim_loc; } // *INDENT-ON* // -- Option Defaults - CeedScalar U_inf[3] = {0.}; // m/s - CeedScalar T_inf = 288.; // K - CeedScalar P_inf = 1.01e5; // Pa + CeedScalar U_inf[3] = {0.}; // m/s + CeedScalar T_inf = 288.; // K + CeedScalar P_inf = 1.01e5; // Pa - PetscOptionsBegin(comm, NULL, "Options for Freestream boundary condition", - NULL); - PetscInt narray=3; - PetscCall(PetscOptionsScalarArray("-velocity_freestream", - "Velocity at freestream condition", - NULL, U_inf, &narray, NULL)); - PetscCheck(narray == 3, comm, PETSC_ERR_ARG_SIZ, - "-velocity_freestream should recieve array of size 3, instead recieved size %" - PetscInt_FMT".", narray); + PetscOptionsBegin(comm, NULL, "Options for Freestream boundary condition", NULL); + PetscInt narray = 3; + PetscCall(PetscOptionsScalarArray("-velocity_freestream", "Velocity at freestream condition", NULL, U_inf, &narray, NULL)); + PetscCheck(narray == 3, comm, PETSC_ERR_ARG_SIZ, "-velocity_freestream should recieve array of size 3, instead recieved size %" PetscInt_FMT ".", + narray); - PetscCall(PetscOptionsScalar("-temperature_freestream", - "Temperature at freestream condition", - NULL, T_inf, &T_inf, NULL)); - PetscCall(PetscOptionsScalar("-pressure_freestream", - "Pressure at freestream condition", - NULL, P_inf, &P_inf, NULL)); + PetscCall(PetscOptionsScalar("-temperature_freestream", "Temperature at freestream condition", NULL, T_inf, &T_inf, NULL)); + PetscCall(PetscOptionsScalar("-pressure_freestream", "Pressure at freestream condition", NULL, P_inf, &P_inf, NULL)); PetscOptionsEnd(); PetscScalar meter = user->units->meter; @@ -64,15 +57,14 @@ PetscErrorCode FreestreamBCSetup(ProblemData *problem, DM dm, void *ctx) { T_inf *= Kelvin; P_inf *= Pascal; - for (int i=0; i<3; i++) U_inf[i] *= meter / second; + for (int i = 0; i < 3; i++) U_inf[i] *= meter / second; - CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, - CEED_MEM_HOST, &newtonian_ig_ctx); + CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx); State S_infty; { CeedScalar Y[5] = {P_inf, U_inf[0], U_inf[1], U_inf[2], T_inf}; CeedScalar x[3] = {0.}; - S_infty = StateFromY(newtonian_ig_ctx, Y, x); + S_infty = StateFromY(newtonian_ig_ctx, Y, x); } // -- Set freestream_ctx struct values @@ -80,15 +72,12 @@ PetscErrorCode FreestreamBCSetup(ProblemData *problem, DM dm, void *ctx) { freestream_ctx->newtonian_ctx = *newtonian_ig_ctx; freestream_ctx->S_infty = S_infty; - CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, - &newtonian_ig_ctx); + CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ig_ctx); CeedQFunctionContextCreate(user->ceed, &freestream_context); - CeedQFunctionContextSetData(freestream_context, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(*freestream_ctx), freestream_ctx); - CeedQFunctionContextSetDataDestroy(freestream_context, CEED_MEM_HOST, - FreeContextPetsc); - problem->apply_freestream.qfunction_context = freestream_context; + CeedQFunctionContextSetData(freestream_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*freestream_ctx), freestream_ctx); + CeedQFunctionContextSetDataDestroy(freestream_context, CEED_MEM_HOST, FreeContextPetsc); + problem->apply_freestream.qfunction_context = freestream_context; problem->apply_freestream_jacobian.qfunction_context = freestream_context; PetscFunctionReturn(0); diff --git a/examples/fluids/problems/newtonian.c b/examples/fluids/problems/newtonian.c index 4898101a04..6101887ecf 100644 --- a/examples/fluids/problems/newtonian.c +++ b/examples/fluids/problems/newtonian.c @@ -8,59 +8,48 @@ /// @file /// Utility functions for setting up problems using the Newtonian Qfunction +#include "../qfunctions/newtonian.h" + #include "../navierstokes.h" #include "../qfunctions/setupgeo.h" -#include "../qfunctions/newtonian.h" // Compute relative error |a - b|/|s| -static PetscErrorCode CheckPrimitiveWithTolerance(StatePrimitive sY, - StatePrimitive aY, StatePrimitive bY, const char *name, PetscReal rtol_pressure, - PetscReal rtol_velocity, PetscReal rtol_temperature) { - +static PetscErrorCode CheckPrimitiveWithTolerance(StatePrimitive sY, StatePrimitive aY, StatePrimitive bY, const char *name, PetscReal rtol_pressure, + PetscReal rtol_velocity, PetscReal rtol_temperature) { PetscFunctionBeginUser; - StatePrimitive eY; // relative error - eY.pressure = (aY.pressure - bY.pressure) / sY.pressure; - PetscScalar u = sqrt(Square(sY.velocity[0]) + Square(sY.velocity[1]) + Square( - sY.velocity[2])); - for (int j=0; j<3; j++) eY.velocity[j] = (aY.velocity[j] - bY.velocity[j]) / u; + StatePrimitive eY; // relative error + eY.pressure = (aY.pressure - bY.pressure) / sY.pressure; + PetscScalar u = sqrt(Square(sY.velocity[0]) + Square(sY.velocity[1]) + Square(sY.velocity[2])); + for (int j = 0; j < 3; j++) eY.velocity[j] = (aY.velocity[j] - bY.velocity[j]) / u; eY.temperature = (aY.temperature - bY.temperature) / sY.temperature; - if (fabs(eY.pressure) > rtol_pressure) - printf("%s: pressure error %g\n", name, eY.pressure); - for (int j=0; j<3; j++) - if (fabs(eY.velocity[j]) > rtol_velocity) - printf("%s: velocity[%d] error %g\n", name, j, eY.velocity[j]); - if (fabs(eY.temperature) > rtol_temperature) - printf("%s: temperature error %g\n", name, eY.temperature); + if (fabs(eY.pressure) > rtol_pressure) printf("%s: pressure error %g\n", name, eY.pressure); + for (int j = 0; j < 3; j++) { + if (fabs(eY.velocity[j]) > rtol_velocity) printf("%s: velocity[%d] error %g\n", name, j, eY.velocity[j]); + } + if (fabs(eY.temperature) > rtol_temperature) printf("%s: temperature error %g\n", name, eY.temperature); PetscFunctionReturn(0); } -static PetscErrorCode UnitTests_Newtonian(User user, - NewtonianIdealGasContext gas) { - - Units units = user->units; - const CeedScalar eps = 1e-6; - const CeedScalar kg = units->kilogram, - m = units->meter, - sec = units->second, - Pascal = units->Pascal; +static PetscErrorCode UnitTests_Newtonian(User user, NewtonianIdealGasContext gas) { + Units units = user->units; + const CeedScalar eps = 1e-6; + const CeedScalar kg = units->kilogram, m = units->meter, sec = units->second, Pascal = units->Pascal; PetscFunctionBeginUser; - const CeedScalar rho = 1.2 * kg / (m*m*m), - u = 40 * m/sec; - CeedScalar U[5] = {rho, rho*u, rho *u*1.1, rho *u*1.2, 250e3*Pascal + .5*rho *u*u}; + const CeedScalar rho = 1.2 * kg / (m * m * m), u = 40 * m / sec; + CeedScalar U[5] = {rho, rho * u, rho * u * 1.1, rho * u * 1.2, 250e3 * Pascal + .5 * rho * u * u}; const CeedScalar x[3] = {.1, .2, .3}; - State s = StateFromU(gas, U, x); - for (int i=0; i<8; i++) { + State s = StateFromU(gas, U, x); + for (int i = 0; i < 8; i++) { CeedScalar dU[5] = {0}, dx[3] = {0}; if (i < 5) dU[i] = U[i]; - else dx[i-5] = x[i-5]; + else dx[i - 5] = x[i - 5]; State ds = StateFromU_fwd(gas, s, dU, x, dx); - for (int j=0; j<5; j++) dU[j] = (1 + eps * (i == j)) * U[j]; - for (int j=0; j<3; j++) dx[j] = (1 + eps * (i == 5 + j)) * x[j]; - State t = StateFromU(gas, dU, dx); + for (int j = 0; j < 5; j++) dU[j] = (1 + eps * (i == j)) * U[j]; + for (int j = 0; j < 3; j++) dx[j] = (1 + eps * (i == 5 + j)) * x[j]; + State t = StateFromU(gas, dU, dx); StatePrimitive dY; dY.pressure = (t.Y.pressure - s.Y.pressure) / eps; - for (int j=0; j<3; j++) - dY.velocity[j] = (t.Y.velocity[j] - s.Y.velocity[j]) / eps; + for (int j = 0; j < 3; j++) dY.velocity[j] = (t.Y.velocity[j] - s.Y.velocity[j]) / eps; dY.temperature = (t.Y.temperature - s.Y.temperature) / eps; char buf[128]; snprintf(buf, sizeof buf, "StateFromU_fwd i=%d", i); @@ -70,56 +59,54 @@ static PetscErrorCode UnitTests_Newtonian(User user, } PetscErrorCode NS_NEWTONIAN_IG(ProblemData *problem, DM dm, void *ctx) { - - SetupContext setup_context; - User user = *(User *)ctx; - StabilizationType stab; - StateVariable state_var; - MPI_Comm comm = PETSC_COMM_WORLD; - PetscBool implicit; - PetscBool has_curr_time = PETSC_FALSE, unit_tests; - PetscInt ierr; + SetupContext setup_context; + User user = *(User *)ctx; + StabilizationType stab; + StateVariable state_var; + MPI_Comm comm = PETSC_COMM_WORLD; + PetscBool implicit; + PetscBool has_curr_time = PETSC_FALSE, unit_tests; NewtonianIdealGasContext newtonian_ig_ctx; - CeedQFunctionContext newtonian_ig_context; + CeedQFunctionContext newtonian_ig_context; PetscFunctionBeginUser; - ierr = PetscCalloc1(1, &setup_context); CHKERRQ(ierr); - ierr = PetscCalloc1(1, &newtonian_ig_ctx); CHKERRQ(ierr); + PetscCall(PetscCalloc1(1, &setup_context)); + PetscCall(PetscCalloc1(1, &newtonian_ig_ctx)); // ------------------------------------------------------ // Setup Generic Newtonian IG Problem // ------------------------------------------------------ - problem->dim = 3; - problem->q_data_size_vol = 10; - problem->q_data_size_sur = 10; - problem->jac_data_size_sur = 11; - problem->setup_vol.qfunction = Setup; - problem->setup_vol.qfunction_loc = Setup_loc; - problem->setup_sur.qfunction = SetupBoundary; - problem->setup_sur.qfunction_loc = SetupBoundary_loc; - problem->bc = NULL; - problem->bc_ctx = setup_context; - problem->non_zero_time = PETSC_FALSE; - problem->print_info = PRINT_NEWTONIAN; + problem->dim = 3; + problem->q_data_size_vol = 10; + problem->q_data_size_sur = 10; + problem->jac_data_size_sur = 11; + problem->setup_vol.qfunction = Setup; + problem->setup_vol.qfunction_loc = Setup_loc; + problem->setup_sur.qfunction = SetupBoundary; + problem->setup_sur.qfunction_loc = SetupBoundary_loc; + problem->bc = NULL; + problem->bc_ctx = setup_context; + problem->non_zero_time = PETSC_FALSE; + problem->print_info = PRINT_NEWTONIAN; // ------------------------------------------------------ // Create the libCEED context // ------------------------------------------------------ - CeedScalar cv = 717.; // J/(kg K) - CeedScalar cp = 1004.; // J/(kg K) - CeedScalar g[3] = {0, 0, -9.81}; // m/s^2 - CeedScalar lambda = -2./3.; // - - CeedScalar mu = 1.8e-5; // Pa s, dynamic viscosity - CeedScalar k = 0.02638; // W/(m K) - CeedScalar c_tau = 0.5; // - - CeedScalar Ctau_t = 1.0; // - - CeedScalar Ctau_v = 36.0; // TODO make function of degree - CeedScalar Ctau_C = 1.0; // TODO make function of degree - CeedScalar Ctau_M = 1.0; // TODO make function of degree - CeedScalar Ctau_E = 1.0; // TODO make function of degree - PetscReal domain_min[3], domain_max[3], domain_size[3]; - ierr = DMGetBoundingBox(dm, domain_min, domain_max); CHKERRQ(ierr); - for (PetscInt i=0; i<3; i++) domain_size[i] = domain_max[i] - domain_min[i]; + CeedScalar cv = 717.; // J/(kg K) + CeedScalar cp = 1004.; // J/(kg K) + CeedScalar g[3] = {0, 0, -9.81}; // m/s^2 + CeedScalar lambda = -2. / 3.; // - + CeedScalar mu = 1.8e-5; // Pa s, dynamic viscosity + CeedScalar k = 0.02638; // W/(m K) + CeedScalar c_tau = 0.5; // - + CeedScalar Ctau_t = 1.0; // - + CeedScalar Ctau_v = 36.0; // TODO make function of degree + CeedScalar Ctau_C = 1.0; // TODO make function of degree + CeedScalar Ctau_M = 1.0; // TODO make function of degree + CeedScalar Ctau_E = 1.0; // TODO make function of degree + PetscReal domain_min[3], domain_max[3], domain_size[3]; + PetscCall(DMGetBoundingBox(dm, domain_min, domain_max)); + for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i]; // ------------------------------------------------------ // Create the PETSc context @@ -133,115 +120,86 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData *problem, DM dm, void *ctx) { // ------------------------------------------------------ // Command line Options // ------------------------------------------------------ - PetscOptionsBegin(comm, NULL, "Options for Newtonian Ideal Gas based problem", - NULL); + PetscOptionsBegin(comm, NULL, "Options for Newtonian Ideal Gas based problem", NULL); // -- Conservative vs Primitive variables - ierr = PetscOptionsEnum("-state_var", "State variables used", NULL, - StateVariables, (PetscEnum)(state_var = STATEVAR_CONSERVATIVE), - (PetscEnum *)&state_var, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsEnum("-state_var", "State variables used", NULL, StateVariables, (PetscEnum)(state_var = STATEVAR_CONSERVATIVE), + (PetscEnum *)&state_var, NULL)); // *INDENT-OFF* switch (state_var) { - case STATEVAR_CONSERVATIVE: - problem->ics.qfunction = ICsNewtonianIG; - problem->ics.qfunction_loc = ICsNewtonianIG_loc; - problem->apply_vol_rhs.qfunction = RHSFunction_Newtonian; - problem->apply_vol_rhs.qfunction_loc = RHSFunction_Newtonian_loc; - problem->apply_vol_ifunction.qfunction = IFunction_Newtonian_Conserv; - problem->apply_vol_ifunction.qfunction_loc = IFunction_Newtonian_Conserv_loc; - problem->apply_vol_ijacobian.qfunction = IJacobian_Newtonian_Conserv; - problem->apply_vol_ijacobian.qfunction_loc = IJacobian_Newtonian_Conserv_loc; - problem->apply_inflow.qfunction = BoundaryIntegral_Conserv; - problem->apply_inflow.qfunction_loc = BoundaryIntegral_Conserv_loc; - problem->apply_inflow_jacobian.qfunction = BoundaryIntegral_Jacobian_Conserv; - problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Conserv_loc; - problem->apply_outflow.qfunction = PressureOutflow_Conserv; - problem->apply_outflow.qfunction_loc = PressureOutflow_Conserv_loc; - problem->apply_outflow_jacobian.qfunction = PressureOutflow_Jacobian_Conserv; - problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Conserv_loc; - break; + case STATEVAR_CONSERVATIVE: + problem->ics.qfunction = ICsNewtonianIG; + problem->ics.qfunction_loc = ICsNewtonianIG_loc; + problem->apply_vol_rhs.qfunction = RHSFunction_Newtonian; + problem->apply_vol_rhs.qfunction_loc = RHSFunction_Newtonian_loc; + problem->apply_vol_ifunction.qfunction = IFunction_Newtonian_Conserv; + problem->apply_vol_ifunction.qfunction_loc = IFunction_Newtonian_Conserv_loc; + problem->apply_vol_ijacobian.qfunction = IJacobian_Newtonian_Conserv; + problem->apply_vol_ijacobian.qfunction_loc = IJacobian_Newtonian_Conserv_loc; + problem->apply_inflow.qfunction = BoundaryIntegral_Conserv; + problem->apply_inflow.qfunction_loc = BoundaryIntegral_Conserv_loc; + problem->apply_inflow_jacobian.qfunction = BoundaryIntegral_Jacobian_Conserv; + problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Conserv_loc; + problem->apply_outflow.qfunction = PressureOutflow_Conserv; + problem->apply_outflow.qfunction_loc = PressureOutflow_Conserv_loc; + problem->apply_outflow_jacobian.qfunction = PressureOutflow_Jacobian_Conserv; + problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Conserv_loc; + break; - case STATEVAR_PRIMITIVE: - problem->ics.qfunction = ICsNewtonianIG_Prim; - problem->ics.qfunction_loc = ICsNewtonianIG_Prim_loc; - problem->apply_vol_ifunction.qfunction = IFunction_Newtonian_Prim; - problem->apply_vol_ifunction.qfunction_loc = IFunction_Newtonian_Prim_loc; - problem->apply_vol_ijacobian.qfunction = IJacobian_Newtonian_Prim; - problem->apply_vol_ijacobian.qfunction_loc = IJacobian_Newtonian_Prim_loc; - problem->apply_inflow.qfunction = BoundaryIntegral_Prim; - problem->apply_inflow.qfunction_loc = BoundaryIntegral_Prim_loc; - problem->apply_inflow_jacobian.qfunction = BoundaryIntegral_Jacobian_Prim; - problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Prim_loc; - problem->apply_outflow.qfunction = PressureOutflow_Prim; - problem->apply_outflow.qfunction_loc = PressureOutflow_Prim_loc; - problem->apply_outflow_jacobian.qfunction = PressureOutflow_Jacobian_Prim; - problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Prim_loc; - break; + case STATEVAR_PRIMITIVE: + problem->ics.qfunction = ICsNewtonianIG_Prim; + problem->ics.qfunction_loc = ICsNewtonianIG_Prim_loc; + problem->apply_vol_ifunction.qfunction = IFunction_Newtonian_Prim; + problem->apply_vol_ifunction.qfunction_loc = IFunction_Newtonian_Prim_loc; + problem->apply_vol_ijacobian.qfunction = IJacobian_Newtonian_Prim; + problem->apply_vol_ijacobian.qfunction_loc = IJacobian_Newtonian_Prim_loc; + problem->apply_inflow.qfunction = BoundaryIntegral_Prim; + problem->apply_inflow.qfunction_loc = BoundaryIntegral_Prim_loc; + problem->apply_inflow_jacobian.qfunction = BoundaryIntegral_Jacobian_Prim; + problem->apply_inflow_jacobian.qfunction_loc = BoundaryIntegral_Jacobian_Prim_loc; + problem->apply_outflow.qfunction = PressureOutflow_Prim; + problem->apply_outflow.qfunction_loc = PressureOutflow_Prim_loc; + problem->apply_outflow_jacobian.qfunction = PressureOutflow_Jacobian_Prim; + problem->apply_outflow_jacobian.qfunction_loc = PressureOutflow_Jacobian_Prim_loc; + break; } // *INDENT-ON* // -- Physics - ierr = PetscOptionsScalar("-cv", "Heat capacity at constant volume", - NULL, cv, &cv, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-cp", "Heat capacity at constant pressure", - NULL, cp, &cp, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-lambda", - "Stokes hypothesis second viscosity coefficient", - NULL, lambda, &lambda, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-mu", "Shear dynamic viscosity coefficient", - NULL, mu, &mu, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-k", "Thermal conductivity", - NULL, k, &k, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-cv", "Heat capacity at constant volume", NULL, cv, &cv, NULL)); + PetscCall(PetscOptionsScalar("-cp", "Heat capacity at constant pressure", NULL, cp, &cp, NULL)); + PetscCall(PetscOptionsScalar("-lambda", "Stokes hypothesis second viscosity coefficient", NULL, lambda, &lambda, NULL)); + PetscCall(PetscOptionsScalar("-mu", "Shear dynamic viscosity coefficient", NULL, mu, &mu, NULL)); + PetscCall(PetscOptionsScalar("-k", "Thermal conductivity", NULL, k, &k, NULL)); PetscInt dim = problem->dim; - ierr = PetscOptionsRealArray("-g", "Gravitational acceleration", - NULL, g, &dim, NULL); CHKERRQ(ierr); - ierr = PetscOptionsEnum("-stab", "Stabilization method", NULL, - StabilizationTypes, (PetscEnum)(stab = STAB_NONE), - (PetscEnum *)&stab, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-c_tau", "Stabilization constant", - NULL, c_tau, &c_tau, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-Ctau_t", "Stabilization time constant", - NULL, Ctau_t, &Ctau_t, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-Ctau_v", "Stabilization viscous constant", - NULL, Ctau_v, &Ctau_v, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-Ctau_C", "Stabilization continuity constant", - NULL, Ctau_C, &Ctau_C, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-Ctau_M", "Stabilization momentum constant", - NULL, Ctau_M, &Ctau_M, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-Ctau_E", "Stabilization energy constant", - NULL, Ctau_E, &Ctau_E, NULL); CHKERRQ(ierr); - ierr = PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", - NULL, implicit=PETSC_FALSE, &implicit, NULL); - CHKERRQ(ierr); - ierr = PetscOptionsBool("-newtonian_unit_tests", "Run Newtonian unit tests", - NULL, unit_tests=PETSC_FALSE, &unit_tests, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsRealArray("-g", "Gravitational acceleration", NULL, g, &dim, NULL)); + PetscCall(PetscOptionsEnum("-stab", "Stabilization method", NULL, StabilizationTypes, (PetscEnum)(stab = STAB_NONE), (PetscEnum *)&stab, NULL)); + PetscCall(PetscOptionsScalar("-c_tau", "Stabilization constant", NULL, c_tau, &c_tau, NULL)); + PetscCall(PetscOptionsScalar("-Ctau_t", "Stabilization time constant", NULL, Ctau_t, &Ctau_t, NULL)); + PetscCall(PetscOptionsScalar("-Ctau_v", "Stabilization viscous constant", NULL, Ctau_v, &Ctau_v, NULL)); + PetscCall(PetscOptionsScalar("-Ctau_C", "Stabilization continuity constant", NULL, Ctau_C, &Ctau_C, NULL)); + PetscCall(PetscOptionsScalar("-Ctau_M", "Stabilization momentum constant", NULL, Ctau_M, &Ctau_M, NULL)); + PetscCall(PetscOptionsScalar("-Ctau_E", "Stabilization energy constant", NULL, Ctau_E, &Ctau_E, NULL)); + PetscCall(PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", NULL, implicit = PETSC_FALSE, &implicit, NULL)); + PetscCall(PetscOptionsBool("-newtonian_unit_tests", "Run Newtonian unit tests", NULL, unit_tests = PETSC_FALSE, &unit_tests, NULL)); // -- Units - ierr = PetscOptionsScalar("-units_meter", "1 meter in scaled length units", - NULL, meter, &meter, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_meter", "1 meter in scaled length units", NULL, meter, &meter, NULL)); meter = fabs(meter); - ierr = PetscOptionsScalar("-units_kilogram","1 kilogram in scaled mass units", - NULL, kilogram, &kilogram, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_kilogram", "1 kilogram in scaled mass units", NULL, kilogram, &kilogram, NULL)); kilogram = fabs(kilogram); - ierr = PetscOptionsScalar("-units_second","1 second in scaled time units", - NULL, second, &second, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_second", "1 second in scaled time units", NULL, second, &second, NULL)); second = fabs(second); - ierr = PetscOptionsScalar("-units_Kelvin", - "1 Kelvin in scaled temperature units", - NULL, Kelvin, &Kelvin, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_Kelvin", "1 Kelvin in scaled temperature units", NULL, Kelvin, &Kelvin, NULL)); Kelvin = fabs(Kelvin); // -- Warnings if (stab == STAB_SUPG && !implicit) { - ierr = PetscPrintf(comm, - "Warning! Use -stab supg only with -implicit\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! Use -stab supg only with -implicit\n")); } - if (state_var==STATEVAR_PRIMITIVE && !implicit) { - SETERRQ(comm, PETSC_ERR_ARG_NULL, - "RHSFunction is not provided for primitive variables (use -state_var primitive only with -implicit)\n"); + if (state_var == STATEVAR_PRIMITIVE && !implicit) { + SETERRQ(comm, PETSC_ERR_ARG_NULL, "RHSFunction is not provided for primitive variables (use -state_var primitive only with -implicit)\n"); } PetscOptionsEnd(); @@ -250,9 +208,9 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData *problem, DM dm, void *ctx) { // ------------------------------------------------------ // -- Define derived units Pascal = kilogram / (meter * PetscSqr(second)); - J_per_kg_K = PetscSqr(meter) / (PetscSqr(second) * Kelvin); + J_per_kg_K = PetscSqr(meter) / (PetscSqr(second) * Kelvin); m_per_squared_s = meter / PetscSqr(second); - W_per_m_K = kilogram * meter / (pow(second,3) * Kelvin); + W_per_m_K = kilogram * meter / (pow(second, 3) * Kelvin); user->units->meter = meter; user->units->kilogram = kilogram; @@ -267,22 +225,22 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData *problem, DM dm, void *ctx) { // Set up the libCEED context // ------------------------------------------------------ // -- Scale variables to desired units - cv *= J_per_kg_K; - cp *= J_per_kg_K; - mu *= Pascal * second; - k *= W_per_m_K; - for (PetscInt i=0; i<3; i++) domain_size[i] *= meter; - for (PetscInt i=0; i<3; i++) g[i] *= m_per_squared_s; + cv *= J_per_kg_K; + cp *= J_per_kg_K; + mu *= Pascal * second; + k *= W_per_m_K; + for (PetscInt i = 0; i < 3; i++) domain_size[i] *= meter; + for (PetscInt i = 0; i < 3; i++) g[i] *= m_per_squared_s; problem->dm_scale = meter; // -- Setup Context - setup_context->cv = cv; - setup_context->cp = cp; - setup_context->lx = domain_size[0]; - setup_context->ly = domain_size[1]; - setup_context->lz = domain_size[2]; - setup_context->time = 0; - ierr = PetscArraycpy(setup_context->g, g, 3); CHKERRQ(ierr); + setup_context->cv = cv; + setup_context->cp = cp; + setup_context->lx = domain_size[0]; + setup_context->ly = domain_size[1]; + setup_context->lz = domain_size[2]; + setup_context->time = 0; + PetscCall(PetscArraycpy(setup_context->g, g, 3)); // -- Solver Settings user->phys->stab = stab; @@ -304,43 +262,29 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData *problem, DM dm, void *ctx) { newtonian_ig_ctx->Ctau_E = Ctau_E; newtonian_ig_ctx->stabilization = stab; newtonian_ig_ctx->is_implicit = implicit; - newtonian_ig_ctx->state_var = state_var; - ierr = PetscArraycpy(newtonian_ig_ctx->g, g, 3); CHKERRQ(ierr); + newtonian_ig_ctx->state_var = state_var; + PetscCall(PetscArraycpy(newtonian_ig_ctx->g, g, 3)); CeedQFunctionContextCreate(user->ceed, &problem->ics.qfunction_context); - CeedQFunctionContextSetData(problem->ics.qfunction_context, CEED_MEM_HOST, - CEED_USE_POINTER, sizeof(*setup_context), setup_context); - CeedQFunctionContextSetDataDestroy(problem->ics.qfunction_context, - CEED_MEM_HOST, - FreeContextPetsc); - CeedQFunctionContextRegisterDouble(problem->ics.qfunction_context, - "evaluation time", - (char *)&setup_context->time - (char *)setup_context, 1, "Time of evaluation"); + CeedQFunctionContextSetData(problem->ics.qfunction_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*setup_context), setup_context); + CeedQFunctionContextSetDataDestroy(problem->ics.qfunction_context, CEED_MEM_HOST, FreeContextPetsc); + CeedQFunctionContextRegisterDouble(problem->ics.qfunction_context, "evaluation time", (char *)&setup_context->time - (char *)setup_context, 1, + "Time of evaluation"); CeedQFunctionContextCreate(user->ceed, &newtonian_ig_context); - CeedQFunctionContextSetData(newtonian_ig_context, CEED_MEM_HOST, - CEED_USE_POINTER, - sizeof(*newtonian_ig_ctx), newtonian_ig_ctx); - CeedQFunctionContextSetDataDestroy(newtonian_ig_context, CEED_MEM_HOST, - FreeContextPetsc); - CeedQFunctionContextRegisterDouble(newtonian_ig_context, "timestep size", - offsetof(struct NewtonianIdealGasContext_, dt), 1, "Size of timestep, delta t"); - CeedQFunctionContextRegisterDouble(newtonian_ig_context, "ijacobian time shift", - offsetof(struct NewtonianIdealGasContext_, ijacobian_time_shift), 1, - "Shift for mass matrix in IJacobian"); + CeedQFunctionContextSetData(newtonian_ig_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*newtonian_ig_ctx), newtonian_ig_ctx); + CeedQFunctionContextSetDataDestroy(newtonian_ig_context, CEED_MEM_HOST, FreeContextPetsc); + CeedQFunctionContextRegisterDouble(newtonian_ig_context, "timestep size", offsetof(struct NewtonianIdealGasContext_, dt), 1, + "Size of timestep, delta t"); + CeedQFunctionContextRegisterDouble(newtonian_ig_context, "ijacobian time shift", offsetof(struct NewtonianIdealGasContext_, ijacobian_time_shift), + 1, "Shift for mass matrix in IJacobian"); problem->apply_vol_rhs.qfunction_context = newtonian_ig_context; - CeedQFunctionContextReferenceCopy(newtonian_ig_context, - &problem->apply_vol_ifunction.qfunction_context); - CeedQFunctionContextReferenceCopy(newtonian_ig_context, - &problem->apply_vol_ijacobian.qfunction_context); - CeedQFunctionContextReferenceCopy(newtonian_ig_context, - &problem->apply_inflow.qfunction_context); - CeedQFunctionContextReferenceCopy(newtonian_ig_context, - &problem->apply_inflow_jacobian.qfunction_context); - CeedQFunctionContextReferenceCopy(newtonian_ig_context, - &problem->apply_outflow.qfunction_context); - CeedQFunctionContextReferenceCopy(newtonian_ig_context, - &problem->apply_outflow_jacobian.qfunction_context); + CeedQFunctionContextReferenceCopy(newtonian_ig_context, &problem->apply_vol_ifunction.qfunction_context); + CeedQFunctionContextReferenceCopy(newtonian_ig_context, &problem->apply_vol_ijacobian.qfunction_context); + CeedQFunctionContextReferenceCopy(newtonian_ig_context, &problem->apply_inflow.qfunction_context); + CeedQFunctionContextReferenceCopy(newtonian_ig_context, &problem->apply_inflow_jacobian.qfunction_context); + CeedQFunctionContextReferenceCopy(newtonian_ig_context, &problem->apply_outflow.qfunction_context); + CeedQFunctionContextReferenceCopy(newtonian_ig_context, &problem->apply_outflow_jacobian.qfunction_context); if (unit_tests) { PetscCall(UnitTests_Newtonian(user, newtonian_ig_ctx)); @@ -349,21 +293,16 @@ PetscErrorCode NS_NEWTONIAN_IG(ProblemData *problem, DM dm, void *ctx) { } PetscErrorCode PRINT_NEWTONIAN(ProblemData *problem, AppCtx app_ctx) { - - MPI_Comm comm = PETSC_COMM_WORLD; - PetscErrorCode ierr; + MPI_Comm comm = PETSC_COMM_WORLD; NewtonianIdealGasContext newtonian_ctx; PetscFunctionBeginUser; - CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, - CEED_MEM_HOST, &newtonian_ctx); - ierr = PetscPrintf(comm, - " Problem:\n" - " Problem Name : %s\n" - " Stabilization : %s\n", - app_ctx->problem_name, StabilizationTypes[newtonian_ctx->stabilization]); - CHKERRQ(ierr); - CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, - &newtonian_ctx); + CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ctx); + PetscCall(PetscPrintf(comm, + " Problem:\n" + " Problem Name : %s\n" + " Stabilization : %s\n", + app_ctx->problem_name, StabilizationTypes[newtonian_ctx->stabilization])); + CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ctx); PetscFunctionReturn(0); } diff --git a/examples/fluids/problems/newtonianwave.c b/examples/fluids/problems/newtonianwave.c index e09f584014..8431ec8fa8 100644 --- a/examples/fluids/problems/newtonianwave.c +++ b/examples/fluids/problems/newtonianwave.c @@ -9,60 +9,53 @@ /// Utility functions for setting up Newtonian Wave problem #include "../navierstokes.h" -#include "../qfunctions/newtonwave.h" #include "../qfunctions/freestream_bc_type.h" +#include "../qfunctions/newtonwave.h" PetscErrorCode NS_NEWTONIAN_WAVE(ProblemData *problem, DM dm, void *ctx) { - User user = *(User *)ctx; - MPI_Comm comm = PETSC_COMM_WORLD; - NewtonWaveContext newtwave_ctx; - FreestreamContext freestream_ctx; + User user = *(User *)ctx; + MPI_Comm comm = PETSC_COMM_WORLD; + NewtonWaveContext newtwave_ctx; + FreestreamContext freestream_ctx; NewtonianIdealGasContext newtonian_ig_ctx; - CeedQFunctionContext newtwave_context; + CeedQFunctionContext newtwave_context; PetscFunctionBeginUser; PetscCall(NS_NEWTONIAN_IG(problem, dm, ctx)); // *INDENT-OFF* switch (user->phys->state_var) { - case STATEVAR_CONSERVATIVE: - problem->ics.qfunction = IC_NewtonianWave_Conserv; - problem->ics.qfunction_loc = IC_NewtonianWave_Conserv_loc; - case STATEVAR_PRIMITIVE: - problem->ics.qfunction = IC_NewtonianWave_Prim; - problem->ics.qfunction_loc = IC_NewtonianWave_Prim_loc; + case STATEVAR_CONSERVATIVE: + problem->ics.qfunction = IC_NewtonianWave_Conserv; + problem->ics.qfunction_loc = IC_NewtonianWave_Conserv_loc; + case STATEVAR_PRIMITIVE: + problem->ics.qfunction = IC_NewtonianWave_Prim; + problem->ics.qfunction_loc = IC_NewtonianWave_Prim_loc; } // *INDENT-ON* // -- Option Defaults - CeedScalar epicenter[3] = {0.}; // m - CeedScalar width = 0.002; // m - CeedScalar amplitude = 0.1; // - + CeedScalar epicenter[3] = {0.}; // m + CeedScalar width = 0.002; // m + CeedScalar amplitude = 0.1; // - PetscOptionsBegin(comm, NULL, "Options for NEWTONIAN_WAVE problem", NULL); - PetscInt narray=3; - PetscCall(PetscOptionsScalarArray("-epicenter", "Coordinates of center of wave", - NULL, epicenter, &narray, NULL)); - PetscCheck(narray == 3, comm, PETSC_ERR_ARG_SIZ, - "-epicenter should recieve array of size 3, instead recieved size %" - PetscInt_FMT".", narray); - PetscCall(PetscOptionsScalar("-width", "Width parameter for perturbation size", - NULL, width, &width, NULL)); - PetscCall(PetscOptionsScalar("-amplitude", "Amplitude of the perturbation", - NULL, amplitude, &litude, NULL)); + PetscInt narray = 3; + PetscCall(PetscOptionsScalarArray("-epicenter", "Coordinates of center of wave", NULL, epicenter, &narray, NULL)); + PetscCheck(narray == 3, comm, PETSC_ERR_ARG_SIZ, "-epicenter should recieve array of size 3, instead recieved size %" PetscInt_FMT ".", narray); + PetscCall(PetscOptionsScalar("-width", "Width parameter for perturbation size", NULL, width, &width, NULL)); + PetscCall(PetscOptionsScalar("-amplitude", "Amplitude of the perturbation", NULL, amplitude, &litude, NULL)); PetscOptionsEnd(); width *= user->units->meter; - for (int i=0; i<3; i++) epicenter[i] *= user->units->meter; + for (int i = 0; i < 3; i++) epicenter[i] *= user->units->meter; PetscCall(FreestreamBCSetup(problem, dm, ctx)); // -- Set newtwave_ctx struct values PetscCall(PetscCalloc1(1, &newtwave_ctx)); - CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, - CEED_MEM_HOST, &newtonian_ig_ctx); - CeedQFunctionContextGetData(problem->apply_freestream.qfunction_context, - CEED_MEM_HOST, &freestream_ctx); + CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx); + CeedQFunctionContextGetData(problem->apply_freestream.qfunction_context, CEED_MEM_HOST, &freestream_ctx); newtwave_ctx->amplitude = amplitude; newtwave_ctx->width = width; @@ -70,16 +63,12 @@ PetscErrorCode NS_NEWTONIAN_WAVE(ProblemData *problem, DM dm, void *ctx) { newtwave_ctx->newt_ctx = *newtonian_ig_ctx; PetscCall(PetscArraycpy(newtwave_ctx->epicenter, epicenter, 3)); - CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, - &newtonian_ig_ctx); - CeedQFunctionContextRestoreData(problem->apply_freestream.qfunction_context, - &freestream_ctx); + CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ig_ctx); + CeedQFunctionContextRestoreData(problem->apply_freestream.qfunction_context, &freestream_ctx); CeedQFunctionContextCreate(user->ceed, &newtwave_context); - CeedQFunctionContextSetData(newtwave_context, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(*newtwave_ctx), newtwave_ctx); - CeedQFunctionContextSetDataDestroy(newtwave_context, CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(newtwave_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*newtwave_ctx), newtwave_ctx); + CeedQFunctionContextSetDataDestroy(newtwave_context, CEED_MEM_HOST, FreeContextPetsc); CeedQFunctionContextDestroy(&problem->ics.qfunction_context); problem->ics.qfunction_context = newtwave_context; diff --git a/examples/fluids/problems/shocktube.c b/examples/fluids/problems/shocktube.c index 1bb42e9e16..d420c74a15 100644 --- a/examples/fluids/problems/shocktube.c +++ b/examples/fluids/problems/shocktube.c @@ -17,26 +17,25 @@ /// @file /// Utility functions for setting up SHOCKTUBE +#include "../qfunctions/shocktube.h" + #include "../navierstokes.h" #include "../qfunctions/setupgeo.h" -#include "../qfunctions/shocktube.h" PetscErrorCode NS_SHOCKTUBE(ProblemData *problem, DM dm, void *ctx) { - - SetupContextShock setup_context; - User user = *(User *)ctx; - MPI_Comm comm = PETSC_COMM_WORLD; - PetscBool implicit; - PetscBool yzb; - PetscInt stab; - PetscBool has_curr_time = PETSC_FALSE; - PetscInt ierr; - ShockTubeContext shocktube_ctx; + SetupContextShock setup_context; + User user = *(User *)ctx; + MPI_Comm comm = PETSC_COMM_WORLD; + PetscBool implicit; + PetscBool yzb; + PetscInt stab; + PetscBool has_curr_time = PETSC_FALSE; + ShockTubeContext shocktube_ctx; CeedQFunctionContext shocktube_context; PetscFunctionBeginUser; - ierr = PetscCalloc1(1, &setup_context); CHKERRQ(ierr); - ierr = PetscCalloc1(1, &shocktube_ctx); CHKERRQ(ierr); + PetscCall(PetscCalloc1(1, &setup_context)); + PetscCall(PetscCalloc1(1, &shocktube_ctx)); // ------------------------------------------------------ // SET UP SHOCKTUBE @@ -63,26 +62,26 @@ PetscErrorCode NS_SHOCKTUBE(ProblemData *problem, DM dm, void *ctx) { // Create the libCEED context // ------------------------------------------------------ // Driver section initial conditions - CeedScalar P_high = 1.0; // Pa - CeedScalar rho_high = 1.0; // kg/m^3 + CeedScalar P_high = 1.0; // Pa + CeedScalar rho_high = 1.0; // kg/m^3 // Driven section initial conditions - CeedScalar P_low = 0.1; // Pa - CeedScalar rho_low = 0.125; // kg/m^3 + CeedScalar P_low = 0.1; // Pa + CeedScalar rho_low = 0.125; // kg/m^3 // Stabilization parameter - CeedScalar c_tau = 0.5; // -, based on Hughes et al (2010) + CeedScalar c_tau = 0.5; // -, based on Hughes et al (2010) // Tuning parameters for the YZB shock capturing - CeedScalar Cyzb = 0.1; // -, used in approximation of (Na),x - CeedScalar Byzb = 2.0; // -, 1 for smooth shocks + CeedScalar Cyzb = 0.1; // -, used in approximation of (Na),x + CeedScalar Byzb = 2.0; // -, 1 for smooth shocks // 2 for sharp shocks PetscReal domain_min[3], domain_max[3], domain_size[3]; - ierr = DMGetBoundingBox(dm, domain_min, domain_max); CHKERRQ(ierr); - for (PetscInt i=0; i<3; i++) domain_size[i] = domain_max[i] - domain_min[i]; + PetscCall(DMGetBoundingBox(dm, domain_min, domain_max)); + for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i]; // ------------------------------------------------------ // Create the PETSc context // ------------------------------------------------------ - PetscScalar meter = 1e-2; // 1 meter in scaled length units - PetscScalar second = 1e-2; // 1 second in scaled time units + PetscScalar meter = 1e-2; // 1 meter in scaled length units + PetscScalar second = 1e-2; // 1 second in scaled time units // ------------------------------------------------------ // Command line Options @@ -90,37 +89,25 @@ PetscErrorCode NS_SHOCKTUBE(ProblemData *problem, DM dm, void *ctx) { PetscOptionsBegin(comm, NULL, "Options for SHOCKTUBE problem", NULL); // -- Numerical formulation options - ierr = PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", - NULL, implicit=PETSC_FALSE, &implicit, NULL); CHKERRQ(ierr); - ierr = PetscOptionsEnum("-stab", "Stabilization method", NULL, - StabilizationTypes, (PetscEnum)(stab = STAB_NONE), - (PetscEnum *)&stab, NULL); CHKERRQ(ierr); - ierr = PetscOptionsScalar("-c_tau", "Stabilization constant", - NULL, c_tau, &c_tau, NULL); CHKERRQ(ierr); - ierr = PetscOptionsBool("-yzb", "Use YZB discontinuity capturing", - NULL, yzb=PETSC_FALSE, &yzb, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-implicit", "Use implicit (IFunction) formulation", NULL, implicit = PETSC_FALSE, &implicit, NULL)); + PetscCall(PetscOptionsEnum("-stab", "Stabilization method", NULL, StabilizationTypes, (PetscEnum)(stab = STAB_NONE), (PetscEnum *)&stab, NULL)); + PetscCall(PetscOptionsScalar("-c_tau", "Stabilization constant", NULL, c_tau, &c_tau, NULL)); + PetscCall(PetscOptionsBool("-yzb", "Use YZB discontinuity capturing", NULL, yzb = PETSC_FALSE, &yzb, NULL)); // -- Units - ierr = PetscOptionsScalar("-units_meter", "1 meter in scaled length units", - NULL, meter, &meter, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_meter", "1 meter in scaled length units", NULL, meter, &meter, NULL)); meter = fabs(meter); - ierr = PetscOptionsScalar("-units_second","1 second in scaled time units", - NULL, second, &second, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_second", "1 second in scaled time units", NULL, second, &second, NULL)); second = fabs(second); // -- Warnings if (stab == STAB_SUPG) { - ierr = PetscPrintf(comm, - "Warning! -stab supg not implemented for the shocktube problem. \n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! -stab supg not implemented for the shocktube problem. \n")); } if (yzb && implicit) { - ierr = PetscPrintf(comm, - "Warning! -yzb only implemented for explicit timestepping. \n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Warning! -yzb only implemented for explicit timestepping. \n")); } - PetscOptionsEnd(); // ------------------------------------------------------ @@ -133,12 +120,12 @@ PetscErrorCode NS_SHOCKTUBE(ProblemData *problem, DM dm, void *ctx) { // Set up the libCEED context // ------------------------------------------------------ // -- Scale variables to desired units - for (PetscInt i=0; i<3; i++) { + for (PetscInt i = 0; i < 3; i++) { domain_size[i] *= meter; domain_min[i] *= meter; } - problem->dm_scale = meter; - CeedScalar mid_point = 0.5*(domain_size[0]+domain_min[0]); + problem->dm_scale = meter; + CeedScalar mid_point = 0.5 * (domain_size[0] + domain_min[0]); // -- Setup Context setup_context->mid_point = mid_point; @@ -149,42 +136,34 @@ PetscErrorCode NS_SHOCKTUBE(ProblemData *problem, DM dm, void *ctx) { setup_context->rho_low = rho_low; // -- QFunction Context - user->phys->implicit = implicit; - user->phys->has_curr_time = has_curr_time; - shocktube_ctx->implicit = implicit; - shocktube_ctx->stabilization = stab; - shocktube_ctx->yzb = yzb; - shocktube_ctx->Cyzb = Cyzb; - shocktube_ctx->Byzb = Byzb; - shocktube_ctx->c_tau = c_tau; + user->phys->implicit = implicit; + user->phys->has_curr_time = has_curr_time; + shocktube_ctx->implicit = implicit; + shocktube_ctx->stabilization = stab; + shocktube_ctx->yzb = yzb; + shocktube_ctx->Cyzb = Cyzb; + shocktube_ctx->Byzb = Byzb; + shocktube_ctx->c_tau = c_tau; CeedQFunctionContextCreate(user->ceed, &problem->ics.qfunction_context); - CeedQFunctionContextSetData(problem->ics.qfunction_context, CEED_MEM_HOST, - CEED_USE_POINTER, sizeof(*setup_context), setup_context); - CeedQFunctionContextSetDataDestroy(problem->ics.qfunction_context, - CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(problem->ics.qfunction_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*setup_context), setup_context); + CeedQFunctionContextSetDataDestroy(problem->ics.qfunction_context, CEED_MEM_HOST, FreeContextPetsc); CeedQFunctionContextCreate(user->ceed, &shocktube_context); - CeedQFunctionContextSetData(shocktube_context, CEED_MEM_HOST, - CEED_USE_POINTER, - sizeof(*shocktube_ctx), shocktube_ctx); - CeedQFunctionContextSetDataDestroy(shocktube_context, CEED_MEM_HOST, - FreeContextPetsc); + CeedQFunctionContextSetData(shocktube_context, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*shocktube_ctx), shocktube_ctx); + CeedQFunctionContextSetDataDestroy(shocktube_context, CEED_MEM_HOST, FreeContextPetsc); problem->apply_vol_rhs.qfunction_context = shocktube_context; PetscFunctionReturn(0); } PetscErrorCode PRINT_SHOCKTUBE(ProblemData *problem, AppCtx app_ctx) { - - MPI_Comm comm = PETSC_COMM_WORLD; - PetscErrorCode ierr; + MPI_Comm comm = PETSC_COMM_WORLD; PetscFunctionBeginUser; - ierr = PetscPrintf(comm, - " Problem:\n" - " Problem Name : %s\n", - app_ctx->problem_name); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, + " Problem:\n" + " Problem Name : %s\n", + app_ctx->problem_name)); PetscFunctionReturn(0); } diff --git a/examples/fluids/problems/stg_shur14.c b/examples/fluids/problems/stg_shur14.c index 4dcbf6c966..3812b1bec6 100644 --- a/examples/fluids/problems/stg_shur14.c +++ b/examples/fluids/problems/stg_shur14.c @@ -9,11 +9,13 @@ /// Implementation of the Synthetic Turbulence Generation (STG) algorithm /// presented in Shur et al. 2014 -#include +#include "stg_shur14.h" + #include #include +#include + #include "../navierstokes.h" -#include "stg_shur14.h" #include "../qfunctions/stg_shur14.h" STGShur14Context global_stg_ctx; @@ -29,25 +31,24 @@ STGShur14Context global_stg_ctx; * @param[in] Rij Array of the symmetric matrices [6,nprofs] * @param[out] Cij Array of the Cholesky Decomposition matrices, [6,nprofs] */ -PetscErrorCode CalcCholeskyDecomp(MPI_Comm comm, PetscInt nprofs, - const CeedScalar Rij[6][nprofs], CeedScalar Cij[6][nprofs]) { +PetscErrorCode CalcCholeskyDecomp(MPI_Comm comm, PetscInt nprofs, const CeedScalar Rij[6][nprofs], CeedScalar Cij[6][nprofs]) { PetscFunctionBeginUser; - for (PetscInt i=0; inprofs]; - CeedScalar *wall_dist = &stg_ctx->data[stg_ctx->offsets.wall_dist]; - CeedScalar *eps = &stg_ctx->data[stg_ctx->offsets.eps]; - CeedScalar *lt = &stg_ctx->data[stg_ctx->offsets.lt]; - CeedScalar (*ubar)[stg_ctx->nprofs] = (CeedScalar (*)[stg_ctx->nprofs]) - &stg_ctx->data[stg_ctx->offsets.ubar]; + CeedScalar rij[6][stg_ctx->nprofs]; + CeedScalar *wall_dist = &stg_ctx->data[stg_ctx->offsets.wall_dist]; + CeedScalar *eps = &stg_ctx->data[stg_ctx->offsets.eps]; + CeedScalar *lt = &stg_ctx->data[stg_ctx->offsets.lt]; + CeedScalar(*ubar)[stg_ctx->nprofs] = (CeedScalar(*)[stg_ctx->nprofs]) & stg_ctx->data[stg_ctx->offsets.ubar]; - for (PetscInt i=0; inprofs; i++) { + for (PetscInt i = 0; i < stg_ctx->nprofs; i++) { PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line)); PetscCall(PetscStrToArray(line, ' ', &ndims, &array)); - if (ndims < dims[1]) SETERRQ(comm, -1, - "Line %" PetscInt_FMT" of %s does not contain enough columns (%" - PetscInt_FMT" instead of %" PetscInt_FMT")", i, - path, ndims, dims[1]); - - wall_dist[i] = (CeedScalar) atof(array[0]); - ubar[0][i] = (CeedScalar) atof(array[1]); - ubar[1][i] = (CeedScalar) atof(array[2]); - ubar[2][i] = (CeedScalar) atof(array[3]); - rij[0][i] = (CeedScalar) atof(array[4]); - rij[1][i] = (CeedScalar) atof(array[5]); - rij[2][i] = (CeedScalar) atof(array[6]); - rij[3][i] = (CeedScalar) atof(array[7]); - rij[4][i] = (CeedScalar) atof(array[8]); - rij[5][i] = (CeedScalar) atof(array[9]); - lt[i] = (CeedScalar) atof(array[12]); - eps[i] = (CeedScalar) atof(array[13]); - - if (wall_dist[i] < 0) SETERRQ(comm, -1, - "Distance to wall in %s cannot be negative", path); - if (lt[i] < 0) SETERRQ(comm, -1, - "Turbulent length scale in %s cannot be negative", path); - if (eps[i] < 0) SETERRQ(comm, -1, - "Turbulent dissipation in %s cannot be negative", path); + if (ndims < dims[1]) { + SETERRQ(comm, -1, "Line %" PetscInt_FMT " of %s does not contain enough columns (%" PetscInt_FMT " instead of %" PetscInt_FMT ")", i, path, + ndims, dims[1]); + } + wall_dist[i] = (CeedScalar)atof(array[0]); + ubar[0][i] = (CeedScalar)atof(array[1]); + ubar[1][i] = (CeedScalar)atof(array[2]); + ubar[2][i] = (CeedScalar)atof(array[3]); + rij[0][i] = (CeedScalar)atof(array[4]); + rij[1][i] = (CeedScalar)atof(array[5]); + rij[2][i] = (CeedScalar)atof(array[6]); + rij[3][i] = (CeedScalar)atof(array[7]); + rij[4][i] = (CeedScalar)atof(array[8]); + rij[5][i] = (CeedScalar)atof(array[9]); + lt[i] = (CeedScalar)atof(array[12]); + eps[i] = (CeedScalar)atof(array[13]); + + if (wall_dist[i] < 0) SETERRQ(comm, -1, "Distance to wall in %s cannot be negative", path); + if (lt[i] < 0) SETERRQ(comm, -1, "Turbulent length scale in %s cannot be negative", path); + if (eps[i] < 0) SETERRQ(comm, -1, "Turbulent dissipation in %s cannot be negative", path); } - CeedScalar (*cij)[stg_ctx->nprofs] = (CeedScalar (*)[stg_ctx->nprofs]) - &stg_ctx->data[stg_ctx->offsets.cij]; + CeedScalar(*cij)[stg_ctx->nprofs] = (CeedScalar(*)[stg_ctx->nprofs]) & stg_ctx->data[stg_ctx->offsets.cij]; PetscCall(CalcCholeskyDecomp(comm, stg_ctx->nprofs, rij, cij)); PetscCall(PetscFClose(comm, fp)); + PetscFunctionReturn(0); } @@ -188,41 +183,38 @@ static PetscErrorCode ReadSTGInflow(const MPI_Comm comm, * @param[in] path Path to the STGRand.dat file * @param[inout] stg_ctx STGShur14Context where the data will be loaded into */ -static PetscErrorCode ReadSTGRand(const MPI_Comm comm, - const char path[PETSC_MAX_PATH_LEN], - STGShur14Context stg_ctx) { - PetscInt ndims, dims[2]; - FILE *fp; +static PetscErrorCode ReadSTGRand(const MPI_Comm comm, const char path[PETSC_MAX_PATH_LEN], STGShur14Context stg_ctx) { + PetscInt ndims, dims[2]; + FILE *fp; const PetscInt char_array_len = 512; - char line[char_array_len]; - char **array; + char line[char_array_len]; + char **array; PetscFunctionBeginUser; PetscCall(OpenPHASTADatFile(comm, path, char_array_len, dims, &fp)); - CeedScalar *phi = &stg_ctx->data[stg_ctx->offsets.phi]; - CeedScalar (*d)[stg_ctx->nmodes] = (CeedScalar (*)[stg_ctx->nmodes]) - &stg_ctx->data[stg_ctx->offsets.d]; - CeedScalar (*sigma)[stg_ctx->nmodes] = (CeedScalar (*)[stg_ctx->nmodes]) - &stg_ctx->data[stg_ctx->offsets.sigma]; + CeedScalar *phi = &stg_ctx->data[stg_ctx->offsets.phi]; + CeedScalar(*d)[stg_ctx->nmodes] = (CeedScalar(*)[stg_ctx->nmodes]) & stg_ctx->data[stg_ctx->offsets.d]; + CeedScalar(*sigma)[stg_ctx->nmodes] = (CeedScalar(*)[stg_ctx->nmodes]) & stg_ctx->data[stg_ctx->offsets.sigma]; - for (PetscInt i=0; inmodes; i++) { + for (PetscInt i = 0; i < stg_ctx->nmodes; i++) { PetscCall(PetscSynchronizedFGets(comm, fp, char_array_len, line)); PetscCall(PetscStrToArray(line, ' ', &ndims, &array)); - if (ndims < dims[1]) SETERRQ(comm, -1, - "Line %" PetscInt_FMT" of %s does not contain enough columns (%" - PetscInt_FMT" instead of %" PetscInt_FMT")", i, - path, ndims, dims[1]); - - d[0][i] = (CeedScalar) atof(array[0]); - d[1][i] = (CeedScalar) atof(array[1]); - d[2][i] = (CeedScalar) atof(array[2]); - phi[i] = (CeedScalar) atof(array[3]); - sigma[0][i] = (CeedScalar) atof(array[4]); - sigma[1][i] = (CeedScalar) atof(array[5]); - sigma[2][i] = (CeedScalar) atof(array[6]); + if (ndims < dims[1]) { + SETERRQ(comm, -1, "Line %" PetscInt_FMT " of %s does not contain enough columns (%" PetscInt_FMT " instead of %" PetscInt_FMT ")", i, path, + ndims, dims[1]); + } + + d[0][i] = (CeedScalar)atof(array[0]); + d[1][i] = (CeedScalar)atof(array[1]); + d[2][i] = (CeedScalar)atof(array[2]); + phi[i] = (CeedScalar)atof(array[3]); + sigma[0][i] = (CeedScalar)atof(array[4]); + sigma[1][i] = (CeedScalar)atof(array[5]); + sigma[2][i] = (CeedScalar)atof(array[6]); } PetscCall(PetscFClose(comm, fp)); + PetscFunctionReturn(0); } @@ -239,12 +231,9 @@ static PetscErrorCode ReadSTGRand(const MPI_Comm comm, * @param[in] stg_rand_path Path to STGRand.dat file * @param[inout] pstg_ctx Pointer to STGShur14Context where the data will be loaded into */ -PetscErrorCode GetSTGContextData(const MPI_Comm comm, const DM dm, - char stg_inflow_path[PETSC_MAX_PATH_LEN], - char stg_rand_path[PETSC_MAX_PATH_LEN], - STGShur14Context *pstg_ctx, - const CeedScalar ynodes[]) { - PetscInt nmodes, nprofs; +PetscErrorCode GetSTGContextData(const MPI_Comm comm, const DM dm, char stg_inflow_path[PETSC_MAX_PATH_LEN], char stg_rand_path[PETSC_MAX_PATH_LEN], + STGShur14Context *pstg_ctx, const CeedScalar ynodes[]) { + PetscInt nmodes, nprofs; STGShur14Context stg_ctx; PetscFunctionBeginUser; @@ -252,29 +241,30 @@ PetscErrorCode GetSTGContextData(const MPI_Comm comm, const DM dm, PetscCall(GetNRows(comm, stg_rand_path, &nmodes)); PetscCall(GetNRows(comm, stg_inflow_path, &nprofs)); if (nmodes > STG_NMODES_MAX) - SETERRQ(comm, 1, "Number of wavemodes in %s (%" - PetscInt_FMT") exceeds STG_NMODES_MAX (%" PetscInt_FMT"). " - "Change size of STG_NMODES_MAX and recompile", stg_rand_path, nmodes, - STG_NMODES_MAX); + SETERRQ(comm, 1, + "Number of wavemodes in %s (%" PetscInt_FMT ") exceeds STG_NMODES_MAX (%" PetscInt_FMT + "). " + "Change size of STG_NMODES_MAX and recompile", + stg_rand_path, nmodes, STG_NMODES_MAX); { STGShur14Context s; PetscCall(PetscCalloc1(1, &s)); - *s = **pstg_ctx; - s->nmodes = nmodes; - s->nprofs = nprofs; - s->offsets.sigma = 0; - s->offsets.d = nmodes*3; - s->offsets.phi = s->offsets.d + nmodes*3; - s->offsets.kappa = s->offsets.phi + nmodes; - s->offsets.wall_dist = s->offsets.kappa + nmodes; - s->offsets.ubar = s->offsets.wall_dist + nprofs; - s->offsets.cij = s->offsets.ubar + nprofs*3; - s->offsets.eps = s->offsets.cij + nprofs*6; - s->offsets.lt = s->offsets.eps + nprofs; - s->offsets.ynodes = s->offsets.lt + nprofs; + *s = **pstg_ctx; + s->nmodes = nmodes; + s->nprofs = nprofs; + s->offsets.sigma = 0; + s->offsets.d = nmodes * 3; + s->offsets.phi = s->offsets.d + nmodes * 3; + s->offsets.kappa = s->offsets.phi + nmodes; + s->offsets.wall_dist = s->offsets.kappa + nmodes; + s->offsets.ubar = s->offsets.wall_dist + nprofs; + s->offsets.cij = s->offsets.ubar + nprofs * 3; + s->offsets.eps = s->offsets.cij + nprofs * 6; + s->offsets.lt = s->offsets.eps + nprofs; + s->offsets.ynodes = s->offsets.lt + nprofs; PetscInt total_num_scalars = s->offsets.ynodes + s->nynodes; - s->total_bytes = sizeof(*stg_ctx) + total_num_scalars*sizeof(stg_ctx->data[0]); + s->total_bytes = sizeof(*stg_ctx) + total_num_scalars * sizeof(stg_ctx->data[0]); PetscCall(PetscMalloc(s->total_bytes, &stg_ctx)); *stg_ctx = *s; PetscCall(PetscFree(s)); @@ -285,68 +275,50 @@ PetscErrorCode GetSTGContextData(const MPI_Comm comm, const DM dm, if (stg_ctx->nynodes > 0) { CeedScalar *ynodes_ctx = &stg_ctx->data[stg_ctx->offsets.ynodes]; - for (PetscInt i=0; inynodes; i++) ynodes_ctx[i] = ynodes[i]; + for (PetscInt i = 0; i < stg_ctx->nynodes; i++) ynodes_ctx[i] = ynodes[i]; } // -- Calculate kappa { - CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; + CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; CeedScalar *wall_dist = &stg_ctx->data[stg_ctx->offsets.wall_dist]; - CeedScalar *lt = &stg_ctx->data[stg_ctx->offsets.lt]; - CeedScalar le, le_max=0; + CeedScalar *lt = &stg_ctx->data[stg_ctx->offsets.lt]; + CeedScalar le, le_max = 0; - CeedPragmaSIMD - for (PetscInt i=0; inprofs; i++) { - le = PetscMin(2*wall_dist[i], 3*lt[i]); + CeedPragmaSIMD for (PetscInt i = 0; i < stg_ctx->nprofs; i++) { + le = PetscMin(2 * wall_dist[i], 3 * lt[i]); if (le_max < le) le_max = le; } - CeedScalar kmin = M_PI/le_max; + CeedScalar kmin = M_PI / le_max; - CeedPragmaSIMD - for (PetscInt i=0; inmodes; i++) { - kappa[i] = kmin*pow(stg_ctx->alpha, i); - } - } //end calculate kappa + CeedPragmaSIMD for (PetscInt i = 0; i < stg_ctx->nmodes; i++) { kappa[i] = kmin * pow(stg_ctx->alpha, i); } + } // end calculate kappa PetscCall(PetscFree(*pstg_ctx)); *pstg_ctx = stg_ctx; PetscFunctionReturn(0); } -PetscErrorCode SetupSTG(const MPI_Comm comm, const DM dm, ProblemData *problem, - User user, const bool prescribe_T, - const CeedScalar theta0, const CeedScalar P0, - const CeedScalar ynodes[], const CeedInt nynodes) { - char stg_inflow_path[PETSC_MAX_PATH_LEN] = "./STGInflow.dat"; - char stg_rand_path[PETSC_MAX_PATH_LEN] = "./STGRand.dat"; - PetscBool mean_only = PETSC_FALSE, - use_stgstrong = PETSC_FALSE, - use_fluctuating_IC = PETSC_FALSE; - CeedScalar u0 = 0.0, - alpha = 1.01; - CeedQFunctionContext stg_context; +PetscErrorCode SetupSTG(const MPI_Comm comm, const DM dm, ProblemData *problem, User user, const bool prescribe_T, const CeedScalar theta0, + const CeedScalar P0, const CeedScalar ynodes[], const CeedInt nynodes) { + char stg_inflow_path[PETSC_MAX_PATH_LEN] = "./STGInflow.dat"; + char stg_rand_path[PETSC_MAX_PATH_LEN] = "./STGRand.dat"; + PetscBool mean_only = PETSC_FALSE, use_stgstrong = PETSC_FALSE, use_fluctuating_IC = PETSC_FALSE; + CeedScalar u0 = 0.0, alpha = 1.01; + CeedQFunctionContext stg_context; NewtonianIdealGasContext newtonian_ig_ctx; PetscFunctionBeginUser; // Get options PetscOptionsBegin(comm, NULL, "STG Boundary Condition Options", NULL); - PetscCall(PetscOptionsString("-stg_inflow_path", "Path to STGInflow.dat", NULL, - stg_inflow_path, stg_inflow_path, - sizeof(stg_inflow_path), NULL)); - PetscCall(PetscOptionsString("-stg_rand_path", "Path to STGInflow.dat", NULL, - stg_rand_path,stg_rand_path, - sizeof(stg_rand_path), NULL)); - PetscCall(PetscOptionsReal("-stg_alpha", "Growth rate of the wavemodes", NULL, - alpha, &alpha, NULL)); - PetscCall(PetscOptionsReal("-stg_u0", "Advective velocity for the fluctuations", - NULL, u0, &u0, NULL)); - PetscCall(PetscOptionsBool("-stg_mean_only", "Only apply mean profile", - NULL, mean_only, &mean_only, NULL)); - PetscCall(PetscOptionsBool("-stg_strong", "Enforce STG inflow strongly", - NULL, use_stgstrong, &use_stgstrong, NULL)); - PetscCall(PetscOptionsBool("-stg_fluctuating_IC", - "\"Extrude\" the fluctuations through the domain as an initial condition", - NULL, use_fluctuating_IC, &use_fluctuating_IC, NULL)); + PetscCall(PetscOptionsString("-stg_inflow_path", "Path to STGInflow.dat", NULL, stg_inflow_path, stg_inflow_path, sizeof(stg_inflow_path), NULL)); + PetscCall(PetscOptionsString("-stg_rand_path", "Path to STGInflow.dat", NULL, stg_rand_path, stg_rand_path, sizeof(stg_rand_path), NULL)); + PetscCall(PetscOptionsReal("-stg_alpha", "Growth rate of the wavemodes", NULL, alpha, &alpha, NULL)); + PetscCall(PetscOptionsReal("-stg_u0", "Advective velocity for the fluctuations", NULL, u0, &u0, NULL)); + PetscCall(PetscOptionsBool("-stg_mean_only", "Only apply mean profile", NULL, mean_only, &mean_only, NULL)); + PetscCall(PetscOptionsBool("-stg_strong", "Enforce STG inflow strongly", NULL, use_stgstrong, &use_stgstrong, NULL)); + PetscCall(PetscOptionsBool("-stg_fluctuating_IC", "\"Extrude\" the fluctuations through the domain as an initial condition", NULL, + use_fluctuating_IC, &use_fluctuating_IC, NULL)); PetscOptionsEnd(); PetscCall(PetscCalloc1(1, &global_stg_ctx)); @@ -364,32 +336,24 @@ PetscErrorCode SetupSTG(const MPI_Comm comm, const DM dm, ProblemData *problem, // Calculate dx assuming constant spacing PetscReal domain_min[3], domain_max[3], domain_size[3]; PetscCall(DMGetBoundingBox(dm, domain_min, domain_max)); - for (PetscInt i=0; i<3; i++) domain_size[i] = domain_max[i] - domain_min[i]; + for (PetscInt i = 0; i < 3; i++) domain_size[i] = domain_max[i] - domain_min[i]; PetscInt nmax = 3, faces[3]; - PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", faces, - &nmax, NULL)); - global_stg_ctx->dx = domain_size[0]/faces[0]; - global_stg_ctx->dz = domain_size[2]/faces[2]; + PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", faces, &nmax, NULL)); + global_stg_ctx->dx = domain_size[0] / faces[0]; + global_stg_ctx->dz = domain_size[2] / faces[2]; } - CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, - CEED_MEM_HOST, &newtonian_ig_ctx); + CeedQFunctionContextGetData(problem->apply_vol_rhs.qfunction_context, CEED_MEM_HOST, &newtonian_ig_ctx); global_stg_ctx->newtonian_ctx = *newtonian_ig_ctx; - CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, - &newtonian_ig_ctx); + CeedQFunctionContextRestoreData(problem->apply_vol_rhs.qfunction_context, &newtonian_ig_ctx); - PetscCall(GetSTGContextData(comm, dm, stg_inflow_path, stg_rand_path, - &global_stg_ctx, ynodes)); + PetscCall(GetSTGContextData(comm, dm, stg_inflow_path, stg_rand_path, &global_stg_ctx, ynodes)); CeedQFunctionContextCreate(user->ceed, &stg_context); - CeedQFunctionContextSetData(stg_context, CEED_MEM_HOST, - CEED_USE_POINTER, global_stg_ctx->total_bytes, global_stg_ctx); - CeedQFunctionContextSetDataDestroy(stg_context, CEED_MEM_HOST, - FreeContextPetsc); - CeedQFunctionContextRegisterDouble(stg_context, "solution time", - offsetof(struct STGShur14Context_, time), 1, - "Physical time of the solution"); + CeedQFunctionContextSetData(stg_context, CEED_MEM_HOST, CEED_USE_POINTER, global_stg_ctx->total_bytes, global_stg_ctx); + CeedQFunctionContextSetDataDestroy(stg_context, CEED_MEM_HOST, FreeContextPetsc); + CeedQFunctionContextRegisterDouble(stg_context, "solution time", offsetof(struct STGShur14Context_, time), 1, "Physical time of the solution"); CeedQFunctionContextDestroy(&problem->ics.qfunction_context); problem->ics.qfunction = ICsSTG; @@ -405,59 +369,57 @@ PetscErrorCode SetupSTG(const MPI_Comm comm, const DM dm, ProblemData *problem, problem->apply_inflow.qfunction_loc = STGShur14_Inflow_loc; problem->apply_inflow_jacobian.qfunction = STGShur14_Inflow_Jacobian; problem->apply_inflow_jacobian.qfunction_loc = STGShur14_Inflow_Jacobian_loc; - CeedQFunctionContextReferenceCopy(stg_context, - &problem->apply_inflow.qfunction_context); - CeedQFunctionContextReferenceCopy(stg_context, - &problem->apply_inflow_jacobian.qfunction_context); + CeedQFunctionContextReferenceCopy(stg_context, &problem->apply_inflow.qfunction_context); + CeedQFunctionContextReferenceCopy(stg_context, &problem->apply_inflow_jacobian.qfunction_context); problem->bc_from_ics = PETSC_TRUE; } PetscFunctionReturn(0); } -static inline PetscScalar FindDy(const PetscScalar ynodes[], - const PetscInt nynodes, const PetscScalar y) { +static inline PetscScalar FindDy(const PetscScalar ynodes[], const PetscInt nynodes, const PetscScalar y) { const PetscScalar half_mindy = 0.5 * (ynodes[1] - ynodes[0]); // ^^assuming min(dy) is first element off the wall - PetscInt idx = -1; // Index + PetscInt idx = -1; // Index - for (PetscInt i=0; inmodes], u[3], ubar[3], cij[6], eps, lt; - const bool mean_only = stg_ctx->mean_only; - const PetscScalar dx = stg_ctx->dx; - const PetscScalar dz = stg_ctx->dz; - const PetscScalar mu = stg_ctx->newtonian_ctx.mu; - const PetscScalar theta0 = stg_ctx->theta0; - const PetscScalar P0 = stg_ctx->P0; - const PetscScalar cv = stg_ctx->newtonian_ctx.cv; - const PetscScalar cp = stg_ctx->newtonian_ctx.cp; - const PetscScalar Rd = cp - cv; + + const STGShur14Context stg_ctx = (STGShur14Context)ctx; + PetscScalar qn[stg_ctx->nmodes], u[3], ubar[3], cij[6], eps, lt; + const bool mean_only = stg_ctx->mean_only; + const PetscScalar dx = stg_ctx->dx; + const PetscScalar dz = stg_ctx->dz; + const PetscScalar mu = stg_ctx->newtonian_ctx.mu; + const PetscScalar theta0 = stg_ctx->theta0; + const PetscScalar P0 = stg_ctx->P0; + const PetscScalar cv = stg_ctx->newtonian_ctx.cv; + const PetscScalar cp = stg_ctx->newtonian_ctx.cp; + const PetscScalar Rd = cp - cv; const CeedScalar rho = P0 / (Rd * theta0); InterpolateProfile(x[1], ubar, cij, &eps, <, stg_ctx); if (!mean_only) { - const PetscInt nynodes = stg_ctx->nynodes; - const PetscScalar *ynodes = &stg_ctx->data[stg_ctx->offsets.ynodes]; - const PetscScalar h[3] = {dx, FindDy(ynodes, nynodes, x[1]), dz}; - CalcSpectrum(x[1], eps, lt, h, mu/rho, qn, stg_ctx); + const PetscInt nynodes = stg_ctx->nynodes; + const PetscScalar *ynodes = &stg_ctx->data[stg_ctx->offsets.ynodes]; + const PetscScalar h[3] = {dx, FindDy(ynodes, nynodes, x[1]), dz}; + CalcSpectrum(x[1], eps, lt, h, mu / rho, qn, stg_ctx); STGShur14_Calc(x, time, ubar, cij, qn, u, stg_ctx); } else { - for (CeedInt j=0; j<3; j++) u[j] = ubar[j]; + for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j]; } bcval[0] = rho; @@ -467,67 +429,56 @@ PetscErrorCode StrongSTGbcFunc(PetscInt dim, PetscReal time, PetscFunctionReturn(0); } -PetscErrorCode SetupStrongSTG(DM dm, SimpleBC bc, ProblemData *problem, - Physics phys) { +PetscErrorCode SetupStrongSTG(DM dm, SimpleBC bc, ProblemData *problem, Physics phys) { DMLabel label; PetscFunctionBeginUser; - PetscInt comps[5], num_comps=4; + PetscInt comps[5], num_comps = 4; switch (phys->state_var) { - case STATEVAR_CONSERVATIVE: - // {0,1,2,3} for rho, rho*u, rho*v, rho*w - for(int i=0; i<4; i++) comps[i] = i; - break; - - case STATEVAR_PRIMITIVE: - // {1,2,3,4} for u, v, w, T - for(int i=0; i<4; i++) comps[i] = i+1; - break; + case STATEVAR_CONSERVATIVE: + // {0,1,2,3} for rho, rho*u, rho*v, rho*w + for (int i = 0; i < 4; i++) comps[i] = i; + break; + + case STATEVAR_PRIMITIVE: + // {1,2,3,4} for u, v, w, T + for (int i = 0; i < 4; i++) comps[i] = i + 1; + break; } PetscCall(DMGetLabel(dm, "Face Sets", &label)); // Set wall BCs if (bc->num_inflow > 0) { - PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "STG", label, - bc->num_inflow, bc->inflows, 0, num_comps, - comps, (void(*)(void))StrongSTGbcFunc, + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "STG", label, bc->num_inflow, bc->inflows, 0, num_comps, comps, (void (*)(void))StrongSTGbcFunc, NULL, global_stg_ctx, NULL)); } PetscFunctionReturn(0); } -PetscErrorCode SetupStrongSTG_QF(Ceed ceed, ProblemData *problem, - CeedInt num_comp_x, CeedInt num_comp_q, CeedInt stg_data_size, +PetscErrorCode SetupStrongSTG_QF(Ceed ceed, ProblemData *problem, CeedInt num_comp_x, CeedInt num_comp_q, CeedInt stg_data_size, CeedInt q_data_size_sur, CeedQFunction *pqf_strongbc) { - CeedQFunction qf_strongbc; PetscFunctionBeginUser; - CeedQFunctionCreateInterior(ceed, 1, STGShur14_Inflow_StrongQF, - STGShur14_Inflow_StrongQF_loc, &qf_strongbc); - CeedQFunctionAddInput(qf_strongbc, "surface qdata", q_data_size_sur, - CEED_EVAL_NONE); - CeedQFunctionAddInput(qf_strongbc, "x", num_comp_x, CEED_EVAL_NONE); - CeedQFunctionAddInput(qf_strongbc, "scale", 1, CEED_EVAL_NONE); - CeedQFunctionAddInput(qf_strongbc, "stg data", stg_data_size, CEED_EVAL_NONE); - CeedQFunctionAddOutput(qf_strongbc, "q", num_comp_q, CEED_EVAL_NONE); + CeedQFunctionCreateInterior(ceed, 1, STGShur14_Inflow_StrongQF, STGShur14_Inflow_StrongQF_loc, &qf_strongbc); + CeedQFunctionAddInput(qf_strongbc, "surface qdata", q_data_size_sur, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_strongbc, "x", num_comp_x, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_strongbc, "scale", 1, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_strongbc, "stg data", stg_data_size, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_strongbc, "q", num_comp_q, CEED_EVAL_NONE); CeedQFunctionSetContext(qf_strongbc, problem->ics.qfunction_context); *pqf_strongbc = qf_strongbc; PetscFunctionReturn(0); } -PetscErrorCode SetupStrongSTG_PreProcessing(Ceed ceed, ProblemData *problem, - CeedInt num_comp_x, CeedInt stg_data_size, CeedInt q_data_size_sur, - CeedQFunction *pqf_strongbc) { - +PetscErrorCode SetupStrongSTG_PreProcessing(Ceed ceed, ProblemData *problem, CeedInt num_comp_x, CeedInt stg_data_size, CeedInt q_data_size_sur, + CeedQFunction *pqf_strongbc) { CeedQFunction qf_strongbc; PetscFunctionBeginUser; - CeedQFunctionCreateInterior(ceed, 1, Preprocess_STGShur14, - Preprocess_STGShur14_loc, &qf_strongbc); - CeedQFunctionAddInput(qf_strongbc, "surface qdata", q_data_size_sur, - CEED_EVAL_NONE); - CeedQFunctionAddInput(qf_strongbc, "x", num_comp_x, CEED_EVAL_NONE); + CeedQFunctionCreateInterior(ceed, 1, Preprocess_STGShur14, Preprocess_STGShur14_loc, &qf_strongbc); + CeedQFunctionAddInput(qf_strongbc, "surface qdata", q_data_size_sur, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_strongbc, "x", num_comp_x, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_strongbc, "stg data", stg_data_size, CEED_EVAL_NONE); CeedQFunctionSetContext(qf_strongbc, problem->ics.qfunction_context); diff --git a/examples/fluids/problems/stg_shur14.h b/examples/fluids/problems/stg_shur14.h index e7be94741a..93a8cc2257 100644 --- a/examples/fluids/problems/stg_shur14.h +++ b/examples/fluids/problems/stg_shur14.h @@ -7,22 +7,17 @@ #include #include -#include "../qfunctions/stg_shur14_type.h" + #include "../navierstokes.h" +#include "../qfunctions/stg_shur14_type.h" -extern PetscErrorCode SetupSTG(const MPI_Comm comm, const DM dm, - ProblemData *problem, User user, - const bool prescribe_T, const CeedScalar theta0, - const CeedScalar P0, - const CeedScalar ynodes[], const CeedInt nynodes); +extern PetscErrorCode SetupSTG(const MPI_Comm comm, const DM dm, ProblemData *problem, User user, const bool prescribe_T, const CeedScalar theta0, + const CeedScalar P0, const CeedScalar ynodes[], const CeedInt nynodes); -extern PetscErrorCode SetupStrongSTG(DM dm, SimpleBC bc, ProblemData *problem, - Physics phys); +extern PetscErrorCode SetupStrongSTG(DM dm, SimpleBC bc, ProblemData *problem, Physics phys); -extern PetscErrorCode SetupStrongSTG_QF(Ceed ceed, ProblemData *problem, - CeedInt num_comp_x, CeedInt num_comp_q, CeedInt stg_data_size, +extern PetscErrorCode SetupStrongSTG_QF(Ceed ceed, ProblemData *problem, CeedInt num_comp_x, CeedInt num_comp_q, CeedInt stg_data_size, CeedInt q_data_size_sur, CeedQFunction *qf_strongbc); -extern PetscErrorCode SetupStrongSTG_PreProcessing(Ceed ceed, - ProblemData *problem, CeedInt num_comp_x, CeedInt stg_data_size, - CeedInt q_data_size_sur, CeedQFunction *pqf_strongbc); +extern PetscErrorCode SetupStrongSTG_PreProcessing(Ceed ceed, ProblemData *problem, CeedInt num_comp_x, CeedInt stg_data_size, + CeedInt q_data_size_sur, CeedQFunction *pqf_strongbc); diff --git a/examples/fluids/qfunctions/advection.h b/examples/fluids/qfunctions/advection.h index c3501eec53..c398e0d600 100644 --- a/examples/fluids/qfunctions/advection.h +++ b/examples/fluids/qfunctions/advection.h @@ -22,9 +22,9 @@ struct SetupContextAdv_ { CeedScalar lz; CeedScalar wind[3]; CeedScalar time; - int wind_type; // See WindType: 0=ROTATION, 1=TRANSLATION - int bubble_type; // See BubbleType: 0=SPHERE, 1=CYLINDER - int bubble_continuity_type; // See BubbleContinuityType: 0=SMOOTH, 1=BACK_SHARP 2=THICK + int wind_type; // See WindType: 0=ROTATION, 1=TRANSLATION + int bubble_type; // See BubbleType: 0=SPHERE, 1=CYLINDER + int bubble_continuity_type; // See BubbleContinuityType: 0=SMOOTH, 1=BACK_SHARP 2=THICK }; typedef struct AdvectionContext_ *AdvectionContext; @@ -32,11 +32,11 @@ struct AdvectionContext_ { CeedScalar CtauS; CeedScalar strong_form; CeedScalar E_wind; - bool implicit; - int stabilization; // See StabilizationType: 0=none, 1=SU, 2=SUPG + bool implicit; + int stabilization; // See StabilizationType: 0=none, 1=SU, 2=SUPG }; -CEED_QFUNCTION_HELPER CeedScalar Square(CeedScalar x) { return x*x; } +CEED_QFUNCTION_HELPER CeedScalar Square(CeedScalar x) { return x * x; } // ***************************************************************************** // This QFunction sets the initial conditions and the boundary conditions @@ -89,18 +89,17 @@ CEED_QFUNCTION_HELPER CeedScalar Square(CeedScalar x) { return x*x; } // This helper function provides support for the exact, time-dependent solution // (currently not implemented) and IC formulation for 3D advection // ***************************************************************************** -CEED_QFUNCTION_HELPER CeedInt Exact_Advection(CeedInt dim, CeedScalar time, - const CeedScalar X[], CeedInt Nf, CeedScalar q[], void *ctx) { +CEED_QFUNCTION_HELPER CeedInt Exact_Advection(CeedInt dim, CeedScalar time, const CeedScalar X[], CeedInt Nf, CeedScalar q[], void *ctx) { const SetupContextAdv context = (SetupContextAdv)ctx; - const CeedScalar rc = context->rc; - const CeedScalar lx = context->lx; - const CeedScalar ly = context->ly; - const CeedScalar lz = context->lz; - const CeedScalar *wind = context->wind; + const CeedScalar rc = context->rc; + const CeedScalar lx = context->lx; + const CeedScalar ly = context->ly; + const CeedScalar lz = context->lz; + const CeedScalar *wind = context->wind; // Setup - const CeedScalar x0[3] = {0.25*lx, 0.5*ly, 0.5*lz}; - const CeedScalar center[3] = {0.5*lx, 0.5*ly, 0.5*lz}; + const CeedScalar x0[3] = {0.25 * lx, 0.5 * ly, 0.5 * lz}; + const CeedScalar center[3] = {0.5 * lx, 0.5 * ly, 0.5 * lz}; // -- Coordinates const CeedScalar x = X[0]; @@ -110,48 +109,45 @@ CEED_QFUNCTION_HELPER CeedInt Exact_Advection(CeedInt dim, CeedScalar time, // -- Energy CeedScalar r = 0.; switch (context->bubble_type) { - // original sphere - case 0: { // (dim=3) - r = sqrt(Square(x - x0[0]) + - Square(y - x0[1]) + - Square(z - x0[2])); - } break; - // cylinder (needs periodicity to work properly) - case 1: { // (dim=2) - r = sqrt(Square(x - x0[0]) + Square(y - x0[1])); - } break; + // original sphere + case 0: { // (dim=3) + r = sqrt(Square(x - x0[0]) + Square(y - x0[1]) + Square(z - x0[2])); + } break; + // cylinder (needs periodicity to work properly) + case 1: { // (dim=2) + r = sqrt(Square(x - x0[0]) + Square(y - x0[1])); + } break; } // Initial Conditions switch (context->wind_type) { - case 0: // Rotation - q[0] = 1.; - q[1] = -(y - center[1]); - q[2] = (x - center[0]); - q[3] = 0; - break; - case 1: // Translation - q[0] = 1.; - q[1] = wind[0]; - q[2] = wind[1]; - q[3] = wind[2]; - break; + case 0: // Rotation + q[0] = 1.; + q[1] = -(y - center[1]); + q[2] = (x - center[0]); + q[3] = 0; + break; + case 1: // Translation + q[0] = 1.; + q[1] = wind[0]; + q[2] = wind[1]; + q[3] = wind[2]; + break; } switch (context->bubble_continuity_type) { - // original continuous, smooth shape - case 0: { - q[4] = r <= rc ? (1.-r/rc) : 0.; - } break; - // discontinuous, sharp back half shape - case 1: { - q[4] = ((r <= rc) && (yCtauS; const CeedScalar strong_form = context->strong_form; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; iCtauS; const CeedScalar strong_form = context->strong_form; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; istabilization) { - case 0: - break; - case 1: dv[j][4][i] += wdetJ * TauS * strong_conv * uX[j]; //SU - break; - case 2: dv[j][4][i] += wdetJ * TauS * strong_res * uX[j]; //SUPG - break; + for (CeedInt j = 0; j < 3; j++) uX[j] = dXdx[j][0] * u[0] + dXdx[j][1] * u[1] + dXdx[j][2] * u[2]; + const CeedScalar TauS = CtauS / sqrt(uX[0] * uX[0] + uX[1] * uX[1] + uX[2] * uX[2]); + + for (CeedInt j = 0; j < 3; j++) switch (context->stabilization) { + case 0: + break; + case 1: + dv[j][4][i] += wdetJ * TauS * strong_conv * uX[j]; // SU + break; + case 2: + dv[j][4][i] += wdetJ * TauS * strong_res * uX[j]; // SUPG + break; } - } // End Quadrature Point Loop + } // End Quadrature Point Loop return 0; } @@ -459,60 +397,51 @@ CEED_QFUNCTION(IFunction_Advection)(void *ctx, CeedInt Q, // A prescribed Total Energy (E_wind) is applied weakly. // // ***************************************************************************** -CEED_QFUNCTION(Advection_InOutFlow)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Advection_InOutFlow)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* - AdvectionContext context = (AdvectionContext)ctx; + AdvectionContext context = (AdvectionContext)ctx; const CeedScalar E_wind = context->E_wind; const CeedScalar strong_form = context->strong_form; - const bool implicit = context->implicit; + const bool implicit = context->implicit; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; i 0) { // outflow + if (u_normal > 0) { // outflow v[4][i] = -(1 - strong_form) * wdetJb * E * u_normal; - } else { // inflow + } else { // inflow v[4][i] = -(1 - strong_form) * wdetJb * E_wind * u_normal; } - } // End Quadrature Point Loop + } // End Quadrature Point Loop return 0; } // ***************************************************************************** -#endif // advection_h +#endif // advection_h diff --git a/examples/fluids/qfunctions/advection2d.h b/examples/fluids/qfunctions/advection2d.h index 2688b5905d..cd14fe9d0c 100644 --- a/examples/fluids/qfunctions/advection2d.h +++ b/examples/fluids/qfunctions/advection2d.h @@ -13,6 +13,7 @@ #include #include + #include "utils.h" typedef struct SetupContextAdv2D_ *SetupContextAdv2D; @@ -22,7 +23,7 @@ struct SetupContextAdv2D_ { CeedScalar ly; CeedScalar wind[3]; CeedScalar time; - int wind_type; // See WindType: 0=ROTATION, 1=TRANSLATION + int wind_type; // See WindType: 0=ROTATION, 1=TRANSLATION }; typedef struct AdvectionContext_ *AdvectionContext; @@ -30,8 +31,8 @@ struct AdvectionContext_ { CeedScalar CtauS; CeedScalar strong_form; CeedScalar E_wind; - bool implicit; - int stabilization; // See StabilizationType: 0=none, 1=SU, 2=SUPG + bool implicit; + int stabilization; // See StabilizationType: 0=none, 1=SU, 2=SUPG }; // ***************************************************************************** @@ -85,53 +86,52 @@ struct AdvectionContext_ { // This helper function provides the exact, time-dependent solution // and IC formulation for 2D advection // ***************************************************************************** -CEED_QFUNCTION_HELPER CeedInt Exact_Advection2d(CeedInt dim, CeedScalar time, - const CeedScalar X[], CeedInt Nf, CeedScalar q[], void *ctx) { +CEED_QFUNCTION_HELPER CeedInt Exact_Advection2d(CeedInt dim, CeedScalar time, const CeedScalar X[], CeedInt Nf, CeedScalar q[], void *ctx) { const SetupContextAdv2D context = (SetupContextAdv2D)ctx; - const CeedScalar rc = context->rc; - const CeedScalar lx = context->lx; - const CeedScalar ly = context->ly; - const CeedScalar *wind = context->wind; + const CeedScalar rc = context->rc; + const CeedScalar lx = context->lx; + const CeedScalar ly = context->ly; + const CeedScalar *wind = context->wind; // Setup - const CeedScalar center[2] = {0.5*lx, 0.5*ly}; - const CeedScalar theta[] = {M_PI, -M_PI/3, M_PI/3}; - const CeedScalar x0[2] = {center[0] + .25*lx*cos(theta[0] + time), center[1] + .25*ly*sin(theta[0] + time)}; - const CeedScalar x1[2] = {center[0] + .25*lx*cos(theta[1] + time), center[1] + .25*ly*sin(theta[1] + time)}; - const CeedScalar x2[2] = {center[0] + .25*lx*cos(theta[2] + time), center[1] + .25*ly*sin(theta[2] + time)}; + const CeedScalar center[2] = {0.5 * lx, 0.5 * ly}; + const CeedScalar theta[] = {M_PI, -M_PI / 3, M_PI / 3}; + const CeedScalar x0[2] = {center[0] + .25 * lx * cos(theta[0] + time), center[1] + .25 * ly * sin(theta[0] + time)}; + const CeedScalar x1[2] = {center[0] + .25 * lx * cos(theta[1] + time), center[1] + .25 * ly * sin(theta[1] + time)}; + const CeedScalar x2[2] = {center[0] + .25 * lx * cos(theta[2] + time), center[1] + .25 * ly * sin(theta[2] + time)}; const CeedScalar x = X[0], y = X[1]; // Initial/Boundary Conditions switch (context->wind_type) { - case 0: // Rotation - q[0] = 1.; - q[1] = -(y - center[1]); - q[2] = (x - center[0]); - q[3] = 0; - q[4] = 0; - break; - case 1: // Translation - q[0] = 1.; - q[1] = wind[0]; - q[2] = wind[1]; - q[3] = 0; - q[4] = 0; - break; - default: - return 1; + case 0: // Rotation + q[0] = 1.; + q[1] = -(y - center[1]); + q[2] = (x - center[0]); + q[3] = 0; + q[4] = 0; + break; + case 1: // Translation + q[0] = 1.; + q[1] = wind[0]; + q[2] = wind[1]; + q[3] = 0; + q[4] = 0; + break; + default: + return 1; } CeedScalar r = sqrt(Square(x - x0[0]) + Square(y - x0[1])); - CeedScalar E = 1 - r/rc; + CeedScalar E = 1 - r / rc; - if (0) { // non-smooth initial conditions + if (0) { // non-smooth initial conditions if (q[4] < E) q[4] = E; r = sqrt(Square(x - x1[0]) + Square(y - x1[1])); if (r <= rc) q[4] = 1; } r = sqrt(Square(x - x2[0]) + Square(y - x2[1])); - E = (r <= rc) ? .5 + .5*cos(r*M_PI/rc) : 0; + E = (r <= rc) ? .5 + .5 * cos(r * M_PI / rc) : 0; if (q[4] < E) q[4] = E; return 0; @@ -140,23 +140,22 @@ CEED_QFUNCTION_HELPER CeedInt Exact_Advection2d(CeedInt dim, CeedScalar time, // ***************************************************************************** // This QFunction sets the initial conditions for 2D advection // ***************************************************************************** -CEED_QFUNCTION(ICsAdvection2d)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ICsAdvection2d)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs - const CeedScalar (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; // Outputs - CeedScalar (*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; const SetupContextAdv2D context = (SetupContextAdv2D)ctx; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; itime, x, 5, q, ctx); - for (CeedInt j=0; j<5; j++) q0[j][i] = q[j]; - } // End of Quadrature Point Loop + for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j]; + } // End of Quadrature Point Loop // Return return 0; @@ -176,67 +175,58 @@ CEED_QFUNCTION(ICsAdvection2d)(void *ctx, CeedInt Q, // dE/dt + div( E u ) = 0 // // ***************************************************************************** -CEED_QFUNCTION(Advection2d)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Advection2d)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*dq)[5][CEED_Q_VLA] = (const CeedScalar(*)[5][CEED_Q_VLA])in[1], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*dq)[5][CEED_Q_VLA] = (const CeedScalar(*)[5][CEED_Q_VLA])in[1], + (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0], - (*dv)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[1]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0], (*dv)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[1]; // *INDENT-ON* - AdvectionContext context = (AdvectionContext)ctx; - const CeedScalar CtauS = context->CtauS; + AdvectionContext context = (AdvectionContext)ctx; + const CeedScalar CtauS = context->CtauS; const CeedScalar strong_form = context->strong_form; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; iCtauS; const CeedScalar strong_form = context->strong_form; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; istabilization) { - case 0: - break; - case 1: dv[j][4][i] += wdetJ * TauS * strong_conv * uX[j]; - break; - case 2: dv[j][4][i] += wdetJ * TauS * strong_res * uX[j]; - break; + for (CeedInt j = 0; j < 2; j++) uX[j] = dXdx[j][0] * u[0] + dXdx[j][1] * u[1]; + const CeedScalar TauS = CtauS / sqrt(uX[0] * uX[0] + uX[1] * uX[1]); + + for (CeedInt j = 0; j < 2; j++) switch (context->stabilization) { + case 0: + break; + case 1: + dv[j][4][i] += wdetJ * TauS * strong_conv * uX[j]; + break; + case 2: + dv[j][4][i] += wdetJ * TauS * strong_res * uX[j]; + break; } - } // End Quadrature Point Loop + } // End Quadrature Point Loop return 0; } @@ -403,60 +379,52 @@ CEED_QFUNCTION(IFunction_Advection2d)(void *ctx, CeedInt Q, // A prescribed Total Energy (E_wind) is applied weakly. // // ***************************************************************************** -CEED_QFUNCTION(Advection2d_InOutFlow)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Advection2d_InOutFlow)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* - AdvectionContext context = (AdvectionContext)ctx; + AdvectionContext context = (AdvectionContext)ctx; const CeedScalar E_wind = context->E_wind; const CeedScalar strong_form = context->strong_form; - const bool implicit = context->implicit; + const bool implicit = context->implicit; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; i 0) { // outflow + if (u_normal > 0) { // outflow v[4][i] = -(1 - strong_form) * wdetJb * E * u_normal; - } else { // inflow + } else { // inflow v[4][i] = -(1 - strong_form) * wdetJb * E_wind * u_normal; } - } // End Quadrature Point Loop + } // End Quadrature Point Loop return 0; } // ***************************************************************************** -#endif // advection2d_h +#endif // advection2d_h diff --git a/examples/fluids/qfunctions/blasius.h b/examples/fluids/qfunctions/blasius.h index 0b18a2dad7..cacb0efaca 100644 --- a/examples/fluids/qfunctions/blasius.h +++ b/examples/fluids/qfunctions/blasius.h @@ -8,11 +8,11 @@ /// @file /// Operator for Navier-Stokes example using PETSc - #ifndef blasius_h #define blasius_h #include + #include "newtonian_state.h" #include "newtonian_types.h" #include "utils.h" @@ -21,19 +21,19 @@ typedef struct BlasiusContext_ *BlasiusContext; struct BlasiusContext_ { - bool implicit; // !< Using implicit timesteping or not - bool weakT; // !< flag to set Temperature weakly at inflow - CeedScalar delta0; // !< Boundary layer height at inflow - CeedScalar U_inf; // !< Velocity at boundary layer edge - CeedScalar T_inf; // !< Temperature at boundary layer edge - CeedScalar T_wall; // !< Temperature at the wall - CeedScalar P0; // !< Pressure at outflow - CeedScalar x_inflow; // !< Location of inflow in x - CeedScalar n_cheb; // !< Number of Chebyshev terms - CeedScalar *X; // !< Chebyshev polynomial coordinate vector (CPU only) - CeedScalar eta_max; // !< Maximum eta in the domain - CeedScalar Tf_cheb[BLASIUS_MAX_N_CHEBYSHEV]; // !< Chebyshev coefficient for f - CeedScalar Th_cheb[BLASIUS_MAX_N_CHEBYSHEV-1]; // !< Chebyshev coefficient for h + bool implicit; // !< Using implicit timesteping or not + bool weakT; // !< flag to set Temperature weakly at inflow + CeedScalar delta0; // !< Boundary layer height at inflow + CeedScalar U_inf; // !< Velocity at boundary layer edge + CeedScalar T_inf; // !< Temperature at boundary layer edge + CeedScalar T_wall; // !< Temperature at the wall + CeedScalar P0; // !< Pressure at outflow + CeedScalar x_inflow; // !< Location of inflow in x + CeedScalar n_cheb; // !< Number of Chebyshev terms + CeedScalar *X; // !< Chebyshev polynomial coordinate vector (CPU only) + CeedScalar eta_max; // !< Maximum eta in the domain + CeedScalar Tf_cheb[BLASIUS_MAX_N_CHEBYSHEV]; // !< Chebyshev coefficient for f + CeedScalar Th_cheb[BLASIUS_MAX_N_CHEBYSHEV - 1]; // !< Chebyshev coefficient for h struct NewtonianIdealGasContext_ newtonian_ctx; }; @@ -41,58 +41,59 @@ struct BlasiusContext_ { // This helper function evaluates Chebyshev polynomials with a set of // coefficients with all their derivatives represented as a recurrence table. // ***************************************************************************** -CEED_QFUNCTION_HELPER void ChebyshevEval(int N, const double *Tf, double x, - double eta_max, double *f) { - double dX_deta = 2 / eta_max; +CEED_QFUNCTION_HELPER void ChebyshevEval(int N, const double *Tf, double x, double eta_max, double *f) { + double dX_deta = 2 / eta_max; double table[4][3] = { - // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1) - {1, x, 2*x *x - 1}, {0, 1, 4*x}, {0, 0, 4}, {0, 0, 0} + // Chebyshev polynomials T_0, T_1, T_2 of the first kind in (-1,1) + {1, x, 2 * x * x - 1}, + {0, 1, 4 * x }, + {0, 0, 4 }, + {0, 0, 0 } }; - for (int i=0; i<4; i++) { + for (int i = 0; i < 4; i++) { // i-th derivative of f f[i] = table[i][0] * Tf[0] + table[i][1] * Tf[1] + table[i][2] * Tf[2]; } - for (int i=3; in_cheb; CeedScalar mu = blasius->newtonian_ctx.mu; CeedScalar nu = mu / rho_infty; - CeedScalar eta = x[1]*sqrt(blasius->U_inf/(nu*(x0+x[0]-x_inflow))); + CeedScalar eta = x[1] * sqrt(blasius->U_inf / (nu * (x0 + x[0] - x_inflow))); CeedScalar X = 2 * (eta / blasius->eta_max) - 1.; CeedScalar U_inf = blasius->U_inf; CeedScalar Rd = GasConstant(&blasius->newtonian_ctx); CeedScalar f[4], h[4]; ChebyshevEval(N, blasius->Tf_cheb, X, blasius->eta_max, f); - ChebyshevEval(N-1, blasius->Th_cheb, X, blasius->eta_max, h); + ChebyshevEval(N - 1, blasius->Th_cheb, X, blasius->eta_max, h); - *t12 = mu*U_inf*f[2]*sqrt(U_inf/(nu*(x0+x[0]-x_inflow))); + *t12 = mu * U_inf * f[2] * sqrt(U_inf / (nu * (x0 + x[0] - x_inflow))); CeedScalar Y[5]; Y[1] = U_inf * f[1]; - Y[2] = 0.5*sqrt(nu*U_inf/(x0+x[0]-x_inflow))*(eta*f[1] - f[0]); + Y[2] = 0.5 * sqrt(nu * U_inf / (x0 + x[0] - x_inflow)) * (eta * f[1] - f[0]); Y[3] = 0.; Y[4] = blasius->T_inf * h[0]; Y[0] = rho_infty / h[0] * Rd * Y[4]; @@ -102,183 +103,173 @@ State CEED_QFUNCTION_HELPER(BlasiusSolution)(const BlasiusContext blasius, // ***************************************************************************** // This QFunction sets a Blasius boundary layer for the initial condition // ***************************************************************************** -CEED_QFUNCTION(ICsBlasius)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ICsBlasius)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs - const CeedScalar (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; // Outputs - CeedScalar (*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - - const BlasiusContext context = (BlasiusContext)ctx; - const CeedScalar cv = context->newtonian_ctx.cv; - const CeedScalar mu = context->newtonian_ctx.mu; - const CeedScalar T_inf = context->T_inf; - const CeedScalar P0 = context->P0; - const CeedScalar delta0 = context->delta0; - const CeedScalar U_inf = context->U_inf; - const CeedScalar x_inflow = context->x_inflow; - const CeedScalar gamma = HeatCapacityRatio(&context->newtonian_ctx); - const CeedScalar e_internal = cv * T_inf; - const CeedScalar rho = P0 / ((gamma - 1) * e_internal); - const CeedScalar x0 = U_inf*rho / (mu*25/(delta0*delta0)); - CeedScalar t12; + CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + + const BlasiusContext context = (BlasiusContext)ctx; + const CeedScalar cv = context->newtonian_ctx.cv; + const CeedScalar mu = context->newtonian_ctx.mu; + const CeedScalar T_inf = context->T_inf; + const CeedScalar P0 = context->P0; + const CeedScalar delta0 = context->delta0; + const CeedScalar U_inf = context->U_inf; + const CeedScalar x_inflow = context->x_inflow; + const CeedScalar gamma = HeatCapacityRatio(&context->newtonian_ctx); + const CeedScalar e_internal = cv * T_inf; + const CeedScalar rho = P0 / ((gamma - 1) * e_internal); + const CeedScalar x0 = U_inf * rho / (mu * 25 / (delta0 * delta0)); + CeedScalar t12; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iimplicit; - NewtonianIdealGasContext gas = &context->newtonian_ctx; - const CeedScalar mu = context->newtonian_ctx.mu; - const CeedScalar Rd = GasConstant(&context->newtonian_ctx); - const CeedScalar T_inf = context->T_inf; - const CeedScalar P0 = context->P0; - const CeedScalar delta0 = context->delta0; - const CeedScalar U_inf = context->U_inf; - const CeedScalar x_inflow = context->x_inflow; - const bool weakT = context->weakT; - const CeedScalar rho_0 = P0 / (Rd * T_inf); - const CeedScalar x0 = U_inf*rho_0 / (mu*25/ Square(delta0)); + const BlasiusContext context = (BlasiusContext)ctx; + const bool implicit = context->implicit; + NewtonianIdealGasContext gas = &context->newtonian_ctx; + const CeedScalar mu = context->newtonian_ctx.mu; + const CeedScalar Rd = GasConstant(&context->newtonian_ctx); + const CeedScalar T_inf = context->T_inf; + const CeedScalar P0 = context->P0; + const CeedScalar delta0 = context->delta0; + const CeedScalar U_inf = context->U_inf; + const CeedScalar x_inflow = context->x_inflow; + const bool weakT = context->weakT; + const CeedScalar rho_0 = P0 / (Rd * T_inf); + const CeedScalar x0 = U_inf * rho_0 / (mu * 25 / Square(delta0)); CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; inewtonian_ctx, s, Flux_inviscid); - const CeedScalar stress[3][3] = {{0, t12, 0}, {t12, 0, 0}, {0, 0, 0}}; - const CeedScalar Fe[3] = {0}; // TODO: viscous energy flux needs grad temperature - CeedScalar Flux[5]; + const CeedScalar stress[3][3] = { + {0, t12, 0}, + {t12, 0, 0}, + {0, 0, 0} + }; + const CeedScalar Fe[3] = {0}; // TODO: viscous energy flux needs grad temperature + CeedScalar Flux[5]; FluxTotal_Boundary(Flux_inviscid, stress, Fe, norm, Flux); - for (CeedInt j=0; j<5; j++) - v[j][i] = -wdetJb * Flux[j]; - } // End Quadrature Point Loop + for (CeedInt j = 0; j < 5; j++) v[j][i] = -wdetJb * Flux[j]; + } // End Quadrature Point Loop return 0; } // ***************************************************************************** -CEED_QFUNCTION(Blasius_Inflow_Jacobian)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Blasius_Inflow_Jacobian)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], - (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; + const CeedScalar(*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], + (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* - const BlasiusContext context = (BlasiusContext)ctx; - const bool implicit = context->implicit; - const CeedScalar mu = context->newtonian_ctx.mu; - const CeedScalar cv = context->newtonian_ctx.cv; - const CeedScalar Rd = GasConstant(&context->newtonian_ctx); - const CeedScalar gamma = HeatCapacityRatio(&context->newtonian_ctx); - const CeedScalar T_inf = context->T_inf; - const CeedScalar P0 = context->P0; - const CeedScalar delta0 = context->delta0; - const CeedScalar U_inf = context->U_inf; - const bool weakT = context->weakT; - const CeedScalar rho_0 = P0 / (Rd * T_inf); - const CeedScalar x0 = U_inf*rho_0 / (mu*25/ (delta0*delta0)); + const BlasiusContext context = (BlasiusContext)ctx; + const bool implicit = context->implicit; + const CeedScalar mu = context->newtonian_ctx.mu; + const CeedScalar cv = context->newtonian_ctx.cv; + const CeedScalar Rd = GasConstant(&context->newtonian_ctx); + const CeedScalar gamma = HeatCapacityRatio(&context->newtonian_ctx); + const CeedScalar T_inf = context->T_inf; + const CeedScalar P0 = context->P0; + const CeedScalar delta0 = context->delta0; + const CeedScalar U_inf = context->U_inf; + const bool weakT = context->weakT; + const CeedScalar rho_0 = P0 / (Rd * T_inf); + const CeedScalar x0 = U_inf * rho_0 / (mu * 25 / (delta0 * delta0)); CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; i #include + #include "newtonian_state.h" #include "newtonian_types.h" #include "utils.h" typedef struct ChannelContext_ *ChannelContext; struct ChannelContext_ { - bool implicit; // !< Using implicit timesteping or not - CeedScalar theta0; // !< Reference temperature - CeedScalar P0; // !< Reference Pressure - CeedScalar umax; // !< Centerline velocity - CeedScalar center; // !< Y Coordinate for center of channel - CeedScalar H; // !< Channel half-height - CeedScalar B; // !< Body-force driving the flow + bool implicit; // !< Using implicit timesteping or not + CeedScalar theta0; // !< Reference temperature + CeedScalar P0; // !< Reference Pressure + CeedScalar umax; // !< Centerline velocity + CeedScalar center; // !< Y Coordinate for center of channel + CeedScalar H; // !< Channel half-height + CeedScalar B; // !< Body-force driving the flow struct NewtonianIdealGasContext_ newtonian_ctx; }; -CEED_QFUNCTION_HELPER State Exact_Channel(CeedInt dim, CeedScalar time, - const CeedScalar X[], CeedInt Nf, void *ctx) { - - const ChannelContext context = (ChannelContext)ctx; - const CeedScalar theta0 = context->theta0; - const CeedScalar P0 = context->P0; - const CeedScalar umax = context->umax; - const CeedScalar center = context->center; - const CeedScalar H = context->H; - NewtonianIdealGasContext gas = &context->newtonian_ctx; - const CeedScalar cp = gas->cp; - const CeedScalar mu = gas->mu; - const CeedScalar k = gas->k; +CEED_QFUNCTION_HELPER State Exact_Channel(CeedInt dim, CeedScalar time, const CeedScalar X[], CeedInt Nf, void *ctx) { + const ChannelContext context = (ChannelContext)ctx; + const CeedScalar theta0 = context->theta0; + const CeedScalar P0 = context->P0; + const CeedScalar umax = context->umax; + const CeedScalar center = context->center; + const CeedScalar H = context->H; + NewtonianIdealGasContext gas = &context->newtonian_ctx; + const CeedScalar cp = gas->cp; + const CeedScalar mu = gas->mu; + const CeedScalar k = gas->k; // There is a gravity body force but it is excluded from // the potential energy due to periodicity. // g = (g, 0, 0) @@ -50,16 +48,15 @@ CEED_QFUNCTION_HELPER State Exact_Channel(CeedInt dim, CeedScalar time, // e_potential = dot(g, x) = 0 const CeedScalar x[3] = {0, X[1], X[2]}; - const CeedScalar Pr = mu / (cp*k); - const CeedScalar Ec = (umax*umax) / (cp*theta0); - const CeedScalar theta = theta0*(1 + (Pr*Ec/3) - * (1 - Square(Square((x[1]-center)/H)))); - CeedScalar Y[5] = {0.}; - Y[0] = P0; - Y[1] = umax*(1 - Square((x[1]-center)/H)); - Y[2] = 0.; - Y[3] = 0.; - Y[4] = theta; + const CeedScalar Pr = mu / (cp * k); + const CeedScalar Ec = (umax * umax) / (cp * theta0); + const CeedScalar theta = theta0 * (1 + (Pr * Ec / 3) * (1 - Square(Square((x[1] - center) / H)))); + CeedScalar Y[5] = {0.}; + Y[0] = P0; + Y[1] = umax * (1 - Square((x[1] - center) / H)); + Y[2] = 0.; + Y[3] = 0.; + Y[4] = theta; return StateFromY(gas, Y, x); } @@ -67,70 +64,64 @@ CEED_QFUNCTION_HELPER State Exact_Channel(CeedInt dim, CeedScalar time, // ***************************************************************************** // This QFunction set the initial condition // ***************************************************************************** -CEED_QFUNCTION(ICsChannel)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ICsChannel)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs - const CeedScalar (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; // Outputs - CeedScalar (*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // Context const ChannelContext context = (ChannelContext)ctx; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; inewtonian_ctx.state_var) { - case STATEVAR_CONSERVATIVE: - UnpackState_U(s.U, q); - break; - case STATEVAR_PRIMITIVE: - UnpackState_Y(s.Y, q); - break; + case STATEVAR_CONSERVATIVE: + UnpackState_U(s.U, q); + break; + case STATEVAR_PRIMITIVE: + UnpackState_Y(s.Y, q); + break; } - for (CeedInt j=0; j<5; j++) - q0[j][i] = q[j]; + for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j]; - } // End of Quadrature Point Loop + } // End of Quadrature Point Loop return 0; } // ***************************************************************************** // This QFunction set the inflow boundary condition for conservative variables // ***************************************************************************** -CEED_QFUNCTION(Channel_Inflow)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Channel_Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], - (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], + (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* - const ChannelContext context = (ChannelContext)ctx; - const bool implicit = context->implicit; - NewtonianIdealGasContext gas = &context->newtonian_ctx; - const CeedScalar cv = gas->cv; - const CeedScalar cp = gas->cp; - const CeedScalar gamma = cp / cv; + const ChannelContext context = (ChannelContext)ctx; + const bool implicit = context->implicit; + NewtonianIdealGasContext gas = &context->newtonian_ctx; + const CeedScalar cv = gas->cv; + const CeedScalar cp = gas->cp; + const CeedScalar gamma = cp / cv; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; iimplicit; - const CeedScalar P0 = context->P0; + const ChannelContext context = (ChannelContext)ctx; + const bool implicit = context->implicit; + const CeedScalar P0 = context->P0; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; i #include + #include "newtonian_state.h" #include "newtonian_types.h" #include "utils.h" typedef struct DensityCurrentContext_ *DensityCurrentContext; struct DensityCurrentContext_ { - CeedScalar theta0; - CeedScalar thetaC; - CeedScalar P0; - CeedScalar N; - CeedScalar rc; - CeedScalar center[3]; - CeedScalar dc_axis[3]; + CeedScalar theta0; + CeedScalar thetaC; + CeedScalar P0; + CeedScalar N; + CeedScalar rc; + CeedScalar center[3]; + CeedScalar dc_axis[3]; struct NewtonianIdealGasContext_ newtonian_ctx; }; @@ -85,23 +86,22 @@ struct DensityCurrentContext_ { // This helper function provides support for the exact, time-dependent solution // (currently not implemented) and IC formulation for density current // ***************************************************************************** -CEED_QFUNCTION_HELPER State Exact_DC(CeedInt dim, CeedScalar time, - const CeedScalar X[], CeedInt Nf, void *ctx) { +CEED_QFUNCTION_HELPER State Exact_DC(CeedInt dim, CeedScalar time, const CeedScalar X[], CeedInt Nf, void *ctx) { // Context const DensityCurrentContext context = (DensityCurrentContext)ctx; - const CeedScalar theta0 = context->theta0; - const CeedScalar thetaC = context->thetaC; - const CeedScalar P0 = context->P0; - const CeedScalar N = context->N; - const CeedScalar rc = context->rc; - const CeedScalar *center = context->center; - const CeedScalar *dc_axis = context->dc_axis; - NewtonianIdealGasContext gas = &context->newtonian_ctx; - const CeedScalar cp = gas->cp; - const CeedScalar cv = gas->cv; - const CeedScalar Rd = cp - cv; - const CeedScalar *g_vec = gas->g; - const CeedScalar g = -g_vec[2]; + const CeedScalar theta0 = context->theta0; + const CeedScalar thetaC = context->thetaC; + const CeedScalar P0 = context->P0; + const CeedScalar N = context->N; + const CeedScalar rc = context->rc; + const CeedScalar *center = context->center; + const CeedScalar *dc_axis = context->dc_axis; + NewtonianIdealGasContext gas = &context->newtonian_ctx; + const CeedScalar cp = gas->cp; + const CeedScalar cv = gas->cv; + const CeedScalar Rd = cp - cv; + const CeedScalar *g_vec = gas->g; + const CeedScalar g = -g_vec[2]; // Setup // -- Coordinates @@ -112,23 +112,21 @@ CEED_QFUNCTION_HELPER State Exact_DC(CeedInt dim, CeedScalar time, // -- Potential temperature, density current CeedScalar rr[3] = {x - center[0], y - center[1], z - center[2]}; // (I - q q^T) r: distance from dc_axis (or from center if dc_axis is the zero vector) - for (CeedInt i=0; i<3; i++) - rr[i] -= dc_axis[i] * Dot3(dc_axis, rr); - const CeedScalar r = sqrt(Dot3(rr, rr)); - const CeedScalar delta_theta = r <= rc ? thetaC*(1. + cos(M_PI*r/rc))/2. : 0.; - const CeedScalar theta = theta0*exp(Square(N)*z/g) + delta_theta; + for (CeedInt i = 0; i < 3; i++) rr[i] -= dc_axis[i] * Dot3(dc_axis, rr); + const CeedScalar r = sqrt(Dot3(rr, rr)); + const CeedScalar delta_theta = r <= rc ? thetaC * (1. + cos(M_PI * r / rc)) / 2. : 0.; + const CeedScalar theta = theta0 * exp(Square(N) * z / g) + delta_theta; // -- Exner pressure, hydrostatic balance - const CeedScalar Pi = 1. + Square(g)*(exp(-Square(N)*z/g) - 1.) / - (cp*theta0*Square(N)); + const CeedScalar Pi = 1. + Square(g) * (exp(-Square(N) * z / g) - 1.) / (cp * theta0 * Square(N)); // Initial Conditions CeedScalar Y[5] = {0.}; - Y[0] = P0 * pow(Pi, cp/Rd); - Y[1] = 0.0; - Y[2] = 0.0; - Y[3] = 0.0; - Y[4] = Pi * theta; + Y[0] = P0 * pow(Pi, cp / Rd); + Y[1] = 0.0; + Y[2] = 0.0; + Y[3] = 0.0; + Y[4] = Pi * theta; return StateFromY(gas, Y, X); } @@ -136,40 +134,38 @@ CEED_QFUNCTION_HELPER State Exact_DC(CeedInt dim, CeedScalar time, // ***************************************************************************** // This QFunction sets the initial conditions for density current // ***************************************************************************** -CEED_QFUNCTION(ICsDC)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ICsDC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs - const CeedScalar (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; // Outputs - CeedScalar (*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // Context const DensityCurrentContext context = (DensityCurrentContext)ctx; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; inewtonian_ctx.state_var) { - case STATEVAR_CONSERVATIVE: - UnpackState_U(s.U, q); - break; - case STATEVAR_PRIMITIVE: - UnpackState_Y(s.Y, q); - break; + case STATEVAR_CONSERVATIVE: + UnpackState_U(s.U, q); + break; + case STATEVAR_PRIMITIVE: + UnpackState_Y(s.Y, q); + break; } - for (CeedInt j=0; j<5; j++) - q0[j][i] = q[j]; + for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j]; - } // End of Quadrature Point Loop + } // End of Quadrature Point Loop return 0; } // ***************************************************************************** -#endif // densitycurrent_h +#endif // densitycurrent_h diff --git a/examples/fluids/qfunctions/dirichlet_boundary.h b/examples/fluids/qfunctions/dirichlet_boundary.h index 0e87d3731d..f00e61fbb4 100644 --- a/examples/fluids/qfunctions/dirichlet_boundary.h +++ b/examples/fluids/qfunctions/dirichlet_boundary.h @@ -10,25 +10,22 @@ #include -CEED_QFUNCTION(SetupDirichletBC)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(SetupDirichletBC)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs // *INDENT-OFF* - const CeedScalar (*coords)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; - const CeedScalar (*multiplicity)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + const CeedScalar(*coords)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + const CeedScalar(*multiplicity)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; // *INDENT-ON* // Outputs - CeedScalar (*coords_stored)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - CeedScalar (*scale_stored) = (CeedScalar(*))out[1]; + CeedScalar(*coords_stored)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*scale_stored) = (CeedScalar(*))out[1]; - CeedPragmaSIMD - for(CeedInt i=0; i #include + #include "utils.h" typedef struct EulerContext_ *EulerContext; @@ -27,9 +28,9 @@ struct EulerContext_ { CeedScalar vortex_strength; CeedScalar c_tau; CeedScalar mean_velocity[3]; - bool implicit; - int euler_test; - int stabilization; // See StabilizationType: 0=none, 1=SU, 2=SUPG + bool implicit; + int euler_test; + int stabilization; // See StabilizationType: 0=none, 1=SU, 2=SUPG }; // ***************************************************************************** @@ -62,123 +63,119 @@ struct EulerContext_ { // This helper function provides support for the exact, time-dependent solution // (currently not implemented) and IC formulation for Euler traveling vortex // ***************************************************************************** -CEED_QFUNCTION_HELPER int Exact_Euler(CeedInt dim, CeedScalar time, - const CeedScalar X[], CeedInt Nf, CeedScalar q[], - void *ctx) { +CEED_QFUNCTION_HELPER int Exact_Euler(CeedInt dim, CeedScalar time, const CeedScalar X[], CeedInt Nf, CeedScalar q[], void *ctx) { // Context - const EulerContext context = (EulerContext)ctx; - const CeedScalar vortex_strength = context->vortex_strength; - const CeedScalar *center = context->center; // Center of the domain - const CeedScalar *mean_velocity = context->mean_velocity; + const EulerContext context = (EulerContext)ctx; + const CeedScalar vortex_strength = context->vortex_strength; + const CeedScalar *center = context->center; // Center of the domain + const CeedScalar *mean_velocity = context->mean_velocity; // Setup const CeedScalar gamma = 1.4; const CeedScalar cv = 2.5; const CeedScalar R = 1.; - const CeedScalar x = X[0], y = X[1]; // Coordinates + const CeedScalar x = X[0], y = X[1]; // Coordinates // Vortex center const CeedScalar xc = center[0] + mean_velocity[0] * time; const CeedScalar yc = center[1] + mean_velocity[1] * time; const CeedScalar x0 = x - xc; const CeedScalar y0 = y - yc; - const CeedScalar r = sqrt( x0*x0 + y0*y0 ); - const CeedScalar C = vortex_strength * exp((1. - r*r)/2.) / (2. * M_PI); - const CeedScalar delta_T = - (gamma - 1.) * vortex_strength * vortex_strength * - exp(1 - r*r) / (8. * gamma * M_PI * M_PI); - const CeedScalar S_vortex = 1; // no perturbation in the entropy P / rho^gamma - const CeedScalar S_bubble = (gamma - 1.) * vortex_strength * vortex_strength / - (8.*gamma*M_PI*M_PI); - CeedScalar rho, P, T, E, u[3] = {0.}; + const CeedScalar r = sqrt(x0 * x0 + y0 * y0); + const CeedScalar C = vortex_strength * exp((1. - r * r) / 2.) / (2. * M_PI); + const CeedScalar delta_T = -(gamma - 1.) * vortex_strength * vortex_strength * exp(1 - r * r) / (8. * gamma * M_PI * M_PI); + const CeedScalar S_vortex = 1; // no perturbation in the entropy P / rho^gamma + const CeedScalar S_bubble = (gamma - 1.) * vortex_strength * vortex_strength / (8. * gamma * M_PI * M_PI); + CeedScalar rho, P, T, E, u[3] = {0.}; // Initial Conditions switch (context->euler_test) { - case 0: // Traveling vortex - T = 1 + delta_T; - // P = rho * T - // P = S * rho^gamma - // Solve for rho, then substitute for P - rho = pow(T/S_vortex, 1 / (gamma - 1.)); - P = rho * T; - u[0] = mean_velocity[0] - C*y0; - u[1] = mean_velocity[1] + C*x0; - - // Assign exact solution - q[0] = rho; - q[1] = rho * u[0]; - q[2] = rho * u[1]; - q[3] = rho * u[2]; - q[4] = P / (gamma - 1.) + rho * (u[0]*u[0] + u[1]*u[1]) / 2.; - break; - case 1: // Constant zero velocity, density constant, total energy constant - rho = 1.; - E = 2.; - - // Assign exact solution - q[0] = rho; - q[1] = rho * u[0]; - q[2] = rho * u[1]; - q[3] = rho * u[2]; - q[4] = E; - break; - case 2: // Constant nonzero velocity, density constant, total energy constant - rho = 1.; - E = 2.; - u[0] = mean_velocity[0]; - u[1] = mean_velocity[1]; - - // Assign exact solution - q[0] = rho; - q[1] = rho * u[0]; - q[2] = rho * u[1]; - q[3] = rho * u[2]; - q[4] = E; - break; - case 3: // Velocity zero, pressure constant - // (so density and internal energy will be non-constant), - // but the velocity should stay zero and the bubble won't diffuse - // (for Euler, where there is no thermal conductivity) - P = 1.; - T = 1. - S_bubble * exp(1. - r*r); - rho = P / (R*T); - - // Assign exact solution - q[0] = rho; - q[1] = rho * u[0]; - q[2] = rho * u[1]; - q[3] = rho * u[2]; - q[4] = rho * (cv * T + (u[0]*u[0] + u[1]*u[1])/2.); - break; - case 4: // Constant nonzero velocity, pressure constant - // (so density and internal energy will be non-constant), - // it should be transported across the domain, but velocity stays constant - P = 1.; - T = 1. - S_bubble * exp(1. - r*r); - rho = P / (R*T); - u[0] = mean_velocity[0]; - u[1] = mean_velocity[1]; - - // Assign exact solution - q[0] = rho; - q[1] = rho * u[0]; - q[2] = rho * u[1]; - q[3] = rho * u[2]; - q[4] = rho * (cv * T + (u[0]*u[0] + u[1]*u[1])/2.); - break; - case 5: // non-smooth thermal bubble - cylinder - P = 1.; - T = 1. - (r < 1. ? S_bubble : 0.); - rho = P / (R*T); - u[0] = mean_velocity[0]; - u[1] = mean_velocity[1]; - - // Assign exact solution - q[0] = rho; - q[1] = rho * u[0]; - q[2] = rho * u[1]; - q[3] = rho * u[2]; - q[4] = rho * (cv * T + (u[0]*u[0] + u[1]*u[1])/2.); - break; + case 0: // Traveling vortex + T = 1 + delta_T; + // P = rho * T + // P = S * rho^gamma + // Solve for rho, then substitute for P + rho = pow(T / S_vortex, 1 / (gamma - 1.)); + P = rho * T; + u[0] = mean_velocity[0] - C * y0; + u[1] = mean_velocity[1] + C * x0; + + // Assign exact solution + q[0] = rho; + q[1] = rho * u[0]; + q[2] = rho * u[1]; + q[3] = rho * u[2]; + q[4] = P / (gamma - 1.) + rho * (u[0] * u[0] + u[1] * u[1]) / 2.; + break; + case 1: // Constant zero velocity, density constant, total energy constant + rho = 1.; + E = 2.; + + // Assign exact solution + q[0] = rho; + q[1] = rho * u[0]; + q[2] = rho * u[1]; + q[3] = rho * u[2]; + q[4] = E; + break; + case 2: // Constant nonzero velocity, density constant, total energy constant + rho = 1.; + E = 2.; + u[0] = mean_velocity[0]; + u[1] = mean_velocity[1]; + + // Assign exact solution + q[0] = rho; + q[1] = rho * u[0]; + q[2] = rho * u[1]; + q[3] = rho * u[2]; + q[4] = E; + break; + case 3: // Velocity zero, pressure constant + // (so density and internal energy will be non-constant), + // but the velocity should stay zero and the bubble won't diffuse + // (for Euler, where there is no thermal conductivity) + P = 1.; + T = 1. - S_bubble * exp(1. - r * r); + rho = P / (R * T); + + // Assign exact solution + q[0] = rho; + q[1] = rho * u[0]; + q[2] = rho * u[1]; + q[3] = rho * u[2]; + q[4] = rho * (cv * T + (u[0] * u[0] + u[1] * u[1]) / 2.); + break; + case 4: // Constant nonzero velocity, pressure constant + // (so density and internal energy will be non-constant), + // it should be transported across the domain, but velocity stays constant + P = 1.; + T = 1. - S_bubble * exp(1. - r * r); + rho = P / (R * T); + u[0] = mean_velocity[0]; + u[1] = mean_velocity[1]; + + // Assign exact solution + q[0] = rho; + q[1] = rho * u[0]; + q[2] = rho * u[1]; + q[3] = rho * u[2]; + q[4] = rho * (cv * T + (u[0] * u[0] + u[1] * u[1]) / 2.); + break; + case 5: // non-smooth thermal bubble - cylinder + P = 1.; + T = 1. - (r < 1. ? S_bubble : 0.); + rho = P / (R * T); + u[0] = mean_velocity[0]; + u[1] = mean_velocity[1]; + + // Assign exact solution + q[0] = rho; + q[1] = rho * u[0]; + q[2] = rho * u[1]; + q[3] = rho * u[2]; + q[4] = rho * (cv * T + (u[0] * u[0] + u[1] * u[1]) / 2.); + break; } // Return return 0; @@ -187,24 +184,20 @@ CEED_QFUNCTION_HELPER int Exact_Euler(CeedInt dim, CeedScalar time, // ***************************************************************************** // Helper function for computing flux Jacobian // ***************************************************************************** -CEED_QFUNCTION_HELPER void ConvectiveFluxJacobian_Euler(CeedScalar dF[3][5][5], - const CeedScalar rho, const CeedScalar u[3], const CeedScalar E, - const CeedScalar gamma) { - CeedScalar u_sq = u[0]*u[0] + u[1]*u[1] + u[2]*u[2]; // Velocity square - for (CeedInt i=0; i<3; i++) { // Jacobian matrices for 3 directions - for (CeedInt j=0; j<3; j++) { // Rows of each Jacobian matrix - dF[i][j+1][0] = ((i==j) ? ((gamma-1.)*(u_sq/2.)) : 0.) - u[i]*u[j]; - for (CeedInt k=0; k<3; k++) { // Columns of each Jacobian matrix - dF[i][0][k+1] = ((i==k) ? 1. : 0.); - dF[i][j+1][k+1] = ((j==k) ? u[i] : 0.) + - ((i==k) ? u[j] : 0.) - - ((i==j) ? u[k] : 0.) * (gamma-1.); - dF[i][4][k+1] = ((i==k) ? (E*gamma/rho - (gamma-1.)*u_sq/2.) : 0.) - - (gamma-1.)*u[i]*u[k]; +CEED_QFUNCTION_HELPER void ConvectiveFluxJacobian_Euler(CeedScalar dF[3][5][5], const CeedScalar rho, const CeedScalar u[3], const CeedScalar E, + const CeedScalar gamma) { + CeedScalar u_sq = u[0] * u[0] + u[1] * u[1] + u[2] * u[2]; // Velocity square + for (CeedInt i = 0; i < 3; i++) { // Jacobian matrices for 3 directions + for (CeedInt j = 0; j < 3; j++) { // Rows of each Jacobian matrix + dF[i][j + 1][0] = ((i == j) ? ((gamma - 1.) * (u_sq / 2.)) : 0.) - u[i] * u[j]; + for (CeedInt k = 0; k < 3; k++) { // Columns of each Jacobian matrix + dF[i][0][k + 1] = ((i == k) ? 1. : 0.); + dF[i][j + 1][k + 1] = ((j == k) ? u[i] : 0.) + ((i == k) ? u[j] : 0.) - ((i == j) ? u[k] : 0.) * (gamma - 1.); + dF[i][4][k + 1] = ((i == k) ? (E * gamma / rho - (gamma - 1.) * u_sq / 2.) : 0.) - (gamma - 1.) * u[i] * u[k]; } - dF[i][j+1][4] = ((i==j) ? (gamma-1.) : 0.); + dF[i][j + 1][4] = ((i == j) ? (gamma - 1.) : 0.); } - dF[i][4][0] = u[i] * ((gamma-1.)*u_sq - E*gamma/rho); + dF[i][4][0] = u[i] * ((gamma - 1.) * u_sq - E * gamma / rho); dF[i][4][4] = u[i] * gamma; } } @@ -225,42 +218,38 @@ CEED_QFUNCTION_HELPER void ConvectiveFluxJacobian_Euler(CeedScalar dF[3][5][5], // rho(A[i]) = spectral radius of the convective flux Jacobian i, // wave speed in direction i // ***************************************************************************** -CEED_QFUNCTION_HELPER void Tau_spatial(CeedScalar Tau_x[3], - const CeedScalar dXdx[3][3], const CeedScalar u[3], - const CeedScalar sound_speed, const CeedScalar c_tau) { - for (CeedInt i=0; i<3; i++) { +CEED_QFUNCTION_HELPER void Tau_spatial(CeedScalar Tau_x[3], const CeedScalar dXdx[3][3], const CeedScalar u[3], const CeedScalar sound_speed, + const CeedScalar c_tau) { + for (CeedInt i = 0; i < 3; i++) { // length of element in direction i - CeedScalar h = 2 / sqrt(dXdx[0][i]*dXdx[0][i] + dXdx[1][i]*dXdx[1][i] + - dXdx[2][i]*dXdx[2][i]); + CeedScalar h = 2 / sqrt(dXdx[0][i] * dXdx[0][i] + dXdx[1][i] * dXdx[1][i] + dXdx[2][i] * dXdx[2][i]); // fastest wave in direction i CeedScalar fastest_wave = fabs(u[i]) + sound_speed; - Tau_x[i] = c_tau * h / fastest_wave; + Tau_x[i] = c_tau * h / fastest_wave; } } // ***************************************************************************** // This QFunction sets the initial conditions for Euler traveling vortex // ***************************************************************************** -CEED_QFUNCTION(ICsEuler)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ICsEuler)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs - const CeedScalar (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; // Outputs - CeedScalar (*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - const EulerContext context = (EulerContext)ctx; + CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + const EulerContext context = (EulerContext)ctx; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; icurr_time, x, 5, q, ctx); - for (CeedInt j=0; j<5; j++) - q0[j][i] = q[j]; - } // End of Quadrature Point Loop + for (CeedInt j = 0; j < 5; j++) q0[j][i] = q[j]; + } // End of Quadrature Point Loop // Return return 0; @@ -293,113 +282,85 @@ CEED_QFUNCTION(ICsEuler)(void *ctx, CeedInt Q, // g , Gravity // gamma = cp / cv, Specific heat ratio // ***************************************************************************** -CEED_QFUNCTION(Euler)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Euler)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*dq)[5][CEED_Q_VLA] = (const CeedScalar(*)[5][CEED_Q_VLA])in[1], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*dq)[5][CEED_Q_VLA] = (const CeedScalar(*)[5][CEED_Q_VLA])in[1], + (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0], - (*dv)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[1]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0], (*dv)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[1]; - EulerContext context = (EulerContext)ctx; - const CeedScalar c_tau = context->c_tau; - const CeedScalar gamma = 1.4; + EulerContext context = (EulerContext)ctx; + const CeedScalar c_tau = context->c_tau; + const CeedScalar gamma = 1.4; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; istabilization) { - case 0: // Galerkin - break; - case 1: // SU - for (CeedInt j=0; j<3; j++) - for (CeedInt k=0; k<5; k++) - for (CeedInt l=0; l<5; l++) - stab[k][j] += jacob_F_conv[j][k][l] * Tau_x[j] * strong_conv[l]; - - for (CeedInt j=0; j<5; j++) - for (CeedInt k=0; k<3; k++) - dv[k][j][i] -= wdetJ*(stab[j][0] * dXdx[k][0] + - stab[j][1] * dXdx[k][1] + - stab[j][2] * dXdx[k][2]); - break; - case 2: // SUPG is not implemented for explicit scheme - break; + case 0: // Galerkin + break; + case 1: // SU + for (CeedInt j = 0; j < 3; j++) { + for (CeedInt k = 0; k < 5; k++) { + for (CeedInt l = 0; l < 5; l++) stab[k][j] += jacob_F_conv[j][k][l] * Tau_x[j] * strong_conv[l]; + } + } + + for (CeedInt j = 0; j < 5; j++) { + for (CeedInt k = 0; k < 3; k++) dv[k][j][i] -= wdetJ * (stab[j][0] * dXdx[k][0] + stab[j][1] * dXdx[k][1] + stab[j][2] * dXdx[k][2]); + } + break; + case 2: // SUPG is not implemented for explicit scheme + break; } - } // End Quadrature Point Loop + } // End Quadrature Point Loop // Return return 0; @@ -460,116 +420,86 @@ CEED_QFUNCTION(Euler)(void *ctx, CeedInt Q, // with implicit time stepping method // // ***************************************************************************** -CEED_QFUNCTION(IFunction_Euler)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(IFunction_Euler)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*dq)[5][CEED_Q_VLA] = (const CeedScalar(*)[5][CEED_Q_VLA])in[1], - (*q_dot)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*dq)[5][CEED_Q_VLA] = (const CeedScalar(*)[5][CEED_Q_VLA])in[1], + (*q_dot)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0], - (*dv)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[1]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0], (*dv)[5][CEED_Q_VLA] = (CeedScalar(*)[5][CEED_Q_VLA])out[1]; - EulerContext context = (EulerContext)ctx; - const CeedScalar c_tau = context->c_tau; - const CeedScalar gamma = 1.4; + EulerContext context = (EulerContext)ctx; + const CeedScalar c_tau = context->c_tau; + const CeedScalar gamma = 1.4; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; istabilization) { - case 0: // Galerkin - break; - case 1: // SU - for (CeedInt j=0; j<3; j++) - for (CeedInt k=0; k<5; k++) - for (CeedInt l=0; l<5; l++) - stab[k][j] += jacob_F_conv[j][k][l] * Tau_x[j] * strong_conv[l]; - - for (CeedInt j=0; j<5; j++) - for (CeedInt k=0; k<3; k++) - dv[k][j][i] += wdetJ*(stab[j][0] * dXdx[k][0] + - stab[j][1] * dXdx[k][1] + - stab[j][2] * dXdx[k][2]); - break; - case 2: // SUPG - for (CeedInt j=0; j<3; j++) - for (CeedInt k=0; k<5; k++) - for (CeedInt l=0; l<5; l++) - stab[k][j] = jacob_F_conv[j][k][l] * Tau_x[j] * strong_res[l]; - - for (CeedInt j=0; j<5; j++) - for (CeedInt k=0; k<3; k++) - dv[k][j][i] += wdetJ*(stab[j][0] * dXdx[k][0] + - stab[j][1] * dXdx[k][1] + - stab[j][2] * dXdx[k][2]); - break; + case 0: // Galerkin + break; + case 1: // SU + for (CeedInt j = 0; j < 3; j++) { + for (CeedInt k = 0; k < 5; k++) { + for (CeedInt l = 0; l < 5; l++) stab[k][j] += jacob_F_conv[j][k][l] * Tau_x[j] * strong_conv[l]; + } + } + + for (CeedInt j = 0; j < 5; j++) { + for (CeedInt k = 0; k < 3; k++) dv[k][j][i] += wdetJ * (stab[j][0] * dXdx[k][0] + stab[j][1] * dXdx[k][1] + stab[j][2] * dXdx[k][2]); + } + break; + case 2: // SUPG + for (CeedInt j = 0; j < 3; j++) { + for (CeedInt k = 0; k < 5; k++) { + for (CeedInt l = 0; l < 5; l++) stab[k][j] = jacob_F_conv[j][k][l] * Tau_x[j] * strong_res[l]; + } + } + + for (CeedInt j = 0; j < 5; j++) { + for (CeedInt k = 0; k < 3; k++) dv[k][j][i] += wdetJ * (stab[j][0] * dXdx[k][0] + stab[j][1] * dXdx[k][1] + stab[j][2] * dXdx[k][2]); + } + break; } - } // End Quadrature Point Loop + } // End Quadrature Point Loop // Return return 0; @@ -646,61 +573,54 @@ CEED_QFUNCTION(IFunction_Euler)(void *ctx, CeedInt Q, // and applied weakly. // // ***************************************************************************** -CEED_QFUNCTION(TravelingVortex_Inflow)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(TravelingVortex_Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; + const CeedScalar(*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* - EulerContext context = (EulerContext)ctx; - const int euler_test = context->euler_test; - const bool implicit = context->implicit; - CeedScalar *mean_velocity = context->mean_velocity; - const CeedScalar cv = 2.5; - const CeedScalar R = 1.; - CeedScalar T_inlet; - CeedScalar P_inlet; + EulerContext context = (EulerContext)ctx; + const int euler_test = context->euler_test; + const bool implicit = context->implicit; + CeedScalar *mean_velocity = context->mean_velocity; + const CeedScalar cv = 2.5; + const CeedScalar R = 1.; + CeedScalar T_inlet; + CeedScalar P_inlet; // For test cases 1 and 3 the background velocity is zero - if (euler_test == 1 || euler_test == 3) - for (CeedInt i=0; i<3; i++) mean_velocity[i] = 0.; + if (euler_test == 1 || euler_test == 3) { + for (CeedInt i = 0; i < 3; i++) mean_velocity[i] = 0.; + } // For test cases 1 and 2, T_inlet = T_inlet = 0.4 if (euler_test == 1 || euler_test == 2) T_inlet = P_inlet = .4; else T_inlet = P_inlet = 1.; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; i 0) { - } else { // inflow - const CeedScalar rho_inlet = P_inlet/(R*T_inlet); - const CeedScalar E_kinetic_inlet = (mean_velocity[0]*mean_velocity[0] + - mean_velocity[1]*mean_velocity[1]) / 2.; + } else { // inflow + const CeedScalar rho_inlet = P_inlet / (R * T_inlet); + const CeedScalar E_kinetic_inlet = (mean_velocity[0] * mean_velocity[0] + mean_velocity[1] * mean_velocity[1]) / 2.; // incoming total energy const CeedScalar E_inlet = rho_inlet * (cv * T_inlet + E_kinetic_inlet); @@ -709,15 +629,13 @@ CEED_QFUNCTION(TravelingVortex_Inflow)(void *ctx, CeedInt Q, v[0][i] -= wdetJb * rho_inlet * face_normal; // -- Momentum - for (CeedInt j=0; j<3; j++) - v[j+1][i] -= wdetJb *(rho_inlet * face_normal * mean_velocity[j] + - norm[j] * P_inlet); + for (CeedInt j = 0; j < 3; j++) v[j + 1][i] -= wdetJb * (rho_inlet * face_normal * mean_velocity[j] + norm[j] * P_inlet); // -- Total Energy Density v[4][i] -= wdetJb * face_normal * (E_inlet + P_inlet); } - } // End Quadrature Point Loop + } // End Quadrature Point Loop return 0; } @@ -730,74 +648,61 @@ CEED_QFUNCTION(TravelingVortex_Inflow)(void *ctx, CeedInt Q, // extended to the outflow. // // ***************************************************************************** -CEED_QFUNCTION(Euler_Outflow)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Euler_Outflow)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* - EulerContext context = (EulerContext)ctx; - const bool implicit = context->implicit; - CeedScalar *mean_velocity = context->mean_velocity; + EulerContext context = (EulerContext)ctx; + const bool implicit = context->implicit; + CeedScalar *mean_velocity = context->mean_velocity; const CeedScalar gamma = 1.4; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; i 0) { // outflow - const CeedScalar E_kinetic = (u[0]*u[0] + u[1]*u[1]) / 2.; - const CeedScalar P = (E - E_kinetic * rho) * (gamma - 1.); // pressure - const CeedScalar u_normal = norm[0]*u[0] + norm[1]*u[1] + - norm[2]*u[2]; // Normal velocity + if (face_normal > 0) { // outflow + const CeedScalar E_kinetic = (u[0] * u[0] + u[1] * u[1]) / 2.; + const CeedScalar P = (E - E_kinetic * rho) * (gamma - 1.); // pressure + const CeedScalar u_normal = norm[0] * u[0] + norm[1] * u[1] + norm[2] * u[2]; // Normal velocity // The Physics // -- Density v[0][i] -= wdetJb * rho * u_normal; // -- Momentum - for (CeedInt j=0; j<3; j++) - v[j+1][i] -= wdetJb *(rho * u_normal * u[j] + norm[j] * P); + for (CeedInt j = 0; j < 3; j++) v[j + 1][i] -= wdetJb * (rho * u_normal * u[j] + norm[j] * P); // -- Total Energy Density v[4][i] -= wdetJb * u_normal * (E + P); } - } // End Quadrature Point Loop + } // End Quadrature Point Loop return 0; } // ***************************************************************************** -#endif // eulervortex_h +#endif // eulervortex_h diff --git a/examples/fluids/qfunctions/freestream_bc.h b/examples/fluids/qfunctions/freestream_bc.h index 58aee20d48..6e9b38ffc3 100644 --- a/examples/fluids/qfunctions/freestream_bc.h +++ b/examples/fluids/qfunctions/freestream_bc.h @@ -8,79 +8,77 @@ /// @file /// Operator for Navier-Stokes example using PETSc -#include "newtonian_types.h" -#include "newtonian_state.h" #include "freestream_bc_type.h" +#include "newtonian_state.h" +#include "newtonian_types.h" typedef struct { CeedScalar left, right; } RoeWeights; -CEED_QFUNCTION_HELPER RoeWeights RoeSetup(CeedScalar rho_left, - CeedScalar rho_right) { +CEED_QFUNCTION_HELPER RoeWeights RoeSetup(CeedScalar rho_left, CeedScalar rho_right) { CeedScalar sqrt_left = sqrt(rho_left), sqrt_right = sqrt(rho_right); - return (RoeWeights) {sqrt_left / (sqrt_left + sqrt_right), sqrt_right / (sqrt_left + sqrt_right)}; + return (RoeWeights){sqrt_left / (sqrt_left + sqrt_right), sqrt_right / (sqrt_left + sqrt_right)}; } -CEED_QFUNCTION_HELPER RoeWeights RoeSetup_fwd(CeedScalar rho_left, - CeedScalar rho_right, - CeedScalar drho_left, CeedScalar drho_right) { +CEED_QFUNCTION_HELPER RoeWeights RoeSetup_fwd(CeedScalar rho_left, CeedScalar rho_right, CeedScalar drho_left, CeedScalar drho_right) { CeedScalar sqrt_left = sqrt(rho_left), sqrt_right = sqrt(rho_right); CeedScalar square_sum_root = Square(sqrt_left + sqrt_right); - CeedScalar r_right = (sqrt_left/(2*sqrt_right*square_sum_root)) * drho_right - - (sqrt_right/(2*sqrt_left*square_sum_root)) * drho_left; - CeedScalar r_left = (sqrt_right/(2*sqrt_left*square_sum_root)) * drho_left - - (sqrt_left/(2*sqrt_right*square_sum_root)) * drho_right; - return (RoeWeights) {r_left, r_right}; + CeedScalar r_right = (sqrt_left / (2 * sqrt_right * square_sum_root)) * drho_right - (sqrt_right / (2 * sqrt_left * square_sum_root)) * drho_left; + CeedScalar r_left = (sqrt_right / (2 * sqrt_left * square_sum_root)) * drho_left - (sqrt_left / (2 * sqrt_right * square_sum_root)) * drho_right; + return (RoeWeights){r_left, r_right}; } -CEED_QFUNCTION_HELPER CeedScalar RoeAverage(RoeWeights r, CeedScalar q_left, - CeedScalar q_right) { - return r.left * q_left + r.right * q_right; -} +CEED_QFUNCTION_HELPER CeedScalar RoeAverage(RoeWeights r, CeedScalar q_left, CeedScalar q_right) { return r.left * q_left + r.right * q_right; } -CEED_QFUNCTION_HELPER CeedScalar RoeAverage_fwd(RoeWeights r, RoeWeights dr, - CeedScalar q_left, CeedScalar q_right, - CeedScalar dq_left, CeedScalar dq_right) { - return q_right*dr.right + q_left*dr.left + r.right*dq_right + r.left*dq_left; +CEED_QFUNCTION_HELPER CeedScalar RoeAverage_fwd(RoeWeights r, RoeWeights dr, CeedScalar q_left, CeedScalar q_right, CeedScalar dq_left, + CeedScalar dq_right) { + return q_right * dr.right + q_left * dr.left + r.right * dq_right + r.left * dq_left; } -CEED_QFUNCTION_HELPER StateConservative Flux_HLL(State left, State right, - StateConservative flux_left, StateConservative flux_right, - CeedScalar s_left, CeedScalar s_right) { +CEED_QFUNCTION_HELPER StateConservative Flux_HLL(State left, State right, StateConservative flux_left, StateConservative flux_right, + CeedScalar s_left, CeedScalar s_right) { CeedScalar U_left[5], U_right[5], F_right[5], F_left[5], F_hll[5]; - UnpackState_U(left.U, U_left); UnpackState_U(right.U, U_right); - UnpackState_U(flux_left, F_left); UnpackState_U(flux_right, F_right); - for (int i=0; i<5; i++) { - F_hll[i] = (s_right * F_left[i] - s_left * F_right[i] + - s_left * s_right * (U_right[i] - U_left[i])) / (s_right - s_left); + UnpackState_U(left.U, U_left); + UnpackState_U(right.U, U_right); + UnpackState_U(flux_left, F_left); + UnpackState_U(flux_right, F_right); + for (int i = 0; i < 5; i++) { + F_hll[i] = (s_right * F_left[i] - s_left * F_right[i] + s_left * s_right * (U_right[i] - U_left[i])) / (s_right - s_left); } - return (StateConservative) {F_hll[0], {F_hll[1], F_hll[2], F_hll[3]}, F_hll[4]}; + return (StateConservative){ + F_hll[0], {F_hll[1], F_hll[2], F_hll[3]}, + F_hll[4] + }; } -CEED_QFUNCTION_HELPER StateConservative Flux_HLL_fwd(State left, State right, - State dleft, State dright, - StateConservative flux_left, StateConservative flux_right, - StateConservative dflux_left, StateConservative dflux_right, - CeedScalar S_l, CeedScalar S_r, CeedScalar dS_l, CeedScalar dS_r) { +CEED_QFUNCTION_HELPER StateConservative Flux_HLL_fwd(State left, State right, State dleft, State dright, StateConservative flux_left, + StateConservative flux_right, StateConservative dflux_left, StateConservative dflux_right, + CeedScalar S_l, CeedScalar S_r, CeedScalar dS_l, CeedScalar dS_r) { CeedScalar U_l[5], U_r[5], F_r[5], F_l[5]; - UnpackState_U(left.U, U_l); UnpackState_U(right.U, U_r); - UnpackState_U(flux_left, F_l); UnpackState_U(flux_right, F_r); + UnpackState_U(left.U, U_l); + UnpackState_U(right.U, U_r); + UnpackState_U(flux_left, F_l); + UnpackState_U(flux_right, F_r); CeedScalar dU_l[5], dU_r[5], dF_r[5], dF_l[5], dF_hll[5] = {0.}; - UnpackState_U(dleft.U, dU_l); UnpackState_U(dright.U, dU_r); - UnpackState_U(dflux_left, dF_l); UnpackState_U(dflux_right, dF_r); - for (int i=0; i<5; i++) { - const CeedScalar U_diff = U_r[i] - U_l[i]; - const CeedScalar S_diff = S_r - S_l; - const CeedScalar F_hll_denom = S_r*F_l[i] - S_l*F_r[i] + S_l*S_r*U_diff; - - dF_hll[i] += (( F_l[i] + S_r*U_diff)*S_diff - F_hll_denom)/Square(S_diff)*dS_r; - dF_hll[i] += ((-F_r[i] + S_r*U_diff)*S_diff + F_hll_denom)/Square(S_diff)*dS_l; - dF_hll[i] += (S_r*dF_l[i] - S_l*dF_r[i] + S_r*S_l*dU_r[i] - S_r*S_l*dU_l[i]) - / S_diff; + UnpackState_U(dleft.U, dU_l); + UnpackState_U(dright.U, dU_r); + UnpackState_U(dflux_left, dF_l); + UnpackState_U(dflux_right, dF_r); + for (int i = 0; i < 5; i++) { + const CeedScalar U_diff = U_r[i] - U_l[i]; + const CeedScalar S_diff = S_r - S_l; + const CeedScalar F_hll_denom = S_r * F_l[i] - S_l * F_r[i] + S_l * S_r * U_diff; + + dF_hll[i] += ((F_l[i] + S_r * U_diff) * S_diff - F_hll_denom) / Square(S_diff) * dS_r; + dF_hll[i] += ((-F_r[i] + S_r * U_diff) * S_diff + F_hll_denom) / Square(S_diff) * dS_l; + dF_hll[i] += (S_r * dF_l[i] - S_l * dF_r[i] + S_r * S_l * dU_r[i] - S_r * S_l * dU_l[i]) / S_diff; } - return (StateConservative) {dF_hll[0], {dF_hll[1], dF_hll[2], dF_hll[3]}, dF_hll[4]}; + return (StateConservative){ + dF_hll[0], {dF_hll[1], dF_hll[2], dF_hll[3]}, + dF_hll[4] + }; } // ***************************************************************************** @@ -94,17 +92,14 @@ CEED_QFUNCTION_HELPER StateConservative Flux_HLL_fwd(State left, State right, // @param right Fluid state of the domain exterior (free stream conditions) // @param normal Normalized, outward facing boundary normal vector // ***************************************************************************** -CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux( - NewtonianIdealGasContext gas, State left, State right, - const CeedScalar normal[3]) { - +CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux(NewtonianIdealGasContext gas, State left, State right, const CeedScalar normal[3]) { const CeedScalar gamma = HeatCapacityRatio(gas); StateConservative flux_left = FluxInviscidDotNormal(gas, left, normal); StateConservative flux_right = FluxInviscidDotNormal(gas, right, normal); StateConservative RiemannFlux_HLL; - CeedScalar u_left = Dot3(left.Y.velocity, normal); + CeedScalar u_left = Dot3(left.Y.velocity, normal); CeedScalar u_right = Dot3(right.Y.velocity, normal); RoeWeights r = RoeSetup(left.U.density, right.U.density); @@ -120,7 +115,7 @@ CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux( CeedScalar a_roe = sqrt((gamma - 1) * (H_roe - 0.5 * Square(u_roe))); // Einfeldt (1988) justifies (and Toro's book repeats) that Roe speeds can be used here. - CeedScalar s_left = u_roe - a_roe; + CeedScalar s_left = u_roe - a_roe; CeedScalar s_right = u_roe + a_roe; // Compute HLL flux @@ -144,18 +139,14 @@ CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux( // @param dright Derivative of fluid state of the domain exterior (free stream conditions) // @param normal Normalized, outward facing boundary normal vector // ***************************************************************************** -CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux_fwd( - NewtonianIdealGasContext gas, State left, State right, - State dleft, State dright, const CeedScalar normal[3]) { - +CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux_fwd(NewtonianIdealGasContext gas, State left, State right, State dleft, State dright, + const CeedScalar normal[3]) { const CeedScalar gamma = HeatCapacityRatio(gas); StateConservative flux_left = FluxInviscidDotNormal(gas, left, normal); StateConservative flux_right = FluxInviscidDotNormal(gas, right, normal); - StateConservative dflux_left = FluxInviscidDotNormal_fwd(gas, left, dleft, - normal); - StateConservative dflux_right = FluxInviscidDotNormal_fwd(gas, right, dright, - normal); + StateConservative dflux_left = FluxInviscidDotNormal_fwd(gas, left, dleft, normal); + StateConservative dflux_right = FluxInviscidDotNormal_fwd(gas, right, dright, normal); CeedScalar u_left = Dot3(left.Y.velocity, normal); CeedScalar u_right = Dot3(right.Y.velocity, normal); @@ -163,13 +154,12 @@ CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux_fwd( CeedScalar du_right = Dot3(dright.Y.velocity, normal); RoeWeights r = RoeSetup(left.U.density, right.U.density); - RoeWeights dr = RoeSetup_fwd(left.U.density, right.U.density, - dleft.U.density, dright.U.density); + RoeWeights dr = RoeSetup_fwd(left.U.density, right.U.density, dleft.U.density, dright.U.density); // Speed estimate // Roe average eigenvalues for left and right non-linear waves // Stability requires that these speed estimates are *at least* // as fast as the physical wave speeds. - CeedScalar u_roe = RoeAverage(r, u_left, u_right); + CeedScalar u_roe = RoeAverage(r, u_left, u_right); CeedScalar du_roe = RoeAverage_fwd(r, dr, u_left, u_right, du_left, du_right); CeedScalar H_left = TotalSpecificEnthalpy(gas, left); @@ -180,8 +170,7 @@ CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux_fwd( CeedScalar H_roe = RoeAverage(r, H_left, H_right); CeedScalar dH_roe = RoeAverage_fwd(r, dr, H_left, H_right, dH_left, dH_right); CeedScalar a_roe = sqrt((gamma - 1) * (H_roe - 0.5 * Square(u_roe))); - CeedScalar da_roe = 0.5*(gamma-1)/sqrt(H_roe) * dH_roe - - 0.5*sqrt(gamma-1)*u_roe/sqrt(H_roe - 0.5*Square(u_roe)) * du_roe; + CeedScalar da_roe = 0.5 * (gamma - 1) / sqrt(H_roe) * dH_roe - 0.5 * sqrt(gamma - 1) * u_roe / sqrt(H_roe - 0.5 * Square(u_roe)) * du_roe; // Einfeldt (1988) justifies (and Toro's book repeats) that Roe speeds can be used here. CeedScalar s_left = u_roe - a_roe; @@ -196,9 +185,7 @@ CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux_fwd( } else if (s_right <= 0) { dRiemannFlux_HLL = dflux_right; } else { - dRiemannFlux_HLL = Flux_HLL_fwd(left, right, dleft, dright, - flux_left, flux_right, dflux_left, dflux_right, - s_left, s_right, ds_left, ds_right); + dRiemannFlux_HLL = Flux_HLL_fwd(left, right, dleft, dright, flux_left, flux_right, dflux_left, dflux_right, s_left, s_right, ds_left, ds_right); } return dRiemannFlux_HLL; } @@ -206,103 +193,83 @@ CEED_QFUNCTION_HELPER StateConservative Harten_Lax_VanLeer_Flux_fwd( // ***************************************************************************** // Freestream Boundary Condition // ***************************************************************************** -CEED_QFUNCTION_HELPER int Freestream(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out, - StateFromQi_t StateFromQi, StateFromQi_fwd_t StateFromQi_fwd) { +CEED_QFUNCTION_HELPER int Freestream(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateFromQi_t StateFromQi, + StateFromQi_fwd_t StateFromQi_fwd) { //*INDENT-OFF* - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], - (*x)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], + (*x)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA]) out[0], - (*jac_data_sur)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA]) out[1]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0], (*jac_data_sur)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[1]; //*INDENT-ON* - const FreestreamContext context = (FreestreamContext) ctx; - const NewtonianIdealGasContext newt_ctx = &context->newtonian_ctx; - const bool is_implicit = newt_ctx->is_implicit; + const FreestreamContext context = (FreestreamContext)ctx; + const NewtonianIdealGasContext newt_ctx = &context->newtonian_ctx; + const bool is_implicit = newt_ctx->is_implicit; - CeedPragmaSIMD - for(CeedInt i=0; iS_infty, norm); - CeedScalar Flux[5]; + const CeedScalar norm[3] = {q_data_sur[1][i], q_data_sur[2][i], q_data_sur[3][i]}; + + StateConservative HLL_flux = Harten_Lax_VanLeer_Flux(newt_ctx, s, context->S_infty, norm); + CeedScalar Flux[5]; UnpackState_U(HLL_flux, Flux); - for (CeedInt j=0; j<5; j++) v[j][i] = -wdetJb * Flux[j]; + for (CeedInt j = 0; j < 5; j++) v[j][i] = -wdetJb * Flux[j]; - for (int j=0; j<5; j++) jac_data_sur[j][i] = qi[j]; + for (int j = 0; j < 5; j++) jac_data_sur[j][i] = qi[j]; } return 0; } -CEED_QFUNCTION(Freestream_Conserv)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Freestream_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return Freestream(ctx, Q, in, out, StateFromU, StateFromU_fwd); } -CEED_QFUNCTION(Freestream_Prim)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Freestream_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return Freestream(ctx, Q, in, out, StateFromY, StateFromY_fwd); } -CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out, - StateFromQi_t StateFromQi, StateFromQi_fwd_t StateFromQi_fwd) { +CEED_QFUNCTION_HELPER int Freestream_Jacobian(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateFromQi_t StateFromQi, + StateFromQi_fwd_t StateFromQi_fwd) { //*INDENT-OFF* - const CeedScalar (*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], - (*x)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3], - (*jac_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4]; + const CeedScalar(*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], + (*x)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3], (*jac_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4]; - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA]) out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; //*INDENT-ON* - const FreestreamContext context = (FreestreamContext) ctx; - const NewtonianIdealGasContext newt_ctx = &context->newtonian_ctx; - const bool is_implicit = newt_ctx->is_implicit; - const State dS_infty = {{0}}; + const FreestreamContext context = (FreestreamContext)ctx; + const NewtonianIdealGasContext newt_ctx = &context->newtonian_ctx; + const bool is_implicit = newt_ctx->is_implicit; + const State dS_infty = {{0}}; - CeedPragmaSIMD - for(CeedInt i=0; iS_infty, ds, dS_infty, norm); - CeedScalar Flux[5]; + StateConservative dHLL_flux = Harten_Lax_VanLeer_Flux_fwd(newt_ctx, s, context->S_infty, ds, dS_infty, norm); + CeedScalar Flux[5]; UnpackState_U(dHLL_flux, Flux); - for (CeedInt j=0; j<5; j++) v[j][i] = -wdetJb * Flux[j]; + for (CeedInt j = 0; j < 5; j++) v[j][i] = -wdetJb * Flux[j]; } return 0; } -CEED_QFUNCTION(Freestream_Jacobian_Conserv)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Freestream_Jacobian_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return Freestream_Jacobian(ctx, Q, in, out, StateFromU, StateFromU_fwd); } -CEED_QFUNCTION(Freestream_Jacobian_Prim)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Freestream_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return Freestream_Jacobian(ctx, Q, in, out, StateFromY, StateFromY_fwd); } diff --git a/examples/fluids/qfunctions/freestream_bc_type.h b/examples/fluids/qfunctions/freestream_bc_type.h index 276353518d..88c678ed0d 100644 --- a/examples/fluids/qfunctions/freestream_bc_type.h +++ b/examples/fluids/qfunctions/freestream_bc_type.h @@ -8,13 +8,13 @@ #ifndef freestream_bc_type_h #define freestream_bc_type_h -#include "newtonian_types.h" #include "newtonian_state.h" +#include "newtonian_types.h" typedef struct FreestreamContext_ *FreestreamContext; struct FreestreamContext_ { struct NewtonianIdealGasContext_ newtonian_ctx; - State S_infty; + State S_infty; }; #endif diff --git a/examples/fluids/qfunctions/mass.h b/examples/fluids/qfunctions/mass.h index ab8c1d23d2..1713712308 100644 --- a/examples/fluids/qfunctions/mass.h +++ b/examples/fluids/qfunctions/mass.h @@ -25,19 +25,16 @@ // v - Output vector at quadrature points // // ***************************************************************************** -CEED_QFUNCTION(Mass)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Mass)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data) = in[1]; + const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data) = in[1]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* - CeedPragmaSIMD - for (CeedInt i=0; i #include #include + #include "newtonian_state.h" #include "newtonian_types.h" #include "stabilization.h" @@ -23,47 +23,44 @@ // ***************************************************************************** // This QFunction sets a "still" initial condition for generic Newtonian IG problems // ***************************************************************************** -CEED_QFUNCTION(ICsNewtonianIG)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ICsNewtonianIG)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs - const CeedScalar (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; // Outputs - CeedScalar (*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // Context const SetupContext context = (SetupContext)ctx; - const CeedScalar theta0 = context->theta0; - const CeedScalar P0 = context->P0; - const CeedScalar cv = context->cv; - const CeedScalar cp = context->cp; - const CeedScalar *g = context->g; - const CeedScalar Rd = cp - cv; + const CeedScalar theta0 = context->theta0; + const CeedScalar P0 = context->P0; + const CeedScalar cv = context->cv; + const CeedScalar cp = context->cp; + const CeedScalar *g = context->g; + const CeedScalar Rd = cp - cv; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; itheta0; - const CeedScalar P0 = context->P0; + const CeedScalar theta0 = context->theta0; + const CeedScalar P0 = context->P0; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; ig; - const CeedScalar dt = context->dt; + const CeedScalar *g = context->g; + const CeedScalar dt = context->dt; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; ig; - const CeedScalar dt = context->dt; + const CeedScalar *g = context->g; + const CeedScalar dt = context->dt; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; ig; + const CeedScalar *g = context->g; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; iijacobian_time_shift * dU[j] - dbody_force[j]); + for (int j = 0; j < 5; j++) v[j][i] = wdetJ * (context->ijacobian_time_shift * dU[j] - dbody_force[j]); // -- Stabilization method: none (Galerkin), SU, or SUPG CeedScalar dstab[5][3], U_dot[5] = {0}; - for (CeedInt j=0; j<5; j++) U_dot[j] = context->ijacobian_time_shift * dU[j]; + for (CeedInt j = 0; j < 5; j++) U_dot[j] = context->ijacobian_time_shift * dU[j]; Stabilization(context, s, Tau_d, grad_ds, U_dot, dbody_force, x_i, dstab); - for (int j=0; j<5; j++) - for (int k=0; k<3; k++) - Grad_v[k][j][i] += wdetJ*(dstab[j][0] * dXdx[k][0] + - dstab[j][1] * dXdx[k][1] + - dstab[j][2] * dXdx[k][2]); - - } // End Quadrature Point Loop + for (int j = 0; j < 5; j++) { + for (int k = 0; k < 3; k++) Grad_v[k][j][i] += wdetJ * (dstab[j][0] * dXdx[k][0] + dstab[j][1] * dXdx[k][1] + dstab[j][2] * dXdx[k][2]); + } + } // End Quadrature Point Loop return 0; } -CEED_QFUNCTION(IJacobian_Newtonian_Conserv)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(IJacobian_Newtonian_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return IJacobian_Newtonian(ctx, Q, in, out, StateFromU, StateFromU_fwd); } -CEED_QFUNCTION(IJacobian_Newtonian_Prim)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(IJacobian_Newtonian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return IJacobian_Newtonian(ctx, Q, in, out, StateFromY, StateFromY_fwd); } // ***************************************************************************** // Compute boundary integral (ie. for strongly set inflows) // ***************************************************************************** -CEED_QFUNCTION_HELPER int BoundaryIntegral(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out, - StateFromQi_t StateFromQi, StateFromQi_fwd_t StateFromQi_fwd) { - +CEED_QFUNCTION_HELPER int BoundaryIntegral(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateFromQi_t StateFromQi, + StateFromQi_fwd_t StateFromQi_fwd) { //*INDENT-OFF* - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*Grad_q)[5][CEED_Q_VLA] = (const CeedScalar(*)[5][CEED_Q_VLA])in[1], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], - (*x)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*Grad_q)[5][CEED_Q_VLA] = (const CeedScalar(*)[5][CEED_Q_VLA])in[1], + (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], (*x)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA]) out[0], - (*jac_data_sur)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA]) out[1]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0], (*jac_data_sur)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[1]; //*INDENT-ON* - const NewtonianIdealGasContext context = (NewtonianIdealGasContext) ctx; - const bool is_implicit = context->is_implicit; + const NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx; + const bool is_implicit = context->is_implicit; - CeedPragmaSIMD - for(CeedInt i=0; iis_implicit; + const NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx; + const bool implicit = context->is_implicit; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; iis_implicit; - const CeedScalar P0 = context->P0; + const NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx; + const bool implicit = context->is_implicit; + const CeedScalar P0 = context->P0; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; iis_implicit; + const NewtonianIdealGasContext context = (NewtonianIdealGasContext)ctx; + const bool implicit = context->is_implicit; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; iP0; ds.Y.pressure = 0.; State grad_ds[3]; - for (CeedInt j=0; j<3; j++) { + for (CeedInt j = 0; j < 3; j++) { CeedScalar dx_i[3] = {0}, dqi_j[5]; - for (CeedInt k=0; k<5; k++) - dqi_j[k] = Grad_dq[0][k][i] * dXdx[0][j] + - Grad_dq[1][k][i] * dXdx[1][j]; - dx_i[j] = 1.; + for (CeedInt k = 0; k < 5; k++) dqi_j[k] = Grad_dq[0][k][i] * dXdx[0][j] + Grad_dq[1][k][i] * dXdx[1][j]; + dx_i[j] = 1.; grad_ds[j] = StateFromQi_fwd(context, s, dqi_j, x_i, dx_i); } @@ -769,18 +687,17 @@ CEED_QFUNCTION_HELPER int PressureOutflow_Jacobian(void *ctx, CeedInt Q, CeedScalar dFlux[5]; FluxTotal_Boundary(dF_inviscid, dstress, dFe, norm, dFlux); - for (int j=0; j<5; j++) v[j][i] = -wdetJb * dFlux[j]; - } // End Quadrature Point Loop + for (int j = 0; j < 5; j++) v[j][i] = -wdetJb * dFlux[j]; + } // End Quadrature Point Loop return 0; } -CEED_QFUNCTION(PressureOutflow_Jacobian_Conserv)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(PressureOutflow_Jacobian_Conserv)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return PressureOutflow_Jacobian(ctx, Q, in, out, StateFromU, StateFromU_fwd); } -CEED_QFUNCTION(PressureOutflow_Jacobian_Prim)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(PressureOutflow_Jacobian_Prim)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { return PressureOutflow_Jacobian(ctx, Q, in, out, StateFromY, StateFromY_fwd); } -#endif // newtonian_h + +#endif // newtonian_h diff --git a/examples/fluids/qfunctions/newtonian_state.h b/examples/fluids/qfunctions/newtonian_state.h index 86b8190e84..fb51421d59 100644 --- a/examples/fluids/qfunctions/newtonian_state.h +++ b/examples/fluids/qfunctions/newtonian_state.h @@ -8,12 +8,12 @@ /// @file /// Structs and helper functions regarding the state of a newtonian simulation - #ifndef newtonian_state_h #define newtonian_state_h #include #include + #include "newtonian_types.h" #include "utils.h" @@ -31,80 +31,61 @@ typedef struct { typedef struct { StateConservative U; - StatePrimitive Y; + StatePrimitive Y; } State; CEED_QFUNCTION_HELPER void UnpackState_U(StateConservative s, CeedScalar U[5]) { U[0] = s.density; - for (int i=0; i<3; i++) U[i+1] = s.momentum[i]; + for (int i = 0; i < 3; i++) U[i + 1] = s.momentum[i]; U[4] = s.E_total; } CEED_QFUNCTION_HELPER void UnpackState_Y(StatePrimitive s, CeedScalar Y[5]) { Y[0] = s.pressure; - for (int i=0; i<3; i++) Y[i+1] = s.velocity[i]; + for (int i = 0; i < 3; i++) Y[i + 1] = s.velocity[i]; Y[4] = s.temperature; } -CEED_QFUNCTION_HELPER CeedScalar HeatCapacityRatio( - NewtonianIdealGasContext gas) { - return gas->cp / gas->cv; -} +CEED_QFUNCTION_HELPER CeedScalar HeatCapacityRatio(NewtonianIdealGasContext gas) { return gas->cp / gas->cv; } -CEED_QFUNCTION_HELPER CeedScalar GasConstant( - NewtonianIdealGasContext gas) { - return gas->cp - gas->cv; -} +CEED_QFUNCTION_HELPER CeedScalar GasConstant(NewtonianIdealGasContext gas) { return gas->cp - gas->cv; } -CEED_QFUNCTION_HELPER CeedScalar Prandtl(NewtonianIdealGasContext gas) { - return gas->cp * gas->mu / gas->k; -} +CEED_QFUNCTION_HELPER CeedScalar Prandtl(NewtonianIdealGasContext gas) { return gas->cp * gas->mu / gas->k; } -CEED_QFUNCTION_HELPER CeedScalar SoundSpeed(NewtonianIdealGasContext gas, - CeedScalar T) { - return sqrt(gas->cp * (HeatCapacityRatio(gas) - 1.) * T); -} +CEED_QFUNCTION_HELPER CeedScalar SoundSpeed(NewtonianIdealGasContext gas, CeedScalar T) { return sqrt(gas->cp * (HeatCapacityRatio(gas) - 1.) * T); } -CEED_QFUNCTION_HELPER CeedScalar Mach(NewtonianIdealGasContext gas, - CeedScalar T, CeedScalar u) { - return u / SoundSpeed(gas, T); -} +CEED_QFUNCTION_HELPER CeedScalar Mach(NewtonianIdealGasContext gas, CeedScalar T, CeedScalar u) { return u / SoundSpeed(gas, T); } -CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy( - NewtonianIdealGasContext gas, const State s) { +CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy(NewtonianIdealGasContext gas, const State s) { // Ignoring potential energy - CeedScalar e_internal = gas->cv*s.Y.temperature; - CeedScalar e_kinetic = 0.5*Dot3(s.Y.velocity, s.Y.velocity); - return e_internal + e_kinetic + s.Y.pressure/s.U.density; + CeedScalar e_internal = gas->cv * s.Y.temperature; + CeedScalar e_kinetic = 0.5 * Dot3(s.Y.velocity, s.Y.velocity); + return e_internal + e_kinetic + s.Y.pressure / s.U.density; } -CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy_fwd( - NewtonianIdealGasContext gas, const State s, const State ds) { +CEED_QFUNCTION_HELPER CeedScalar TotalSpecificEnthalpy_fwd(NewtonianIdealGasContext gas, const State s, const State ds) { // Ignoring potential energy CeedScalar de_kinetic = Dot3(ds.Y.velocity, s.Y.velocity); CeedScalar de_internal = gas->cv * ds.Y.temperature; - return de_internal + de_kinetic + ds.Y.pressure/s.U.density - - s.Y.pressure/Square(s.U.density)*ds.U.density; + return de_internal + de_kinetic + ds.Y.pressure / s.U.density - s.Y.pressure / Square(s.U.density) * ds.U.density; } -CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromConservative( - NewtonianIdealGasContext gas, StateConservative U, const CeedScalar x[3]) { +CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromConservative(NewtonianIdealGasContext gas, StateConservative U, const CeedScalar x[3]) { StatePrimitive Y; - for (CeedInt i=0; i<3; i++) Y.velocity[i] = U.momentum[i] / U.density; + for (CeedInt i = 0; i < 3; i++) Y.velocity[i] = U.momentum[i] / U.density; CeedScalar e_kinetic = .5 * Dot3(Y.velocity, Y.velocity); CeedScalar e_potential = -Dot3(gas->g, x); CeedScalar e_total = U.E_total / U.density; CeedScalar e_internal = e_total - e_kinetic - e_potential; Y.temperature = e_internal / gas->cv; - Y.pressure = (HeatCapacityRatio(gas) - 1) * U.density * e_internal; + Y.pressure = (HeatCapacityRatio(gas) - 1) * U.density * e_internal; return Y; } -CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromConservative_fwd( - NewtonianIdealGasContext gas, State s, StateConservative dU, - const CeedScalar x[3], const CeedScalar dx[3]) { +CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromConservative_fwd(NewtonianIdealGasContext gas, State s, StateConservative dU, + const CeedScalar x[3], const CeedScalar dx[3]) { StatePrimitive dY; - for (CeedInt i=0; i<3; i++) { + for (CeedInt i = 0; i < 3; i++) { dY.velocity[i] = (dU.momentum[i] - s.Y.velocity[i] * dU.density) / s.U.density; } CeedScalar e_kinetic = .5 * Dot3(s.Y.velocity, s.Y.velocity); @@ -116,31 +97,27 @@ CEED_QFUNCTION_HELPER StatePrimitive StatePrimitiveFromConservative_fwd( CeedScalar e_internal = e_total - e_kinetic - e_potential; CeedScalar de_internal = de_total - de_kinetic - de_potential; dY.temperature = de_internal / gas->cv; - dY.pressure = (HeatCapacityRatio(gas) - 1) - * (dU.density * e_internal + s.U.density * de_internal); + dY.pressure = (HeatCapacityRatio(gas) - 1) * (dU.density * e_internal + s.U.density * de_internal); return dY; } -CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive( - NewtonianIdealGasContext gas, StatePrimitive Y, const CeedScalar x[3]) { +CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive(NewtonianIdealGasContext gas, StatePrimitive Y, const CeedScalar x[3]) { StateConservative U; U.density = Y.pressure / (GasConstant(gas) * Y.temperature); - for (int i=0; i<3; i++) U.momentum[i] = U.density*Y.velocity[i]; + for (int i = 0; i < 3; i++) U.momentum[i] = U.density * Y.velocity[i]; CeedScalar e_internal = gas->cv * Y.temperature; CeedScalar e_kinetic = .5 * Dot3(Y.velocity, Y.velocity); CeedScalar e_potential = -Dot3(gas->g, x); CeedScalar e_total = e_internal + e_kinetic + e_potential; - U.E_total = U.density*e_total; + U.E_total = U.density * e_total; return U; } -CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive_fwd( - NewtonianIdealGasContext gas, State s, StatePrimitive dY, - const CeedScalar x[3], const CeedScalar dx[3]) { +CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive_fwd(NewtonianIdealGasContext gas, State s, StatePrimitive dY, + const CeedScalar x[3], const CeedScalar dx[3]) { StateConservative dU; - dU.density = (dY.pressure * s.Y.temperature - s.Y.pressure * dY.temperature) / - (GasConstant(gas) * s.Y.temperature * s.Y.temperature); - for (int i=0; i<3; i++) { + dU.density = (dY.pressure * s.Y.temperature - s.Y.pressure * dY.temperature) / (GasConstant(gas) * s.Y.temperature * s.Y.temperature); + for (int i = 0; i < 3; i++) { dU.momentum[i] = dU.density * s.Y.velocity[i] + s.U.density * dY.velocity[i]; } CeedScalar e_kinetic = .5 * Dot3(s.Y.velocity, s.Y.velocity); @@ -151,208 +128,169 @@ CEED_QFUNCTION_HELPER StateConservative StateConservativeFromPrimitive_fwd( CeedScalar de_internal = gas->cv * dY.temperature; CeedScalar e_total = e_internal + e_kinetic + e_potential; CeedScalar de_total = de_internal + de_kinetic + de_potential; - dU.E_total = dU.density*e_total + s.U.density*de_total; + dU.E_total = dU.density * e_total + s.U.density * de_total; return dU; } // Function pointer types for generic state array -> State struct functions -typedef State (*StateFromQi_t)(NewtonianIdealGasContext gas, - const CeedScalar qi[5], const CeedScalar x[3]); -typedef State (*StateFromQi_fwd_t)(NewtonianIdealGasContext gas, - State s, const CeedScalar dqi[5], - const CeedScalar x[3], const CeedScalar dx[3]); - -CEED_QFUNCTION_HELPER State StateFromU(NewtonianIdealGasContext gas, - const CeedScalar U[5], const CeedScalar x[3]) { +typedef State (*StateFromQi_t)(NewtonianIdealGasContext gas, const CeedScalar qi[5], const CeedScalar x[3]); +typedef State (*StateFromQi_fwd_t)(NewtonianIdealGasContext gas, State s, const CeedScalar dqi[5], const CeedScalar x[3], const CeedScalar dx[3]); + +CEED_QFUNCTION_HELPER State StateFromU(NewtonianIdealGasContext gas, const CeedScalar U[5], const CeedScalar x[3]) { State s; s.U.density = U[0]; s.U.momentum[0] = U[1]; s.U.momentum[1] = U[2]; s.U.momentum[2] = U[3]; s.U.E_total = U[4]; - s.Y = StatePrimitiveFromConservative(gas, s.U, x); + s.Y = StatePrimitiveFromConservative(gas, s.U, x); return s; } -CEED_QFUNCTION_HELPER State StateFromU_fwd(NewtonianIdealGasContext gas, - State s, const CeedScalar dU[5], - const CeedScalar x[3], const CeedScalar dx[3]) { +CEED_QFUNCTION_HELPER State StateFromU_fwd(NewtonianIdealGasContext gas, State s, const CeedScalar dU[5], const CeedScalar x[3], + const CeedScalar dx[3]) { State ds; ds.U.density = dU[0]; ds.U.momentum[0] = dU[1]; ds.U.momentum[1] = dU[2]; ds.U.momentum[2] = dU[3]; ds.U.E_total = dU[4]; - ds.Y = StatePrimitiveFromConservative_fwd(gas, s, ds.U, x, dx); + ds.Y = StatePrimitiveFromConservative_fwd(gas, s, ds.U, x, dx); return ds; } -CEED_QFUNCTION_HELPER State StateFromY(NewtonianIdealGasContext gas, - const CeedScalar Y[5], const CeedScalar x[3]) { +CEED_QFUNCTION_HELPER State StateFromY(NewtonianIdealGasContext gas, const CeedScalar Y[5], const CeedScalar x[3]) { State s; s.Y.pressure = Y[0]; s.Y.velocity[0] = Y[1]; s.Y.velocity[1] = Y[2]; s.Y.velocity[2] = Y[3]; s.Y.temperature = Y[4]; - s.U = StateConservativeFromPrimitive(gas, s.Y, x); + s.U = StateConservativeFromPrimitive(gas, s.Y, x); return s; } -CEED_QFUNCTION_HELPER State StateFromY_fwd(NewtonianIdealGasContext gas, - State s, const CeedScalar dY[5], - const CeedScalar x[3], const CeedScalar dx[3]) { +CEED_QFUNCTION_HELPER State StateFromY_fwd(NewtonianIdealGasContext gas, State s, const CeedScalar dY[5], const CeedScalar x[3], + const CeedScalar dx[3]) { State ds; ds.Y.pressure = dY[0]; ds.Y.velocity[0] = dY[1]; ds.Y.velocity[1] = dY[2]; ds.Y.velocity[2] = dY[3]; ds.Y.temperature = dY[4]; - ds.U = StateConservativeFromPrimitive_fwd(gas, s, ds.Y, x, dx); + ds.U = StateConservativeFromPrimitive_fwd(gas, s, ds.Y, x, dx); return ds; } // Function pointer types for State struct -> generic state array -typedef void (*StateToQi_t)(NewtonianIdealGasContext gas, - const State input, CeedScalar qi[5]); +typedef void (*StateToQi_t)(NewtonianIdealGasContext gas, const State input, CeedScalar qi[5]); -CEED_QFUNCTION_HELPER void StateToU(NewtonianIdealGasContext gas, - const State input, CeedScalar U[5]) { - UnpackState_U(input.U, U); -} +CEED_QFUNCTION_HELPER void StateToU(NewtonianIdealGasContext gas, const State input, CeedScalar U[5]) { UnpackState_U(input.U, U); } -CEED_QFUNCTION_HELPER void StateToY(NewtonianIdealGasContext gas, - const State input, CeedScalar Y[5]) { - UnpackState_Y(input.Y, Y); -} +CEED_QFUNCTION_HELPER void StateToY(NewtonianIdealGasContext gas, const State input, CeedScalar Y[5]) { UnpackState_Y(input.Y, Y); } -CEED_QFUNCTION_HELPER void FluxInviscid(NewtonianIdealGasContext gas, State s, - StateConservative Flux[3]) { - for (CeedInt i=0; i<3; i++) { +CEED_QFUNCTION_HELPER void FluxInviscid(NewtonianIdealGasContext gas, State s, StateConservative Flux[3]) { + for (CeedInt i = 0; i < 3; i++) { Flux[i].density = s.U.momentum[i]; - for (CeedInt j=0; j<3; j++) - Flux[i].momentum[j] = s.U.momentum[i] * s.Y.velocity[j] - + s.Y.pressure * (i == j); + for (CeedInt j = 0; j < 3; j++) Flux[i].momentum[j] = s.U.momentum[i] * s.Y.velocity[j] + s.Y.pressure * (i == j); Flux[i].E_total = (s.U.E_total + s.Y.pressure) * s.Y.velocity[i]; } } -CEED_QFUNCTION_HELPER void FluxInviscid_fwd(NewtonianIdealGasContext gas, - State s, State ds, StateConservative dFlux[3]) { - for (CeedInt i=0; i<3; i++) { +CEED_QFUNCTION_HELPER void FluxInviscid_fwd(NewtonianIdealGasContext gas, State s, State ds, StateConservative dFlux[3]) { + for (CeedInt i = 0; i < 3; i++) { dFlux[i].density = ds.U.momentum[i]; - for (CeedInt j=0; j<3; j++) - dFlux[i].momentum[j] = ds.U.momentum[i] * s.Y.velocity[j] + - s.U.momentum[i] * ds.Y.velocity[j] + ds.Y.pressure * (i == j); - dFlux[i].E_total = (ds.U.E_total + ds.Y.pressure) * s.Y.velocity[i] + - (s.U.E_total + s.Y.pressure) * ds.Y.velocity[i]; + for (CeedInt j = 0; j < 3; j++) { + dFlux[i].momentum[j] = ds.U.momentum[i] * s.Y.velocity[j] + s.U.momentum[i] * ds.Y.velocity[j] + ds.Y.pressure * (i == j); + } + dFlux[i].E_total = (ds.U.E_total + ds.Y.pressure) * s.Y.velocity[i] + (s.U.E_total + s.Y.pressure) * ds.Y.velocity[i]; } } -CEED_QFUNCTION_HELPER StateConservative FluxInviscidDotNormal( - NewtonianIdealGasContext gas, State s, const CeedScalar normal[3]) { +CEED_QFUNCTION_HELPER StateConservative FluxInviscidDotNormal(NewtonianIdealGasContext gas, State s, const CeedScalar normal[3]) { StateConservative Flux[3], Flux_dot_n = {0}; FluxInviscid(gas, s, Flux); - for (CeedInt i=0; i<3; i++) { + for (CeedInt i = 0; i < 3; i++) { Flux_dot_n.density += Flux[i].density * normal[i]; - for (CeedInt j=0; j<3; j++) - Flux_dot_n.momentum[j] += Flux[i].momentum[j] * normal[i]; + for (CeedInt j = 0; j < 3; j++) Flux_dot_n.momentum[j] += Flux[i].momentum[j] * normal[i]; Flux_dot_n.E_total += Flux[i].E_total * normal[i]; } return Flux_dot_n; } -CEED_QFUNCTION_HELPER StateConservative FluxInviscidDotNormal_fwd( - NewtonianIdealGasContext gas, State s, State ds, const CeedScalar normal[3]) { +CEED_QFUNCTION_HELPER StateConservative FluxInviscidDotNormal_fwd(NewtonianIdealGasContext gas, State s, State ds, const CeedScalar normal[3]) { StateConservative dFlux[3], Flux_dot_n = {0}; FluxInviscid_fwd(gas, s, ds, dFlux); - for (CeedInt i=0; i<3; i++) { + for (CeedInt i = 0; i < 3; i++) { Flux_dot_n.density += dFlux[i].density * normal[i]; - for (CeedInt j=0; j<3; j++) - Flux_dot_n.momentum[j] += dFlux[i].momentum[j] * normal[i]; + for (CeedInt j = 0; j < 3; j++) Flux_dot_n.momentum[j] += dFlux[i].momentum[j] * normal[i]; Flux_dot_n.E_total += dFlux[i].E_total * normal[i]; } return Flux_dot_n; } -CEED_QFUNCTION_HELPER void FluxInviscidStrong(NewtonianIdealGasContext gas, - State s, State ds[3], CeedScalar strong_conv[5]) { - for (CeedInt i=0; i<5; i++) strong_conv[i] = 0; - for (CeedInt i=0; i<3; i++) { +CEED_QFUNCTION_HELPER void FluxInviscidStrong(NewtonianIdealGasContext gas, State s, State ds[3], CeedScalar strong_conv[5]) { + for (CeedInt i = 0; i < 5; i++) strong_conv[i] = 0; + for (CeedInt i = 0; i < 3; i++) { StateConservative dF[3]; FluxInviscid_fwd(gas, s, ds[i], dF); CeedScalar dF_i[5]; UnpackState_U(dF[i], dF_i); - for (CeedInt j=0; j<5; j++) - strong_conv[j] += dF_i[j]; + for (CeedInt j = 0; j < 5; j++) strong_conv[j] += dF_i[j]; } } -CEED_QFUNCTION_HELPER void FluxTotal(const StateConservative F_inviscid[3], - CeedScalar stress[3][3], CeedScalar Fe[3], CeedScalar Flux[5][3]) { - for (CeedInt j=0; j<3; j++) { +CEED_QFUNCTION_HELPER void FluxTotal(const StateConservative F_inviscid[3], CeedScalar stress[3][3], CeedScalar Fe[3], CeedScalar Flux[5][3]) { + for (CeedInt j = 0; j < 3; j++) { Flux[0][j] = F_inviscid[j].density; - for (CeedInt k=0; k<3; k++) - Flux[k+1][j] = F_inviscid[j].momentum[k] - stress[k][j]; + for (CeedInt k = 0; k < 3; k++) Flux[k + 1][j] = F_inviscid[j].momentum[k] - stress[k][j]; Flux[4][j] = F_inviscid[j].E_total + Fe[j]; } } -CEED_QFUNCTION_HELPER void FluxTotal_Boundary( - const StateConservative F_inviscid[3], const CeedScalar stress[3][3], - const CeedScalar Fe[3], const CeedScalar normal[3], CeedScalar Flux[5]) { - - for (CeedInt j=0; j<5; j++) Flux[j] = 0.; - for (CeedInt j=0; j<3; j++) { +CEED_QFUNCTION_HELPER void FluxTotal_Boundary(const StateConservative F_inviscid[3], const CeedScalar stress[3][3], const CeedScalar Fe[3], + const CeedScalar normal[3], CeedScalar Flux[5]) { + for (CeedInt j = 0; j < 5; j++) Flux[j] = 0.; + for (CeedInt j = 0; j < 3; j++) { Flux[0] += F_inviscid[j].density * normal[j]; - for (CeedInt k=0; k<3; k++) { - Flux[k+1] += (F_inviscid[j].momentum[k] - stress[k][j]) * normal[j]; + for (CeedInt k = 0; k < 3; k++) { + Flux[k + 1] += (F_inviscid[j].momentum[k] - stress[k][j]) * normal[j]; } Flux[4] += (F_inviscid[j].E_total + Fe[j]) * normal[j]; } } // Kelvin-Mandel notation -CEED_QFUNCTION_HELPER void KMStrainRate(const State grad_s[3], - CeedScalar strain_rate[6]) { +CEED_QFUNCTION_HELPER void KMStrainRate(const State grad_s[3], CeedScalar strain_rate[6]) { const CeedScalar weight = 1 / sqrt(2.); - strain_rate[0] = grad_s[0].Y.velocity[0]; - strain_rate[1] = grad_s[1].Y.velocity[1]; - strain_rate[2] = grad_s[2].Y.velocity[2]; - strain_rate[3] = weight * (grad_s[2].Y.velocity[1] + grad_s[1].Y.velocity[2]); - strain_rate[4] = weight * (grad_s[2].Y.velocity[0] + grad_s[0].Y.velocity[2]); - strain_rate[5] = weight * (grad_s[1].Y.velocity[0] + grad_s[0].Y.velocity[1]); + strain_rate[0] = grad_s[0].Y.velocity[0]; + strain_rate[1] = grad_s[1].Y.velocity[1]; + strain_rate[2] = grad_s[2].Y.velocity[2]; + strain_rate[3] = weight * (grad_s[2].Y.velocity[1] + grad_s[1].Y.velocity[2]); + strain_rate[4] = weight * (grad_s[2].Y.velocity[0] + grad_s[0].Y.velocity[2]); + strain_rate[5] = weight * (grad_s[1].Y.velocity[0] + grad_s[0].Y.velocity[1]); } -CEED_QFUNCTION_HELPER void NewtonianStress(NewtonianIdealGasContext gas, - const CeedScalar strain_rate[6], CeedScalar stress[6]) { +CEED_QFUNCTION_HELPER void NewtonianStress(NewtonianIdealGasContext gas, const CeedScalar strain_rate[6], CeedScalar stress[6]) { CeedScalar div_u = strain_rate[0] + strain_rate[1] + strain_rate[2]; - for (CeedInt i=0; i<6; i++) { + for (CeedInt i = 0; i < 6; i++) { stress[i] = gas->mu * (2 * strain_rate[i] + gas->lambda * div_u * (i < 3)); } } -CEED_QFUNCTION_HELPER void ViscousEnergyFlux(NewtonianIdealGasContext gas, - StatePrimitive Y, const State grad_s[3], const CeedScalar stress[3][3], - CeedScalar Fe[3]) { - for (CeedInt i=0; i<3; i++) { - Fe[i] = - Y.velocity[0] * stress[0][i] - - Y.velocity[1] * stress[1][i] - - Y.velocity[2] * stress[2][i] - - gas->k * grad_s[i].Y.temperature; +CEED_QFUNCTION_HELPER void ViscousEnergyFlux(NewtonianIdealGasContext gas, StatePrimitive Y, const State grad_s[3], const CeedScalar stress[3][3], + CeedScalar Fe[3]) { + for (CeedInt i = 0; i < 3; i++) { + Fe[i] = -Y.velocity[0] * stress[0][i] - Y.velocity[1] * stress[1][i] - Y.velocity[2] * stress[2][i] - gas->k * grad_s[i].Y.temperature; } } -CEED_QFUNCTION_HELPER void ViscousEnergyFlux_fwd(NewtonianIdealGasContext gas, - StatePrimitive Y, StatePrimitive dY, const State grad_ds[3], - const CeedScalar stress[3][3], const CeedScalar dstress[3][3], - CeedScalar dFe[3]) { - for (CeedInt i=0; i<3; i++) { - dFe[i] = - Y.velocity[0] * dstress[0][i] - dY.velocity[0] * stress[0][i] - - Y.velocity[1] * dstress[1][i] - dY.velocity[1] * stress[1][i] - - Y.velocity[2] * dstress[2][i] - dY.velocity[2] * stress[2][i] - - gas->k * grad_ds[i].Y.temperature; +CEED_QFUNCTION_HELPER void ViscousEnergyFlux_fwd(NewtonianIdealGasContext gas, StatePrimitive Y, StatePrimitive dY, const State grad_ds[3], + const CeedScalar stress[3][3], const CeedScalar dstress[3][3], CeedScalar dFe[3]) { + for (CeedInt i = 0; i < 3; i++) { + dFe[i] = -Y.velocity[0] * dstress[0][i] - dY.velocity[0] * stress[0][i] - Y.velocity[1] * dstress[1][i] - dY.velocity[1] * stress[1][i] - + Y.velocity[2] * dstress[2][i] - dY.velocity[2] * stress[2][i] - gas->k * grad_ds[i].Y.temperature; } } -#endif // newtonian_state_h +#endif // newtonian_state_h diff --git a/examples/fluids/qfunctions/newtonian_types.h b/examples/fluids/qfunctions/newtonian_types.h index dbc8c5a57c..0104a0f47f 100644 --- a/examples/fluids/qfunctions/newtonian_types.h +++ b/examples/fluids/qfunctions/newtonian_types.h @@ -9,19 +9,16 @@ #define newtonian_types_h #include + #include "stabilization_types.h" typedef enum { STATEVAR_CONSERVATIVE = 0, - STATEVAR_PRIMITIVE = 1, + STATEVAR_PRIMITIVE = 1, } StateVariable; // For use with PetscOptionsEnum -static const char *const StateVariables[] = { - "CONSERVATIVE", - "PRIMITIVE", - "StateVariable", "STATEVAR_", NULL -}; +static const char *const StateVariables[] = {"CONSERVATIVE", "PRIMITIVE", "StateVariable", "STATEVAR_", NULL}; typedef struct SetupContext_ *SetupContext; struct SetupContext_ { @@ -39,31 +36,31 @@ struct SetupContext_ { CeedScalar center[3]; CeedScalar dc_axis[3]; CeedScalar time; - int wind_type; // See WindType: 0=ROTATION, 1=TRANSLATION - int bubble_type; // See BubbleType: 0=SPHERE, 1=CYLINDER - int bubble_continuity_type; // See BubbleContinuityType: 0=SMOOTH, 1=BACK_SHARP 2=THICK + int wind_type; // See WindType: 0=ROTATION, 1=TRANSLATION + int bubble_type; // See BubbleType: 0=SPHERE, 1=CYLINDER + int bubble_continuity_type; // See BubbleContinuityType: 0=SMOOTH, 1=BACK_SHARP 2=THICK }; typedef struct NewtonianIdealGasContext_ *NewtonianIdealGasContext; struct NewtonianIdealGasContext_ { - CeedScalar lambda; - CeedScalar mu; - CeedScalar k; - CeedScalar cv; - CeedScalar cp; - CeedScalar g[3]; - CeedScalar c_tau; - CeedScalar Ctau_t; - CeedScalar Ctau_v; - CeedScalar Ctau_C; - CeedScalar Ctau_M; - CeedScalar Ctau_E; - CeedScalar dt; - CeedScalar ijacobian_time_shift; - CeedScalar P0; - bool is_implicit; - StateVariable state_var; + CeedScalar lambda; + CeedScalar mu; + CeedScalar k; + CeedScalar cv; + CeedScalar cp; + CeedScalar g[3]; + CeedScalar c_tau; + CeedScalar Ctau_t; + CeedScalar Ctau_v; + CeedScalar Ctau_C; + CeedScalar Ctau_M; + CeedScalar Ctau_E; + CeedScalar dt; + CeedScalar ijacobian_time_shift; + CeedScalar P0; + bool is_implicit; + StateVariable state_var; StabilizationType stabilization; }; -#endif // newtonian_types_h +#endif // newtonian_types_h diff --git a/examples/fluids/qfunctions/newtonwave.h b/examples/fluids/qfunctions/newtonwave.h index 9765359619..cb91a7effa 100644 --- a/examples/fluids/qfunctions/newtonwave.h +++ b/examples/fluids/qfunctions/newtonwave.h @@ -11,75 +11,69 @@ #include #include -#include "utils.h" + #include "newtonian_state.h" +#include "utils.h" typedef struct NewtonWaveContext_ *NewtonWaveContext; struct NewtonWaveContext_ { - CeedScalar epicenter[2]; // Location of the perturbation - CeedScalar width; // Controls width of the perturbation - CeedScalar amplitude; // Amplitude of the perturbation - State S_infty; // Flow state at infinity + CeedScalar epicenter[2]; // Location of the perturbation + CeedScalar width; // Controls width of the perturbation + CeedScalar amplitude; // Amplitude of the perturbation + State S_infty; // Flow state at infinity struct NewtonianIdealGasContext_ newt_ctx; }; -CEED_QFUNCTION_HELPER int IC_NewtonianWave(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out, - StateToQi_t StateToQi) { - const CeedScalar (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; +CEED_QFUNCTION_HELPER int IC_NewtonianWave(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out, StateToQi_t StateToQi) { + const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; - CeedScalar (*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - const NewtonWaveContext context = (NewtonWaveContext)ctx; + const NewtonWaveContext context = (NewtonWaveContext)ctx; const NewtonianIdealGasContext newt_ctx = &context->newt_ctx; - const CeedScalar amplitude = context->amplitude; - const CeedScalar width = context->width; - const State S_infty = context->S_infty; - const CeedScalar xc = context->epicenter[0]; - const CeedScalar yc = context->epicenter[1]; + const CeedScalar amplitude = context->amplitude; + const CeedScalar width = context->width; + const State S_infty = context->S_infty; + const CeedScalar xc = context->epicenter[0]; + const CeedScalar yc = context->epicenter[1]; const CeedScalar gamma = HeatCapacityRatio(newt_ctx); - for (CeedInt i=0; i #include + #include "utils.h" typedef struct SetupContextShock_ *SetupContextShock; @@ -43,9 +44,9 @@ struct SetupContextShock_ { CeedScalar rho_high; CeedScalar P_low; CeedScalar rho_low; - int wind_type; // See WindType: 0=ROTATION, 1=TRANSLATION - int bubble_type; // See BubbleType: 0=SPHERE, 1=CYLINDER - int bubble_continuity_type; // See BubbleContinuityType: 0=SMOOTH, 1=BACK_SHARP 2=THICK + int wind_type; // See WindType: 0=ROTATION, 1=TRANSLATION + int bubble_type; // See BubbleType: 0=SPHERE, 1=CYLINDER + int bubble_continuity_type; // See BubbleContinuityType: 0=SMOOTH, 1=BACK_SHARP 2=THICK }; typedef struct ShockTubeContext_ *ShockTubeContext; @@ -53,9 +54,9 @@ struct ShockTubeContext_ { CeedScalar Cyzb; CeedScalar Byzb; CeedScalar c_tau; - bool implicit; - bool yzb; - int stabilization; + bool implicit; + bool yzb; + int stabilization; }; // ***************************************************************************** @@ -88,20 +89,18 @@ struct ShockTubeContext_ { // This helper function provides support for the exact, time-dependent solution // (currently not implemented) and IC formulation for Euler traveling vortex // ***************************************************************************** -CEED_QFUNCTION_HELPER CeedInt Exact_ShockTube(CeedInt dim, CeedScalar time, - const CeedScalar X[], CeedInt Nf, CeedScalar q[], void *ctx) { - +CEED_QFUNCTION_HELPER CeedInt Exact_ShockTube(CeedInt dim, CeedScalar time, const CeedScalar X[], CeedInt Nf, CeedScalar q[], void *ctx) { // Context - const SetupContextShock context = (SetupContextShock)ctx; - const CeedScalar mid_point = context->mid_point; // Midpoint of the domain - const CeedScalar P_high = context->P_high; // Driver section pressure - const CeedScalar rho_high = context->rho_high; // Driver section density - const CeedScalar P_low = context->P_low; // Driven section pressure - const CeedScalar rho_low = context->rho_low; // Driven section density + const SetupContextShock context = (SetupContextShock)ctx; + const CeedScalar mid_point = context->mid_point; // Midpoint of the domain + const CeedScalar P_high = context->P_high; // Driver section pressure + const CeedScalar rho_high = context->rho_high; // Driver section density + const CeedScalar P_low = context->P_low; // Driven section pressure + const CeedScalar rho_low = context->rho_low; // Driven section density // Setup - const CeedScalar gamma = 1.4; // ratio of specific heats - const CeedScalar x = X[0]; // Coordinates + const CeedScalar gamma = 1.4; // ratio of specific heats + const CeedScalar x = X[0]; // Coordinates CeedScalar rho, P, u[3] = {0.}; @@ -119,7 +118,7 @@ CEED_QFUNCTION_HELPER CeedInt Exact_ShockTube(CeedInt dim, CeedScalar time, q[1] = rho * u[0]; q[2] = rho * u[1]; q[3] = rho * u[2]; - q[4] = P / (gamma-1.0) + rho * (u[0]*u[0]) / 2.; + q[4] = P / (gamma - 1.0) + rho * (u[0] * u[0]) / 2.; // Return return 0; @@ -128,24 +127,20 @@ CEED_QFUNCTION_HELPER CeedInt Exact_ShockTube(CeedInt dim, CeedScalar time, // ***************************************************************************** // Helper function for computing flux Jacobian // ***************************************************************************** -CEED_QFUNCTION_HELPER void ConvectiveFluxJacobian_Euler(CeedScalar dF[3][5][5], - const CeedScalar rho, const CeedScalar u[3], const CeedScalar E, - const CeedScalar gamma) { - CeedScalar u_sq = u[0]*u[0] + u[1]*u[1] + u[2]*u[2]; // Velocity square - for (CeedInt i=0; i<3; i++) { // Jacobian matrices for 3 directions - for (CeedInt j=0; j<3; j++) { // Rows of each Jacobian matrix - dF[i][j+1][0] = ((i==j) ? ((gamma-1.)*(u_sq/2.)) : 0.) - u[i]*u[j]; - for (CeedInt k=0; k<3; k++) { // Columns of each Jacobian matrix - dF[i][0][k+1] = ((i==k) ? 1. : 0.); - dF[i][j+1][k+1] = ((j==k) ? u[i] : 0.) + - ((i==k) ? u[j] : 0.) - - ((i==j) ? u[k] : 0.) * (gamma-1.); - dF[i][4][k+1] = ((i==k) ? (E*gamma/rho - (gamma-1.)*u_sq/2.) : 0.) - - (gamma-1.)*u[i]*u[k]; +CEED_QFUNCTION_HELPER void ConvectiveFluxJacobian_Euler(CeedScalar dF[3][5][5], const CeedScalar rho, const CeedScalar u[3], const CeedScalar E, + const CeedScalar gamma) { + CeedScalar u_sq = u[0] * u[0] + u[1] * u[1] + u[2] * u[2]; // Velocity square + for (CeedInt i = 0; i < 3; i++) { // Jacobian matrices for 3 directions + for (CeedInt j = 0; j < 3; j++) { // Rows of each Jacobian matrix + dF[i][j + 1][0] = ((i == j) ? ((gamma - 1.) * (u_sq / 2.)) : 0.) - u[i] * u[j]; + for (CeedInt k = 0; k < 3; k++) { // Columns of each Jacobian matrix + dF[i][0][k + 1] = ((i == k) ? 1. : 0.); + dF[i][j + 1][k + 1] = ((j == k) ? u[i] : 0.) + ((i == k) ? u[j] : 0.) - ((i == j) ? u[k] : 0.) * (gamma - 1.); + dF[i][4][k + 1] = ((i == k) ? (E * gamma / rho - (gamma - 1.) * u_sq / 2.) : 0.) - (gamma - 1.) * u[i] * u[k]; } - dF[i][j+1][4] = ((i==j) ? (gamma-1.) : 0.); + dF[i][j + 1][4] = ((i == j) ? (gamma - 1.) : 0.); } - dF[i][4][0] = u[i] * ((gamma-1.)*u_sq - E*gamma/rho); + dF[i][4][0] = u[i] * ((gamma - 1.) * u_sq - E * gamma / rho); dF[i][4][4] = u[i] * gamma; } } @@ -158,24 +153,20 @@ CEED_QFUNCTION_HELPER void ConvectiveFluxJacobian_Euler(CeedScalar dF[3][5][5], // vec = vector that length is measured in the direction of // h = covariant element length along vec // ***************************************************************************** -CEED_QFUNCTION_HELPER CeedScalar Covariant_length_along_vector( - CeedScalar vec[3], const CeedScalar dXdx[3][3]) { - - CeedScalar vec_norm = sqrt(vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2]); +CEED_QFUNCTION_HELPER CeedScalar Covariant_length_along_vector(CeedScalar vec[3], const CeedScalar dXdx[3][3]) { + CeedScalar vec_norm = sqrt(vec[0] * vec[0] + vec[1] * vec[1] + vec[2] * vec[2]); CeedScalar vec_dot_jacobian[3] = {0.0}; - for (CeedInt i=0; i<3; i++) { - for (CeedInt j=0; j<3; j++) { - vec_dot_jacobian[i] += dXdx[j][i]*vec[i]; + for (CeedInt i = 0; i < 3; i++) { + for (CeedInt j = 0; j < 3; j++) { + vec_dot_jacobian[i] += dXdx[j][i] * vec[i]; } } - CeedScalar norm_vec_dot_jacobian = sqrt(vec_dot_jacobian[0]*vec_dot_jacobian[0]+ - vec_dot_jacobian[1]*vec_dot_jacobian[1]+ - vec_dot_jacobian[2]*vec_dot_jacobian[2]); + CeedScalar norm_vec_dot_jacobian = + sqrt(vec_dot_jacobian[0] * vec_dot_jacobian[0] + vec_dot_jacobian[1] * vec_dot_jacobian[1] + vec_dot_jacobian[2] * vec_dot_jacobian[2]); CeedScalar h = 2.0 * vec_norm / norm_vec_dot_jacobian; return h; } - // ***************************************************************************** // Helper function for computing Tau elements (stabilization constant) // Model from: @@ -192,41 +183,37 @@ CEED_QFUNCTION_HELPER CeedScalar Covariant_length_along_vector( // rho(A[i]) = spectral radius of the convective flux Jacobian i, // wave speed in direction i // ***************************************************************************** -CEED_QFUNCTION_HELPER void Tau_spatial(CeedScalar Tau_x[3], - const CeedScalar dXdx[3][3], const CeedScalar u[3], - const CeedScalar sound_speed, const CeedScalar c_tau) { - for (CeedInt i=0; i<3; i++) { +CEED_QFUNCTION_HELPER void Tau_spatial(CeedScalar Tau_x[3], const CeedScalar dXdx[3][3], const CeedScalar u[3], const CeedScalar sound_speed, + const CeedScalar c_tau) { + for (CeedInt i = 0; i < 3; i++) { // length of element in direction i - CeedScalar h = 2 / sqrt(dXdx[0][i]*dXdx[0][i] + dXdx[1][i]*dXdx[1][i] + - dXdx[2][i]*dXdx[2][i]); + CeedScalar h = 2 / sqrt(dXdx[0][i] * dXdx[0][i] + dXdx[1][i] * dXdx[1][i] + dXdx[2][i] * dXdx[2][i]); // fastest wave in direction i CeedScalar fastest_wave = fabs(u[i]) + sound_speed; - Tau_x[i] = c_tau * h / fastest_wave; + Tau_x[i] = c_tau * h / fastest_wave; } } // ***************************************************************************** // This QFunction sets the initial conditions for shock tube // ***************************************************************************** -CEED_QFUNCTION(ICsShockTube)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(ICsShockTube)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs - const CeedScalar (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; + const CeedScalar(*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0]; // Outputs - CeedScalar (*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; iCyzb; - const CeedScalar Byzb = context->Byzb; - const CeedScalar c_tau = context->c_tau; + const CeedScalar Cyzb = context->Cyzb; + const CeedScalar Byzb = context->Byzb; + const CeedScalar c_tau = context->c_tau; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; iyzb) { - CeedScalar drho_norm = 0.0; // magnitude of the density gradient - CeedScalar j_vec[3] = {0.0}; // unit vector aligned with the density gradient - CeedScalar h_shock = 0.0; // element lengthscale - CeedScalar acoustic_vel = 0.0; // characteristic velocity, acoustic speed - CeedScalar tau_shock = 0.0; // timescale - CeedScalar nu_shock = 0.0; // artificial diffusion + CeedScalar drho_norm = 0.0; // magnitude of the density gradient + CeedScalar j_vec[3] = {0.0}; // unit vector aligned with the density gradient + CeedScalar h_shock = 0.0; // element lengthscale + CeedScalar acoustic_vel = 0.0; // characteristic velocity, acoustic speed + CeedScalar tau_shock = 0.0; // timescale + CeedScalar nu_shock = 0.0; // artificial diffusion // Unit vector aligned with the density gradient - drho_norm = sqrt(drhodx[0]*drhodx[0] + drhodx[1]*drhodx[1] + - drhodx[2]*drhodx[2]); - for (CeedInt j=0; j<3; j++) - j_vec[j] = drhodx[j] / (drho_norm + 1e-20); + drho_norm = sqrt(drhodx[0] * drhodx[0] + drhodx[1] * drhodx[1] + drhodx[2] * drhodx[2]); + for (CeedInt j = 0; j < 3; j++) j_vec[j] = drhodx[j] / (drho_norm + 1e-20); if (drho_norm == 0.0) { nu_shock = 0.0; } else { h_shock = Covariant_length_along_vector(j_vec, dXdx); h_shock /= Cyzb; - acoustic_vel = sqrt(gamma*P/rho); - tau_shock = h_shock / (2*acoustic_vel) * pow(drho_norm * h_shock / rho, Byzb); - nu_shock = fabs(tau_shock * acoustic_vel * acoustic_vel); + acoustic_vel = sqrt(gamma * P / rho); + tau_shock = h_shock / (2 * acoustic_vel) * pow(drho_norm * h_shock / rho, Byzb); + nu_shock = fabs(tau_shock * acoustic_vel * acoustic_vel); } - for (CeedInt j=0; j<3; j++) - dv[j][0][i] -= wdetJ * nu_shock * drhodx[j]; + for (CeedInt j = 0; j < 3; j++) dv[j][0][i] -= wdetJ * nu_shock * drhodx[j]; - for (CeedInt k=0; k<3; k++) - for (CeedInt j=0; j<3; j++) - dv[j][k][i] -= wdetJ * nu_shock * du[k][j]; + for (CeedInt k = 0; k < 3; k++) { + for (CeedInt j = 0; j < 3; j++) dv[j][k][i] -= wdetJ * nu_shock * du[k][j]; + } - for (CeedInt j=0; j<3; j++) - dv[j][4][i] -= wdetJ * nu_shock * dEdx[j]; + for (CeedInt j = 0; j < 3; j++) dv[j][4][i] -= wdetJ * nu_shock * dEdx[j]; } // Stabilization @@ -414,51 +369,50 @@ CEED_QFUNCTION(EulerShockTube)(void *ctx, CeedInt Q, CeedScalar jacob_F_conv[3][5][5] = {{{0.}}}; ConvectiveFluxJacobian_Euler(jacob_F_conv, rho, u, E, gamma); - // dqdx collects drhodx, dUdx and dEdx in one vector CeedScalar dqdx[5][3]; - for (CeedInt j=0; j<3; j++) { + for (CeedInt j = 0; j < 3; j++) { dqdx[0][j] = drhodx[j]; dqdx[4][j] = dEdx[j]; - for (CeedInt k=0; k<3; k++) - dqdx[k+1][j] = dUdx[k][j]; + for (CeedInt k = 0; k < 3; k++) dqdx[k + 1][j] = dUdx[k][j]; } // strong_conv = dF/dq * dq/dx (Strong convection) CeedScalar strong_conv[5] = {0}; - for (CeedInt j=0; j<3; j++) - for (CeedInt k=0; k<5; k++) - for (CeedInt l=0; l<5; l++) - strong_conv[k] += jacob_F_conv[j][k][l] * dqdx[l][j]; + for (CeedInt j = 0; j < 3; j++) { + for (CeedInt k = 0; k < 5; k++) { + for (CeedInt l = 0; l < 5; l++) strong_conv[k] += jacob_F_conv[j][k][l] * dqdx[l][j]; + } + } // Stabilization // -- Tau elements const CeedScalar sound_speed = sqrt(gamma * P / rho); - CeedScalar Tau_x[3] = {0.}; + CeedScalar Tau_x[3] = {0.}; Tau_spatial(Tau_x, dXdx, u, sound_speed, c_tau); CeedScalar stab[5][3] = {0}; switch (context->stabilization) { - case 0: // Galerkin - break; - case 1: // SU - for (CeedInt j=0; j<3; j++) - for (CeedInt k=0; k<5; k++) - for (CeedInt l=0; l<5; l++) { - stab[k][j] += jacob_F_conv[j][k][l] * Tau_x[j] * strong_conv[l]; + case 0: // Galerkin + break; + case 1: // SU + for (CeedInt j = 0; j < 3; j++) { + for (CeedInt k = 0; k < 5; k++) { + for (CeedInt l = 0; l < 5; l++) { + stab[k][j] += jacob_F_conv[j][k][l] * Tau_x[j] * strong_conv[l]; + } } - for (CeedInt j=0; j<5; j++) - for (CeedInt k=0; k<3; k++) - dv[k][j][i] -= wdetJ*(stab[j][0] * dXdx[k][0] + - stab[j][1] * dXdx[k][1] + - stab[j][2] * dXdx[k][2]); - break; + } + for (CeedInt j = 0; j < 5; j++) { + for (CeedInt k = 0; k < 3; k++) dv[k][j][i] -= wdetJ * (stab[j][0] * dXdx[k][0] + stab[j][1] * dXdx[k][1] + stab[j][2] * dXdx[k][2]); + } + break; } - } // End Quadrature Point Loop + } // End Quadrature Point Loop // Return return 0; } -#endif // shocktube_h +#endif // shocktube_h diff --git a/examples/fluids/qfunctions/stabilization.h b/examples/fluids/qfunctions/stabilization.h index c7aa59291a..5bdf2cbbec 100644 --- a/examples/fluids/qfunctions/stabilization.h +++ b/examples/fluids/qfunctions/stabilization.h @@ -12,14 +12,14 @@ #define stabilization_h #include + #include "newtonian_state.h" // ***************************************************************************** // Helper function for computing the variation in primitive variables, // given Tau_d // ***************************************************************************** -CEED_QFUNCTION_HELPER void dYFromTau(CeedScalar Y[5], CeedScalar Tau_d[3], - CeedScalar dY[5]) { +CEED_QFUNCTION_HELPER void dYFromTau(CeedScalar Y[5], CeedScalar Tau_d[3], CeedScalar dY[5]) { dY[0] = Tau_d[0] * Y[0]; dY[1] = Tau_d[1] * Y[1]; dY[2] = Tau_d[1] * Y[2]; @@ -30,42 +30,39 @@ CEED_QFUNCTION_HELPER void dYFromTau(CeedScalar Y[5], CeedScalar Tau_d[3], // ***************************************************************************** // Helper functions for computing the stabilization terms // ***************************************************************************** -CEED_QFUNCTION_HELPER void StabilizationMatrix(NewtonianIdealGasContext gas, - State s, CeedScalar Tau_d[3], CeedScalar R[5], const CeedScalar x[3], - CeedScalar stab[5][3]) { - CeedScalar dY[5]; - const CeedScalar dx_i[3] = {0}; +CEED_QFUNCTION_HELPER void StabilizationMatrix(NewtonianIdealGasContext gas, State s, CeedScalar Tau_d[3], CeedScalar R[5], const CeedScalar x[3], + CeedScalar stab[5][3]) { + CeedScalar dY[5]; + const CeedScalar dx_i[3] = {0}; StateConservative dF[3]; // Zero stab so all future terms can safely sum into it - for (CeedInt i=0; i<5; i++) - for (CeedInt j=0; j<3; j++) - stab[i][j] = 0; + for (CeedInt i = 0; i < 5; i++) { + for (CeedInt j = 0; j < 3; j++) stab[i][j] = 0; + } dYFromTau(R, Tau_d, dY); State ds = StateFromY_fwd(gas, s, dY, x, dx_i); FluxInviscid_fwd(gas, s, ds, dF); - for (CeedInt i=0; i<3; i++) { + for (CeedInt i = 0; i < 3; i++) { CeedScalar dF_i[5]; UnpackState_U(dF[i], dF_i); - for (CeedInt j=0; j<5; j++) - stab[j][i] += dF_i[j]; + for (CeedInt j = 0; j < 5; j++) stab[j][i] += dF_i[j]; } } -CEED_QFUNCTION_HELPER void Stabilization(NewtonianIdealGasContext gas, State s, - CeedScalar Tau_d[3], State ds[3], CeedScalar U_dot[5], - const CeedScalar body_force[5], const CeedScalar x[3], CeedScalar stab[5][3]) { +CEED_QFUNCTION_HELPER void Stabilization(NewtonianIdealGasContext gas, State s, CeedScalar Tau_d[3], State ds[3], CeedScalar U_dot[5], + const CeedScalar body_force[5], const CeedScalar x[3], CeedScalar stab[5][3]) { // -- Stabilization method: none (Galerkin), SU, or SUPG CeedScalar R[5] = {0}; switch (gas->stabilization) { - case STAB_NONE: - break; - case STAB_SU: - FluxInviscidStrong(gas, s, ds, R); - break; - case STAB_SUPG: - FluxInviscidStrong(gas, s, ds, R); - for (CeedInt j=0; j<5; j++) R[j] += U_dot[j] - body_force[j]; - break; + case STAB_NONE: + break; + case STAB_SU: + FluxInviscidStrong(gas, s, ds, R); + break; + case STAB_SUPG: + FluxInviscidStrong(gas, s, ds, R); + for (CeedInt j = 0; j < 5; j++) R[j] += U_dot[j] - body_force[j]; + break; } StabilizationMatrix(gas, s, Tau_d, R, x, stab); } @@ -78,19 +75,18 @@ CEED_QFUNCTION_HELPER void Stabilization(NewtonianIdealGasContext gas, State s, // Tau[i] = itau=0 which is diagonal-Shakib (3 values still but not spatial) // // ***************************************************************************** -CEED_QFUNCTION_HELPER void Tau_diagPrim(NewtonianIdealGasContext gas, State s, - const CeedScalar dXdx[3][3], - const CeedScalar dt, CeedScalar Tau_d[3]) { +CEED_QFUNCTION_HELPER void Tau_diagPrim(NewtonianIdealGasContext gas, State s, const CeedScalar dXdx[3][3], const CeedScalar dt, + CeedScalar Tau_d[3]) { // Context const CeedScalar Ctau_t = gas->Ctau_t; const CeedScalar Ctau_v = gas->Ctau_v; const CeedScalar Ctau_C = gas->Ctau_C; const CeedScalar Ctau_M = gas->Ctau_M; const CeedScalar Ctau_E = gas->Ctau_E; - const CeedScalar cv = gas->cv; - const CeedScalar mu = gas->mu; - const CeedScalar u[3] = {s.Y.velocity[0], s.Y.velocity[1], s.Y.velocity[2]}; - const CeedScalar rho = s.U.density; + const CeedScalar cv = gas->cv; + const CeedScalar mu = gas->mu; + const CeedScalar u[3] = {s.Y.velocity[0], s.Y.velocity[1], s.Y.velocity[2]}; + const CeedScalar rho = s.U.density; CeedScalar gijd[6]; CeedScalar tau; @@ -98,47 +94,33 @@ CEED_QFUNCTION_HELPER void Tau_diagPrim(NewtonianIdealGasContext gas, State s, CeedScalar fact; //*INDENT-OFF* - gijd[0] = dXdx[0][0] * dXdx[0][0] - + dXdx[1][0] * dXdx[1][0] - + dXdx[2][0] * dXdx[2][0]; + gijd[0] = dXdx[0][0] * dXdx[0][0] + dXdx[1][0] * dXdx[1][0] + dXdx[2][0] * dXdx[2][0]; - gijd[1] = dXdx[0][0] * dXdx[0][1] - + dXdx[1][0] * dXdx[1][1] - + dXdx[2][0] * dXdx[2][1]; + gijd[1] = dXdx[0][0] * dXdx[0][1] + dXdx[1][0] * dXdx[1][1] + dXdx[2][0] * dXdx[2][1]; - gijd[2] = dXdx[0][1] * dXdx[0][1] - + dXdx[1][1] * dXdx[1][1] - + dXdx[2][1] * dXdx[2][1]; + gijd[2] = dXdx[0][1] * dXdx[0][1] + dXdx[1][1] * dXdx[1][1] + dXdx[2][1] * dXdx[2][1]; - gijd[3] = dXdx[0][0] * dXdx[0][2] - + dXdx[1][0] * dXdx[1][2] - + dXdx[2][0] * dXdx[2][2]; + gijd[3] = dXdx[0][0] * dXdx[0][2] + dXdx[1][0] * dXdx[1][2] + dXdx[2][0] * dXdx[2][2]; - gijd[4] = dXdx[0][1] * dXdx[0][2] - + dXdx[1][1] * dXdx[1][2] - + dXdx[2][1] * dXdx[2][2]; + gijd[4] = dXdx[0][1] * dXdx[0][2] + dXdx[1][1] * dXdx[1][2] + dXdx[2][1] * dXdx[2][2]; - gijd[5] = dXdx[0][2] * dXdx[0][2] - + dXdx[1][2] * dXdx[1][2] - + dXdx[2][2] * dXdx[2][2]; + gijd[5] = dXdx[0][2] * dXdx[0][2] + dXdx[1][2] * dXdx[1][2] + dXdx[2][2] * dXdx[2][2]; //*INDENT-ON* - dts = Ctau_t / dt ; + dts = Ctau_t / dt; - tau = rho*rho*((4. * dts * dts) - + u[0] * ( u[0] * gijd[0] + 2. * ( u[1] * gijd[1] + u[2] * gijd[3])) - + u[1] * ( u[1] * gijd[2] + 2. * u[2] * gijd[4]) - + u[2] * u[2] * gijd[5]) - + Ctau_v* mu * mu * - (gijd[0]*gijd[0] + gijd[2]*gijd[2] + gijd[5]*gijd[5] + - + 2. * (gijd[1]*gijd[1] + gijd[3]*gijd[3] + gijd[4]*gijd[4])); + tau = rho * rho * + ((4. * dts * dts) + u[0] * (u[0] * gijd[0] + 2. * (u[1] * gijd[1] + u[2] * gijd[3])) + u[1] * (u[1] * gijd[2] + 2. * u[2] * gijd[4]) + + u[2] * u[2] * gijd[5]) + + Ctau_v * mu * mu * + (gijd[0] * gijd[0] + gijd[2] * gijd[2] + gijd[5] * gijd[5] + +2. * (gijd[1] * gijd[1] + gijd[3] * gijd[3] + gijd[4] * gijd[4])); fact = sqrt(tau); - Tau_d[0] = Ctau_C * fact / (rho*(gijd[0] + gijd[2] + gijd[5]))*0.125; + Tau_d[0] = Ctau_C * fact / (rho * (gijd[0] + gijd[2] + gijd[5])) * 0.125; Tau_d[1] = Ctau_M / fact; - Tau_d[2] = Ctau_E / ( fact * cv ); + Tau_d[2] = Ctau_E / (fact * cv); // consider putting back the way I initially had it Ctau_E * Tau_d[1] /cv // to avoid a division if the compiler is smart enough to see that cv IS @@ -151,4 +133,4 @@ CEED_QFUNCTION_HELPER void Tau_diagPrim(NewtonianIdealGasContext gas, State s, // ***************************************************************************** -#endif // stabilization_h +#endif // stabilization_h diff --git a/examples/fluids/qfunctions/stabilization_types.h b/examples/fluids/qfunctions/stabilization_types.h index 83908cf731..490c67c440 100644 --- a/examples/fluids/qfunctions/stabilization_types.h +++ b/examples/fluids/qfunctions/stabilization_types.h @@ -10,8 +10,8 @@ typedef enum { STAB_NONE = 0, - STAB_SU = 1, // Streamline Upwind - STAB_SUPG = 2, // Streamline Upwind Petrov-Galerkin + STAB_SU = 1, // Streamline Upwind + STAB_SUPG = 2, // Streamline Upwind Petrov-Galerkin } StabilizationType; -#endif // stabilization_types_h +#endif // stabilization_types_h diff --git a/examples/fluids/qfunctions/stg_shur14.h b/examples/fluids/qfunctions/stg_shur14.h index e183445f53..fddb065ec5 100644 --- a/examples/fluids/qfunctions/stg_shur14.h +++ b/examples/fluids/qfunctions/stg_shur14.h @@ -19,6 +19,7 @@ #include #include #include + #include "stg_shur14_type.h" #include "utils.h" @@ -37,53 +38,51 @@ * @param[out] lt Turbulent length scale at wall_dist * @param[in] stg_ctx STGShur14Context for the problem */ -CEED_QFUNCTION_HELPER void InterpolateProfile(const CeedScalar wall_dist, - CeedScalar ubar[3], CeedScalar cij[6], CeedScalar *eps, CeedScalar *lt, - const STGShur14Context stg_ctx) { - - const CeedInt nprofs = stg_ctx->nprofs; +CEED_QFUNCTION_HELPER void InterpolateProfile(const CeedScalar wall_dist, CeedScalar ubar[3], CeedScalar cij[6], CeedScalar *eps, CeedScalar *lt, + const STGShur14Context stg_ctx) { + const CeedInt nprofs = stg_ctx->nprofs; const CeedScalar *prof_wd = &stg_ctx->data[stg_ctx->offsets.wall_dist]; const CeedScalar *prof_eps = &stg_ctx->data[stg_ctx->offsets.eps]; const CeedScalar *prof_lt = &stg_ctx->data[stg_ctx->offsets.lt]; const CeedScalar *prof_ubar = &stg_ctx->data[stg_ctx->offsets.ubar]; const CeedScalar *prof_cij = &stg_ctx->data[stg_ctx->offsets.cij]; - CeedInt idx=-1; + CeedInt idx = -1; - for(CeedInt i=0; i 0) { // y within the bounds of prof_wd + if (idx > 0) { // y within the bounds of prof_wd //*INDENT-OFF* - CeedScalar coeff = (wall_dist - prof_wd[idx-1]) / (prof_wd[idx] - prof_wd[idx -1]); - - ubar[0] = prof_ubar[0*nprofs+idx-1] + coeff*( prof_ubar[0*nprofs+idx] - prof_ubar[0*nprofs+idx-1] ); - ubar[1] = prof_ubar[1*nprofs+idx-1] + coeff*( prof_ubar[1*nprofs+idx] - prof_ubar[1*nprofs+idx-1] ); - ubar[2] = prof_ubar[2*nprofs+idx-1] + coeff*( prof_ubar[2*nprofs+idx] - prof_ubar[2*nprofs+idx-1] ); - cij[0] = prof_cij[0*nprofs+idx-1] + coeff*( prof_cij[0*nprofs+idx] - prof_cij[0*nprofs+idx-1] ); - cij[1] = prof_cij[1*nprofs+idx-1] + coeff*( prof_cij[1*nprofs+idx] - prof_cij[1*nprofs+idx-1] ); - cij[2] = prof_cij[2*nprofs+idx-1] + coeff*( prof_cij[2*nprofs+idx] - prof_cij[2*nprofs+idx-1] ); - cij[3] = prof_cij[3*nprofs+idx-1] + coeff*( prof_cij[3*nprofs+idx] - prof_cij[3*nprofs+idx-1] ); - cij[4] = prof_cij[4*nprofs+idx-1] + coeff*( prof_cij[4*nprofs+idx] - prof_cij[4*nprofs+idx-1] ); - cij[5] = prof_cij[5*nprofs+idx-1] + coeff*( prof_cij[5*nprofs+idx] - prof_cij[5*nprofs+idx-1] ); - *eps = prof_eps[idx-1] + coeff*( prof_eps[idx] - prof_eps[idx-1] ); - *lt = prof_lt[idx-1] + coeff*( prof_lt[idx] - prof_lt[idx-1] ); + CeedScalar coeff = (wall_dist - prof_wd[idx - 1]) / (prof_wd[idx] - prof_wd[idx - 1]); + + ubar[0] = prof_ubar[0 * nprofs + idx - 1] + coeff * (prof_ubar[0 * nprofs + idx] - prof_ubar[0 * nprofs + idx - 1]); + ubar[1] = prof_ubar[1 * nprofs + idx - 1] + coeff * (prof_ubar[1 * nprofs + idx] - prof_ubar[1 * nprofs + idx - 1]); + ubar[2] = prof_ubar[2 * nprofs + idx - 1] + coeff * (prof_ubar[2 * nprofs + idx] - prof_ubar[2 * nprofs + idx - 1]); + cij[0] = prof_cij[0 * nprofs + idx - 1] + coeff * (prof_cij[0 * nprofs + idx] - prof_cij[0 * nprofs + idx - 1]); + cij[1] = prof_cij[1 * nprofs + idx - 1] + coeff * (prof_cij[1 * nprofs + idx] - prof_cij[1 * nprofs + idx - 1]); + cij[2] = prof_cij[2 * nprofs + idx - 1] + coeff * (prof_cij[2 * nprofs + idx] - prof_cij[2 * nprofs + idx - 1]); + cij[3] = prof_cij[3 * nprofs + idx - 1] + coeff * (prof_cij[3 * nprofs + idx] - prof_cij[3 * nprofs + idx - 1]); + cij[4] = prof_cij[4 * nprofs + idx - 1] + coeff * (prof_cij[4 * nprofs + idx] - prof_cij[4 * nprofs + idx - 1]); + cij[5] = prof_cij[5 * nprofs + idx - 1] + coeff * (prof_cij[5 * nprofs + idx] - prof_cij[5 * nprofs + idx - 1]); + *eps = prof_eps[idx - 1] + coeff * (prof_eps[idx] - prof_eps[idx - 1]); + *lt = prof_lt[idx - 1] + coeff * (prof_lt[idx] - prof_lt[idx - 1]); //*INDENT-ON* - } else { // y outside bounds of prof_wd - ubar[0] = prof_ubar[1*nprofs-1]; - ubar[1] = prof_ubar[2*nprofs-1]; - ubar[2] = prof_ubar[3*nprofs-1]; - cij[0] = prof_cij[1*nprofs-1]; - cij[1] = prof_cij[2*nprofs-1]; - cij[2] = prof_cij[3*nprofs-1]; - cij[3] = prof_cij[4*nprofs-1]; - cij[4] = prof_cij[5*nprofs-1]; - cij[5] = prof_cij[6*nprofs-1]; - *eps = prof_eps[nprofs-1]; - *lt = prof_lt[nprofs-1]; + } else { // y outside bounds of prof_wd + ubar[0] = prof_ubar[1 * nprofs - 1]; + ubar[1] = prof_ubar[2 * nprofs - 1]; + ubar[2] = prof_ubar[3 * nprofs - 1]; + cij[0] = prof_cij[1 * nprofs - 1]; + cij[1] = prof_cij[2 * nprofs - 1]; + cij[2] = prof_cij[3 * nprofs - 1]; + cij[3] = prof_cij[4 * nprofs - 1]; + cij[4] = prof_cij[5 * nprofs - 1]; + cij[5] = prof_cij[6 * nprofs - 1]; + *eps = prof_eps[nprofs - 1]; + *lt = prof_lt[nprofs - 1]; } } @@ -100,24 +99,19 @@ CEED_QFUNCTION_HELPER void InterpolateProfile(const CeedScalar wall_dist, * @param[in] Ektot Total turbulent kinetic energy of spectrum * @returns qn Spectrum coefficient */ -CEED_QFUNCTION_HELPER CeedScalar Calc_qn (const CeedScalar kappa, - const CeedScalar dkappa, const CeedScalar keta, const CeedScalar kcut, - const CeedScalar ke, const CeedScalar Ektot_inv) { - const CeedScalar feta_x_fcut = exp(-Square(12*kappa/keta) - -Cube(4*Max(kappa - 0.9*kcut, 0)/kcut) ); - return pow(kappa/ke, 4.) * pow(1 + 2.4*Square(kappa/ke),-17./6) - *feta_x_fcut*dkappa * Ektot_inv; +CEED_QFUNCTION_HELPER CeedScalar Calc_qn(const CeedScalar kappa, const CeedScalar dkappa, const CeedScalar keta, const CeedScalar kcut, + const CeedScalar ke, const CeedScalar Ektot_inv) { + const CeedScalar feta_x_fcut = exp(-Square(12 * kappa / keta) - Cube(4 * Max(kappa - 0.9 * kcut, 0) / kcut)); + return pow(kappa / ke, 4.) * pow(1 + 2.4 * Square(kappa / ke), -17. / 6) * feta_x_fcut * dkappa * Ektot_inv; } // Calculate hmax, ke, keta, and kcut -CEED_QFUNCTION_HELPER void SpectrumConstants (const CeedScalar wall_dist, - const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3], - const CeedScalar nu, CeedScalar *hmax, CeedScalar *ke, - CeedScalar *keta, CeedScalar *kcut) { - *hmax = Max( Max(h[0], h[1]), h[2]); - *ke = wall_dist==0 ? 1e16 : 2*M_PI/Min(2*wall_dist, 3*lt); - *keta = 2*M_PI*pow(Cube(nu)/eps, -0.25); - *kcut = M_PI/ Min( Max(Max(h[1], h[2]), 0.3*(*hmax)) + 0.1*wall_dist, *hmax ); +CEED_QFUNCTION_HELPER void SpectrumConstants(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3], + const CeedScalar nu, CeedScalar *hmax, CeedScalar *ke, CeedScalar *keta, CeedScalar *kcut) { + *hmax = Max(Max(h[0], h[1]), h[2]); + *ke = wall_dist == 0 ? 1e16 : 2 * M_PI / Min(2 * wall_dist, 3 * lt); + *keta = 2 * M_PI * pow(Cube(nu) / eps, -0.25); + *kcut = M_PI / Min(Max(Max(h[1], h[2]), 0.3 * (*hmax)) + 0.1 * wall_dist, *hmax); } /* @@ -133,23 +127,22 @@ CEED_QFUNCTION_HELPER void SpectrumConstants (const CeedScalar wall_dist, * @param[in] stg_ctx STGShur14Context for the problem * @param[out] qn Spectrum coefficients, [nmodes] */ -CEED_QFUNCTION_HELPER void CalcSpectrum (const CeedScalar wall_dist, - const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3], - const CeedScalar nu, CeedScalar qn[], const STGShur14Context stg_ctx) { +CEED_QFUNCTION_HELPER void CalcSpectrum(const CeedScalar wall_dist, const CeedScalar eps, const CeedScalar lt, const CeedScalar h[3], + const CeedScalar nu, CeedScalar qn[], const STGShur14Context stg_ctx) { + const CeedInt nmodes = stg_ctx->nmodes; + const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; + CeedScalar hmax, ke, keta, kcut, Ektot = 0.0; - const CeedInt nmodes = stg_ctx->nmodes; - const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; - CeedScalar hmax, ke, keta, kcut, Ektot=0.0; SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut); - for(CeedInt n=0; nnmodes; - const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; - const CeedScalar *phi = &stg_ctx->data[stg_ctx->offsets.phi]; - const CeedScalar *sigma = &stg_ctx->data[stg_ctx->offsets.sigma]; - const CeedScalar *d = &stg_ctx->data[stg_ctx->offsets.d]; + const CeedInt nmodes = stg_ctx->nmodes; + const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; + const CeedScalar *phi = &stg_ctx->data[stg_ctx->offsets.phi]; + const CeedScalar *sigma = &stg_ctx->data[stg_ctx->offsets.sigma]; + const CeedScalar *d = &stg_ctx->data[stg_ctx->offsets.d]; //*INDENT-ON* CeedScalar xdotd, vp[3] = {0.}; CeedScalar xhat[] = {0., X[1], X[2]}; - CeedPragmaSIMD - for(CeedInt n=0; nu0*t)*Max(2*kappa[0]/kappa[n], 0.1); - xdotd = 0.; - for(CeedInt i=0; i<3; i++) xdotd += d[i*nmodes+n]*xhat[i]; - const CeedScalar cos_kxdp = cos(kappa[n]*xdotd + phi[n]); - vp[0] += sqrt(qn[n])*sigma[0*nmodes+n] * cos_kxdp; - vp[1] += sqrt(qn[n])*sigma[1*nmodes+n] * cos_kxdp; - vp[2] += sqrt(qn[n])*sigma[2*nmodes+n] * cos_kxdp; + CeedPragmaSIMD for (CeedInt n = 0; n < nmodes; n++) { + xhat[0] = (X[0] - stg_ctx->u0 * t) * Max(2 * kappa[0] / kappa[n], 0.1); + xdotd = 0.; + for (CeedInt i = 0; i < 3; i++) xdotd += d[i * nmodes + n] * xhat[i]; + const CeedScalar cos_kxdp = cos(kappa[n] * xdotd + phi[n]); + vp[0] += sqrt(qn[n]) * sigma[0 * nmodes + n] * cos_kxdp; + vp[1] += sqrt(qn[n]) * sigma[1 * nmodes + n] * cos_kxdp; + vp[2] += sqrt(qn[n]) * sigma[2 * nmodes + n] * cos_kxdp; } - for(CeedInt i=0; i<3; i++) vp[i] *= 2*sqrt(1.5); + for (CeedInt i = 0; i < 3; i++) vp[i] *= 2 * sqrt(1.5); - u[0] = ubar[0] + cij[0]*vp[0]; - u[1] = ubar[1] + cij[3]*vp[0] + cij[1]*vp[1]; - u[2] = ubar[2] + cij[4]*vp[0] + cij[5]*vp[1] + cij[2]*vp[2]; + u[0] = ubar[0] + cij[0] * vp[0]; + u[1] = ubar[1] + cij[3] * vp[0] + cij[1] * vp[1]; + u[2] = ubar[2] + cij[4] * vp[0] + cij[5] * vp[1] + cij[2] * vp[2]; } /****************************************************** @@ -210,162 +199,151 @@ CEED_QFUNCTION_HELPER void STGShur14_Calc (const CeedScalar X[3], * @param[out] u Velocity at X and t * @param[in] stg_ctx STGShur14Context for the problem */ -CEED_QFUNCTION_HELPER void STGShur14_Calc_PrecompEktot(const CeedScalar X[3], - const CeedScalar t, const CeedScalar ubar[3], const CeedScalar cij[6], - const CeedScalar Ektot, const CeedScalar h[3], const CeedScalar wall_dist, - const CeedScalar eps, const CeedScalar lt, const CeedScalar nu, CeedScalar u[3], - const STGShur14Context stg_ctx) { - +CEED_QFUNCTION_HELPER void STGShur14_Calc_PrecompEktot(const CeedScalar X[3], const CeedScalar t, const CeedScalar ubar[3], const CeedScalar cij[6], + const CeedScalar Ektot, const CeedScalar h[3], const CeedScalar wall_dist, + const CeedScalar eps, const CeedScalar lt, const CeedScalar nu, CeedScalar u[3], + const STGShur14Context stg_ctx) { //*INDENT-OFF* - const CeedInt nmodes = stg_ctx->nmodes; - const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; - const CeedScalar *phi = &stg_ctx->data[stg_ctx->offsets.phi]; - const CeedScalar *sigma = &stg_ctx->data[stg_ctx->offsets.sigma]; - const CeedScalar *d = &stg_ctx->data[stg_ctx->offsets.d]; + const CeedInt nmodes = stg_ctx->nmodes; + const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; + const CeedScalar *phi = &stg_ctx->data[stg_ctx->offsets.phi]; + const CeedScalar *sigma = &stg_ctx->data[stg_ctx->offsets.sigma]; + const CeedScalar *d = &stg_ctx->data[stg_ctx->offsets.d]; //*INDENT-ON* CeedScalar hmax, ke, keta, kcut; SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut); CeedScalar xdotd, vp[3] = {0.}; CeedScalar xhat[] = {0., X[1], X[2]}; - CeedPragmaSIMD - for(CeedInt n=0; nu0*t)*Max(2*kappa[0]/kappa[n], 0.1); - xdotd = 0.; - for(CeedInt i=0; i<3; i++) xdotd += d[i*nmodes+n]*xhat[i]; - const CeedScalar cos_kxdp = cos(kappa[n]*xdotd + phi[n]); - const CeedScalar dkappa = n==0 ? kappa[0] : kappa[n] - kappa[n-1]; + CeedPragmaSIMD for (CeedInt n = 0; n < nmodes; n++) { + xhat[0] = (X[0] - stg_ctx->u0 * t) * Max(2 * kappa[0] / kappa[n], 0.1); + xdotd = 0.; + for (CeedInt i = 0; i < 3; i++) xdotd += d[i * nmodes + n] * xhat[i]; + const CeedScalar cos_kxdp = cos(kappa[n] * xdotd + phi[n]); + const CeedScalar dkappa = n == 0 ? kappa[0] : kappa[n] - kappa[n - 1]; const CeedScalar qn = Calc_qn(kappa[n], dkappa, keta, kcut, ke, Ektot); - vp[0] += sqrt(qn)*sigma[0*nmodes+n] * cos_kxdp; - vp[1] += sqrt(qn)*sigma[1*nmodes+n] * cos_kxdp; - vp[2] += sqrt(qn)*sigma[2*nmodes+n] * cos_kxdp; + vp[0] += sqrt(qn) * sigma[0 * nmodes + n] * cos_kxdp; + vp[1] += sqrt(qn) * sigma[1 * nmodes + n] * cos_kxdp; + vp[2] += sqrt(qn) * sigma[2 * nmodes + n] * cos_kxdp; } - for(CeedInt i=0; i<3; i++) vp[i] *= 2*sqrt(1.5); + for (CeedInt i = 0; i < 3; i++) vp[i] *= 2 * sqrt(1.5); - u[0] = ubar[0] + cij[0]*vp[0]; - u[1] = ubar[1] + cij[3]*vp[0] + cij[1]*vp[1]; - u[2] = ubar[2] + cij[4]*vp[0] + cij[5]*vp[1] + cij[2]*vp[2]; + u[0] = ubar[0] + cij[0] * vp[0]; + u[1] = ubar[1] + cij[3] * vp[0] + cij[1] * vp[1]; + u[2] = ubar[2] + cij[4] * vp[0] + cij[5] * vp[1] + cij[2] * vp[2]; } // Create preprocessed input for the stg calculation // // stg_data[0] = 1 / Ektot (inverse of total spectrum energy) -CEED_QFUNCTION(Preprocess_STGShur14)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Preprocess_STGShur14)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { //*INDENT-OFF* - const CeedScalar (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA]) in[0], - (*x)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA]) in[1]; + const CeedScalar(*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*x)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; - CeedScalar (*stg_data) = (CeedScalar(*)) out[0]; + CeedScalar(*stg_data) = (CeedScalar(*))out[0]; //*INDENT-ON* - CeedScalar ubar[3], cij[6], eps, lt; - const STGShur14Context stg_ctx = (STGShur14Context) ctx; - const CeedScalar dx = stg_ctx->dx; - const CeedScalar mu = stg_ctx->newtonian_ctx.mu; - const CeedScalar theta0 = stg_ctx->theta0; - const CeedScalar P0 = stg_ctx->P0; - const CeedScalar cv = stg_ctx->newtonian_ctx.cv; - const CeedScalar cp = stg_ctx->newtonian_ctx.cp; - const CeedScalar Rd = cp - cv; - const CeedScalar rho = P0 / (Rd * theta0); - const CeedScalar nu = mu / rho; - - const CeedInt nmodes = stg_ctx->nmodes; - const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; - CeedScalar hmax, ke, keta, kcut; - - CeedPragmaSIMD - for(CeedInt i=0; idx; + const CeedScalar mu = stg_ctx->newtonian_ctx.mu; + const CeedScalar theta0 = stg_ctx->theta0; + const CeedScalar P0 = stg_ctx->P0; + const CeedScalar cv = stg_ctx->newtonian_ctx.cv; + const CeedScalar cp = stg_ctx->newtonian_ctx.cp; + const CeedScalar Rd = cp - cv; + const CeedScalar rho = P0 / (Rd * theta0); + const CeedScalar nu = mu / rho; + + const CeedInt nmodes = stg_ctx->nmodes; + const CeedScalar *kappa = &stg_ctx->data[stg_ctx->offsets.kappa]; + CeedScalar hmax, ke, keta, kcut; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + const CeedScalar wall_dist = x[1][i]; const CeedScalar dXdx[2][3] = { - {q_data_sur[4][i], q_data_sur[5][i], q_data_sur[6][i]}, - {q_data_sur[7][i], q_data_sur[8][i], q_data_sur[9][i]} + {q_data_sur[4][i], q_data_sur[5][i], q_data_sur[6][i]}, + {q_data_sur[7][i], q_data_sur[8][i], q_data_sur[9][i]} }; CeedScalar h[3]; h[0] = dx; - for (CeedInt j=1; j<3; j++) - h[j] = 2/sqrt(dXdx[0][j]*dXdx[0][j] + dXdx[1][j]*dXdx[1][j]); + for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(dXdx[0][j] * dXdx[0][j] + dXdx[1][j] * dXdx[1][j]); InterpolateProfile(wall_dist, ubar, cij, &eps, <, stg_ctx); SpectrumConstants(wall_dist, eps, lt, h, nu, &hmax, &ke, &keta, &kcut); // Calculate total TKE per spectrum - CeedScalar Ek_tot=0; - CeedPragmaSIMD - for(CeedInt n=0; ndx; - const CeedScalar time = stg_ctx->time; - const CeedScalar theta0 = stg_ctx->theta0; - const CeedScalar P0 = stg_ctx->P0; - const CeedScalar mu = stg_ctx->newtonian_ctx.mu; - const CeedScalar cv = stg_ctx->newtonian_ctx.cv; - const CeedScalar cp = stg_ctx->newtonian_ctx.cp; - const CeedScalar Rd = cp - cv; - const CeedScalar rho = P0 / (Rd * theta0); + CeedScalar(*q0)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - CeedPragmaSIMD - for(CeedInt i=0; idx; + const CeedScalar time = stg_ctx->time; + const CeedScalar theta0 = stg_ctx->theta0; + const CeedScalar P0 = stg_ctx->P0; + const CeedScalar mu = stg_ctx->newtonian_ctx.mu; + const CeedScalar cv = stg_ctx->newtonian_ctx.cv; + const CeedScalar cp = stg_ctx->newtonian_ctx.cp; + const CeedScalar Rd = cp - cv; + const CeedScalar rho = P0 / (Rd * theta0); + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { const CeedScalar x_i[3] = {x[0][i], x[1][i], x[2][i]}; // *INDENT-OFF* - const CeedScalar dXdx[3][3] = {{q_data[1][i], q_data[2][i], q_data[3][i]}, - {q_data[4][i], q_data[5][i], q_data[6][i]}, - {q_data[7][i], q_data[8][i], q_data[9][i]} - }; + const CeedScalar dXdx[3][3] = { + {q_data[1][i], q_data[2][i], q_data[3][i]}, + {q_data[4][i], q_data[5][i], q_data[6][i]}, + {q_data[7][i], q_data[8][i], q_data[9][i]} + }; // *INDENT-ON* CeedScalar h[3]; h[0] = dx; - for (CeedInt j=1; j<3; j++) - h[j] = 2/sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]) + Square(dXdx[2][j])); + for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j]) + Square(dXdx[2][j])); InterpolateProfile(x_i[1], ubar, cij, &eps, <, stg_ctx); if (stg_ctx->use_fluctuating_IC) { - CalcSpectrum(x_i[1], eps, lt, h, mu/rho, qn, stg_ctx); + CalcSpectrum(x_i[1], eps, lt, h, mu / rho, qn, stg_ctx); STGShur14_Calc(x_i, time, ubar, cij, qn, u, stg_ctx); } else { - for (CeedInt j=0; j<3; j++) u[j] = ubar[j]; + for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j]; } switch (stg_ctx->newtonian_ctx.state_var) { - case STATEVAR_CONSERVATIVE: - q0[0][i] = rho; - q0[1][i] = u[0] * rho; - q0[2][i] = u[1] * rho; - q0[3][i] = u[2] * rho; - q0[4][i] = rho * (0.5 * Dot3(u, u) + cv * theta0); - break; - - case STATEVAR_PRIMITIVE: - q0[0][i] = P0; - q0[1][i] = u[0]; - q0[2][i] = u[1]; - q0[3][i] = u[2]; - q0[4][i] = theta0; - break; + case STATEVAR_CONSERVATIVE: + q0[0][i] = rho; + q0[1][i] = u[0] * rho; + q0[2][i] = u[1] * rho; + q0[3][i] = u[2] * rho; + q0[4][i] = rho * (0.5 * Dot3(u, u) + cv * theta0); + break; + + case STATEVAR_PRIMITIVE: + q0[0][i] = P0; + q0[1][i] = u[0]; + q0[2][i] = u[1]; + q0[3][i] = u[2]; + q0[4][i] = theta0; + break; } - } // End of Quadrature Point Loop + } // End of Quadrature Point Loop return 0; } @@ -375,74 +353,65 @@ CEED_QFUNCTION(ICsSTG)(void *ctx, CeedInt Q, * This will loop through quadrature points, calculate the wavemode amplitudes * at each location, then calculate the actual velocity. */ -CEED_QFUNCTION(STGShur14_Inflow)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { - +CEED_QFUNCTION(STGShur14_Inflow)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { //*INDENT-OFF* - const CeedScalar (*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA]) in[0], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA]) in[2], - (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA]) in[3]; + const CeedScalar(*q)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], + (*X)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; - CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA]) out[0], - (*jac_data_sur)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA]) out[1]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0], (*jac_data_sur)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[1]; //*INDENT-ON* - const STGShur14Context stg_ctx = (STGShur14Context) ctx; - CeedScalar qn[STG_NMODES_MAX], u[3], ubar[3], cij[6], eps, lt; - const bool is_implicit = stg_ctx->is_implicit; - const bool mean_only = stg_ctx->mean_only; - const bool prescribe_T = stg_ctx->prescribe_T; - const CeedScalar dx = stg_ctx->dx; - const CeedScalar mu = stg_ctx->newtonian_ctx.mu; - const CeedScalar time = stg_ctx->time; - const CeedScalar theta0 = stg_ctx->theta0; - const CeedScalar P0 = stg_ctx->P0; - const CeedScalar cv = stg_ctx->newtonian_ctx.cv; - const CeedScalar cp = stg_ctx->newtonian_ctx.cp; - const CeedScalar Rd = cp - cv; - const CeedScalar gamma = cp/cv; - - CeedPragmaSIMD - for(CeedInt i=0; iis_implicit; + const bool mean_only = stg_ctx->mean_only; + const bool prescribe_T = stg_ctx->prescribe_T; + const CeedScalar dx = stg_ctx->dx; + const CeedScalar mu = stg_ctx->newtonian_ctx.mu; + const CeedScalar time = stg_ctx->time; + const CeedScalar theta0 = stg_ctx->theta0; + const CeedScalar P0 = stg_ctx->P0; + const CeedScalar cv = stg_ctx->newtonian_ctx.cv; + const CeedScalar cp = stg_ctx->newtonian_ctx.cp; + const CeedScalar Rd = cp - cv; + const CeedScalar gamma = cp / cv; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + const CeedScalar rho = prescribe_T ? q[0][i] : P0 / (Rd * theta0); + const CeedScalar x[] = {X[0][i], X[1][i], X[2][i]}; const CeedScalar dXdx[2][3] = { - {q_data_sur[4][i], q_data_sur[5][i], q_data_sur[6][i]}, - {q_data_sur[7][i], q_data_sur[8][i], q_data_sur[9][i]} + {q_data_sur[4][i], q_data_sur[5][i], q_data_sur[6][i]}, + {q_data_sur[7][i], q_data_sur[8][i], q_data_sur[9][i]} }; CeedScalar h[3]; h[0] = dx; - for (CeedInt j=1; j<3; j++) - h[j] = 2/sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j])); + for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j])); InterpolateProfile(X[1][i], ubar, cij, &eps, <, stg_ctx); if (!mean_only) { - CalcSpectrum(X[1][i], eps, lt, h, mu/rho, qn, stg_ctx); + CalcSpectrum(X[1][i], eps, lt, h, mu / rho, qn, stg_ctx); STGShur14_Calc(x, time, ubar, cij, qn, u, stg_ctx); } else { - for (CeedInt j=0; j<3; j++) u[j] = ubar[j]; + for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j]; } const CeedScalar E_kinetic = .5 * rho * Dot3(u, u); - CeedScalar E_internal, P; + CeedScalar E_internal, P; if (prescribe_T) { // Temperature is being set weakly (theta0) and for constant cv this sets E_internal E_internal = rho * cv * theta0; // Find pressure using - P = rho * Rd * theta0; // interior rho with exterior T + P = rho * Rd * theta0; // interior rho with exterior T } else { - E_internal = q[4][i] - E_kinetic; // uses prescribed rho and u, E from solution - P = E_internal * (gamma - 1.); + E_internal = q[4][i] - E_kinetic; // uses prescribed rho and u, E from solution + P = E_internal * (gamma - 1.); } - const CeedScalar wdetJb = (is_implicit ? -1. : 1.) * q_data_sur[0][i]; + const CeedScalar wdetJb = (is_implicit ? -1. : 1.) * q_data_sur[0][i]; // ---- Normal vect - const CeedScalar norm[3] = {q_data_sur[1][i], - q_data_sur[2][i], - q_data_sur[3][i] - }; + const CeedScalar norm[3] = {q_data_sur[1][i], q_data_sur[2][i], q_data_sur[3][i]}; const CeedScalar E = E_internal + E_kinetic; @@ -451,16 +420,14 @@ CEED_QFUNCTION(STGShur14_Inflow)(void *ctx, CeedInt Q, // The Physics // Zero v so all future terms can safely sum into it - for (CeedInt j=0; j<5; j++) v[j][i] = 0.; + for (CeedInt j = 0; j < 5; j++) v[j][i] = 0.; // The Physics // -- Density v[0][i] -= wdetJb * rho * u_normal; // -- Momentum - for (CeedInt j=0; j<3; j++) - v[j+1][i] -= wdetJb *(rho * u_normal * u[j] + - norm[j] * P); + for (CeedInt j = 0; j < 3; j++) v[j + 1][i] -= wdetJb * (rho * u_normal * u[j] + norm[j] * P); // -- Total Energy Density v[4][i] -= wdetJb * u_normal * (E + P); @@ -470,71 +437,65 @@ CEED_QFUNCTION(STGShur14_Inflow)(void *ctx, CeedInt Q, jac_data_sur[2][i] = u[1]; jac_data_sur[3][i] = u[2]; jac_data_sur[4][i] = E; - for (int j=0; j<6; j++) jac_data_sur[5+j][i] = 0.; + for (int j = 0; j < 6; j++) jac_data_sur[5 + j][i] = 0.; } return 0; } -CEED_QFUNCTION(STGShur14_Inflow_Jacobian)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(STGShur14_Inflow_Jacobian)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // Inputs - const CeedScalar (*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], - (*jac_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4]; + const CeedScalar(*dq)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[2], + (*jac_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[4]; // Outputs - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* - const STGShur14Context stg_ctx = (STGShur14Context)ctx; - const bool implicit = stg_ctx->is_implicit; - const CeedScalar cv = stg_ctx->newtonian_ctx.cv; - const CeedScalar cp = stg_ctx->newtonian_ctx.cp; - const CeedScalar Rd = cp - cv; - const CeedScalar gamma = cp/cv; + const STGShur14Context stg_ctx = (STGShur14Context)ctx; + const bool implicit = stg_ctx->is_implicit; + const CeedScalar cv = stg_ctx->newtonian_ctx.cv; + const CeedScalar cp = stg_ctx->newtonian_ctx.cp; + const CeedScalar Rd = cp - cv; + const CeedScalar gamma = cp / cv; - const CeedScalar theta0 = stg_ctx->theta0; - const bool prescribe_T = stg_ctx->prescribe_T; + const CeedScalar theta0 = stg_ctx->theta0; + const bool prescribe_T = stg_ctx->prescribe_T; CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; i `bcFunc` method. */ -CEED_QFUNCTION(STGShur14_Inflow_StrongQF)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { - +CEED_QFUNCTION(STGShur14_Inflow_StrongQF)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { //*INDENT-OFF* - const CeedScalar (*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA]) in[0], - (*coords)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA]) in[1], - (*scale) = (const CeedScalar(*)) in[2], - (*stg_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA]) in[3]; + const CeedScalar(*q_data_sur)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*coords)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1], + (*scale) = (const CeedScalar(*))in[2], (*stg_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[3]; - CeedScalar(*bcval)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA]) out[0]; + CeedScalar(*bcval)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; //*INDENT-ON* - const STGShur14Context stg_ctx = (STGShur14Context) ctx; - CeedScalar u[3], ubar[3], cij[6], eps, lt; - const bool mean_only = stg_ctx->mean_only; - const CeedScalar dx = stg_ctx->dx; - const CeedScalar mu = stg_ctx->newtonian_ctx.mu; - const CeedScalar time = stg_ctx->time; - const CeedScalar theta0 = stg_ctx->theta0; - const CeedScalar P0 = stg_ctx->P0; - const CeedScalar cv = stg_ctx->newtonian_ctx.cv; - const CeedScalar cp = stg_ctx->newtonian_ctx.cp; - const CeedScalar Rd = cp - cv; - const CeedScalar rho = P0 / (Rd * theta0); - - CeedPragmaSIMD - for(CeedInt i=0; imean_only; + const CeedScalar dx = stg_ctx->dx; + const CeedScalar mu = stg_ctx->newtonian_ctx.mu; + const CeedScalar time = stg_ctx->time; + const CeedScalar theta0 = stg_ctx->theta0; + const CeedScalar P0 = stg_ctx->P0; + const CeedScalar cv = stg_ctx->newtonian_ctx.cv; + const CeedScalar cp = stg_ctx->newtonian_ctx.cp; + const CeedScalar Rd = cp - cv; + const CeedScalar rho = P0 / (Rd * theta0); + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + const CeedScalar x[] = {coords[0][i], coords[1][i], coords[2][i]}; const CeedScalar dXdx[2][3] = { - {q_data_sur[4][i], q_data_sur[5][i], q_data_sur[6][i]}, - {q_data_sur[7][i], q_data_sur[8][i], q_data_sur[9][i]} + {q_data_sur[4][i], q_data_sur[5][i], q_data_sur[6][i]}, + {q_data_sur[7][i], q_data_sur[8][i], q_data_sur[9][i]} }; CeedScalar h[3]; h[0] = dx; - for (CeedInt j=1; j<3; j++) - h[j] = 2/sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j])); + for (CeedInt j = 1; j < 3; j++) h[j] = 2 / sqrt(Square(dXdx[0][j]) + Square(dXdx[1][j])); InterpolateProfile(coords[1][i], ubar, cij, &eps, <, stg_ctx); if (!mean_only) { if (1) { - STGShur14_Calc_PrecompEktot(x, time, ubar, cij, stg_data[0][i], - h, x[1], eps, lt, mu/rho, u, stg_ctx); - } else { // Original way + STGShur14_Calc_PrecompEktot(x, time, ubar, cij, stg_data[0][i], h, x[1], eps, lt, mu / rho, u, stg_ctx); + } else { // Original way CeedScalar qn[STG_NMODES_MAX]; - CalcSpectrum(coords[1][i], eps, lt, h, mu/rho, qn, stg_ctx); + CalcSpectrum(coords[1][i], eps, lt, h, mu / rho, qn, stg_ctx); STGShur14_Calc(x, time, ubar, cij, qn, u, stg_ctx); } } else { - for (CeedInt j=0; j<3; j++) u[j] = ubar[j]; + for (CeedInt j = 0; j < 3; j++) u[j] = ubar[j]; } switch (stg_ctx->newtonian_ctx.state_var) { - case STATEVAR_CONSERVATIVE: - bcval[0][i] = scale[i] * rho; - bcval[1][i] = scale[i] * rho * u[0]; - bcval[2][i] = scale[i] * rho * u[1]; - bcval[3][i] = scale[i] * rho * u[2]; - bcval[4][i] = 0.; - break; - - case STATEVAR_PRIMITIVE: - bcval[0][i] = 0; - bcval[1][i] = scale[i] * u[0]; - bcval[2][i] = scale[i] * u[1]; - bcval[3][i] = scale[i] * u[2]; - bcval[4][i] = scale[i] * theta0; - break; + case STATEVAR_CONSERVATIVE: + bcval[0][i] = scale[i] * rho; + bcval[1][i] = scale[i] * rho * u[0]; + bcval[2][i] = scale[i] * rho * u[1]; + bcval[3][i] = scale[i] * rho * u[2]; + bcval[4][i] = 0.; + break; + + case STATEVAR_PRIMITIVE: + bcval[0][i] = 0; + bcval[1][i] = scale[i] * u[0]; + bcval[2][i] = scale[i] * u[1]; + bcval[3][i] = scale[i] * u[2]; + bcval[4][i] = scale[i] * theta0; + break; } } return 0; } -#endif // stg_shur14_h +#endif // stg_shur14_h diff --git a/examples/fluids/qfunctions/stg_shur14_type.h b/examples/fluids/qfunctions/stg_shur14_type.h index 59f26c5564..95114718d6 100644 --- a/examples/fluids/qfunctions/stg_shur14_type.h +++ b/examples/fluids/qfunctions/stg_shur14_type.h @@ -9,6 +9,7 @@ #define stg_shur14_type_h #include + #include "newtonian_types.h" /* Access data arrays via: @@ -16,34 +17,34 @@ * CeedScalar *eps = &ctx->data[ctx->offsets.eps]; */ typedef struct STGShur14Context_ *STGShur14Context; struct STGShur14Context_ { - CeedInt nmodes; // !< Number of wavemodes - CeedInt nprofs; // !< Number of profile points in STGInflow.dat - CeedInt nynodes; // !< Number of mesh nodes in the y direction - CeedScalar alpha; // !< Geometric growth rate of kappa - CeedScalar u0; // !< Convective velocity - CeedScalar time; // !< Solution time - CeedScalar P0; // !< Inlet pressure - CeedScalar theta0; // !< Inlet temperature - bool is_implicit; // !< Whether using implicit time integration - bool mean_only; // !< Only apply the mean profile - CeedScalar dx; // !< dx used for h calculation - CeedScalar dz; // !< dz used for h calculation - bool prescribe_T; // !< Prescribe temperature weakly - bool use_fluctuating_IC; // !< Only apply the mean profile + CeedInt nmodes; // !< Number of wavemodes + CeedInt nprofs; // !< Number of profile points in STGInflow.dat + CeedInt nynodes; // !< Number of mesh nodes in the y direction + CeedScalar alpha; // !< Geometric growth rate of kappa + CeedScalar u0; // !< Convective velocity + CeedScalar time; // !< Solution time + CeedScalar P0; // !< Inlet pressure + CeedScalar theta0; // !< Inlet temperature + bool is_implicit; // !< Whether using implicit time integration + bool mean_only; // !< Only apply the mean profile + CeedScalar dx; // !< dx used for h calculation + CeedScalar dz; // !< dz used for h calculation + bool prescribe_T; // !< Prescribe temperature weakly + bool use_fluctuating_IC; // !< Only apply the mean profile struct NewtonianIdealGasContext_ newtonian_ctx; struct { - size_t sigma, d, phi; // !< Random number set, [nmodes,3], [nmodes,3], [nmodes] - size_t kappa; // !< Wavemode frequencies in increasing order, [nmodes] - size_t wall_dist; // !< Distance to wall for Inflow Profie, [nprof] - size_t ubar; // !< Mean velocity, [nprof, 3] - size_t cij; // !< Cholesky decomposition [nprof, 6] - size_t eps; // !< Turbulent Disspation [nprof, 6] - size_t lt; // !< Tubulent Length Scale [nprof, 6] - size_t ynodes; // !< Locations of nodes in y direction [nynodes] - } offsets; // !< Holds offsets for each array in data - size_t total_bytes; // !< Total size of struct plus array - CeedScalar data[1]; // !< Holds concatenated scalar array data + size_t sigma, d, phi; // !< Random number set, [nmodes,3], [nmodes,3], [nmodes] + size_t kappa; // !< Wavemode frequencies in increasing order, [nmodes] + size_t wall_dist; // !< Distance to wall for Inflow Profie, [nprof] + size_t ubar; // !< Mean velocity, [nprof, 3] + size_t cij; // !< Cholesky decomposition [nprof, 6] + size_t eps; // !< Turbulent Disspation [nprof, 6] + size_t lt; // !< Tubulent Length Scale [nprof, 6] + size_t ynodes; // !< Locations of nodes in y direction [nynodes] + } offsets; // !< Holds offsets for each array in data + size_t total_bytes; // !< Total size of struct plus array + CeedScalar data[1]; // !< Holds concatenated scalar array data }; #endif diff --git a/examples/fluids/qfunctions/utils.h b/examples/fluids/qfunctions/utils.h index de8efa7afe..dc3752d04d 100644 --- a/examples/fluids/qfunctions/utils.h +++ b/examples/fluids/qfunctions/utils.h @@ -12,30 +12,27 @@ #include #ifndef M_PI -#define M_PI 3.14159265358979323846 +#define M_PI 3.14159265358979323846 #endif CEED_QFUNCTION_HELPER CeedScalar Max(CeedScalar a, CeedScalar b) { return a < b ? b : a; } CEED_QFUNCTION_HELPER CeedScalar Min(CeedScalar a, CeedScalar b) { return a < b ? a : b; } -CEED_QFUNCTION_HELPER CeedScalar Square(CeedScalar x) { return x*x; } -CEED_QFUNCTION_HELPER CeedScalar Cube(CeedScalar x) { return x*x*x; } +CEED_QFUNCTION_HELPER CeedScalar Square(CeedScalar x) { return x * x; } +CEED_QFUNCTION_HELPER CeedScalar Cube(CeedScalar x) { return x * x * x; } // @brief Dot product of 3 element vectors -CEED_QFUNCTION_HELPER CeedScalar Dot3(const CeedScalar u[3], - const CeedScalar v[3]) { - return u[0]*v[0] + u[1]*v[1] + u[2]*v[2]; -} +CEED_QFUNCTION_HELPER CeedScalar Dot3(const CeedScalar u[3], const CeedScalar v[3]) { return u[0] * v[0] + u[1] * v[1] + u[2] * v[2]; } // @brief Unpack Kelvin-Mandel notation symmetric tensor into full tensor CEED_QFUNCTION_HELPER void KMUnpack(const CeedScalar v[6], CeedScalar A[3][3]) { const CeedScalar weight = 1 / sqrt(2.); - A[0][0] = v[0]; - A[1][1] = v[1]; - A[2][2] = v[2]; + A[0][0] = v[0]; + A[1][1] = v[1]; + A[2][2] = v[2]; A[2][1] = A[1][2] = weight * v[3]; A[2][0] = A[0][2] = weight * v[4]; A[1][0] = A[0][1] = weight * v[5]; } -#endif // utils_h +#endif // utils_h diff --git a/examples/fluids/src/cloptions.c b/examples/fluids/src/cloptions.c index 5c584af3f3..d7ef52147a 100644 --- a/examples/fluids/src/cloptions.c +++ b/examples/fluids/src/cloptions.c @@ -12,133 +12,85 @@ // Register problems to be available on the command line PetscErrorCode RegisterProblems_NS(AppCtx app_ctx) { - - app_ctx->problems = NULL; PetscFunctionBeginUser; - PetscCall(PetscFunctionListAdd(&app_ctx->problems, "density_current", - NS_DENSITY_CURRENT)); - - PetscCall(PetscFunctionListAdd(&app_ctx->problems, "euler_vortex", - NS_EULER_VORTEX)); - - PetscCall(PetscFunctionListAdd(&app_ctx->problems, "shocktube", - NS_SHOCKTUBE)); - - PetscCall(PetscFunctionListAdd(&app_ctx->problems, "advection", - NS_ADVECTION)); - - PetscCall(PetscFunctionListAdd(&app_ctx->problems, "advection2d", - NS_ADVECTION2D)); - - PetscCall(PetscFunctionListAdd(&app_ctx->problems, "blasius", - NS_BLASIUS)); - - PetscCall(PetscFunctionListAdd(&app_ctx->problems, "channel", - NS_CHANNEL)); - - PetscCall(PetscFunctionListAdd(&app_ctx->problems, "newtonian_wave", - NS_NEWTONIAN_WAVE)); + PetscCall(PetscFunctionListAdd(&app_ctx->problems, "density_current", NS_DENSITY_CURRENT)); + PetscCall(PetscFunctionListAdd(&app_ctx->problems, "euler_vortex", NS_EULER_VORTEX)); + PetscCall(PetscFunctionListAdd(&app_ctx->problems, "shocktube", NS_SHOCKTUBE)); + PetscCall(PetscFunctionListAdd(&app_ctx->problems, "advection", NS_ADVECTION)); + PetscCall(PetscFunctionListAdd(&app_ctx->problems, "advection2d", NS_ADVECTION2D)); + PetscCall(PetscFunctionListAdd(&app_ctx->problems, "blasius", NS_BLASIUS)); + PetscCall(PetscFunctionListAdd(&app_ctx->problems, "channel", NS_CHANNEL)); + PetscCall(PetscFunctionListAdd(&app_ctx->problems, "newtonian_wave", NS_NEWTONIAN_WAVE)); PetscFunctionReturn(0); } // Process general command line options -PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, - SimpleBC bc) { - +PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, SimpleBC bc) { PetscBool ceed_flag = PETSC_FALSE; PetscBool problem_flag = PETSC_FALSE; PetscBool option_set = PETSC_FALSE; - PetscErrorCode ierr; + PetscFunctionBeginUser; - PetscOptionsBegin(comm, NULL, "Navier-Stokes in PETSc with libCEED", - NULL); + PetscOptionsBegin(comm, NULL, "Navier-Stokes in PETSc with libCEED", NULL); - ierr = PetscOptionsString("-ceed", "CEED resource specifier", - NULL, app_ctx->ceed_resource, app_ctx->ceed_resource, - sizeof(app_ctx->ceed_resource), &ceed_flag); CHKERRQ(ierr); + PetscCall(PetscOptionsString("-ceed", "CEED resource specifier", NULL, app_ctx->ceed_resource, app_ctx->ceed_resource, + sizeof(app_ctx->ceed_resource), &ceed_flag)); app_ctx->test_mode = PETSC_FALSE; - ierr = PetscOptionsBool("-test", "Run in test mode", - NULL, app_ctx->test_mode, &app_ctx->test_mode, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-test", "Run in test mode", NULL, app_ctx->test_mode, &app_ctx->test_mode, NULL)); app_ctx->test_tol = 1E-11; - ierr = PetscOptionsScalar("-compare_final_state_atol", - "Test absolute tolerance", - NULL, app_ctx->test_tol, &app_ctx->test_tol, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-compare_final_state_atol", "Test absolute tolerance", NULL, app_ctx->test_tol, &app_ctx->test_tol, NULL)); - ierr = PetscOptionsString("-compare_final_state_filename", "Test filename", - NULL, app_ctx->file_path, app_ctx->file_path, - sizeof(app_ctx->file_path), NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsString("-compare_final_state_filename", "Test filename", NULL, app_ctx->file_path, app_ctx->file_path, + sizeof(app_ctx->file_path), NULL)); - ierr = PetscOptionsFList("-problem", "Problem to solve", NULL, - app_ctx->problems, - app_ctx->problem_name, app_ctx->problem_name, sizeof(app_ctx->problem_name), - &problem_flag); CHKERRQ(ierr); + PetscCall(PetscOptionsFList("-problem", "Problem to solve", NULL, app_ctx->problems, app_ctx->problem_name, app_ctx->problem_name, + sizeof(app_ctx->problem_name), &problem_flag)); app_ctx->viz_refine = 0; - ierr = PetscOptionsInt("-viz_refine", - "Regular refinement levels for visualization", - NULL, app_ctx->viz_refine, &app_ctx->viz_refine, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-viz_refine", "Regular refinement levels for visualization", NULL, app_ctx->viz_refine, &app_ctx->viz_refine, NULL)); app_ctx->output_freq = 10; - ierr = PetscOptionsInt("-output_freq", - "Frequency of output, in number of steps", - NULL, app_ctx->output_freq, &app_ctx->output_freq, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-output_freq", "Frequency of output, in number of steps", NULL, app_ctx->output_freq, &app_ctx->output_freq, NULL)); - PetscCall(PetscOptionsBool("-output_add_stepnum2bin", - "Add step number to the binary outputs", - NULL, app_ctx->add_stepnum2bin, &app_ctx->add_stepnum2bin, NULL)); + PetscCall(PetscOptionsBool("-output_add_stepnum2bin", "Add step number to the binary outputs", NULL, app_ctx->add_stepnum2bin, + &app_ctx->add_stepnum2bin, NULL)); PetscCall(PetscStrncpy(app_ctx->output_dir, ".", 2)); - PetscCall(PetscOptionsString("-output_dir", "Output directory", - NULL, app_ctx->output_dir, app_ctx->output_dir, - sizeof(app_ctx->output_dir), NULL)); + PetscCall(PetscOptionsString("-output_dir", "Output directory", NULL, app_ctx->output_dir, app_ctx->output_dir, sizeof(app_ctx->output_dir), NULL)); app_ctx->cont_steps = 0; - ierr = PetscOptionsInt("-continue", "Continue from previous solution", - NULL, app_ctx->cont_steps, &app_ctx->cont_steps, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-continue", "Continue from previous solution", NULL, app_ctx->cont_steps, &app_ctx->cont_steps, NULL)); PetscCall(PetscStrcpy(app_ctx->cont_file, "[output_dir]/ns-solution.bin")); - PetscCall(PetscOptionsString("-continue_filename", - "Filename to get initial condition from", - NULL, app_ctx->cont_file, app_ctx->cont_file, + PetscCall(PetscOptionsString("-continue_filename", "Filename to get initial condition from", NULL, app_ctx->cont_file, app_ctx->cont_file, sizeof(app_ctx->cont_file), &option_set)); - if(!option_set) PetscCall(PetscSNPrintf(app_ctx->cont_file, - sizeof app_ctx->cont_file, "%s/ns-solution.bin", - app_ctx->output_dir)); + if (!option_set) PetscCall(PetscSNPrintf(app_ctx->cont_file, sizeof app_ctx->cont_file, "%s/ns-solution.bin", app_ctx->output_dir)); PetscCall(PetscStrcpy(app_ctx->cont_time_file, "[output_dir]/ns-time.bin")); - PetscCall(PetscOptionsString("-continue_time_filename", - "Filename to get initial condition time from", - NULL, app_ctx->cont_time_file, app_ctx->cont_time_file, - sizeof(app_ctx->cont_time_file), &option_set)); - if(!option_set) PetscCall(PetscSNPrintf(app_ctx->cont_time_file, - sizeof app_ctx->cont_time_file, "%s/ns-time.bin", - app_ctx->output_dir)); + PetscCall(PetscOptionsString("-continue_time_filename", "Filename to get initial condition time from", NULL, app_ctx->cont_time_file, + app_ctx->cont_time_file, sizeof(app_ctx->cont_time_file), &option_set)); + if (!option_set) PetscCall(PetscSNPrintf(app_ctx->cont_time_file, sizeof app_ctx->cont_time_file, "%s/ns-time.bin", app_ctx->output_dir)); app_ctx->degree = 1; - ierr = PetscOptionsInt("-degree", "Polynomial degree of finite elements", - NULL, app_ctx->degree, &app_ctx->degree, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-degree", "Polynomial degree of finite elements", NULL, app_ctx->degree, &app_ctx->degree, NULL)); app_ctx->q_extra = 2; - ierr = PetscOptionsInt("-q_extra", "Number of extra quadrature points", - NULL, app_ctx->q_extra, &app_ctx->q_extra, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-q_extra", "Number of extra quadrature points", NULL, app_ctx->q_extra, &app_ctx->q_extra, NULL)); { PetscBool option_set; - char amat_type[256] = ""; - PetscCall(PetscOptionsFList("-amat_type", - "Set the type of Amat distinct from Pmat (-dm_mat_type)", - NULL, MatList, amat_type, amat_type, sizeof(amat_type), &option_set)); - if (option_set) PetscCall(PetscStrallocpy(amat_type, - (char **)&app_ctx->amat_type)); + char amat_type[256] = ""; + PetscCall(PetscOptionsFList("-amat_type", "Set the type of Amat distinct from Pmat (-dm_mat_type)", NULL, MatList, amat_type, amat_type, + sizeof(amat_type), &option_set)); + if (option_set) PetscCall(PetscStrallocpy(amat_type, (char **)&app_ctx->amat_type)); } - PetscCall(PetscOptionsBool("-pmat_pbdiagonal", - "Assemble only point-block diagonal for Pmat", NULL, app_ctx->pmat_pbdiagonal, + PetscCall(PetscOptionsBool("-pmat_pbdiagonal", "Assemble only point-block diagonal for Pmat", NULL, app_ctx->pmat_pbdiagonal, &app_ctx->pmat_pbdiagonal, NULL)); // Provide default ceed resource if not specified @@ -156,21 +108,15 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, // Wall Boundary Conditions bc->num_wall = 16; PetscBool flg; - ierr = PetscOptionsIntArray("-bc_wall", - "Face IDs to apply wall BC", - NULL, bc->walls, &bc->num_wall, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray("-bc_wall", "Face IDs to apply wall BC", NULL, bc->walls, &bc->num_wall, NULL)); bc->num_comps = 5; - ierr = PetscOptionsIntArray("-wall_comps", - "An array of constrained component numbers", - NULL, bc->wall_comps, &bc->num_comps, &flg); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray("-wall_comps", "An array of constrained component numbers", NULL, bc->wall_comps, &bc->num_comps, &flg)); // Slip Boundary Conditions - for (PetscInt j=0; j<3; j++) { + for (PetscInt j = 0; j < 3; j++) { bc->num_slip[j] = 16; - PetscBool flg; + PetscBool flg; const char *flags[3] = {"-bc_slip_x", "-bc_slip_y", "-bc_slip_z"}; - ierr = PetscOptionsIntArray(flags[j], - "Face IDs to apply slip BC", - NULL, bc->slips[j], &bc->num_slip[j], &flg); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray(flags[j], "Face IDs to apply slip BC", NULL, bc->slips[j], &bc->num_slip[j], &flg)); if (flg) bc->user_bc = PETSC_TRUE; } @@ -180,25 +126,17 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx, for (PetscInt s = 0; s < bc->num_slip[c]; s++) for (PetscInt w = 0; w < bc->num_wall; w++) if (bc->slips[c][s] == bc->walls[w]) - SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, - "Boundary condition already set on face %" PetscInt_FMT "!\n", - bc->walls[w]); + SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Boundary condition already set on face %" PetscInt_FMT "!\n", bc->walls[w]); // Inflow BCs bc->num_inflow = 16; - ierr = PetscOptionsIntArray("-bc_inflow", - "Face IDs to apply inflow BC", - NULL, bc->inflows, &bc->num_inflow, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray("-bc_inflow", "Face IDs to apply inflow BC", NULL, bc->inflows, &bc->num_inflow, NULL)); // Outflow BCs bc->num_outflow = 16; - ierr = PetscOptionsIntArray("-bc_outflow", - "Face IDs to apply outflow BC", - NULL, bc->outflows, &bc->num_outflow, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray("-bc_outflow", "Face IDs to apply outflow BC", NULL, bc->outflows, &bc->num_outflow, NULL)); // Freestream BCs bc->num_freestream = 16; - ierr = PetscOptionsIntArray("-bc_freestream", - "Face IDs to apply freestream BC", - NULL, bc->freestreams, &bc->num_freestream, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray("-bc_freestream", "Face IDs to apply freestream BC", NULL, bc->freestreams, &bc->num_freestream, NULL)); PetscOptionsEnd(); diff --git a/examples/fluids/src/dirichlet.c b/examples/fluids/src/dirichlet.c index dfe77c6543..13833febe4 100644 --- a/examples/fluids/src/dirichlet.c +++ b/examples/fluids/src/dirichlet.c @@ -6,20 +6,17 @@ // This file is part of CEED: http://github.com/ceed #include "../navierstokes.h" -#include "../qfunctions/dirichlet_boundary.h" #include "../problems/stg_shur14.h" +#include "../qfunctions/dirichlet_boundary.h" -PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, - AppCtx app_ctx, ProblemData *problem, SimpleBC bc, Physics phys, +PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, AppCtx app_ctx, ProblemData *problem, SimpleBC bc, Physics phys, CeedInt Q_sur, CeedInt q_data_size_sur, CeedOperator op_dirichlet) { - CeedInt num_comp_x=problem->dim, num_comp_q = 5, num_elem, - elem_size, stg_data_size=1; - CeedVector multiplicity, x_stored, scale_stored, q_data_sur, stg_data; - CeedBasis basis_x_to_q_sur; - CeedElemRestriction elem_restr_x_sur, elem_restr_q_sur, elem_restr_x_stored, - elem_restr_scale, elem_restr_qd_sur, elem_restr_stgdata; - CeedQFunction qf_setup, qf_strongbc, qf_stgdata; - CeedOperator op_setup, op_dirichlet_sub, op_setup_sur, op_stgdata; + CeedInt num_comp_x = problem->dim, num_comp_q = 5, num_elem, elem_size, stg_data_size = 1; + CeedVector multiplicity, x_stored, scale_stored, q_data_sur, stg_data; + CeedBasis basis_x_to_q_sur; + CeedElemRestriction elem_restr_x_sur, elem_restr_q_sur, elem_restr_x_stored, elem_restr_scale, elem_restr_qd_sur, elem_restr_stgdata; + CeedQFunction qf_setup, qf_strongbc, qf_stgdata; + CeedOperator op_setup, op_dirichlet_sub, op_setup_sur, op_stgdata; PetscFunctionBeginUser; DMLabel domain_label; @@ -27,105 +24,77 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, // Basis CeedInt height = 1; - PetscCall(CeedBasisCreateProjection(ceed_data->basis_x_sur, - ceed_data->basis_q_sur, &basis_x_to_q_sur)); + PetscCall(CeedBasisCreateProjection(ceed_data->basis_x_sur, ceed_data->basis_q_sur, &basis_x_to_q_sur)); // Setup QFunction - CeedQFunctionCreateInterior(ceed, 1, SetupDirichletBC, SetupDirichletBC_loc, - &qf_setup); + CeedQFunctionCreateInterior(ceed, 1, SetupDirichletBC, SetupDirichletBC_loc, &qf_setup); CeedQFunctionAddInput(qf_setup, "x", num_comp_x, CEED_EVAL_INTERP); CeedQFunctionAddInput(qf_setup, "multiplicity", num_comp_q, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_setup, "x stored", num_comp_x, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_setup, "scale", 1, CEED_EVAL_NONE); // Setup STG Setup QFunction - PetscCall(SetupStrongSTG_PreProcessing(ceed, problem, num_comp_x, stg_data_size, - q_data_size_sur, &qf_stgdata)); + PetscCall(SetupStrongSTG_PreProcessing(ceed, problem, num_comp_x, stg_data_size, q_data_size_sur, &qf_stgdata)); // Compute contribution on each boundary face - for (CeedInt i=0; i < bc->num_inflow; i++) { + for (CeedInt i = 0; i < bc->num_inflow; i++) { // -- Restrictions - PetscCall(GetRestrictionForDomain(ceed, dm, height, domain_label, - bc->inflows[i], - Q_sur, q_data_size_sur, &elem_restr_q_sur, &elem_restr_x_sur, + PetscCall(GetRestrictionForDomain(ceed, dm, height, domain_label, bc->inflows[i], Q_sur, q_data_size_sur, &elem_restr_q_sur, &elem_restr_x_sur, &elem_restr_qd_sur)); CeedElemRestrictionCreateVector(elem_restr_q_sur, &multiplicity, NULL); CeedElemRestrictionGetMultiplicity(elem_restr_q_sur, multiplicity); CeedElemRestrictionGetNumElements(elem_restr_q_sur, &num_elem); CeedElemRestrictionGetElementSize(elem_restr_q_sur, &elem_size); - CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, num_comp_x, - num_elem * elem_size * num_comp_x, - CEED_STRIDES_BACKEND, &elem_restr_x_stored); + CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, num_comp_x, num_elem * elem_size * num_comp_x, CEED_STRIDES_BACKEND, + &elem_restr_x_stored); CeedElemRestrictionCreateVector(elem_restr_x_stored, &x_stored, NULL); - CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, 1, - num_elem * elem_size, - CEED_STRIDES_BACKEND, &elem_restr_scale); + CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, 1, num_elem * elem_size, CEED_STRIDES_BACKEND, &elem_restr_scale); CeedElemRestrictionCreateVector(elem_restr_scale, &scale_stored, NULL); - CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, stg_data_size, - num_elem * elem_size, - CEED_STRIDES_BACKEND, &elem_restr_stgdata); + CeedElemRestrictionCreateStrided(ceed, num_elem, elem_size, stg_data_size, num_elem * elem_size, CEED_STRIDES_BACKEND, &elem_restr_stgdata); CeedElemRestrictionCreateVector(elem_restr_stgdata, &stg_data, NULL); - CeedVectorCreate(ceed, q_data_size_sur*num_elem*elem_size, &q_data_sur); + CeedVectorCreate(ceed, q_data_size_sur * num_elem * elem_size, &q_data_sur); // -- Setup Operator CeedOperatorCreate(ceed, qf_setup, NULL, NULL, &op_setup); CeedOperatorSetName(op_setup, "surface geometric data"); - CeedOperatorSetField(op_setup, "x", elem_restr_x_sur, basis_x_to_q_sur, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup, "multiplicity", elem_restr_q_sur, - CEED_BASIS_COLLOCATED, multiplicity); - CeedOperatorSetField(op_setup, "x stored", elem_restr_x_stored, - CEED_BASIS_COLLOCATED, x_stored); - CeedOperatorSetField(op_setup, "scale", elem_restr_scale, CEED_BASIS_COLLOCATED, - scale_stored); + CeedOperatorSetField(op_setup, "x", elem_restr_x_sur, basis_x_to_q_sur, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup, "multiplicity", elem_restr_q_sur, CEED_BASIS_COLLOCATED, multiplicity); + CeedOperatorSetField(op_setup, "x stored", elem_restr_x_stored, CEED_BASIS_COLLOCATED, x_stored); + CeedOperatorSetField(op_setup, "scale", elem_restr_scale, CEED_BASIS_COLLOCATED, scale_stored); // -- Compute geometric factors - CeedOperatorApply(op_setup, ceed_data->x_coord, CEED_VECTOR_NONE, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_setup, ceed_data->x_coord, CEED_VECTOR_NONE, CEED_REQUEST_IMMEDIATE); // -- Compute QData for the surface CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur); - CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x_sur, - ceed_data->basis_xc_sur, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, - ceed_data->basis_xc_sur, CEED_VECTOR_NONE); - CeedOperatorSetField(op_setup_sur, "surface qdata", elem_restr_qd_sur, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x_sur, ceed_data->basis_xc_sur, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_xc_sur, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_sur, "surface qdata", elem_restr_qd_sur, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); - CeedOperatorApply(op_setup_sur, ceed_data->x_coord, q_data_sur, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_setup_sur, ceed_data->x_coord, q_data_sur, CEED_REQUEST_IMMEDIATE); // -- Compute STGData CeedOperatorCreate(ceed, qf_stgdata, NULL, NULL, &op_stgdata); - CeedOperatorSetField(op_stgdata, "surface qdata", elem_restr_qd_sur, - CEED_BASIS_COLLOCATED, q_data_sur); - CeedOperatorSetField(op_stgdata, "x", elem_restr_x_stored, - CEED_BASIS_COLLOCATED, x_stored); - CeedOperatorSetField(op_stgdata, "stg data", elem_restr_stgdata, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_stgdata, "surface qdata", elem_restr_qd_sur, CEED_BASIS_COLLOCATED, q_data_sur); + CeedOperatorSetField(op_stgdata, "x", elem_restr_x_stored, CEED_BASIS_COLLOCATED, x_stored); + CeedOperatorSetField(op_stgdata, "stg data", elem_restr_stgdata, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); CeedOperatorSetNumQuadraturePoints(op_stgdata, elem_size); CeedOperatorApply(op_stgdata, NULL, stg_data, CEED_REQUEST_IMMEDIATE); // -- Setup BC QFunctions - SetupStrongSTG_QF(ceed, problem, num_comp_x, num_comp_q, stg_data_size, - q_data_size_sur, &qf_strongbc); + SetupStrongSTG_QF(ceed, problem, num_comp_x, num_comp_q, stg_data_size, q_data_size_sur, &qf_strongbc); CeedOperatorCreate(ceed, qf_strongbc, NULL, NULL, &op_dirichlet_sub); CeedOperatorSetName(op_dirichlet_sub, "Strong STG"); - CeedOperatorSetField(op_dirichlet_sub, "surface qdata", elem_restr_qd_sur, - CEED_BASIS_COLLOCATED, q_data_sur); - CeedOperatorSetField(op_dirichlet_sub, "x", elem_restr_x_stored, - CEED_BASIS_COLLOCATED, x_stored); - CeedOperatorSetField(op_dirichlet_sub, "scale", elem_restr_scale, - CEED_BASIS_COLLOCATED, scale_stored); - CeedOperatorSetField(op_dirichlet_sub, "stg data", elem_restr_stgdata, - CEED_BASIS_COLLOCATED, stg_data); - CeedOperatorSetField(op_dirichlet_sub, "q", elem_restr_q_sur, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_dirichlet_sub, "surface qdata", elem_restr_qd_sur, CEED_BASIS_COLLOCATED, q_data_sur); + CeedOperatorSetField(op_dirichlet_sub, "x", elem_restr_x_stored, CEED_BASIS_COLLOCATED, x_stored); + CeedOperatorSetField(op_dirichlet_sub, "scale", elem_restr_scale, CEED_BASIS_COLLOCATED, scale_stored); + CeedOperatorSetField(op_dirichlet_sub, "stg data", elem_restr_stgdata, CEED_BASIS_COLLOCATED, stg_data); + CeedOperatorSetField(op_dirichlet_sub, "q", elem_restr_q_sur, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); CeedOperatorSetNumQuadraturePoints(op_dirichlet_sub, elem_size); // -- Add to composite operator @@ -150,8 +119,7 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, CeedOperatorDestroy(&op_stgdata); } - CeedOperatorContextGetFieldLabel(op_dirichlet, "solution time", - &phys->stg_solution_time_label); + CeedOperatorContextGetFieldLabel(op_dirichlet, "solution time", &phys->stg_solution_time_label); CeedBasisDestroy(&basis_x_to_q_sur); CeedQFunctionDestroy(&qf_setup); @@ -159,11 +127,10 @@ PetscErrorCode SetupStrongSTG_Ceed(Ceed ceed, CeedData ceed_data, DM dm, PetscFunctionReturn(0); } -PetscErrorCode DMPlexInsertBoundaryValues_StrongBCCeed(DM dm, - PetscBool insert_essential, Vec Q_loc, PetscReal time, Vec face_geom_FVM, - Vec cell_geom_FVM, Vec grad_FVM) { - Vec boundary_mask; - User user; +PetscErrorCode DMPlexInsertBoundaryValues_StrongBCCeed(DM dm, PetscBool insert_essential, Vec Q_loc, PetscReal time, Vec face_geom_FVM, + Vec cell_geom_FVM, Vec grad_FVM) { + Vec boundary_mask; + User user; PetscScalar *q; PetscMemType q_mem_type; PetscFunctionBeginUser; @@ -171,8 +138,7 @@ PetscErrorCode DMPlexInsertBoundaryValues_StrongBCCeed(DM dm, PetscCall(DMGetApplicationContext(dm, &user)); if (user->phys->stg_solution_time_label) { - CeedOperatorContextSetDouble(user->op_dirichlet, - user->phys->stg_solution_time_label, &time); + CeedOperatorContextSetDouble(user->op_dirichlet, user->phys->stg_solution_time_label, &time); } // Mask Dirichlet entries @@ -185,8 +151,7 @@ PetscErrorCode DMPlexInsertBoundaryValues_StrongBCCeed(DM dm, CeedVectorSetArray(user->q_ceed, MemTypeP2C(q_mem_type), CEED_USE_POINTER, q); // Apply libCEED operator - CeedOperatorApplyAdd(user->op_dirichlet, CEED_VECTOR_NONE, user->q_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApplyAdd(user->op_dirichlet, CEED_VECTOR_NONE, user->q_ceed, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(user->q_ceed, MemTypeP2C(q_mem_type), NULL); @@ -195,9 +160,8 @@ PetscErrorCode DMPlexInsertBoundaryValues_StrongBCCeed(DM dm, PetscFunctionReturn(0); } -PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, - User user, AppCtx app_ctx, ProblemData *problem, - SimpleBC bc, CeedInt Q_sur, CeedInt q_data_size_sur) { +PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData *problem, SimpleBC bc, CeedInt Q_sur, + CeedInt q_data_size_sur) { PetscFunctionBeginUser; { @@ -217,13 +181,11 @@ PetscErrorCode SetupStrongBC_Ceed(Ceed ceed, CeedData ceed_data, DM dm, PetscBool use_strongstg = PETSC_FALSE; PetscCall(PetscOptionsGetBool(NULL, NULL, "-stg_strong", &use_strongstg, NULL)); - if (use_strongstg) - PetscCall(SetupStrongSTG_Ceed(ceed, ceed_data, dm, app_ctx, problem, bc, - user->phys, Q_sur, q_data_size_sur, user->op_dirichlet)); + if (use_strongstg) { + PetscCall(SetupStrongSTG_Ceed(ceed, ceed_data, dm, app_ctx, problem, bc, user->phys, Q_sur, q_data_size_sur, user->op_dirichlet)); + } } - PetscCall(PetscObjectComposeFunction((PetscObject)dm, - "DMPlexInsertBoundaryValues_C", - DMPlexInsertBoundaryValues_StrongBCCeed)); + PetscCall(PetscObjectComposeFunction((PetscObject)dm, "DMPlexInsertBoundaryValues_C", DMPlexInsertBoundaryValues_StrongBCCeed)); PetscFunctionReturn(0); } diff --git a/examples/fluids/src/misc.c b/examples/fluids/src/misc.c index 083a8fae1b..d6c7b9cc1b 100644 --- a/examples/fluids/src/misc.c +++ b/examples/fluids/src/misc.c @@ -10,18 +10,13 @@ #include "../navierstokes.h" -PetscErrorCode ICs_FixMultiplicity(DM dm, CeedData ceed_data, User user, - Vec Q_loc, Vec Q, - CeedScalar time) { - PetscErrorCode ierr; +PetscErrorCode ICs_FixMultiplicity(DM dm, CeedData ceed_data, User user, Vec Q_loc, Vec Q, CeedScalar time) { PetscFunctionBeginUser; // --------------------------------------------------------------------------- // Update time for evaluation // --------------------------------------------------------------------------- - if (user->phys->ics_time_label) - CeedOperatorContextSetDouble(ceed_data->op_ics, user->phys->ics_time_label, - &time); + if (user->phys->ics_time_label) CeedOperatorContextSetDouble(ceed_data->op_ics, user->phys->ics_time_label, &time); // --------------------------------------------------------------------------- // ICs @@ -31,24 +26,21 @@ PetscErrorCode ICs_FixMultiplicity(DM dm, CeedData ceed_data, User user, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &q0_ceed, NULL); // -- Place PETSc vector in CEED vector - CeedScalar *q0; + CeedScalar *q0; PetscMemType q0_mem_type; - ierr = VecGetArrayAndMemType(Q_loc, (PetscScalar **)&q0, &q0_mem_type); - CHKERRQ(ierr); + PetscCall(VecGetArrayAndMemType(Q_loc, (PetscScalar **)&q0, &q0_mem_type)); CeedVectorSetArray(q0_ceed, MemTypeP2C(q0_mem_type), CEED_USE_POINTER, q0); // -- Apply CEED Operator - CeedOperatorApply(ceed_data->op_ics, ceed_data->x_coord, q0_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(ceed_data->op_ics, ceed_data->x_coord, q0_ceed, CEED_REQUEST_IMMEDIATE); // -- Restore vectors CeedVectorTakeArray(q0_ceed, MemTypeP2C(q0_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(Q_loc, (const PetscScalar **)&q0); - CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(Q_loc, (const PetscScalar **)&q0)); // -- Local-to-Global - ierr = VecZeroEntries(Q); CHKERRQ(ierr); - ierr = DMLocalToGlobal(dm, Q_loc, ADD_VALUES, Q); CHKERRQ(ierr); + PetscCall(VecZeroEntries(Q)); + PetscCall(DMLocalToGlobal(dm, Q_loc, ADD_VALUES, Q)); // --------------------------------------------------------------------------- // Fix multiplicity for output of ICs @@ -58,38 +50,33 @@ PetscErrorCode ICs_FixMultiplicity(DM dm, CeedData ceed_data, User user, CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &mult_vec, NULL); // -- Place PETSc vector in CEED vector - CeedScalar *mult; + CeedScalar *mult; PetscMemType m_mem_type; - Vec multiplicity_loc; - ierr = DMGetLocalVector(dm, &multiplicity_loc); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(multiplicity_loc, (PetscScalar **)&mult, - &m_mem_type); - CHKERRQ(ierr); + Vec multiplicity_loc; + PetscCall(DMGetLocalVector(dm, &multiplicity_loc)); + PetscCall(VecGetArrayAndMemType(multiplicity_loc, (PetscScalar **)&mult, &m_mem_type)); CeedVectorSetArray(mult_vec, MemTypeP2C(m_mem_type), CEED_USE_POINTER, mult); - CHKERRQ(ierr); // -- Get multiplicity CeedElemRestrictionGetMultiplicity(ceed_data->elem_restr_q, mult_vec); // -- Restore vectors CeedVectorTakeArray(mult_vec, MemTypeP2C(m_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(multiplicity_loc, - (const PetscScalar **)&mult); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(multiplicity_loc, (const PetscScalar **)&mult)); // -- Local-to-Global Vec multiplicity; - ierr = DMGetGlobalVector(dm, &multiplicity); CHKERRQ(ierr); - ierr = VecZeroEntries(multiplicity); CHKERRQ(ierr); - ierr = DMLocalToGlobal(dm, multiplicity_loc, ADD_VALUES, multiplicity); - CHKERRQ(ierr); + PetscCall(DMGetGlobalVector(dm, &multiplicity)); + PetscCall(VecZeroEntries(multiplicity)); + PetscCall(DMLocalToGlobal(dm, multiplicity_loc, ADD_VALUES, multiplicity)); // -- Fix multiplicity - ierr = VecPointwiseDivide(Q, Q, multiplicity); CHKERRQ(ierr); - ierr = VecPointwiseDivide(Q_loc, Q_loc, multiplicity_loc); CHKERRQ(ierr); + PetscCall(VecPointwiseDivide(Q, Q, multiplicity)); + PetscCall(VecPointwiseDivide(Q_loc, Q_loc, multiplicity_loc)); // -- Restore vectors - ierr = DMRestoreLocalVector(dm, &multiplicity_loc); CHKERRQ(ierr); - ierr = DMRestoreGlobalVector(dm, &multiplicity); CHKERRQ(ierr); + PetscCall(DMRestoreLocalVector(dm, &multiplicity_loc)); + PetscCall(DMRestoreGlobalVector(dm, &multiplicity)); // Cleanup CeedVectorDestroy(&mult_vec); @@ -98,12 +85,9 @@ PetscErrorCode ICs_FixMultiplicity(DM dm, CeedData ceed_data, User user, PetscFunctionReturn(0); } -PetscErrorCode DMPlexInsertBoundaryValues_NS(DM dm, - PetscBool insert_essential, Vec Q_loc, PetscReal time, Vec face_geom_FVM, - Vec cell_geom_FVM, Vec grad_FVM) { - - Vec Qbc, boundary_mask; - PetscErrorCode ierr; +PetscErrorCode DMPlexInsertBoundaryValues_NS(DM dm, PetscBool insert_essential, Vec Q_loc, PetscReal time, Vec face_geom_FVM, Vec cell_geom_FVM, + Vec grad_FVM) { + Vec Qbc, boundary_mask; PetscFunctionBegin; // Mask (zero) Dirichlet entries @@ -111,112 +95,95 @@ PetscErrorCode DMPlexInsertBoundaryValues_NS(DM dm, PetscCall(VecPointwiseMult(Q_loc, Q_loc, boundary_mask)); PetscCall(DMRestoreNamedLocalVector(dm, "boundary mask", &boundary_mask)); - ierr = DMGetNamedLocalVector(dm, "Qbc", &Qbc); CHKERRQ(ierr); - ierr = VecAXPY(Q_loc, 1., Qbc); CHKERRQ(ierr); - ierr = DMRestoreNamedLocalVector(dm, "Qbc", &Qbc); CHKERRQ(ierr); + PetscCall(DMGetNamedLocalVector(dm, "Qbc", &Qbc)); + PetscCall(VecAXPY(Q_loc, 1., Qbc)); + PetscCall(DMRestoreNamedLocalVector(dm, "Qbc", &Qbc)); PetscFunctionReturn(0); } // Compare reference solution values with current test run for CI PetscErrorCode RegressionTests_NS(AppCtx app_ctx, Vec Q) { - - Vec Qref; - PetscViewer viewer; - PetscReal error, Qrefnorm; - PetscErrorCode ierr; + Vec Qref; + PetscViewer viewer; + PetscReal error, Qrefnorm; PetscFunctionBegin; // Read reference file - ierr = VecDuplicate(Q, &Qref); CHKERRQ(ierr); - ierr = PetscViewerBinaryOpen(PetscObjectComm((PetscObject)Q), - app_ctx->file_path, FILE_MODE_READ, - &viewer); CHKERRQ(ierr); - ierr = VecLoad(Qref, viewer); CHKERRQ(ierr); + PetscCall(VecDuplicate(Q, &Qref)); + PetscCall(PetscViewerBinaryOpen(PetscObjectComm((PetscObject)Q), app_ctx->file_path, FILE_MODE_READ, &viewer)); + PetscCall(VecLoad(Qref, viewer)); // Compute error with respect to reference solution - ierr = VecAXPY(Q, -1.0, Qref); CHKERRQ(ierr); - ierr = VecNorm(Qref, NORM_MAX, &Qrefnorm); CHKERRQ(ierr); - ierr = VecScale(Q, 1./Qrefnorm); CHKERRQ(ierr); - ierr = VecNorm(Q, NORM_MAX, &error); CHKERRQ(ierr); + PetscCall(VecAXPY(Q, -1.0, Qref)); + PetscCall(VecNorm(Qref, NORM_MAX, &Qrefnorm)); + PetscCall(VecScale(Q, 1. / Qrefnorm)); + PetscCall(VecNorm(Q, NORM_MAX, &error)); // Check error if (error > app_ctx->test_tol) { - ierr = PetscPrintf(PETSC_COMM_WORLD, - "Test failed with error norm %g\n", - (double)error); CHKERRQ(ierr); + PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Test failed with error norm %g\n", (double)error)); } // Cleanup - ierr = PetscViewerDestroy(&viewer); CHKERRQ(ierr); - ierr = VecDestroy(&Qref); CHKERRQ(ierr); + PetscCall(PetscViewerDestroy(&viewer)); + PetscCall(VecDestroy(&Qref)); PetscFunctionReturn(0); } // Get error for problems with exact solutions -PetscErrorCode GetError_NS(CeedData ceed_data, DM dm, User user, Vec Q, - PetscScalar final_time) { - PetscInt loc_nodes; - Vec Q_exact, Q_exact_loc; - PetscReal rel_error, norm_error, norm_exact; - PetscErrorCode ierr; +PetscErrorCode GetError_NS(CeedData ceed_data, DM dm, User user, Vec Q, PetscScalar final_time) { + PetscInt loc_nodes; + Vec Q_exact, Q_exact_loc; + PetscReal rel_error, norm_error, norm_exact; PetscFunctionBegin; // Get exact solution at final time - ierr = DMCreateGlobalVector(dm, &Q_exact); CHKERRQ(ierr); - ierr = DMGetLocalVector(dm, &Q_exact_loc); CHKERRQ(ierr); - ierr = VecGetSize(Q_exact_loc, &loc_nodes); CHKERRQ(ierr); - ierr = ICs_FixMultiplicity(dm, ceed_data, user, Q_exact_loc, Q_exact, - final_time); - CHKERRQ(ierr); + PetscCall(DMCreateGlobalVector(dm, &Q_exact)); + PetscCall(DMGetLocalVector(dm, &Q_exact_loc)); + PetscCall(VecGetSize(Q_exact_loc, &loc_nodes)); + PetscCall(ICs_FixMultiplicity(dm, ceed_data, user, Q_exact_loc, Q_exact, final_time)); // Get |exact solution - obtained solution| - ierr = VecNorm(Q_exact, NORM_1, &norm_exact); CHKERRQ(ierr); - ierr = VecAXPY(Q, -1.0, Q_exact); CHKERRQ(ierr); - ierr = VecNorm(Q, NORM_1, &norm_error); CHKERRQ(ierr); + PetscCall(VecNorm(Q_exact, NORM_1, &norm_exact)); + PetscCall(VecAXPY(Q, -1.0, Q_exact)); + PetscCall(VecNorm(Q, NORM_1, &norm_error)); // Compute relative error rel_error = norm_error / norm_exact; // Output relative error - ierr = PetscPrintf(PETSC_COMM_WORLD, - "Relative Error: %g\n", - (double)rel_error); CHKERRQ(ierr); + PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Relative Error: %g\n", (double)rel_error)); // Cleanup - ierr = DMRestoreLocalVector(dm, &Q_exact_loc); CHKERRQ(ierr); - ierr = VecDestroy(&Q_exact); CHKERRQ(ierr); + PetscCall(DMRestoreLocalVector(dm, &Q_exact_loc)); + PetscCall(VecDestroy(&Q_exact)); PetscFunctionReturn(0); } // Post-processing -PetscErrorCode PostProcess_NS(TS ts, CeedData ceed_data, DM dm, - ProblemData *problem, User user, - Vec Q, PetscScalar final_time) { - PetscInt steps; - PetscErrorCode ierr; +PetscErrorCode PostProcess_NS(TS ts, CeedData ceed_data, DM dm, ProblemData *problem, User user, Vec Q, PetscScalar final_time) { + PetscInt steps; PetscFunctionBegin; // Print relative error if (problem->non_zero_time && !user->app_ctx->test_mode) { - ierr = GetError_NS(ceed_data, dm, user, Q, final_time); CHKERRQ(ierr); + PetscCall(GetError_NS(ceed_data, dm, user, Q, final_time)); } // Print final time and number of steps - ierr = TSGetStepNumber(ts, &steps); CHKERRQ(ierr); + PetscCall(TSGetStepNumber(ts, &steps)); if (!user->app_ctx->test_mode) { - ierr = PetscPrintf(PETSC_COMM_WORLD, - "Time integrator took %" PetscInt_FMT " time steps to reach final time %g\n", - steps, (double)final_time); CHKERRQ(ierr); + PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Time integrator took %" PetscInt_FMT " time steps to reach final time %g\n", steps, (double)final_time)); } // Output numerical values from command line - ierr = VecViewFromOptions(Q, NULL, "-vec_view"); CHKERRQ(ierr); + PetscCall(VecViewFromOptions(Q, NULL, "-vec_view")); // Compare reference solution values with current test run for CI if (user->app_ctx->test_mode) { - ierr = RegressionTests_NS(user->app_ctx, Q); CHKERRQ(ierr); + PetscCall(RegressionTests_NS(user->app_ctx, Q)); } PetscFunctionReturn(0); @@ -224,40 +191,34 @@ PetscErrorCode PostProcess_NS(TS ts, CeedData ceed_data, DM dm, // Gather initial Q values in case of continuation of simulation PetscErrorCode SetupICsFromBinary(MPI_Comm comm, AppCtx app_ctx, Vec Q) { + PetscViewer viewer; - PetscViewer viewer; - PetscErrorCode ierr; PetscFunctionBegin; // Read input - ierr = PetscViewerBinaryOpen(comm, app_ctx->cont_file, FILE_MODE_READ, &viewer); - CHKERRQ(ierr); + PetscCall(PetscViewerBinaryOpen(comm, app_ctx->cont_file, FILE_MODE_READ, &viewer)); // Load Q from existent solution - ierr = VecLoad(Q, viewer); CHKERRQ(ierr); + PetscCall(VecLoad(Q, viewer)); // Cleanup - ierr = PetscViewerDestroy(&viewer); CHKERRQ(ierr); + PetscCall(PetscViewerDestroy(&viewer)); PetscFunctionReturn(0); } // Record boundary values from initial condition PetscErrorCode SetBCsFromICs_NS(DM dm, Vec Q, Vec Q_loc) { - - Vec Qbc, boundary_mask; - PetscErrorCode ierr; + Vec Qbc, boundary_mask; PetscFunctionBegin; - ierr = DMGetNamedLocalVector(dm, "Qbc", &Qbc); CHKERRQ(ierr); - ierr = VecCopy(Q_loc, Qbc); CHKERRQ(ierr); - ierr = VecZeroEntries(Q_loc); CHKERRQ(ierr); - ierr = DMGlobalToLocal(dm, Q, INSERT_VALUES, Q_loc); CHKERRQ(ierr); - ierr = VecAXPY(Qbc, -1., Q_loc); CHKERRQ(ierr); - ierr = DMRestoreNamedLocalVector(dm, "Qbc", &Qbc); CHKERRQ(ierr); - ierr = PetscObjectComposeFunction((PetscObject)dm, - "DMPlexInsertBoundaryValues_C", DMPlexInsertBoundaryValues_NS); - CHKERRQ(ierr); + PetscCall(DMGetNamedLocalVector(dm, "Qbc", &Qbc)); + PetscCall(VecCopy(Q_loc, Qbc)); + PetscCall(VecZeroEntries(Q_loc)); + PetscCall(DMGlobalToLocal(dm, Q, INSERT_VALUES, Q_loc)); + PetscCall(VecAXPY(Qbc, -1., Q_loc)); + PetscCall(DMRestoreNamedLocalVector(dm, "Qbc", &Qbc)); + PetscCall(PetscObjectComposeFunction((PetscObject)dm, "DMPlexInsertBoundaryValues_C", DMPlexInsertBoundaryValues_NS)); PetscCall(DMGetNamedLocalVector(dm, "boundary mask", &boundary_mask)); PetscCall(DMGetGlobalVector(dm, &Q)); @@ -271,7 +232,6 @@ PetscErrorCode SetBCsFromICs_NS(DM dm, Vec Q, Vec Q_loc) { // Free a plain data context that was allocated using PETSc; returning libCEED error codes int FreeContextPetsc(void *data) { - if (PetscFree(data)) return CeedError(NULL, CEED_ERROR_ACCESS, - "PetscFree failed"); + if (PetscFree(data)) return CeedError(NULL, CEED_ERROR_ACCESS, "PetscFree failed"); return CEED_ERROR_SUCCESS; } diff --git a/examples/fluids/src/setupdm.c b/examples/fluids/src/setupdm.c index 5996d4f4da..893e9426ad 100644 --- a/examples/fluids/src/setupdm.c +++ b/examples/fluids/src/setupdm.c @@ -12,165 +12,133 @@ #include "../problems/stg_shur14.h" // Create mesh -PetscErrorCode CreateDM(MPI_Comm comm, ProblemData *problem, - MatType mat_type, VecType vec_type, - DM *dm) { - PetscErrorCode ierr; +PetscErrorCode CreateDM(MPI_Comm comm, ProblemData *problem, MatType mat_type, VecType vec_type, DM *dm) { PetscFunctionBeginUser; // Create DMPLEX - ierr = DMCreate(comm, dm); CHKERRQ(ierr); - ierr = DMSetType(*dm, DMPLEX); CHKERRQ(ierr); + PetscCall(DMCreate(comm, dm)); + PetscCall(DMSetType(*dm, DMPLEX)); { PetscBool skip = PETSC_TRUE; - PetscCall(PetscOptionsGetBool(NULL, NULL, "-dm_mat_preallocate_skip", &skip, - NULL)); + PetscCall(PetscOptionsGetBool(NULL, NULL, "-dm_mat_preallocate_skip", &skip, NULL)); PetscCall(DMSetMatrixPreallocateSkip(*dm, skip)); } - ierr = DMSetMatType(*dm, mat_type); CHKERRQ(ierr); - ierr = DMSetVecType(*dm, vec_type); CHKERRQ(ierr); + PetscCall(DMSetMatType(*dm, mat_type)); + PetscCall(DMSetVecType(*dm, vec_type)); // Set Tensor elements - ierr = PetscOptionsSetValue(NULL, "-dm_plex_simplex", "0"); CHKERRQ(ierr); - ierr = PetscOptionsSetValue(NULL, "-dm_sparse_localize", "0"); CHKERRQ(ierr); + PetscCall(PetscOptionsSetValue(NULL, "-dm_plex_simplex", "0")); + PetscCall(PetscOptionsSetValue(NULL, "-dm_sparse_localize", "0")); // Set CL options - ierr = DMSetFromOptions(*dm); CHKERRQ(ierr); - ierr = DMViewFromOptions(*dm, NULL, "-dm_view"); CHKERRQ(ierr); + PetscCall(DMSetFromOptions(*dm)); + PetscCall(DMViewFromOptions(*dm, NULL, "-dm_view")); PetscFunctionReturn(0); } // Setup DM -PetscErrorCode SetUpDM(DM dm, ProblemData *problem, PetscInt degree, - SimpleBC bc, Physics phys) { - PetscErrorCode ierr; +PetscErrorCode SetUpDM(DM dm, ProblemData *problem, PetscInt degree, SimpleBC bc, Physics phys) { PetscFunctionBeginUser; { // Configure the finite element space and boundary conditions PetscFE fe; PetscInt num_comp_q = 5; - DMLabel label; - ierr = PetscFECreateLagrange(PETSC_COMM_SELF, problem->dim, num_comp_q, - PETSC_FALSE, degree, PETSC_DECIDE, - &fe); CHKERRQ(ierr); - ierr = PetscObjectSetName((PetscObject)fe, "Q"); CHKERRQ(ierr); - ierr = DMAddField(dm, NULL,(PetscObject)fe); CHKERRQ(ierr); - ierr = DMCreateDS(dm); CHKERRQ(ierr); - ierr = DMGetLabel(dm, "Face Sets", &label); CHKERRQ(ierr); + DMLabel label; + PetscCall(PetscFECreateLagrange(PETSC_COMM_SELF, problem->dim, num_comp_q, PETSC_FALSE, degree, PETSC_DECIDE, &fe)); + PetscCall(PetscObjectSetName((PetscObject)fe, "Q")); + PetscCall(DMAddField(dm, NULL, (PetscObject)fe)); + PetscCall(DMCreateDS(dm)); + PetscCall(DMGetLabel(dm, "Face Sets", &label)); // Set wall BCs if (bc->num_wall > 0) { - ierr = DMAddBoundary(dm, DM_BC_ESSENTIAL, "wall", label, - bc->num_wall, bc->walls, 0, bc->num_comps, - bc->wall_comps, (void(*)(void))problem->bc, - NULL, problem->bc_ctx, NULL); CHKERRQ(ierr); + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "wall", label, bc->num_wall, bc->walls, 0, bc->num_comps, bc->wall_comps, + (void (*)(void))problem->bc, NULL, problem->bc_ctx, NULL)); } // Set slip BCs in the x direction if (bc->num_slip[0] > 0) { PetscInt comps[1] = {1}; - ierr = DMAddBoundary(dm, DM_BC_ESSENTIAL, "slipx", label, - bc->num_slip[0], bc->slips[0], 0, 1, comps, - (void(*)(void))NULL, NULL, problem->bc_ctx, NULL); CHKERRQ(ierr); + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "slipx", label, bc->num_slip[0], bc->slips[0], 0, 1, comps, (void (*)(void))NULL, NULL, + problem->bc_ctx, NULL)); } // Set slip BCs in the y direction if (bc->num_slip[1] > 0) { PetscInt comps[1] = {2}; - ierr = DMAddBoundary(dm, DM_BC_ESSENTIAL, "slipy", label, - bc->num_slip[1], bc->slips[1], 0, 1, comps, - (void(*)(void))NULL, NULL, problem->bc_ctx, NULL); CHKERRQ(ierr); + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "slipy", label, bc->num_slip[1], bc->slips[1], 0, 1, comps, (void (*)(void))NULL, NULL, + problem->bc_ctx, NULL)); } // Set slip BCs in the z direction if (bc->num_slip[2] > 0) { PetscInt comps[1] = {3}; - ierr = DMAddBoundary(dm, DM_BC_ESSENTIAL, "slipz", label, - bc->num_slip[2], bc->slips[2], 0, 1, comps, - (void(*)(void))NULL, NULL, problem->bc_ctx, NULL); CHKERRQ(ierr); + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "slipz", label, bc->num_slip[2], bc->slips[2], 0, 1, comps, (void (*)(void))NULL, NULL, + problem->bc_ctx, NULL)); } { PetscBool use_strongstg = PETSC_FALSE; - ierr = PetscOptionsGetBool(NULL, NULL, "-stg_strong", &use_strongstg, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsGetBool(NULL, NULL, "-stg_strong", &use_strongstg, NULL)); if (use_strongstg) { - ierr = SetupStrongSTG(dm, bc, problem, phys); CHKERRQ(ierr); + PetscCall(SetupStrongSTG(dm, bc, problem, phys)); } } - ierr = DMPlexSetClosurePermutationTensor(dm, PETSC_DETERMINE, NULL); - CHKERRQ(ierr); - ierr = PetscFEDestroy(&fe); CHKERRQ(ierr); + PetscCall(DMPlexSetClosurePermutationTensor(dm, PETSC_DETERMINE, NULL)); + PetscCall(PetscFEDestroy(&fe)); } // Empty name for conserved field (because there is only one field) PetscSection section; - ierr = DMGetLocalSection(dm, §ion); CHKERRQ(ierr); - ierr = PetscSectionSetFieldName(section, 0, ""); CHKERRQ(ierr); + PetscCall(DMGetLocalSection(dm, §ion)); + PetscCall(PetscSectionSetFieldName(section, 0, "")); switch (phys->state_var) { - case STATEVAR_CONSERVATIVE: - ierr = PetscSectionSetComponentName(section, 0, 0, "Density"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 1, "Momentum X"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 2, "Momentum Y"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 3, "Momentum Z"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 4, "Energy Density"); - CHKERRQ(ierr); - break; + case STATEVAR_CONSERVATIVE: + PetscCall(PetscSectionSetComponentName(section, 0, 0, "Density")); + PetscCall(PetscSectionSetComponentName(section, 0, 1, "Momentum X")); + PetscCall(PetscSectionSetComponentName(section, 0, 2, "Momentum Y")); + PetscCall(PetscSectionSetComponentName(section, 0, 3, "Momentum Z")); + PetscCall(PetscSectionSetComponentName(section, 0, 4, "Energy Density")); + break; - case STATEVAR_PRIMITIVE: - ierr = PetscSectionSetComponentName(section, 0, 0, "Pressure"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 1, "Velocity X"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 2, "Velocity Y"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 3, "Velocity Z"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 4, "Temperature"); - CHKERRQ(ierr); - break; + case STATEVAR_PRIMITIVE: + PetscCall(PetscSectionSetComponentName(section, 0, 0, "Pressure")); + PetscCall(PetscSectionSetComponentName(section, 0, 1, "Velocity X")); + PetscCall(PetscSectionSetComponentName(section, 0, 2, "Velocity Y")); + PetscCall(PetscSectionSetComponentName(section, 0, 3, "Velocity Z")); + PetscCall(PetscSectionSetComponentName(section, 0, 4, "Temperature")); + break; } PetscFunctionReturn(0); } // Refine DM for high-order viz -PetscErrorCode VizRefineDM(DM dm, User user, ProblemData *problem, - SimpleBC bc, Physics phys) { - PetscErrorCode ierr; - DM dm_hierarchy[user->app_ctx->viz_refine + 1]; - VecType vec_type; +PetscErrorCode VizRefineDM(DM dm, User user, ProblemData *problem, SimpleBC bc, Physics phys) { + DM dm_hierarchy[user->app_ctx->viz_refine + 1]; + VecType vec_type; PetscFunctionBeginUser; - ierr = DMPlexSetRefinementUniform(dm, PETSC_TRUE); CHKERRQ(ierr); + PetscCall(DMPlexSetRefinementUniform(dm, PETSC_TRUE)); dm_hierarchy[0] = dm; - for (PetscInt i = 0, d = user->app_ctx->degree; - i < user->app_ctx->viz_refine; i++) { + for (PetscInt i = 0, d = user->app_ctx->degree; i < user->app_ctx->viz_refine; i++) { Mat interp_next; - ierr = DMRefine(dm_hierarchy[i], MPI_COMM_NULL, &dm_hierarchy[i+1]); - CHKERRQ(ierr); - ierr = DMClearDS(dm_hierarchy[i+1]); CHKERRQ(ierr); - ierr = DMClearFields(dm_hierarchy[i+1]); CHKERRQ(ierr); - ierr = DMSetCoarseDM(dm_hierarchy[i+1], dm_hierarchy[i]); CHKERRQ(ierr); + PetscCall(DMRefine(dm_hierarchy[i], MPI_COMM_NULL, &dm_hierarchy[i + 1])); + PetscCall(DMClearDS(dm_hierarchy[i + 1])); + PetscCall(DMClearFields(dm_hierarchy[i + 1])); + PetscCall(DMSetCoarseDM(dm_hierarchy[i + 1], dm_hierarchy[i])); d = (d + 1) / 2; if (i + 1 == user->app_ctx->viz_refine) d = 1; - ierr = DMGetVecType(dm, &vec_type); CHKERRQ(ierr); - ierr = DMSetVecType(dm_hierarchy[i+1], vec_type); CHKERRQ(ierr); - ierr = SetUpDM(dm_hierarchy[i+1], problem, d, bc, phys); - CHKERRQ(ierr); - ierr = DMCreateInterpolation(dm_hierarchy[i], dm_hierarchy[i+1], &interp_next, - NULL); CHKERRQ(ierr); + PetscCall(DMGetVecType(dm, &vec_type)); + PetscCall(DMSetVecType(dm_hierarchy[i + 1], vec_type)); + PetscCall(SetUpDM(dm_hierarchy[i + 1], problem, d, bc, phys)); + PetscCall(DMCreateInterpolation(dm_hierarchy[i], dm_hierarchy[i + 1], &interp_next, NULL)); if (!i) user->interp_viz = interp_next; else { Mat C; - ierr = MatMatMult(interp_next, user->interp_viz, MAT_INITIAL_MATRIX, - PETSC_DECIDE, &C); CHKERRQ(ierr); - ierr = MatDestroy(&interp_next); CHKERRQ(ierr); - ierr = MatDestroy(&user->interp_viz); CHKERRQ(ierr); + PetscCall(MatMatMult(interp_next, user->interp_viz, MAT_INITIAL_MATRIX, PETSC_DECIDE, &C)); + PetscCall(MatDestroy(&interp_next)); + PetscCall(MatDestroy(&user->interp_viz)); user->interp_viz = C; } } - for (PetscInt i=1; iapp_ctx->viz_refine; i++) { - ierr = DMDestroy(&dm_hierarchy[i]); CHKERRQ(ierr); + for (PetscInt i = 1; i < user->app_ctx->viz_refine; i++) { + PetscCall(DMDestroy(&dm_hierarchy[i])); } user->dm_viz = dm_hierarchy[user->app_ctx->viz_refine]; diff --git a/examples/fluids/src/setuplibceed.c b/examples/fluids/src/setuplibceed.c index 14e5655fd7..fb36866e7b 100644 --- a/examples/fluids/src/setuplibceed.c +++ b/examples/fluids/src/setuplibceed.c @@ -11,160 +11,113 @@ #include "../navierstokes.h" // Utility function - essential BC dofs are encoded in closure indices as -(i+1). -PetscInt Involute(PetscInt i) { - return i >= 0 ? i : -(i+1); -} +PetscInt Involute(PetscInt i) { return i >= 0 ? i : -(i + 1); } // Utility function to create local CEED restriction -PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr) { +PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr) { PetscInt num_elem, elem_size, num_dof, num_comp, *elem_restr_offsets; - PetscErrorCode ierr; PetscFunctionBeginUser; - ierr = DMPlexGetLocalOffsets(dm, domain_label, value, height, 0, &num_elem, - &elem_size, &num_comp, &num_dof, &elem_restr_offsets); - CHKERRQ(ierr); + PetscCall(DMPlexGetLocalOffsets(dm, domain_label, value, height, 0, &num_elem, &elem_size, &num_comp, &num_dof, &elem_restr_offsets)); - CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp, - 1, num_dof, CEED_MEM_HOST, CEED_COPY_VALUES, - elem_restr_offsets, elem_restr); - ierr = PetscFree(elem_restr_offsets); CHKERRQ(ierr); + CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp, 1, num_dof, CEED_MEM_HOST, CEED_COPY_VALUES, elem_restr_offsets, elem_restr); + PetscCall(PetscFree(elem_restr_offsets)); PetscFunctionReturn(0); } // Utility function to get Ceed Restriction for each domain -PetscErrorCode GetRestrictionForDomain(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, PetscInt value, - CeedInt Q, CeedInt q_data_size, - CeedElemRestriction *elem_restr_q, - CeedElemRestriction *elem_restr_x, - CeedElemRestriction *elem_restr_qd_i) { - DM dm_coord; - CeedInt dim, loc_num_elem; - CeedInt Q_dim; +PetscErrorCode GetRestrictionForDomain(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, PetscInt value, CeedInt Q, CeedInt q_data_size, + CeedElemRestriction *elem_restr_q, CeedElemRestriction *elem_restr_x, CeedElemRestriction *elem_restr_qd_i) { + DM dm_coord; + CeedInt dim, loc_num_elem; + CeedInt Q_dim; CeedElemRestriction elem_restr_tmp; - PetscErrorCode ierr; PetscFunctionBeginUser; - ierr = DMGetDimension(dm, &dim); CHKERRQ(ierr); + PetscCall(DMGetDimension(dm, &dim)); dim -= height; Q_dim = CeedIntPow(Q, dim); - ierr = CreateRestrictionFromPlex(ceed, dm, height, domain_label, value, - &elem_restr_tmp); - CHKERRQ(ierr); + PetscCall(CreateRestrictionFromPlex(ceed, dm, height, domain_label, value, &elem_restr_tmp)); if (elem_restr_q) *elem_restr_q = elem_restr_tmp; if (elem_restr_x) { - ierr = DMGetCellCoordinateDM(dm, &dm_coord); CHKERRQ(ierr); + PetscCall(DMGetCellCoordinateDM(dm, &dm_coord)); if (!dm_coord) { - ierr = DMGetCoordinateDM(dm, &dm_coord); CHKERRQ(ierr); + PetscCall(DMGetCoordinateDM(dm, &dm_coord)); } - ierr = DMPlexSetClosurePermutationTensor(dm_coord, PETSC_DETERMINE, NULL); - CHKERRQ(ierr); - ierr = CreateRestrictionFromPlex(ceed, dm_coord, height, domain_label, value, - elem_restr_x); - CHKERRQ(ierr); + PetscCall(DMPlexSetClosurePermutationTensor(dm_coord, PETSC_DETERMINE, NULL)); + PetscCall(CreateRestrictionFromPlex(ceed, dm_coord, height, domain_label, value, elem_restr_x)); } if (elem_restr_qd_i) { CeedElemRestrictionGetNumElements(elem_restr_tmp, &loc_num_elem); - CeedElemRestrictionCreateStrided(ceed, loc_num_elem, Q_dim, - q_data_size, q_data_size*loc_num_elem*Q_dim, - CEED_STRIDES_BACKEND, elem_restr_qd_i); + CeedElemRestrictionCreateStrided(ceed, loc_num_elem, Q_dim, q_data_size, q_data_size * loc_num_elem * Q_dim, CEED_STRIDES_BACKEND, + elem_restr_qd_i); } if (!elem_restr_q) CeedElemRestrictionDestroy(&elem_restr_tmp); PetscFunctionReturn(0); } -PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, - DMLabel domain_label, PetscInt value, - CeedInt height, CeedInt Q_sur, - CeedInt q_data_size_sur, CeedInt jac_data_size_sur, - CeedQFunction qf_apply_bc, CeedQFunction qf_apply_bc_jacobian, +PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, DMLabel domain_label, PetscInt value, CeedInt height, CeedInt Q_sur, + CeedInt q_data_size_sur, CeedInt jac_data_size_sur, CeedQFunction qf_apply_bc, CeedQFunction qf_apply_bc_jacobian, CeedOperator *op_apply, CeedOperator *op_apply_ijacobian) { CeedVector q_data_sur, jac_data_sur; - CeedOperator op_setup_sur, op_apply_bc, - op_apply_bc_jacobian = NULL; - CeedElemRestriction elem_restr_x_sur, elem_restr_q_sur, elem_restr_qd_i_sur, - elem_restr_jd_i_sur; - CeedInt num_qpts_sur; + CeedOperator op_setup_sur, op_apply_bc, op_apply_bc_jacobian = NULL; + CeedElemRestriction elem_restr_x_sur, elem_restr_q_sur, elem_restr_qd_i_sur, elem_restr_jd_i_sur; + CeedInt num_qpts_sur; PetscFunctionBeginUser; // --- Get number of quadrature points for the boundaries CeedBasisGetNumQuadraturePoints(ceed_data->basis_q_sur, &num_qpts_sur); - // ---- CEED Restriction - PetscCall(GetRestrictionForDomain(ceed, dm, height, domain_label, value, Q_sur, - q_data_size_sur, &elem_restr_q_sur, &elem_restr_x_sur, &elem_restr_qd_i_sur)); + PetscCall(GetRestrictionForDomain(ceed, dm, height, domain_label, value, Q_sur, q_data_size_sur, &elem_restr_q_sur, &elem_restr_x_sur, + &elem_restr_qd_i_sur)); if (jac_data_size_sur > 0) { // State-dependent data will be passed from residual to Jacobian. This will be collocated. - PetscCall(GetRestrictionForDomain(ceed, dm, height, domain_label, value, Q_sur, - jac_data_size_sur, NULL, NULL, &elem_restr_jd_i_sur)); + PetscCall(GetRestrictionForDomain(ceed, dm, height, domain_label, value, Q_sur, jac_data_size_sur, NULL, NULL, &elem_restr_jd_i_sur)); CeedElemRestrictionCreateVector(elem_restr_jd_i_sur, &jac_data_sur, NULL); } else { elem_restr_jd_i_sur = NULL; - jac_data_sur = NULL; + jac_data_sur = NULL; } // ---- CEED Vector PetscInt loc_num_elem_sur; CeedElemRestrictionGetNumElements(elem_restr_q_sur, &loc_num_elem_sur); - CeedVectorCreate(ceed, q_data_size_sur*loc_num_elem_sur*num_qpts_sur, - &q_data_sur); + CeedVectorCreate(ceed, q_data_size_sur * loc_num_elem_sur * num_qpts_sur, &q_data_sur); // ---- CEED Operator // ----- CEED Operator for Setup (geometric factors) CeedOperatorCreate(ceed, ceed_data->qf_setup_sur, NULL, NULL, &op_setup_sur); - CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x_sur, - ceed_data->basis_x_sur, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, - ceed_data->basis_x_sur, CEED_VECTOR_NONE); - CeedOperatorSetField(op_setup_sur, "surface qdata", elem_restr_qd_i_sur, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_sur, "dx", elem_restr_x_sur, ceed_data->basis_x_sur, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_sur, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x_sur, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_sur, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // ----- CEED Operator for Physics CeedOperatorCreate(ceed, qf_apply_bc, NULL, NULL, &op_apply_bc); - CeedOperatorSetField(op_apply_bc, "q", elem_restr_q_sur, ceed_data->basis_q_sur, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply_bc, "Grad_q", elem_restr_q_sur, - ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply_bc, "surface qdata", elem_restr_qd_i_sur, - CEED_BASIS_COLLOCATED, q_data_sur); - CeedOperatorSetField(op_apply_bc, "x", elem_restr_x_sur, ceed_data->basis_x_sur, - ceed_data->x_coord); - CeedOperatorSetField(op_apply_bc, "v", elem_restr_q_sur, ceed_data->basis_q_sur, - CEED_VECTOR_ACTIVE); - if (elem_restr_jd_i_sur) - CeedOperatorSetField(op_apply_bc, "surface jacobian data", - elem_restr_jd_i_sur, - CEED_BASIS_COLLOCATED, jac_data_sur); + CeedOperatorSetField(op_apply_bc, "q", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply_bc, "Grad_q", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply_bc, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_COLLOCATED, q_data_sur); + CeedOperatorSetField(op_apply_bc, "x", elem_restr_x_sur, ceed_data->basis_x_sur, ceed_data->x_coord); + CeedOperatorSetField(op_apply_bc, "v", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); + if (elem_restr_jd_i_sur) CeedOperatorSetField(op_apply_bc, "surface jacobian data", elem_restr_jd_i_sur, CEED_BASIS_COLLOCATED, jac_data_sur); if (qf_apply_bc_jacobian) { - CeedOperatorCreate(ceed, qf_apply_bc_jacobian, NULL, NULL, - &op_apply_bc_jacobian); - CeedOperatorSetField(op_apply_bc_jacobian, "dq", elem_restr_q_sur, - ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply_bc_jacobian, "Grad_dq", elem_restr_q_sur, - ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply_bc_jacobian, "surface qdata", elem_restr_qd_i_sur, - CEED_BASIS_COLLOCATED, q_data_sur); - CeedOperatorSetField(op_apply_bc_jacobian, "x", elem_restr_x_sur, - ceed_data->basis_x_sur, ceed_data->x_coord); - CeedOperatorSetField(op_apply_bc_jacobian, "surface jacobian data", - elem_restr_jd_i_sur, CEED_BASIS_COLLOCATED, jac_data_sur); - CeedOperatorSetField(op_apply_bc_jacobian, "v", elem_restr_q_sur, - ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_apply_bc_jacobian, NULL, NULL, &op_apply_bc_jacobian); + CeedOperatorSetField(op_apply_bc_jacobian, "dq", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply_bc_jacobian, "Grad_dq", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply_bc_jacobian, "surface qdata", elem_restr_qd_i_sur, CEED_BASIS_COLLOCATED, q_data_sur); + CeedOperatorSetField(op_apply_bc_jacobian, "x", elem_restr_x_sur, ceed_data->basis_x_sur, ceed_data->x_coord); + CeedOperatorSetField(op_apply_bc_jacobian, "surface jacobian data", elem_restr_jd_i_sur, CEED_BASIS_COLLOCATED, jac_data_sur); + CeedOperatorSetField(op_apply_bc_jacobian, "v", elem_restr_q_sur, ceed_data->basis_q_sur, CEED_VECTOR_ACTIVE); } // ----- Apply CEED operator for Setup - CeedOperatorApply(op_setup_sur, ceed_data->x_coord, q_data_sur, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_setup_sur, ceed_data->x_coord, q_data_sur, CEED_REQUEST_IMMEDIATE); // ----- Apply Sub-Operator for Physics CeedCompositeOperatorAddSub(*op_apply, op_apply_bc); - if (op_apply_bc_jacobian) - CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_bc_jacobian); + if (op_apply_bc_jacobian) CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_bc_jacobian); // ----- Cleanup CeedVectorDestroy(&q_data_sur); @@ -181,71 +134,53 @@ PetscErrorCode AddBCSubOperator(Ceed ceed, DM dm, CeedData ceed_data, } // Utility function to create CEED Composite Operator for the entire domain -PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, - CeedData ceed_data, Physics phys, - CeedOperator op_apply_vol, - CeedOperator op_apply_ijacobian_vol, - CeedInt height, - CeedInt P_sur, CeedInt Q_sur, - CeedInt q_data_size_sur, CeedInt jac_data_size_sur, - CeedOperator *op_apply, CeedOperator *op_apply_ijacobian) { - DMLabel domain_label; - PetscErrorCode ierr; +PetscErrorCode CreateOperatorForDomain(Ceed ceed, DM dm, SimpleBC bc, CeedData ceed_data, Physics phys, CeedOperator op_apply_vol, + CeedOperator op_apply_ijacobian_vol, CeedInt height, CeedInt P_sur, CeedInt Q_sur, CeedInt q_data_size_sur, + CeedInt jac_data_size_sur, CeedOperator *op_apply, CeedOperator *op_apply_ijacobian) { + DMLabel domain_label; PetscFunctionBeginUser; // Create Composite Operaters CeedCompositeOperatorCreate(ceed, op_apply); - if (op_apply_ijacobian) - CeedCompositeOperatorCreate(ceed, op_apply_ijacobian); + if (op_apply_ijacobian) CeedCompositeOperatorCreate(ceed, op_apply_ijacobian); // --Apply Sub-Operator for the volume CeedCompositeOperatorAddSub(*op_apply, op_apply_vol); - if (op_apply_ijacobian) - CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_ijacobian_vol); + if (op_apply_ijacobian) CeedCompositeOperatorAddSub(*op_apply_ijacobian, op_apply_ijacobian_vol); // -- Create Sub-Operator for in/outflow BCs if (phys->has_neumann || 1) { // --- Setup - ierr = DMGetLabel(dm, "Face Sets", &domain_label); CHKERRQ(ierr); + PetscCall(DMGetLabel(dm, "Face Sets", &domain_label)); // --- Create Sub-Operator for inflow boundaries - for (CeedInt i=0; i < bc->num_inflow; i++) { - PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->inflows[i], - height, Q_sur, q_data_size_sur, jac_data_size_sur, - ceed_data->qf_apply_inflow, ceed_data->qf_apply_inflow_jacobian, - op_apply, op_apply_ijacobian)); + for (CeedInt i = 0; i < bc->num_inflow; i++) { + PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->inflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, + ceed_data->qf_apply_inflow, ceed_data->qf_apply_inflow_jacobian, op_apply, op_apply_ijacobian)); } // --- Create Sub-Operator for outflow boundaries - for (CeedInt i=0; i < bc->num_outflow; i++) { - PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->outflows[i], - height, Q_sur, q_data_size_sur, jac_data_size_sur, - ceed_data->qf_apply_outflow, ceed_data->qf_apply_outflow_jacobian, - op_apply, op_apply_ijacobian)); + for (CeedInt i = 0; i < bc->num_outflow; i++) { + PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->outflows[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, + ceed_data->qf_apply_outflow, ceed_data->qf_apply_outflow_jacobian, op_apply, op_apply_ijacobian)); } // --- Create Sub-Operator for freestream boundaries - for (CeedInt i=0; i < bc->num_freestream; i++) { - PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, - bc->freestreams[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, - ceed_data->qf_apply_freestream, ceed_data->qf_apply_freestream_jacobian, - op_apply, op_apply_ijacobian)); + for (CeedInt i = 0; i < bc->num_freestream; i++) { + PetscCall(AddBCSubOperator(ceed, dm, ceed_data, domain_label, bc->freestreams[i], height, Q_sur, q_data_size_sur, jac_data_size_sur, + ceed_data->qf_apply_freestream, ceed_data->qf_apply_freestream_jacobian, op_apply, op_apply_ijacobian)); } } // ----- Get Context Labels for Operator - CeedOperatorContextGetFieldLabel(*op_apply, "solution time", - &phys->solution_time_label); - CeedOperatorContextGetFieldLabel(*op_apply, "timestep size", - &phys->timestep_size_label); + CeedOperatorContextGetFieldLabel(*op_apply, "solution time", &phys->solution_time_label); + CeedOperatorContextGetFieldLabel(*op_apply, "timestep size", &phys->timestep_size_label); PetscFunctionReturn(0); } -PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, - PetscInt num_comp_x, PetscInt num_comp_q, - PetscInt q_data_size_sur, PetscInt jac_data_size_sur, - ProblemQFunctionSpec apply_bc, ProblemQFunctionSpec apply_bc_jacobian, +PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, PetscInt num_comp_x, PetscInt num_comp_q, PetscInt q_data_size_sur, + PetscInt jac_data_size_sur, ProblemQFunctionSpec apply_bc, ProblemQFunctionSpec apply_bc_jacobian, CeedQFunction *qf_apply_bc, CeedQFunction *qf_apply_bc_jacobian) { PetscFunctionBeginUser; @@ -255,195 +190,147 @@ PetscErrorCode SetupBCQFunctions(Ceed ceed, PetscInt dim_sur, CeedQFunctionSetContext(*qf_apply_bc, apply_bc.qfunction_context); CeedQFunctionContextDestroy(&apply_bc.qfunction_context); CeedQFunctionAddInput(*qf_apply_bc, "q", num_comp_q, CEED_EVAL_INTERP); - CeedQFunctionAddInput(*qf_apply_bc, "Grad_q", num_comp_q*dim_sur, CEED_EVAL_GRAD); + CeedQFunctionAddInput(*qf_apply_bc, "Grad_q", num_comp_q * dim_sur, CEED_EVAL_GRAD); CeedQFunctionAddInput(*qf_apply_bc, "surface qdata", q_data_size_sur, CEED_EVAL_NONE); CeedQFunctionAddInput(*qf_apply_bc, "x", num_comp_x, CEED_EVAL_INTERP); CeedQFunctionAddOutput(*qf_apply_bc, "v", num_comp_q, CEED_EVAL_INTERP); // *INDENT-ON* - if (jac_data_size_sur) - CeedQFunctionAddOutput(*qf_apply_bc, "surface jacobian data", jac_data_size_sur, - CEED_EVAL_NONE); + if (jac_data_size_sur) CeedQFunctionAddOutput(*qf_apply_bc, "surface jacobian data", jac_data_size_sur, CEED_EVAL_NONE); } if (apply_bc_jacobian.qfunction) { // *INDENT-OFF* - CeedQFunctionCreateInterior(ceed, 1, apply_bc_jacobian.qfunction, - apply_bc_jacobian.qfunction_loc, qf_apply_bc_jacobian); + CeedQFunctionCreateInterior(ceed, 1, apply_bc_jacobian.qfunction, apply_bc_jacobian.qfunction_loc, qf_apply_bc_jacobian); CeedQFunctionSetContext(*qf_apply_bc_jacobian, apply_bc_jacobian.qfunction_context); CeedQFunctionContextDestroy(&apply_bc_jacobian.qfunction_context); CeedQFunctionAddInput(*qf_apply_bc_jacobian, "dq", num_comp_q, CEED_EVAL_INTERP); - CeedQFunctionAddInput(*qf_apply_bc_jacobian, "Grad_dq", num_comp_q*dim_sur, CEED_EVAL_GRAD); + CeedQFunctionAddInput(*qf_apply_bc_jacobian, "Grad_dq", num_comp_q * dim_sur, CEED_EVAL_GRAD); CeedQFunctionAddInput(*qf_apply_bc_jacobian, "surface qdata", q_data_size_sur, CEED_EVAL_NONE); CeedQFunctionAddInput(*qf_apply_bc_jacobian, "x", num_comp_x, CEED_EVAL_INTERP); - CeedQFunctionAddInput(*qf_apply_bc_jacobian, "surface jacobian data", - jac_data_size_sur, CEED_EVAL_NONE); + CeedQFunctionAddInput(*qf_apply_bc_jacobian, "surface jacobian data", jac_data_size_sur, CEED_EVAL_NONE); CeedQFunctionAddOutput(*qf_apply_bc_jacobian, "v", num_comp_q, CEED_EVAL_INTERP); // *INDENT-ON* } PetscFunctionReturn(0); } -PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, - AppCtx app_ctx, ProblemData *problem, SimpleBC bc) { - PetscErrorCode ierr; +PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, AppCtx app_ctx, ProblemData *problem, SimpleBC bc) { PetscFunctionBeginUser; // ***************************************************************************** // Set up CEED objects for the interior domain (volume) // ***************************************************************************** - const PetscInt num_comp_q = 5; - const CeedInt dim = problem->dim, - num_comp_x = problem->dim, - q_data_size_vol = problem->q_data_size_vol, - jac_data_size_vol = num_comp_q + 6 + 3, - P = app_ctx->degree + 1, - Q = P + app_ctx->q_extra; + const PetscInt num_comp_q = 5; + const CeedInt dim = problem->dim, num_comp_x = problem->dim, q_data_size_vol = problem->q_data_size_vol, jac_data_size_vol = num_comp_q + 6 + 3, + P = app_ctx->degree + 1, Q = P + app_ctx->q_extra; CeedElemRestriction elem_restr_jd_i; - CeedVector jac_data; + CeedVector jac_data; // ----------------------------------------------------------------------------- // CEED Bases // ----------------------------------------------------------------------------- - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_q, P, Q, CEED_GAUSS, - &ceed_data->basis_q); - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, Q, CEED_GAUSS, - &ceed_data->basis_x); - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, P, - CEED_GAUSS_LOBATTO, &ceed_data->basis_xc); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_q, P, Q, CEED_GAUSS, &ceed_data->basis_q); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, Q, CEED_GAUSS, &ceed_data->basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, P, CEED_GAUSS_LOBATTO, &ceed_data->basis_xc); // ----------------------------------------------------------------------------- // CEED Restrictions // ----------------------------------------------------------------------------- // -- Create restriction - ierr = GetRestrictionForDomain(ceed, dm, 0, 0, 0, Q, q_data_size_vol, - &ceed_data->elem_restr_q, &ceed_data->elem_restr_x, - &ceed_data->elem_restr_qd_i); CHKERRQ(ierr); - - ierr = GetRestrictionForDomain(ceed, dm, 0, 0, 0, Q, jac_data_size_vol, - NULL, NULL, - &elem_restr_jd_i); CHKERRQ(ierr); -// -- Create E vectors + PetscCall(GetRestrictionForDomain(ceed, dm, 0, 0, 0, Q, q_data_size_vol, &ceed_data->elem_restr_q, &ceed_data->elem_restr_x, + &ceed_data->elem_restr_qd_i)); + + PetscCall(GetRestrictionForDomain(ceed, dm, 0, 0, 0, Q, jac_data_size_vol, NULL, NULL, &elem_restr_jd_i)); + // -- Create E vectors CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_ceed, NULL); - CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, - NULL); + CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->q_dot_ceed, NULL); CeedElemRestrictionCreateVector(ceed_data->elem_restr_q, &user->g_ceed, NULL); // ----------------------------------------------------------------------------- // CEED QFunctions // ----------------------------------------------------------------------------- // -- Create QFunction for quadrature data - CeedQFunctionCreateInterior(ceed, 1, problem->setup_vol.qfunction, - problem->setup_vol.qfunction_loc, - &ceed_data->qf_setup_vol); + CeedQFunctionCreateInterior(ceed, 1, problem->setup_vol.qfunction, problem->setup_vol.qfunction_loc, &ceed_data->qf_setup_vol); if (problem->setup_vol.qfunction_context) { - CeedQFunctionSetContext(ceed_data->qf_setup_vol, - problem->setup_vol.qfunction_context); + CeedQFunctionSetContext(ceed_data->qf_setup_vol, problem->setup_vol.qfunction_context); CeedQFunctionContextDestroy(&problem->setup_vol.qfunction_context); } - CeedQFunctionAddInput(ceed_data->qf_setup_vol, "dx", num_comp_x*dim, - CEED_EVAL_GRAD); + CeedQFunctionAddInput(ceed_data->qf_setup_vol, "dx", num_comp_x * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(ceed_data->qf_setup_vol, "weight", 1, CEED_EVAL_WEIGHT); - CeedQFunctionAddOutput(ceed_data->qf_setup_vol, "qdata", q_data_size_vol, - CEED_EVAL_NONE); + CeedQFunctionAddOutput(ceed_data->qf_setup_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE); // -- Create QFunction for ICs - CeedQFunctionCreateInterior(ceed, 1, problem->ics.qfunction, - problem->ics.qfunction_loc, - &ceed_data->qf_ics); + CeedQFunctionCreateInterior(ceed, 1, problem->ics.qfunction, problem->ics.qfunction_loc, &ceed_data->qf_ics); CeedQFunctionSetContext(ceed_data->qf_ics, problem->ics.qfunction_context); CeedQFunctionContextDestroy(&problem->ics.qfunction_context); CeedQFunctionAddInput(ceed_data->qf_ics, "x", num_comp_x, CEED_EVAL_INTERP); - CeedQFunctionAddInput(ceed_data->qf_ics, "qdata", q_data_size_vol, - CEED_EVAL_NONE); + CeedQFunctionAddInput(ceed_data->qf_ics, "qdata", q_data_size_vol, CEED_EVAL_NONE); CeedQFunctionAddOutput(ceed_data->qf_ics, "q0", num_comp_q, CEED_EVAL_NONE); // -- Create QFunction for RHS if (problem->apply_vol_rhs.qfunction) { - CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_rhs.qfunction, - problem->apply_vol_rhs.qfunction_loc, &ceed_data->qf_rhs_vol); - CeedQFunctionSetContext(ceed_data->qf_rhs_vol, - problem->apply_vol_rhs.qfunction_context); + CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_rhs.qfunction, problem->apply_vol_rhs.qfunction_loc, &ceed_data->qf_rhs_vol); + CeedQFunctionSetContext(ceed_data->qf_rhs_vol, problem->apply_vol_rhs.qfunction_context); CeedQFunctionContextDestroy(&problem->apply_vol_rhs.qfunction_context); CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "q", num_comp_q, CEED_EVAL_INTERP); - CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "Grad_q", num_comp_q*dim, - CEED_EVAL_GRAD); - CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "qdata", q_data_size_vol, - CEED_EVAL_NONE); + CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE); CeedQFunctionAddInput(ceed_data->qf_rhs_vol, "x", num_comp_x, CEED_EVAL_INTERP); - CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "v", num_comp_q, - CEED_EVAL_INTERP); - CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "Grad_v", num_comp_q*dim, - CEED_EVAL_GRAD); + CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "v", num_comp_q, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(ceed_data->qf_rhs_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD); } // -- Create QFunction for IFunction if (problem->apply_vol_ifunction.qfunction) { - CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ifunction.qfunction, - problem->apply_vol_ifunction.qfunction_loc, &ceed_data->qf_ifunction_vol); - CeedQFunctionSetContext(ceed_data->qf_ifunction_vol, - problem->apply_vol_ifunction.qfunction_context); + CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ifunction.qfunction, problem->apply_vol_ifunction.qfunction_loc, + &ceed_data->qf_ifunction_vol); + CeedQFunctionSetContext(ceed_data->qf_ifunction_vol, problem->apply_vol_ifunction.qfunction_context); CeedQFunctionContextDestroy(&problem->apply_vol_ifunction.qfunction_context); - CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q", num_comp_q, - CEED_EVAL_INTERP); - CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "Grad_q", num_comp_q*dim, - CEED_EVAL_GRAD); - CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q dot", num_comp_q, - CEED_EVAL_INTERP); - CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "qdata", q_data_size_vol, - CEED_EVAL_NONE); - CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "x", num_comp_x, - CEED_EVAL_INTERP); - CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "v", num_comp_q, - CEED_EVAL_INTERP); - CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "Grad_v", num_comp_q*dim, - CEED_EVAL_GRAD); - CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "jac_data", - jac_data_size_vol, CEED_EVAL_NONE); + CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q", num_comp_q, CEED_EVAL_INTERP); + CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "Grad_q", num_comp_q * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "q dot", num_comp_q, CEED_EVAL_INTERP); + CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE); + CeedQFunctionAddInput(ceed_data->qf_ifunction_vol, "x", num_comp_x, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "v", num_comp_q, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(ceed_data->qf_ifunction_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE); } CeedQFunction qf_ijacobian_vol = NULL; if (problem->apply_vol_ijacobian.qfunction) { - CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ijacobian.qfunction, - problem->apply_vol_ijacobian.qfunction_loc, &qf_ijacobian_vol); - CeedQFunctionSetContext(qf_ijacobian_vol, - problem->apply_vol_ijacobian.qfunction_context); + CeedQFunctionCreateInterior(ceed, 1, problem->apply_vol_ijacobian.qfunction, problem->apply_vol_ijacobian.qfunction_loc, &qf_ijacobian_vol); + CeedQFunctionSetContext(qf_ijacobian_vol, problem->apply_vol_ijacobian.qfunction_context); CeedQFunctionContextDestroy(&problem->apply_vol_ijacobian.qfunction_context); - CeedQFunctionAddInput(qf_ijacobian_vol, "dq", num_comp_q, - CEED_EVAL_INTERP); - CeedQFunctionAddInput(qf_ijacobian_vol, "Grad_dq", num_comp_q*dim, - CEED_EVAL_GRAD); - CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", q_data_size_vol, - CEED_EVAL_NONE); - CeedQFunctionAddInput(qf_ijacobian_vol, "x", num_comp_x, - CEED_EVAL_INTERP); - CeedQFunctionAddInput(qf_ijacobian_vol, "jac_data", - jac_data_size_vol, CEED_EVAL_NONE); - CeedQFunctionAddOutput(qf_ijacobian_vol, "v", num_comp_q, - CEED_EVAL_INTERP); - CeedQFunctionAddOutput(qf_ijacobian_vol, "Grad_v", num_comp_q*dim, - CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_ijacobian_vol, "dq", num_comp_q, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_ijacobian_vol, "Grad_dq", num_comp_q * dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_ijacobian_vol, "qdata", q_data_size_vol, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_ijacobian_vol, "x", num_comp_x, CEED_EVAL_INTERP); + CeedQFunctionAddInput(qf_ijacobian_vol, "jac_data", jac_data_size_vol, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_ijacobian_vol, "v", num_comp_q, CEED_EVAL_INTERP); + CeedQFunctionAddOutput(qf_ijacobian_vol, "Grad_v", num_comp_q * dim, CEED_EVAL_GRAD); } // --------------------------------------------------------------------------- // Element coordinates // --------------------------------------------------------------------------- // -- Create CEED vector - CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, - NULL); + CeedElemRestrictionCreateVector(ceed_data->elem_restr_x, &ceed_data->x_coord, NULL); // -- Copy PETSc vector in CEED vector - Vec X_loc; + Vec X_loc; const PetscScalar *X_loc_array; { DM cdm; - ierr = DMGetCellCoordinateDM(dm, &cdm); CHKERRQ(ierr); - if (cdm) {ierr = DMGetCellCoordinatesLocal(dm, &X_loc); CHKERRQ(ierr);} - else {ierr = DMGetCoordinatesLocal(dm, &X_loc); CHKERRQ(ierr);} + PetscCall(DMGetCellCoordinateDM(dm, &cdm)); + if (cdm) { + PetscCall(DMGetCellCoordinatesLocal(dm, &X_loc)); + } else { + PetscCall(DMGetCoordinatesLocal(dm, &X_loc)); + } } - ierr = VecScale(X_loc, problem->dm_scale); CHKERRQ(ierr); - ierr = VecGetArrayRead(X_loc, &X_loc_array); CHKERRQ(ierr); - CeedVectorSetArray(ceed_data->x_coord, CEED_MEM_HOST, CEED_COPY_VALUES, - (PetscScalar *)X_loc_array); - ierr = VecRestoreArrayRead(X_loc, &X_loc_array); CHKERRQ(ierr); + PetscCall(VecScale(X_loc, problem->dm_scale)); + PetscCall(VecGetArrayRead(X_loc, &X_loc_array)); + CeedVectorSetArray(ceed_data->x_coord, CEED_MEM_HOST, CEED_COPY_VALUES, (PetscScalar *)X_loc_array); + PetscCall(VecRestoreArrayRead(X_loc, &X_loc_array)); // ----------------------------------------------------------------------------- // CEED vectors @@ -453,50 +340,35 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, PetscInt loc_num_elem_vol; CeedBasisGetNumQuadraturePoints(ceed_data->basis_q, &num_qpts_vol); CeedElemRestrictionGetNumElements(ceed_data->elem_restr_q, &loc_num_elem_vol); - CeedVectorCreate(ceed, q_data_size_vol*loc_num_elem_vol*num_qpts_vol, - &ceed_data->q_data); + CeedVectorCreate(ceed, q_data_size_vol * loc_num_elem_vol * num_qpts_vol, &ceed_data->q_data); CeedElemRestrictionCreateVector(elem_restr_jd_i, &jac_data, NULL); // ----------------------------------------------------------------------------- // CEED Operators // ----------------------------------------------------------------------------- // -- Create CEED operator for quadrature data - CeedOperatorCreate(ceed, ceed_data->qf_setup_vol, NULL, NULL, - &ceed_data->op_setup_vol); - CeedOperatorSetField(ceed_data->op_setup_vol, "dx", ceed_data->elem_restr_x, - ceed_data->basis_x, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(ceed_data->op_setup_vol, "weight", - CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x, CEED_VECTOR_NONE); - CeedOperatorSetField(ceed_data->op_setup_vol, "qdata", - ceed_data->elem_restr_qd_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, ceed_data->qf_setup_vol, NULL, NULL, &ceed_data->op_setup_vol); + CeedOperatorSetField(ceed_data->op_setup_vol, "dx", ceed_data->elem_restr_x, ceed_data->basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(ceed_data->op_setup_vol, "weight", CEED_ELEMRESTRICTION_NONE, ceed_data->basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(ceed_data->op_setup_vol, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // -- Create CEED operator for ICs CeedOperatorCreate(ceed, ceed_data->qf_ics, NULL, NULL, &ceed_data->op_ics); - CeedOperatorSetField(ceed_data->op_ics, "x", ceed_data->elem_restr_x, - ceed_data->basis_xc, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(ceed_data->op_ics, "qdata", ceed_data->elem_restr_qd_i, - CEED_BASIS_COLLOCATED, ceed_data->q_data); - CeedOperatorSetField(ceed_data->op_ics, "q0", ceed_data->elem_restr_q, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); - CeedOperatorContextGetFieldLabel(ceed_data->op_ics, "evaluation time", - &user->phys->ics_time_label); + CeedOperatorSetField(ceed_data->op_ics, "x", ceed_data->elem_restr_x, ceed_data->basis_xc, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(ceed_data->op_ics, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_COLLOCATED, ceed_data->q_data); + CeedOperatorSetField(ceed_data->op_ics, "q0", ceed_data->elem_restr_q, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorContextGetFieldLabel(ceed_data->op_ics, "evaluation time", &user->phys->ics_time_label); // Create CEED operator for RHS if (ceed_data->qf_rhs_vol) { CeedOperator op; CeedOperatorCreate(ceed, ceed_data->qf_rhs_vol, NULL, NULL, &op); - CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, - CEED_BASIS_COLLOCATED, ceed_data->q_data); - CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, - ceed_data->x_coord); - CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_COLLOCATED, ceed_data->q_data); + CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord); + CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); user->op_rhs_vol = op; } @@ -504,22 +376,14 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, if (ceed_data->qf_ifunction_vol) { CeedOperator op; CeedOperatorCreate(ceed, ceed_data->qf_ifunction_vol, NULL, NULL, &op); - CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "q dot", ceed_data->elem_restr_q, ceed_data->basis_q, - user->q_dot_ceed); - CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, - CEED_BASIS_COLLOCATED, ceed_data->q_data); - CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, - ceed_data->x_coord); - CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, - CEED_BASIS_COLLOCATED, jac_data); + CeedOperatorSetField(op, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "Grad_q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "q dot", ceed_data->elem_restr_q, ceed_data->basis_q, user->q_dot_ceed); + CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_COLLOCATED, ceed_data->q_data); + CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord); + CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, CEED_BASIS_COLLOCATED, jac_data); user->op_ifunction_vol = op; } @@ -528,20 +392,13 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, if (qf_ijacobian_vol) { CeedOperator op; CeedOperatorCreate(ceed, qf_ijacobian_vol, NULL, NULL, &op); - CeedOperatorSetField(op, "dq", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "Grad_dq", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, - CEED_BASIS_COLLOCATED, ceed_data->q_data); - CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, - ceed_data->x_coord); - CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, - CEED_BASIS_COLLOCATED, jac_data); - CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "Grad_dq", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_COLLOCATED, ceed_data->q_data); + CeedOperatorSetField(op, "x", ceed_data->elem_restr_x, ceed_data->basis_x, ceed_data->x_coord); + CeedOperatorSetField(op, "jac_data", elem_restr_jd_i, CEED_BASIS_COLLOCATED, jac_data); + CeedOperatorSetField(op, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op, "Grad_v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); op_ijacobian_vol = op; CeedQFunctionDestroy(&qf_ijacobian_vol); } @@ -549,85 +406,57 @@ PetscErrorCode SetupLibceed(Ceed ceed, CeedData ceed_data, DM dm, User user, // ***************************************************************************** // Set up CEED objects for the exterior domain (surface) // ***************************************************************************** - CeedInt height = 1, - dim_sur = dim - height, - P_sur = app_ctx->degree + 1, - Q_sur = P_sur + app_ctx->q_extra; - const CeedInt q_data_size_sur = problem->q_data_size_sur, - jac_data_size_sur = problem->jac_data_size_sur; + CeedInt height = 1, dim_sur = dim - height, P_sur = app_ctx->degree + 1, Q_sur = P_sur + app_ctx->q_extra; + const CeedInt q_data_size_sur = problem->q_data_size_sur, jac_data_size_sur = problem->jac_data_size_sur; // ----------------------------------------------------------------------------- // CEED Bases // ----------------------------------------------------------------------------- - CeedBasisCreateTensorH1Lagrange(ceed, dim_sur, num_comp_q, P_sur, Q_sur, - CEED_GAUSS, &ceed_data->basis_q_sur); - CeedBasisCreateTensorH1Lagrange(ceed, dim_sur, num_comp_x, 2, Q_sur, CEED_GAUSS, - &ceed_data->basis_x_sur); - CeedBasisCreateTensorH1Lagrange(ceed, dim_sur, num_comp_x, 2, P_sur, - CEED_GAUSS_LOBATTO, &ceed_data->basis_xc_sur); + CeedBasisCreateTensorH1Lagrange(ceed, dim_sur, num_comp_q, P_sur, Q_sur, CEED_GAUSS, &ceed_data->basis_q_sur); + CeedBasisCreateTensorH1Lagrange(ceed, dim_sur, num_comp_x, 2, Q_sur, CEED_GAUSS, &ceed_data->basis_x_sur); + CeedBasisCreateTensorH1Lagrange(ceed, dim_sur, num_comp_x, 2, P_sur, CEED_GAUSS_LOBATTO, &ceed_data->basis_xc_sur); // ----------------------------------------------------------------------------- // CEED QFunctions // ----------------------------------------------------------------------------- // -- Create QFunction for quadrature data - CeedQFunctionCreateInterior(ceed, 1, problem->setup_sur.qfunction, - problem->setup_sur.qfunction_loc, - &ceed_data->qf_setup_sur); + CeedQFunctionCreateInterior(ceed, 1, problem->setup_sur.qfunction, problem->setup_sur.qfunction_loc, &ceed_data->qf_setup_sur); if (problem->setup_sur.qfunction_context) { - CeedQFunctionSetContext(ceed_data->qf_setup_sur, - problem->setup_sur.qfunction_context); + CeedQFunctionSetContext(ceed_data->qf_setup_sur, problem->setup_sur.qfunction_context); CeedQFunctionContextDestroy(&problem->setup_sur.qfunction_context); } - CeedQFunctionAddInput(ceed_data->qf_setup_sur, "dx", num_comp_x*dim_sur, - CEED_EVAL_GRAD); + CeedQFunctionAddInput(ceed_data->qf_setup_sur, "dx", num_comp_x * dim_sur, CEED_EVAL_GRAD); CeedQFunctionAddInput(ceed_data->qf_setup_sur, "weight", 1, CEED_EVAL_WEIGHT); - CeedQFunctionAddOutput(ceed_data->qf_setup_sur, "surface qdata", - q_data_size_sur, CEED_EVAL_NONE); + CeedQFunctionAddOutput(ceed_data->qf_setup_sur, "surface qdata", q_data_size_sur, CEED_EVAL_NONE); - PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, - q_data_size_sur, jac_data_size_sur, - problem->apply_inflow, problem->apply_inflow_jacobian, - &ceed_data->qf_apply_inflow, &ceed_data->qf_apply_inflow_jacobian)); + PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_inflow, + problem->apply_inflow_jacobian, &ceed_data->qf_apply_inflow, &ceed_data->qf_apply_inflow_jacobian)); - PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, - q_data_size_sur, jac_data_size_sur, - problem->apply_outflow, problem->apply_outflow_jacobian, - &ceed_data->qf_apply_outflow, &ceed_data->qf_apply_outflow_jacobian)); + PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_outflow, + problem->apply_outflow_jacobian, &ceed_data->qf_apply_outflow, &ceed_data->qf_apply_outflow_jacobian)); - PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, - q_data_size_sur, jac_data_size_sur, - problem->apply_freestream, problem->apply_freestream_jacobian, - &ceed_data->qf_apply_freestream, &ceed_data->qf_apply_freestream_jacobian)); + PetscCall(SetupBCQFunctions(ceed, dim_sur, num_comp_x, num_comp_q, q_data_size_sur, jac_data_size_sur, problem->apply_freestream, + problem->apply_freestream_jacobian, &ceed_data->qf_apply_freestream, &ceed_data->qf_apply_freestream_jacobian)); // ***************************************************************************** // CEED Operator Apply // ***************************************************************************** // -- Apply CEED Operator for the geometric data - CeedOperatorApply(ceed_data->op_setup_vol, ceed_data->x_coord, - ceed_data->q_data, CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(ceed_data->op_setup_vol, ceed_data->x_coord, ceed_data->q_data, CEED_REQUEST_IMMEDIATE); // -- Create and apply CEED Composite Operator for the entire domain - if (!user->phys->implicit) { // RHS - ierr = CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, - user->op_rhs_vol, NULL, height, P_sur, Q_sur, - q_data_size_sur, 0, - &user->op_rhs, NULL); CHKERRQ(ierr); - } else { // IFunction - ierr = CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, - user->op_ifunction_vol, op_ijacobian_vol, - height, P_sur, Q_sur, - q_data_size_sur, jac_data_size_sur, - &user->op_ifunction, - op_ijacobian_vol ? &user->op_ijacobian : NULL); CHKERRQ(ierr); + if (!user->phys->implicit) { // RHS + PetscCall(CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, user->op_rhs_vol, NULL, height, P_sur, Q_sur, q_data_size_sur, 0, + &user->op_rhs, NULL)); + } else { // IFunction + PetscCall(CreateOperatorForDomain(ceed, dm, bc, ceed_data, user->phys, user->op_ifunction_vol, op_ijacobian_vol, height, P_sur, Q_sur, + q_data_size_sur, jac_data_size_sur, &user->op_ifunction, op_ijacobian_vol ? &user->op_ijacobian : NULL)); if (user->op_ijacobian) { - CeedOperatorContextGetFieldLabel(user->op_ijacobian, "ijacobian time shift", - &user->phys->ijacobian_time_shift_label); + CeedOperatorContextGetFieldLabel(user->op_ijacobian, "ijacobian time shift", &user->phys->ijacobian_time_shift_label); } if (problem->use_dirichlet_ceed) { - PetscCall(SetupStrongBC_Ceed(ceed, ceed_data, dm, user, app_ctx, problem, bc, - Q_sur, q_data_size_sur)); + PetscCall(SetupStrongBC_Ceed(ceed, ceed_data, dm, user, app_ctx, problem, bc, Q_sur, q_data_size_sur)); } - } CeedElemRestrictionDestroy(&elem_restr_jd_i); CeedOperatorDestroy(&op_ijacobian_vol); diff --git a/examples/fluids/src/setupts.c b/examples/fluids/src/setupts.c index 4fa25ec9a7..1103745049 100644 --- a/examples/fluids/src/setupts.c +++ b/examples/fluids/src/setupts.c @@ -12,14 +12,12 @@ #include "../qfunctions/mass.h" // Compute mass matrix for explicit scheme -PetscErrorCode ComputeLumpedMassMatrix(Ceed ceed, DM dm, CeedData ceed_data, - Vec M) { - Vec M_loc; - CeedQFunction qf_mass; - CeedOperator op_mass; - CeedVector m_ceed, ones_vec; - CeedInt num_comp_q, q_data_size; - PetscErrorCode ierr; +PetscErrorCode ComputeLumpedMassMatrix(Ceed ceed, DM dm, CeedData ceed_data, Vec M) { + Vec M_loc; + CeedQFunction qf_mass; + CeedOperator op_mass; + CeedVector m_ceed, ones_vec; + CeedInt num_comp_q, q_data_size; PetscFunctionBeginUser; // CEED Restriction @@ -37,19 +35,15 @@ PetscErrorCode ComputeLumpedMassMatrix(Ceed ceed, DM dm, CeedData ceed_data, // CEED Operator CeedOperatorCreate(ceed, qf_mass, NULL, NULL, &op_mass); - CeedOperatorSetField(op_mass, "q", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_mass, "qdata", ceed_data->elem_restr_qd_i, - CEED_BASIS_COLLOCATED, ceed_data->q_data); - CeedOperatorSetField(op_mass, "v", ceed_data->elem_restr_q, ceed_data->basis_q, - CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "q", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_mass, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_COLLOCATED, ceed_data->q_data); + CeedOperatorSetField(op_mass, "v", ceed_data->elem_restr_q, ceed_data->basis_q, CEED_VECTOR_ACTIVE); // Place PETSc vector in CEED vector - CeedScalar *m; + CeedScalar *m; PetscMemType m_mem_type; - ierr = DMGetLocalVector(dm, &M_loc); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(M_loc, (PetscScalar **)&m, &m_mem_type); - CHKERRQ(ierr); + PetscCall(DMGetLocalVector(dm, &M_loc)); + PetscCall(VecGetArrayAndMemType(M_loc, (PetscScalar **)&m, &m_mem_type)); CeedVectorSetArray(m_ceed, MemTypeP2C(m_mem_type), CEED_USE_POINTER, m); // Apply CEED Operator @@ -57,16 +51,15 @@ PetscErrorCode ComputeLumpedMassMatrix(Ceed ceed, DM dm, CeedData ceed_data, // Restore vectors CeedVectorTakeArray(m_ceed, MemTypeP2C(m_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(M_loc, (const PetscScalar **)&m); - CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(M_loc, (const PetscScalar **)&m)); // Local-to-Global - ierr = VecZeroEntries(M); CHKERRQ(ierr); - ierr = DMLocalToGlobal(dm, M_loc, ADD_VALUES, M); CHKERRQ(ierr); - ierr = DMRestoreLocalVector(dm, &M_loc); CHKERRQ(ierr); + PetscCall(VecZeroEntries(M)); + PetscCall(DMLocalToGlobal(dm, M_loc, ADD_VALUES, M)); + PetscCall(DMRestoreLocalVector(dm, &M_loc)); // Invert diagonally lumped mass vector for RHS function - ierr = VecReciprocal(M); CHKERRQ(ierr); + PetscCall(VecReciprocal(M)); // Cleanup CeedVectorDestroy(&ones_vec); @@ -81,20 +74,18 @@ PetscErrorCode ComputeLumpedMassMatrix(Ceed ceed, DM dm, CeedData ceed_data, // This is the RHS of the ODE, given as u_t = G(t,u) // This function takes in a state vector Q and writes into G PetscErrorCode RHS_NS(TS ts, PetscReal t, Vec Q, Vec G, void *user_data) { - User user = *(User *)user_data; - PetscScalar *q, *g; - Vec Q_loc = user->Q_loc, G_loc; - PetscMemType q_mem_type, g_mem_type; - PetscErrorCode ierr; + User user = *(User *)user_data; + PetscScalar *q, *g; + Vec Q_loc = user->Q_loc, G_loc; + PetscMemType q_mem_type, g_mem_type; PetscFunctionBeginUser; // Get local vector - ierr = DMGetLocalVector(user->dm, &G_loc); CHKERRQ(ierr); + PetscCall(DMGetLocalVector(user->dm, &G_loc)); // Update time dependent data if (user->time != t) { - ierr = DMPlexInsertBoundaryValues(user->dm, PETSC_TRUE, Q_loc, t, - NULL, NULL, NULL); CHKERRQ(ierr); + PetscCall(DMPlexInsertBoundaryValues(user->dm, PETSC_TRUE, Q_loc, t, NULL, NULL, NULL)); if (user->phys->solution_time_label) { CeedOperatorContextSetDouble(user->op_rhs, user->phys->solution_time_label, &t); } @@ -102,166 +93,149 @@ PetscErrorCode RHS_NS(TS ts, PetscReal t, Vec Q, Vec G, void *user_data) { } if (user->phys->timestep_size_label) { PetscScalar dt; - ierr = TSGetTimeStep(ts, &dt); CHKERRQ(ierr); + PetscCall(TSGetTimeStep(ts, &dt)); if (user->dt != dt) { - CeedOperatorContextSetDouble(user->op_rhs, user->phys->timestep_size_label, - &dt); + CeedOperatorContextSetDouble(user->op_rhs, user->phys->timestep_size_label, &dt); user->dt = dt; } } // Global-to-local - ierr = DMGlobalToLocal(user->dm, Q, INSERT_VALUES, Q_loc); CHKERRQ(ierr); + PetscCall(DMGlobalToLocal(user->dm, Q, INSERT_VALUES, Q_loc)); // Place PETSc vectors in CEED vectors - ierr = VecGetArrayReadAndMemType(Q_loc, (const PetscScalar **)&q, &q_mem_type); - CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(G_loc, &g, &g_mem_type); CHKERRQ(ierr); + PetscCall(VecGetArrayReadAndMemType(Q_loc, (const PetscScalar **)&q, &q_mem_type)); + PetscCall(VecGetArrayAndMemType(G_loc, &g, &g_mem_type)); CeedVectorSetArray(user->q_ceed, MemTypeP2C(q_mem_type), CEED_USE_POINTER, q); CeedVectorSetArray(user->g_ceed, MemTypeP2C(g_mem_type), CEED_USE_POINTER, g); // Apply CEED operator - CeedOperatorApply(user->op_rhs, user->q_ceed, user->g_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(user->op_rhs, user->q_ceed, user->g_ceed, CEED_REQUEST_IMMEDIATE); // Restore vectors CeedVectorTakeArray(user->q_ceed, MemTypeP2C(q_mem_type), NULL); CeedVectorTakeArray(user->g_ceed, MemTypeP2C(g_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(Q_loc, (const PetscScalar **)&q); - CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(G_loc, &g); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(Q_loc, (const PetscScalar **)&q)); + PetscCall(VecRestoreArrayAndMemType(G_loc, &g)); // Local-to-Global - ierr = VecZeroEntries(G); CHKERRQ(ierr); - ierr = DMLocalToGlobal(user->dm, G_loc, ADD_VALUES, G); CHKERRQ(ierr); + PetscCall(VecZeroEntries(G)); + PetscCall(DMLocalToGlobal(user->dm, G_loc, ADD_VALUES, G)); // Inverse of the lumped mass matrix (M is Minv) - ierr = VecPointwiseMult(G, G, user->M); CHKERRQ(ierr); + PetscCall(VecPointwiseMult(G, G, user->M)); // Restore vectors - ierr = DMRestoreLocalVector(user->dm, &G_loc); CHKERRQ(ierr); + PetscCall(DMRestoreLocalVector(user->dm, &G_loc)); PetscFunctionReturn(0); } // Implicit time-stepper function setup -PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, - void *user_data) { - User user = *(User *)user_data; +PetscErrorCode IFunction_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, Vec G, void *user_data) { + User user = *(User *)user_data; const PetscScalar *q, *q_dot; PetscScalar *g; - Vec Q_loc = user->Q_loc, Q_dot_loc = user->Q_dot_loc, G_loc; - PetscMemType q_mem_type, q_dot_mem_type, g_mem_type; - PetscErrorCode ierr; + Vec Q_loc = user->Q_loc, Q_dot_loc = user->Q_dot_loc, G_loc; + PetscMemType q_mem_type, q_dot_mem_type, g_mem_type; PetscFunctionBeginUser; // Get local vectors - ierr = DMGetLocalVector(user->dm, &G_loc); CHKERRQ(ierr); + PetscCall(DMGetLocalVector(user->dm, &G_loc)); // Update time dependent data if (user->time != t) { - ierr = DMPlexInsertBoundaryValues(user->dm, PETSC_TRUE, Q_loc, t, - NULL, NULL, NULL); CHKERRQ(ierr); + PetscCall(DMPlexInsertBoundaryValues(user->dm, PETSC_TRUE, Q_loc, t, NULL, NULL, NULL)); if (user->phys->solution_time_label) { - CeedOperatorContextSetDouble(user->op_ifunction, - user->phys->solution_time_label, &t); + CeedOperatorContextSetDouble(user->op_ifunction, user->phys->solution_time_label, &t); } user->time = t; } if (user->phys->timestep_size_label) { PetscScalar dt; - ierr = TSGetTimeStep(ts, &dt); CHKERRQ(ierr); + PetscCall(TSGetTimeStep(ts, &dt)); if (user->dt != dt) { - CeedOperatorContextSetDouble(user->op_ifunction, - user->phys->timestep_size_label, &dt); + CeedOperatorContextSetDouble(user->op_ifunction, user->phys->timestep_size_label, &dt); user->dt = dt; } } // Global-to-local - ierr = DMGlobalToLocal(user->dm, Q, INSERT_VALUES, Q_loc); CHKERRQ(ierr); - ierr = DMGlobalToLocal(user->dm, Q_dot, INSERT_VALUES, Q_dot_loc); - CHKERRQ(ierr); + PetscCall(DMGlobalToLocal(user->dm, Q, INSERT_VALUES, Q_loc)); + PetscCall(DMGlobalToLocal(user->dm, Q_dot, INSERT_VALUES, Q_dot_loc)); // Place PETSc vectors in CEED vectors - ierr = VecGetArrayReadAndMemType(Q_loc, &q, &q_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayReadAndMemType(Q_dot_loc, &q_dot, &q_dot_mem_type); - CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(G_loc, &g, &g_mem_type); CHKERRQ(ierr); - CeedVectorSetArray(user->q_ceed, MemTypeP2C(q_mem_type), CEED_USE_POINTER, - (PetscScalar *)q); - CeedVectorSetArray(user->q_dot_ceed, MemTypeP2C(q_dot_mem_type), - CEED_USE_POINTER, (PetscScalar *)q_dot); + PetscCall(VecGetArrayReadAndMemType(Q_loc, &q, &q_mem_type)); + PetscCall(VecGetArrayReadAndMemType(Q_dot_loc, &q_dot, &q_dot_mem_type)); + PetscCall(VecGetArrayAndMemType(G_loc, &g, &g_mem_type)); + CeedVectorSetArray(user->q_ceed, MemTypeP2C(q_mem_type), CEED_USE_POINTER, (PetscScalar *)q); + CeedVectorSetArray(user->q_dot_ceed, MemTypeP2C(q_dot_mem_type), CEED_USE_POINTER, (PetscScalar *)q_dot); CeedVectorSetArray(user->g_ceed, MemTypeP2C(g_mem_type), CEED_USE_POINTER, g); // Apply CEED operator - CeedOperatorApply(user->op_ifunction, user->q_ceed, user->g_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(user->op_ifunction, user->q_ceed, user->g_ceed, CEED_REQUEST_IMMEDIATE); // Restore vectors CeedVectorTakeArray(user->q_ceed, MemTypeP2C(q_mem_type), NULL); CeedVectorTakeArray(user->q_dot_ceed, MemTypeP2C(q_dot_mem_type), NULL); CeedVectorTakeArray(user->g_ceed, MemTypeP2C(g_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(Q_loc, &q); CHKERRQ(ierr); - ierr = VecRestoreArrayReadAndMemType(Q_dot_loc, &q_dot); CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(G_loc, &g); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(Q_loc, &q)); + PetscCall(VecRestoreArrayReadAndMemType(Q_dot_loc, &q_dot)); + PetscCall(VecRestoreArrayAndMemType(G_loc, &g)); // Local-to-Global - ierr = VecZeroEntries(G); CHKERRQ(ierr); - ierr = DMLocalToGlobal(user->dm, G_loc, ADD_VALUES, G); CHKERRQ(ierr); + PetscCall(VecZeroEntries(G)); + PetscCall(DMLocalToGlobal(user->dm, G_loc, ADD_VALUES, G)); // Restore vectors - ierr = DMRestoreLocalVector(user->dm, &G_loc); CHKERRQ(ierr); + PetscCall(DMRestoreLocalVector(user->dm, &G_loc)); PetscFunctionReturn(0); } static PetscErrorCode MatMult_NS_IJacobian(Mat J, Vec Q, Vec G) { - User user; + User user; const PetscScalar *q; PetscScalar *g; - PetscMemType q_mem_type, g_mem_type; - PetscErrorCode ierr; + PetscMemType q_mem_type, g_mem_type; PetscFunctionBeginUser; - ierr = MatShellGetContext(J, &user); CHKERRQ(ierr); - Vec Q_loc = user->Q_dot_loc, // Note - Q_dot_loc has zero BCs - G_loc; + PetscCall(MatShellGetContext(J, &user)); + Vec Q_loc = user->Q_dot_loc, // Note - Q_dot_loc has zero BCs + G_loc; // Get local vectors - ierr = DMGetLocalVector(user->dm, &G_loc); CHKERRQ(ierr); + PetscCall(DMGetLocalVector(user->dm, &G_loc)); // Global-to-local - ierr = DMGlobalToLocal(user->dm, Q, INSERT_VALUES, Q_loc); CHKERRQ(ierr); + PetscCall(DMGlobalToLocal(user->dm, Q, INSERT_VALUES, Q_loc)); // Place PETSc vectors in CEED vectors - ierr = VecGetArrayReadAndMemType(Q_loc, &q, &q_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(G_loc, &g, &g_mem_type); CHKERRQ(ierr); - CeedVectorSetArray(user->q_ceed, MemTypeP2C(q_mem_type), CEED_USE_POINTER, - (PetscScalar *)q); + PetscCall(VecGetArrayReadAndMemType(Q_loc, &q, &q_mem_type)); + PetscCall(VecGetArrayAndMemType(G_loc, &g, &g_mem_type)); + CeedVectorSetArray(user->q_ceed, MemTypeP2C(q_mem_type), CEED_USE_POINTER, (PetscScalar *)q); CeedVectorSetArray(user->g_ceed, MemTypeP2C(g_mem_type), CEED_USE_POINTER, g); // Apply CEED operator - CeedOperatorApply(user->op_ijacobian, user->q_ceed, user->g_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(user->op_ijacobian, user->q_ceed, user->g_ceed, CEED_REQUEST_IMMEDIATE); // Restore vectors CeedVectorTakeArray(user->q_ceed, MemTypeP2C(q_mem_type), NULL); CeedVectorTakeArray(user->g_ceed, MemTypeP2C(g_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(Q_loc, &q); CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(G_loc, &g); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(Q_loc, &q)); + PetscCall(VecRestoreArrayAndMemType(G_loc, &g)); // Local-to-Global - ierr = VecZeroEntries(G); CHKERRQ(ierr); - ierr = DMLocalToGlobal(user->dm, G_loc, ADD_VALUES, G); CHKERRQ(ierr); + PetscCall(VecZeroEntries(G)); + PetscCall(DMLocalToGlobal(user->dm, G_loc, ADD_VALUES, G)); // Restore vectors - ierr = DMRestoreLocalVector(user->dm, &G_loc); CHKERRQ(ierr); + PetscCall(DMRestoreLocalVector(user->dm, &G_loc)); PetscFunctionReturn(0); } PetscErrorCode MatGetDiagonal_NS_IJacobian(Mat A, Vec D) { - User user; - Vec D_loc; + User user; + Vec D_loc; PetscScalar *d; PetscMemType mem_type; @@ -270,8 +244,7 @@ PetscErrorCode MatGetDiagonal_NS_IJacobian(Mat A, Vec D) { PetscCall(DMGetLocalVector(user->dm, &D_loc)); PetscCall(VecGetArrayAndMemType(D_loc, &d, &mem_type)); CeedVectorSetArray(user->g_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, d); - CeedOperatorLinearAssembleDiagonal(user->op_ijacobian, user->g_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorLinearAssembleDiagonal(user->op_ijacobian, user->g_ceed, CEED_REQUEST_IMMEDIATE); CeedVectorTakeArray(user->g_ceed, MemTypeP2C(mem_type), NULL); PetscCall(VecRestoreArrayAndMemType(D_loc, &d)); PetscCall(VecZeroEntries(D)); @@ -281,29 +254,27 @@ PetscErrorCode MatGetDiagonal_NS_IJacobian(Mat A, Vec D) { PetscFunctionReturn(0); } -static PetscErrorCode FormPreallocation(User user, PetscBool pbdiagonal, Mat J, - CeedVector *coo_values) { +static PetscErrorCode FormPreallocation(User user, PetscBool pbdiagonal, Mat J, CeedVector *coo_values) { PetscCount ncoo; - PetscInt *rows, *cols; + PetscInt *rows, *cols; PetscFunctionBeginUser; if (pbdiagonal) { CeedSize l_size; CeedOperatorGetActiveVectorLengths(user->op_ijacobian, &l_size, NULL); ncoo = l_size * 5; - rows = malloc(ncoo*sizeof(rows[0])); - cols = malloc(ncoo*sizeof(cols[0])); - for (PetscCount n=0; nop_ijacobian, &ncoo, &rows, - &cols)); + PetscCall(CeedOperatorLinearAssembleSymbolic(user->op_ijacobian, &ncoo, &rows, &cols)); } PetscCall(MatSetPreallocationCOOLocal(J, ncoo, rows, cols)); free(rows); @@ -312,19 +283,16 @@ static PetscErrorCode FormPreallocation(User user, PetscBool pbdiagonal, Mat J, PetscFunctionReturn(0); } -static PetscErrorCode FormSetValues(User user, PetscBool pbdiagonal, Mat J, - CeedVector coo_values) { - CeedMemType mem_type = CEED_MEM_HOST; +static PetscErrorCode FormSetValues(User user, PetscBool pbdiagonal, Mat J, CeedVector coo_values) { + CeedMemType mem_type = CEED_MEM_HOST; const PetscScalar *values; - MatType mat_type; + MatType mat_type; PetscFunctionBeginUser; PetscCall(MatGetType(J, &mat_type)); - if (strstr(mat_type, "kokkos") || strstr(mat_type, "cusparse")) - mem_type = CEED_MEM_DEVICE; + if (strstr(mat_type, "kokkos") || strstr(mat_type, "cusparse")) mem_type = CEED_MEM_DEVICE; if (user->app_ctx->pmat_pbdiagonal) { - CeedOperatorLinearAssemblePointBlockDiagonal(user->op_ijacobian, - coo_values, CEED_REQUEST_IMMEDIATE); + CeedOperatorLinearAssemblePointBlockDiagonal(user->op_ijacobian, coo_values, CEED_REQUEST_IMMEDIATE); } else { CeedOperatorLinearAssemble(user->op_ijacobian, coo_values); } @@ -334,40 +302,31 @@ static PetscErrorCode FormSetValues(User user, PetscBool pbdiagonal, Mat J, PetscFunctionReturn(0); } -PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, - PetscReal shift, Mat J, Mat J_pre, - void *user_data) { - User user = *(User *)user_data; +PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscReal shift, Mat J, Mat J_pre, void *user_data) { + User user = *(User *)user_data; PetscBool J_is_shell, J_is_mffd, J_pre_is_shell; PetscFunctionBeginUser; - if (user->phys->ijacobian_time_shift_label) - CeedOperatorContextSetDouble(user->op_ijacobian, - user->phys->ijacobian_time_shift_label, &shift); + if (user->phys->ijacobian_time_shift_label) CeedOperatorContextSetDouble(user->op_ijacobian, user->phys->ijacobian_time_shift_label, &shift); PetscCall(PetscObjectTypeCompare((PetscObject)J, MATMFFD, &J_is_mffd)); PetscCall(PetscObjectTypeCompare((PetscObject)J, MATSHELL, &J_is_shell)); - PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATSHELL, - &J_pre_is_shell)); + PetscCall(PetscObjectTypeCompare((PetscObject)J_pre, MATSHELL, &J_pre_is_shell)); if (!user->matrices_set_up) { if (J_is_shell) { PetscCall(MatShellSetContext(J, user)); - PetscCall(MatShellSetOperation(J, MATOP_MULT, - (void (*)(void))MatMult_NS_IJacobian)); - PetscCall(MatShellSetOperation(J, MATOP_GET_DIAGONAL, - (void (*)(void))MatGetDiagonal_NS_IJacobian)); + PetscCall(MatShellSetOperation(J, MATOP_MULT, (void (*)(void))MatMult_NS_IJacobian)); + PetscCall(MatShellSetOperation(J, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiagonal_NS_IJacobian)); PetscCall(MatSetUp(J)); } if (!J_pre_is_shell) { - PetscCall(FormPreallocation(user,user->app_ctx->pmat_pbdiagonal,J_pre, - &user->coo_values_pmat)); + PetscCall(FormPreallocation(user, user->app_ctx->pmat_pbdiagonal, J_pre, &user->coo_values_pmat)); } if (J != J_pre && !J_is_shell && !J_is_mffd) { - PetscCall(FormPreallocation(user,PETSC_FALSE,J, &user->coo_values_amat)); + PetscCall(FormPreallocation(user, PETSC_FALSE, J, &user->coo_values_amat)); } user->matrices_set_up = true; } if (!J_pre_is_shell) { - PetscCall(FormSetValues(user, user->app_ctx->pmat_pbdiagonal, J_pre, - user->coo_values_pmat)); + PetscCall(FormSetValues(user, user->app_ctx->pmat_pbdiagonal, J_pre, user->coo_values_pmat)); } if (user->coo_values_amat) { PetscCall(FormSetValues(user, PETSC_FALSE, J, user->coo_values_amat)); @@ -378,11 +337,10 @@ PetscErrorCode FormIJacobian_NS(TS ts, PetscReal t, Vec Q, Vec Q_dot, PetscFunctionReturn(0); } -PetscErrorCode WriteOutput(User user, Vec Q, PetscInt step_no, - PetscScalar time) { - Vec Q_loc; - char file_path[PETSC_MAX_PATH_LEN]; - PetscViewer viewer; +PetscErrorCode WriteOutput(User user, Vec Q, PetscInt step_no, PetscScalar time) { + Vec Q_loc; + char file_path[PETSC_MAX_PATH_LEN]; + PetscViewer viewer; PetscFunctionBeginUser; // Set up output @@ -392,12 +350,10 @@ PetscErrorCode WriteOutput(User user, Vec Q, PetscInt step_no, PetscCall(DMGlobalToLocal(user->dm, Q, INSERT_VALUES, Q_loc)); // Output - PetscCall(PetscSNPrintf(file_path, sizeof file_path, - "%s/ns-%03" PetscInt_FMT ".vtu", - user->app_ctx->output_dir, step_no + user->app_ctx->cont_steps)); + PetscCall( + PetscSNPrintf(file_path, sizeof file_path, "%s/ns-%03" PetscInt_FMT ".vtu", user->app_ctx->output_dir, step_no + user->app_ctx->cont_steps)); - PetscCall(PetscViewerVTKOpen(PetscObjectComm((PetscObject)Q), file_path, - FILE_MODE_WRITE, &viewer)); + PetscCall(PetscViewerVTKOpen(PetscObjectComm((PetscObject)Q), file_path, FILE_MODE_WRITE, &viewer)); PetscCall(VecView(Q_loc, viewer)); PetscCall(PetscViewerDestroy(&viewer)); if (user->dm_viz) { @@ -411,15 +367,12 @@ PetscErrorCode WriteOutput(User user, Vec Q, PetscInt step_no, PetscCall(MatInterpolate(user->interp_viz, Q, Q_refined)); PetscCall(VecZeroEntries(Q_refined_loc)); - PetscCall(DMGlobalToLocal(user->dm_viz, Q_refined, INSERT_VALUES, - Q_refined_loc)); + PetscCall(DMGlobalToLocal(user->dm_viz, Q_refined, INSERT_VALUES, Q_refined_loc)); - PetscCall(PetscSNPrintf(file_path_refined, sizeof file_path_refined, - "%s/nsrefined-%03" PetscInt_FMT ".vtu", user->app_ctx->output_dir, + PetscCall(PetscSNPrintf(file_path_refined, sizeof file_path_refined, "%s/nsrefined-%03" PetscInt_FMT ".vtu", user->app_ctx->output_dir, step_no + user->app_ctx->cont_steps)); - PetscCall(PetscViewerVTKOpen(PetscObjectComm((PetscObject)Q_refined), - file_path_refined, FILE_MODE_WRITE, &viewer_refined)); + PetscCall(PetscViewerVTKOpen(PetscObjectComm((PetscObject)Q_refined), file_path_refined, FILE_MODE_WRITE, &viewer_refined)); PetscCall(VecView(Q_refined_loc, viewer_refined)); PetscCall(DMRestoreLocalVector(user->dm_viz, &Q_refined_loc)); PetscCall(DMRestoreGlobalVector(user->dm_viz, &Q_refined)); @@ -429,15 +382,12 @@ PetscErrorCode WriteOutput(User user, Vec Q, PetscInt step_no, // Save data in a binary file for continuation of simulations if (user->app_ctx->add_stepnum2bin) { - PetscCall(PetscSNPrintf(file_path, sizeof file_path, - "%s/ns-solution-%" PetscInt_FMT ".bin", - user->app_ctx->output_dir, step_no + user->app_ctx->cont_steps)); + PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/ns-solution-%" PetscInt_FMT ".bin", user->app_ctx->output_dir, + step_no + user->app_ctx->cont_steps)); } else { - PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/ns-solution.bin", - user->app_ctx->output_dir)); + PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/ns-solution.bin", user->app_ctx->output_dir)); } - PetscCall(PetscViewerBinaryOpen(user->comm, file_path, FILE_MODE_WRITE, - &viewer)); + PetscCall(PetscViewerBinaryOpen(user->comm, file_path, FILE_MODE_WRITE, &viewer)); PetscCall(VecView(Q, viewer)); PetscCall(PetscViewerDestroy(&viewer)); @@ -446,36 +396,30 @@ PetscErrorCode WriteOutput(User user, Vec Q, PetscInt step_no, // Dimensionalize time back time /= user->units->second; if (user->app_ctx->add_stepnum2bin) { - PetscCall(PetscSNPrintf(file_path, sizeof file_path, - "%s/ns-time-%" PetscInt_FMT ".bin", - user->app_ctx->output_dir, step_no + user->app_ctx->cont_steps)); + PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/ns-time-%" PetscInt_FMT ".bin", user->app_ctx->output_dir, + step_no + user->app_ctx->cont_steps)); } else { - PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/ns-time.bin", - user->app_ctx->output_dir)); + PetscCall(PetscSNPrintf(file_path, sizeof file_path, "%s/ns-time.bin", user->app_ctx->output_dir)); } - PetscCall(PetscViewerBinaryOpen(user->comm, file_path, FILE_MODE_WRITE, - &viewer)); + PetscCall(PetscViewerBinaryOpen(user->comm, file_path, FILE_MODE_WRITE, &viewer)); - #if PETSC_VERSION_GE(3,13,0) +#if PETSC_VERSION_GE(3, 13, 0) PetscCall(PetscViewerBinaryWrite(viewer, &time, 1, PETSC_REAL)); - #else +#else PetscCall(PetscViewerBinaryWrite(viewer, &time, 1, PETSC_REAL, true)); - #endif +#endif PetscCall(PetscViewerDestroy(&viewer)); PetscFunctionReturn(0); } // User provided TS Monitor -PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, - Vec Q, void *ctx) { - User user = ctx; +PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, Vec Q, void *ctx) { + User user = ctx; PetscFunctionBeginUser; // Print every 'output_freq' steps - if (user->app_ctx->output_freq <= 0 - || step_no % user->app_ctx->output_freq != 0) - PetscFunctionReturn(0); + if (user->app_ctx->output_freq <= 0 || step_no % user->app_ctx->output_freq != 0) PetscFunctionReturn(0); PetscCall(WriteOutput(user, Q, step_no, time)); @@ -483,76 +427,71 @@ PetscErrorCode TSMonitor_NS(TS ts, PetscInt step_no, PetscReal time, } // TS: Create, setup, and solve -PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, - Vec *Q, PetscScalar *f_time, TS *ts) { - MPI_Comm comm = user->comm; - TSAdapt adapt; - PetscScalar final_time; - PetscErrorCode ierr; +PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, Vec *Q, PetscScalar *f_time, TS *ts) { + MPI_Comm comm = user->comm; + TSAdapt adapt; + PetscScalar final_time; PetscFunctionBeginUser; - ierr = TSCreate(comm, ts); CHKERRQ(ierr); - ierr = TSSetDM(*ts, dm); CHKERRQ(ierr); + PetscCall(TSCreate(comm, ts)); + PetscCall(TSSetDM(*ts, dm)); if (phys->implicit) { - ierr = TSSetType(*ts, TSBDF); CHKERRQ(ierr); + PetscCall(TSSetType(*ts, TSBDF)); if (user->op_ifunction) { - ierr = TSSetIFunction(*ts, NULL, IFunction_NS, &user); CHKERRQ(ierr); - } else { // Implicit integrators can fall back to using an RHSFunction - ierr = TSSetRHSFunction(*ts, NULL, RHS_NS, &user); CHKERRQ(ierr); + PetscCall(TSSetIFunction(*ts, NULL, IFunction_NS, &user)); + } else { // Implicit integrators can fall back to using an RHSFunction + PetscCall(TSSetRHSFunction(*ts, NULL, RHS_NS, &user)); } if (user->op_ijacobian) { - ierr = DMTSSetIJacobian(dm, FormIJacobian_NS, &user); CHKERRQ(ierr); + PetscCall(DMTSSetIJacobian(dm, FormIJacobian_NS, &user)); if (app_ctx->amat_type) { - Mat Pmat,Amat; - ierr = DMCreateMatrix(dm, &Pmat); CHKERRQ(ierr); - ierr = DMSetMatType(dm, app_ctx->amat_type); CHKERRQ(ierr); - ierr = DMCreateMatrix(dm, &Amat); CHKERRQ(ierr); - ierr = TSSetIJacobian(*ts, Amat, Pmat, NULL, NULL); CHKERRQ(ierr); - ierr = MatDestroy(&Amat); CHKERRQ(ierr); - ierr = MatDestroy(&Pmat); CHKERRQ(ierr); + Mat Pmat, Amat; + PetscCall(DMCreateMatrix(dm, &Pmat)); + PetscCall(DMSetMatType(dm, app_ctx->amat_type)); + PetscCall(DMCreateMatrix(dm, &Amat)); + PetscCall(TSSetIJacobian(*ts, Amat, Pmat, NULL, NULL)); + PetscCall(MatDestroy(&Amat)); + PetscCall(MatDestroy(&Pmat)); } } } else { - if (!user->op_rhs) SETERRQ(comm, PETSC_ERR_ARG_NULL, - "Problem does not provide RHSFunction"); - ierr = TSSetType(*ts, TSRK); CHKERRQ(ierr); - ierr = TSRKSetType(*ts, TSRK5F); CHKERRQ(ierr); - ierr = TSSetRHSFunction(*ts, NULL, RHS_NS, &user); CHKERRQ(ierr); + if (!user->op_rhs) SETERRQ(comm, PETSC_ERR_ARG_NULL, "Problem does not provide RHSFunction"); + PetscCall(TSSetType(*ts, TSRK)); + PetscCall(TSRKSetType(*ts, TSRK5F)); + PetscCall(TSSetRHSFunction(*ts, NULL, RHS_NS, &user)); + } + PetscCall(TSSetMaxTime(*ts, 500. * user->units->second)); + PetscCall(TSSetExactFinalTime(*ts, TS_EXACTFINALTIME_STEPOVER)); + PetscCall(TSSetTimeStep(*ts, 1.e-2 * user->units->second)); + if (app_ctx->test_mode) { + PetscCall(TSSetMaxSteps(*ts, 10)); } - ierr = TSSetMaxTime(*ts, 500. * user->units->second); CHKERRQ(ierr); - ierr = TSSetExactFinalTime(*ts, TS_EXACTFINALTIME_STEPOVER); CHKERRQ(ierr); - ierr = TSSetTimeStep(*ts, 1.e-2 * user->units->second); CHKERRQ(ierr); - if (app_ctx->test_mode) {ierr = TSSetMaxSteps(*ts, 10); CHKERRQ(ierr);} - ierr = TSGetAdapt(*ts, &adapt); CHKERRQ(ierr); - ierr = TSAdaptSetStepLimits(adapt, - 1.e-12 * user->units->second, - 1.e2 * user->units->second); CHKERRQ(ierr); - ierr = TSSetFromOptions(*ts); CHKERRQ(ierr); - user->time = -1.0; // require all BCs and ctx to be updated + PetscCall(TSGetAdapt(*ts, &adapt)); + PetscCall(TSAdaptSetStepLimits(adapt, 1.e-12 * user->units->second, 1.e2 * user->units->second)); + PetscCall(TSSetFromOptions(*ts)); + user->time = -1.0; // require all BCs and ctx to be updated user->dt = -1.0; - if (!app_ctx->cont_steps) { // print initial condition + if (!app_ctx->cont_steps) { // print initial condition if (!app_ctx->test_mode) { - ierr = TSMonitor_NS(*ts, 0, 0., *Q, user); CHKERRQ(ierr); + PetscCall(TSMonitor_NS(*ts, 0, 0., *Q, user)); } - } else { // continue from time of last output + } else { // continue from time of last output PetscReal time; PetscInt count; PetscViewer viewer; - ierr = PetscViewerBinaryOpen(comm, app_ctx->cont_time_file, FILE_MODE_READ, - &viewer); - CHKERRQ(ierr); - ierr = PetscViewerBinaryRead(viewer, &time, 1, &count, PETSC_REAL); - CHKERRQ(ierr); - ierr = PetscViewerDestroy(&viewer); CHKERRQ(ierr); - ierr = TSSetTime(*ts, time * user->units->second); CHKERRQ(ierr); + + PetscCall(PetscViewerBinaryOpen(comm, app_ctx->cont_time_file, FILE_MODE_READ, &viewer)); + PetscCall(PetscViewerBinaryRead(viewer, &time, 1, &count, PETSC_REAL)); + PetscCall(PetscViewerDestroy(&viewer)); + PetscCall(TSSetTime(*ts, time * user->units->second)); } if (!app_ctx->test_mode) { - ierr = TSMonitorSet(*ts, TSMonitor_NS, user, NULL); CHKERRQ(ierr); + PetscCall(TSMonitorSet(*ts, TSMonitor_NS, user, NULL)); } // Solve PetscScalar start_time; - ierr = TSGetTime(*ts, &start_time); CHKERRQ(ierr); + PetscCall(TSGetTime(*ts, &start_time)); PetscPreLoadBegin(PETSC_FALSE, "Fluids Solve"); PetscCall(TSSetTime(*ts, start_time)); @@ -566,24 +505,21 @@ PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, PetscCall(VecCopy(*Q, Q_preload)); PetscCall(TSGetSNES(*ts, &snes)); PetscCall(SNESGetTolerances(snes, NULL, &rtol, NULL, NULL, NULL)); - PetscCall(SNESSetTolerances(snes, PETSC_DEFAULT, .99, PETSC_DEFAULT, - PETSC_DEFAULT, PETSC_DEFAULT)); + PetscCall(SNESSetTolerances(snes, PETSC_DEFAULT, .99, PETSC_DEFAULT, PETSC_DEFAULT, PETSC_DEFAULT)); PetscCall(TSSetSolution(*ts, *Q)); PetscCall(TSStep(*ts)); - PetscCall(SNESSetTolerances(snes, PETSC_DEFAULT, rtol, PETSC_DEFAULT, - PETSC_DEFAULT, PETSC_DEFAULT)); + PetscCall(SNESSetTolerances(snes, PETSC_DEFAULT, rtol, PETSC_DEFAULT, PETSC_DEFAULT, PETSC_DEFAULT)); PetscCall(VecDestroy(&Q_preload)); // LCOV_EXCL_STOP } else { - ierr = PetscBarrier((PetscObject) *ts); CHKERRQ(ierr); - ierr = TSSolve(*ts, *Q); CHKERRQ(ierr); + PetscCall(PetscBarrier((PetscObject)*ts)); + PetscCall(TSSolve(*ts, *Q)); } PetscPreLoadEnd(); PetscCall(TSGetSolveTime(*ts, &final_time)); *f_time = final_time; - if (!app_ctx->test_mode) { if (user->app_ctx->output_freq > 0 || user->app_ctx->output_freq == -1) { PetscInt step_no; @@ -596,9 +532,7 @@ PetscErrorCode TSSolve_NS(DM dm, User user, AppCtx app_ctx, Physics phys, PetscCall(PetscLogStageGetId("Fluids Solve", &stage_id)); PetscCall(PetscLogGetStageLog(&stage_log)); - PetscCall(PetscPrintf(PETSC_COMM_WORLD, - "Time taken for solution (sec): %g\n", - stage_log->stageInfo[stage_id].perfInfo.time)); + PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Time taken for solution (sec): %g\n", stage_log->stageInfo[stage_id].perfInfo.time)); } PetscFunctionReturn(0); } diff --git a/examples/mfem/bp1.cpp b/examples/mfem/bp1.cpp index 44b9919e23..120125572b 100644 --- a/examples/mfem/bp1.cpp +++ b/examples/mfem/bp1.cpp @@ -31,41 +31,38 @@ /// @file /// MFEM mass operator based on libCEED +#include "bp1.hpp" + #include + #include -#include "bp1.hpp" /// Continuous function to project on the discrete FE space double solution(const mfem::Vector &pt) { - return pt.Norml2(); // distance to the origin + return pt.Norml2(); // distance to the origin } //TESTARGS -ceed {ceed_resource} -t -no-vis --size 2000 --order 4 int main(int argc, char *argv[]) { // 1. Parse command-line options. const char *ceed_spec = "/cpu/self"; - #ifndef MFEM_DIR +#ifndef MFEM_DIR const char *mesh_file = "../../../mfem/data/star.mesh"; - #else +#else const char *mesh_file = MFEM_DIR "/data/star.mesh"; - #endif - int order = 1; - bool visualization = true; - bool test = false; - double max_nnodes = 50000; +#endif + int order = 1; + bool visualization = true; + bool test = false; + double max_nnodes = 50000; mfem::OptionsParser args(argc, argv); args.AddOption(&ceed_spec, "-c", "-ceed", "Ceed specification."); args.AddOption(&mesh_file, "-m", "--mesh", "Mesh file to use."); - args.AddOption(&order, "-o", "--order", - "Finite element order (polynomial degree)."); + args.AddOption(&order, "-o", "--order", "Finite element order (polynomial degree)."); args.AddOption(&max_nnodes, "-s", "--size", "Maximum size (number of DoFs)"); - args.AddOption(&visualization, "-vis", "--visualization", "-no-vis", - "--no-visualization", - "Enable or disable GLVis visualization."); - args.AddOption(&test, "-t", "--test", "-no-test", - "--no-test", - "Enable or disable test mode."); + args.AddOption(&visualization, "-vis", "--visualization", "-no-vis", "--no-visualization", "Enable or disable GLVis visualization."); + args.AddOption(&test, "-t", "--test", "-no-test", "--no-test", "Enable or disable test mode."); args.Parse(); if (!args.Good()) { args.PrintUsage(std::cout); @@ -81,15 +78,14 @@ int main(int argc, char *argv[]) { // 3. Read the mesh from the given mesh file. mfem::Mesh *mesh = new mfem::Mesh(mesh_file, 1, 1); - int dim = mesh->Dimension(); + int dim = mesh->Dimension(); // 4. Refine the mesh to increase the resolution. In this example we do // 'ref_levels' of uniform refinement. We choose 'ref_levels' to be the // largest number that gives a final system with no more than 50,000 // unknowns, approximately. { - int ref_levels = - (int)floor((log(max_nnodes/mesh->GetNE())-dim*log(order))/log(2.)/dim); + int ref_levels = (int)floor((log(max_nnodes / mesh->GetNE()) - dim * log(order)) / log(2.) / dim); for (int l = 0; l < ref_levels; l++) { mesh->UniformRefinement(); } @@ -104,16 +100,15 @@ int main(int argc, char *argv[]) { // 5. Define a finite element space on the mesh. Here we use continuous // Lagrange finite elements of the specified order. MFEM_VERIFY(order > 0, "invalid order"); - mfem::FiniteElementCollection *fec = new mfem::H1_FECollection(order, dim); - mfem::FiniteElementSpace *fespace = new mfem::FiniteElementSpace(mesh, fec); + mfem::FiniteElementCollection *fec = new mfem::H1_FECollection(order, dim); + mfem::FiniteElementSpace *fespace = new mfem::FiniteElementSpace(mesh, fec); if (!test) { - std::cout << "Number of finite element unknowns: " - << fespace->GetTrueVSize() << std::endl; + std::cout << "Number of finite element unknowns: " << fespace->GetTrueVSize() << std::endl; } // 6. Construct a rhs vector using the linear form f(v) = (solution, v), where // v is a test function. - mfem::LinearForm b(fespace); + mfem::LinearForm b(fespace); mfem::FunctionCoefficient sol_coeff(solution); b.AddDomainIntegrator(new mfem::DomainLFIntegrator(sol_coeff)); b.Assemble(); @@ -140,10 +135,9 @@ int main(int argc, char *argv[]) { // 9. Compute and print the L2 projection error. double err_l2 = sol.ComputeL2Error(sol_coeff); if (!test) { - std::cout << "L2 projection error: " << err_l2 - << std::endl; + std::cout << "L2 projection error: " << err_l2 << std::endl; } else { - if (fabs(sol.ComputeL2Error(sol_coeff))>2e-4) { + if (fabs(sol.ComputeL2Error(sol_coeff)) > 2e-4) { std::cout << "Error too large: " << err_l2 << std::endl; } } @@ -151,8 +145,8 @@ int main(int argc, char *argv[]) { // 10. Open a socket connection to GLVis and send the mesh and solution for // visualization. if (visualization) { - char vishost[] = "localhost"; - int visport = 19916; + char vishost[] = "localhost"; + int visport = 19916; mfem::socketstream sol_sock(vishost, visport); sol_sock.precision(8); sol_sock << "solution\n" << *mesh << sol << std::flush; diff --git a/examples/mfem/bp1.h b/examples/mfem/bp1.h index fdfb020d6b..dbaa08dcf4 100644 --- a/examples/mfem/bp1.h +++ b/examples/mfem/bp1.h @@ -11,62 +11,55 @@ #include /// A structure used to pass additional data to f_build_mass -struct BuildContext { CeedInt dim, space_dim; }; +struct BuildContext { + CeedInt dim, space_dim; +}; /// libCEED Q-function for building quadrature data for a mass operator -CEED_QFUNCTION(f_build_mass)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(f_build_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is Jacobians with shape [dim, nc=dim, Q] // in[1] is quadrature weights, size (Q) - BuildContext *bc = (BuildContext *)ctx; + BuildContext *bc = (BuildContext *)ctx; const CeedScalar *J = in[0], *w = in[1]; - CeedScalar *qdata = out[0]; + CeedScalar *qdata = out[0]; - switch (bc->dim + 10*bc->space_dim) { - case 11: - // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; idim + 10 * bc->space_dim) { + case 11: + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { qdata[i] = J[i] * w[i]; } + break; + case 22: + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // 0 2 + // 1 3 + qdata[i] = (J[i + Q * 0] * J[i + Q * 3] - J[i + Q * 1] * J[i + Q * 2]) * w[i]; + } + break; + case 33: + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // 0 3 6 + // 1 4 7 + // 2 5 8 + qdata[i] = (J[i + Q * 0] * (J[i + Q * 4] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 7]) - + J[i + Q * 1] * (J[i + Q * 3] * J[i + Q * 8] - J[i + Q * 5] * J[i + Q * 6]) + + J[i + Q * 2] * (J[i + Q * 3] * J[i + Q * 7] - J[i + Q * 4] * J[i + Q * 6])) * + w[i]; + } + break; } return 0; } /// libCEED Q-function for applying a mass operator -CEED_QFUNCTION(f_apply_mass)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(f_apply_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *u = in[0], *qdata = in[1]; - CeedScalar *v = out[0]; + CeedScalar *v = out[0]; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; i + #include + #include "bp1.h" /// Wrapper for a mass CeedOperator as an mfem::Operator class CeedMassOperator : public mfem::Operator { protected: const mfem::FiniteElementSpace *fes; - CeedOperator build_oper, oper; - CeedBasis basis, mesh_basis; - CeedElemRestriction restr, mesh_restr, restr_i, mesh_restr_i; - CeedQFunction apply_qfunc, build_qfunc; - CeedQFunctionContext build_ctx; - CeedVector node_coords, qdata; - CeedVector u, v; + CeedOperator build_oper, oper; + CeedBasis basis, mesh_basis; + CeedElemRestriction restr, mesh_restr, restr_i, mesh_restr_i; + CeedQFunction apply_qfunc, build_qfunc; + CeedQFunctionContext build_ctx; + CeedVector node_coords, qdata; + CeedVector u, v; BuildContext build_ctx_data; - static void FESpace2Ceed(const mfem::FiniteElementSpace *fes, - const mfem::IntegrationRule &ir, - Ceed ceed, CeedBasis *basis, + static void FESpace2Ceed(const mfem::FiniteElementSpace *fes, const mfem::IntegrationRule &ir, Ceed ceed, CeedBasis *basis, CeedElemRestriction *restr) { - mfem::Mesh *mesh = fes->GetMesh(); - const mfem::FiniteElement *fe = fes->GetFE(0); - const int order = fes->GetOrder(0); - mfem::Array dof_map; + mfem::Mesh *mesh = fes->GetMesh(); + const mfem::FiniteElement *fe = fes->GetFE(0); + const int order = fes->GetOrder(0); + mfem::Array dof_map; switch (mesh->Dimension()) { - case 1: { - const mfem::H1_SegmentElement *h1_fe = - dynamic_cast(fe); - MFEM_VERIFY(h1_fe, "invalid FE"); - h1_fe->GetDofMap().Copy(dof_map); - break; - } - case 2: { - const mfem::H1_QuadrilateralElement *h1_fe = - dynamic_cast(fe); - MFEM_VERIFY(h1_fe, "invalid FE"); - h1_fe->GetDofMap().Copy(dof_map); - break; - } - case 3: { - const mfem::H1_HexahedronElement *h1_fe = - dynamic_cast(fe); - MFEM_VERIFY(h1_fe, "invalid FE"); - h1_fe->GetDofMap().Copy(dof_map); - break; - } + case 1: { + const mfem::H1_SegmentElement *h1_fe = dynamic_cast(fe); + MFEM_VERIFY(h1_fe, "invalid FE"); + h1_fe->GetDofMap().Copy(dof_map); + break; + } + case 2: { + const mfem::H1_QuadrilateralElement *h1_fe = dynamic_cast(fe); + MFEM_VERIFY(h1_fe, "invalid FE"); + h1_fe->GetDofMap().Copy(dof_map); + break; + } + case 3: { + const mfem::H1_HexahedronElement *h1_fe = dynamic_cast(fe); + MFEM_VERIFY(h1_fe, "invalid FE"); + h1_fe->GetDofMap().Copy(dof_map); + break; + } } - const mfem::FiniteElement *fe1d = - fes->FEColl()->FiniteElementForGeometry(mfem::Geometry::SEGMENT); - mfem::DenseMatrix shape1d(fe1d->GetDof(), ir.GetNPoints()); - mfem::DenseMatrix grad_1d(fe1d->GetDof(), ir.GetNPoints()); - mfem::Vector q_ref_1d(ir.GetNPoints()), q_weight_1d(ir.GetNPoints()); - mfem::Vector shape_i(shape1d.Height()); - mfem::DenseMatrix grad_i(grad_1d.Height(), 1); - const mfem::H1_SegmentElement *h1_fe1d = - dynamic_cast(fe1d); + const mfem::FiniteElement *fe1d = fes->FEColl()->FiniteElementForGeometry(mfem::Geometry::SEGMENT); + mfem::DenseMatrix shape1d(fe1d->GetDof(), ir.GetNPoints()); + mfem::DenseMatrix grad_1d(fe1d->GetDof(), ir.GetNPoints()); + mfem::Vector q_ref_1d(ir.GetNPoints()), q_weight_1d(ir.GetNPoints()); + mfem::Vector shape_i(shape1d.Height()); + mfem::DenseMatrix grad_i(grad_1d.Height(), 1); + const mfem::H1_SegmentElement *h1_fe1d = dynamic_cast(fe1d); MFEM_VERIFY(h1_fe1d, "invalid FE"); const mfem::Array &dof_map_1d = h1_fe1d->GetDofMap(); for (int i = 0; i < ir.GetNPoints(); i++) { const mfem::IntegrationPoint &ip = ir.IntPoint(i); - q_ref_1d(i) = ip.x; - q_weight_1d(i) = ip.weight; + q_ref_1d(i) = ip.x; + q_weight_1d(i) = ip.weight; fe1d->CalcShape(ip, shape_i); fe1d->CalcDShape(ip, grad_i); for (int j = 0; j < shape1d.Height(); j++) { - shape1d(j,i) = shape_i(dof_map_1d[j]); - grad_1d(j,i) = grad_i(dof_map_1d[j],0); + shape1d(j, i) = shape_i(dof_map_1d[j]); + grad_1d(j, i) = grad_i(dof_map_1d[j], 0); } } - CeedBasisCreateTensorH1(ceed, mesh->Dimension(), fes->GetVDim(), order+1, - ir.GetNPoints(), shape1d.GetData(), - grad_1d.GetData(), q_ref_1d.GetData(), - q_weight_1d.GetData(), basis); + CeedBasisCreateTensorH1(ceed, mesh->Dimension(), fes->GetVDim(), order + 1, ir.GetNPoints(), shape1d.GetData(), grad_1d.GetData(), + q_ref_1d.GetData(), q_weight_1d.GetData(), basis); const mfem::Table &el_dof = fes->GetElementToDofTable(); - mfem::Array tp_el_dof(el_dof.Size_of_connections()); + mfem::Array tp_el_dof(el_dof.Size_of_connections()); for (int i = 0; i < mesh->GetNE(); i++) { - const int el_offset = fe->GetDof()*i; + const int el_offset = fe->GetDof() * i; for (int j = 0; j < fe->GetDof(); j++) { tp_el_dof[j + el_offset] = el_dof.GetJ()[dof_map[j] + el_offset]; } } - CeedElemRestrictionCreate(ceed, mesh->GetNE(), fe->GetDof(), - fes->GetVDim(), fes->GetNDofs(), - (fes->GetVDim())*(fes->GetNDofs()), - CEED_MEM_HOST, CEED_COPY_VALUES, - tp_el_dof.GetData(), restr); + CeedElemRestrictionCreate(ceed, mesh->GetNE(), fe->GetDof(), fes->GetVDim(), fes->GetNDofs(), (fes->GetVDim()) * (fes->GetNDofs()), CEED_MEM_HOST, + CEED_COPY_VALUES, tp_el_dof.GetData(), restr); } public: /// Constructor. Assumes @a fes is a scalar FE space. - CeedMassOperator(Ceed ceed, const mfem::FiniteElementSpace *fes) - : Operator(fes->GetNDofs()), - fes(fes) { - mfem::Mesh *mesh = fes->GetMesh(); - const int order = fes->GetOrder(0); - const int ir_order = 2*(order + 2) - 1; // <----- - const mfem::IntegrationRule &ir = - mfem::IntRules.Get(mfem::Geometry::SEGMENT, ir_order); - CeedInt num_elem = mesh->GetNE(), dim = mesh->SpaceDimension(), - ncompx = dim, nqpts; + CeedMassOperator(Ceed ceed, const mfem::FiniteElementSpace *fes) : Operator(fes->GetNDofs()), fes(fes) { + mfem::Mesh *mesh = fes->GetMesh(); + const int order = fes->GetOrder(0); + const int ir_order = 2 * (order + 2) - 1; // <----- + const mfem::IntegrationRule &ir = mfem::IntRules.Get(mfem::Geometry::SEGMENT, ir_order); + CeedInt num_elem = mesh->GetNE(), dim = mesh->SpaceDimension(), ncompx = dim, nqpts; FESpace2Ceed(fes, ir, ceed, &basis, &restr); @@ -120,56 +106,44 @@ class CeedMassOperator : public mfem::Operator { CeedBasisGetNumQuadraturePoints(basis, &nqpts); CeedInt strides[3] = {1, nqpts, nqpts}; - CeedElemRestrictionCreateStrided(ceed, num_elem, nqpts, 1, nqpts*num_elem, - strides, &restr_i); + CeedElemRestrictionCreateStrided(ceed, num_elem, nqpts, 1, nqpts * num_elem, strides, &restr_i); CeedVectorCreate(ceed, mesh->GetNodes()->Size(), &node_coords); - CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, - mesh->GetNodes()->GetData()); + CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, mesh->GetNodes()->GetData()); - CeedVectorCreate(ceed, num_elem*nqpts, &qdata); + CeedVectorCreate(ceed, num_elem * nqpts, &qdata); // Context data to be passed to the 'f_build_mass' Q-function. - build_ctx_data.dim = mesh->Dimension(); + build_ctx_data.dim = mesh->Dimension(); build_ctx_data.space_dim = dim; CeedQFunctionContextCreate(ceed, &build_ctx); - CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(build_ctx_data), &build_ctx_data); + CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); // Create the Q-function that builds the mass operator (i.e. computes its // quadrature data) and set its context data. - CeedQFunctionCreateInterior(ceed, 1, f_build_mass, - f_build_mass_loc, &build_qfunc); - CeedQFunctionAddInput(build_qfunc, "dx", ncompx*dim, - CEED_EVAL_GRAD); + CeedQFunctionCreateInterior(ceed, 1, f_build_mass, f_build_mass_loc, &build_qfunc); + CeedQFunctionAddInput(build_qfunc, "dx", ncompx * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(build_qfunc, "weights", 1, CEED_EVAL_WEIGHT); CeedQFunctionAddOutput(build_qfunc, "qdata", 1, CEED_EVAL_NONE); CeedQFunctionSetContext(build_qfunc, build_ctx); // Create the operator that builds the quadrature data for the mass operator. - CeedOperatorCreate(ceed, build_qfunc, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &build_oper); - CeedOperatorSetField(build_oper, "dx", mesh_restr, mesh_basis, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(build_oper, "weights", CEED_ELEMRESTRICTION_NONE, - mesh_basis, CEED_VECTOR_NONE); - CeedOperatorSetField(build_oper, "qdata", restr_i, CEED_BASIS_COLLOCATED, - CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, build_qfunc, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &build_oper); + CeedOperatorSetField(build_oper, "dx", mesh_restr, mesh_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(build_oper, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE); + CeedOperatorSetField(build_oper, "qdata", restr_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // Compute the quadrature data for the mass operator. - CeedOperatorApply(build_oper, node_coords, qdata, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(build_oper, node_coords, qdata, CEED_REQUEST_IMMEDIATE); // Create the Q-function that defines the action of the mass operator. - CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, - f_apply_mass_loc, &apply_qfunc); + CeedQFunctionCreateInterior(ceed, 1, f_apply_mass, f_apply_mass_loc, &apply_qfunc); CeedQFunctionAddInput(apply_qfunc, "u", 1, CEED_EVAL_INTERP); CeedQFunctionAddInput(apply_qfunc, "qdata", 1, CEED_EVAL_NONE); CeedQFunctionAddOutput(apply_qfunc, "v", 1, CEED_EVAL_INTERP); // Create the mass operator. - CeedOperatorCreate(ceed, apply_qfunc, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &oper); + CeedOperatorCreate(ceed, apply_qfunc, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &oper); CeedOperatorSetField(oper, "u", restr, basis, CEED_VECTOR_ACTIVE); CeedOperatorSetField(oper, "qdata", restr_i, CEED_BASIS_COLLOCATED, qdata); CeedOperatorSetField(oper, "v", restr, basis, CEED_VECTOR_ACTIVE); diff --git a/examples/mfem/bp3.cpp b/examples/mfem/bp3.cpp index 0216c061d1..8e90def7f5 100644 --- a/examples/mfem/bp3.cpp +++ b/examples/mfem/bp3.cpp @@ -31,34 +31,35 @@ /// @file /// MFEM diffusion operator based on libCEED +#include "bp3.hpp" + #include + #include -#include "bp3.hpp" /// Exact solution double solution(const mfem::Vector &pt) { - static const double x[3] = { -0.32, 0.15, 0.24 }; - static const double k[3] = { 1.21, 1.45, 1.37 }; - double val = sin(M_PI*(x[0]+k[0]*pt(0))); - for (int d = 1; d < pt.Size(); d++) - val *= sin(M_PI*(x[d]+k[d]*pt(d))); + static const double x[3] = {-0.32, 0.15, 0.24}; + static const double k[3] = {1.21, 1.45, 1.37}; + double val = sin(M_PI * (x[0] + k[0] * pt(0))); + for (int d = 1; d < pt.Size(); d++) val *= sin(M_PI * (x[d] + k[d] * pt(d))); return val; } /// Right-hand side double rhs(const mfem::Vector &pt) { - static const double x[3] = { -0.32, 0.15, 0.24 }; - static const double k[3] = { 1.21, 1.45, 1.37 }; - double f[3], l[3], val, lap; - f[0] = sin(M_PI*(x[0]+k[0]*pt(0))); - l[0] = M_PI*M_PI*k[0]*k[0]*f[0]; - val = f[0]; - lap = l[0]; + static const double x[3] = {-0.32, 0.15, 0.24}; + static const double k[3] = {1.21, 1.45, 1.37}; + double f[3], l[3], val, lap; + f[0] = sin(M_PI * (x[0] + k[0] * pt(0))); + l[0] = M_PI * M_PI * k[0] * k[0] * f[0]; + val = f[0]; + lap = l[0]; for (int d = 1; d < pt.Size(); d++) { - f[d] = sin(M_PI*(x[d]+k[d]*pt(d))); - l[d] = M_PI*M_PI*k[d]*k[d]*f[d]; - lap = lap*f[d] + val*l[d]; - val = val*f[d]; + f[d] = sin(M_PI * (x[d] + k[d] * pt(d))); + l[d] = M_PI * M_PI * k[d] * k[d] * f[d]; + lap = lap * f[d] + val * l[d]; + val = val * f[d]; } return lap; } @@ -67,28 +68,23 @@ double rhs(const mfem::Vector &pt) { int main(int argc, char *argv[]) { // 1. Parse command-line options. const char *ceed_spec = "/cpu/self"; - #ifndef MFEM_DIR +#ifndef MFEM_DIR const char *mesh_file = "../../../mfem/data/star.mesh"; - #else +#else const char *mesh_file = MFEM_DIR "/data/star.mesh"; - #endif - int order = 2; - bool visualization = true; - bool test = false; - double max_nnodes = 50000; +#endif + int order = 2; + bool visualization = true; + bool test = false; + double max_nnodes = 50000; mfem::OptionsParser args(argc, argv); args.AddOption(&ceed_spec, "-c", "-ceed", "Ceed specification."); args.AddOption(&mesh_file, "-m", "--mesh", "Mesh file to use."); - args.AddOption(&order, "-o", "--order", - "Finite element order (polynomial degree)."); + args.AddOption(&order, "-o", "--order", "Finite element order (polynomial degree)."); args.AddOption(&max_nnodes, "-s", "--size", "Maximum size (number of DoFs)"); - args.AddOption(&visualization, "-vis", "--visualization", "-no-vis", - "--no-visualization", - "Enable or disable GLVis visualization."); - args.AddOption(&test, "-t", "--test", "-no-test", - "--no-test", - "Enable or disable test mode."); + args.AddOption(&visualization, "-vis", "--visualization", "-no-vis", "--no-visualization", "Enable or disable GLVis visualization."); + args.AddOption(&test, "-t", "--test", "-no-test", "--no-test", "Enable or disable test mode."); args.Parse(); if (!args.Good()) { args.PrintUsage(std::cout); @@ -104,15 +100,14 @@ int main(int argc, char *argv[]) { // 3. Read the mesh from the given mesh file. mfem::Mesh *mesh = new mfem::Mesh(mesh_file, 1, 1); - int dim = mesh->Dimension(); + int dim = mesh->Dimension(); // 4. Refine the mesh to increase the resolution. In this example we do // 'ref_levels' of uniform refinement. We choose 'ref_levels' to be the // largest number that gives a final system with no more than 50,000 // unknowns, approximately. { - int ref_levels = - (int)floor((log(max_nnodes/mesh->GetNE())-dim*log(order))/log(2.)/dim); + int ref_levels = (int)floor((log(max_nnodes / mesh->GetNE()) - dim * log(order)) / log(2.) / dim); for (int l = 0; l < ref_levels; l++) { mesh->UniformRefinement(); } @@ -127,16 +122,15 @@ int main(int argc, char *argv[]) { // 5. Define a finite element space on the mesh. Here we use continuous // Lagrange finite elements of the specified order. MFEM_VERIFY(order > 0, "invalid order"); - mfem::FiniteElementCollection *fec = new mfem::H1_FECollection(order, dim); - mfem::FiniteElementSpace *fespace = new mfem::FiniteElementSpace(mesh, fec); + mfem::FiniteElementCollection *fec = new mfem::H1_FECollection(order, dim); + mfem::FiniteElementSpace *fespace = new mfem::FiniteElementSpace(mesh, fec); if (!test) { - std::cout << "Number of finite element unknowns: " - << fespace->GetTrueVSize() << std::endl; + std::cout << "Number of finite element unknowns: " << fespace->GetTrueVSize() << std::endl; } mfem::FunctionCoefficient sol_coeff(solution); - mfem::Array ess_tdof_list; - mfem::GridFunction sol(fespace); + mfem::Array ess_tdof_list; + mfem::GridFunction sol(fespace); if (mesh->bdr_attributes.Size()) { mfem::Array ess_bdr(mesh->bdr_attributes.Max()); ess_bdr = 1; @@ -146,7 +140,7 @@ int main(int argc, char *argv[]) { // 6. Construct a rhs vector using the linear form f(v) = (rhs, v), where // v is a test function. - mfem::LinearForm b(fespace); + mfem::LinearForm b(fespace); mfem::FunctionCoefficient rhs_coeff(rhs); b.AddDomainIntegrator(new mfem::DomainLFIntegrator(rhs_coeff)); b.Assemble(); @@ -156,7 +150,7 @@ int main(int argc, char *argv[]) { CeedDiffusionOperator diff(ceed, fespace); mfem::Operator *D; - mfem::Vector X, B; + mfem::Vector X, B; diff.FormLinearSystem(ess_tdof_list, sol, b, D, X, B); // 8. Solve the discrete system using the conjugate gradients (CG) method. @@ -175,10 +169,9 @@ int main(int argc, char *argv[]) { // 9. Compute and print the L2 norm of the error. double err_l2 = sol.ComputeL2Error(sol_coeff); if (!test) { - std::cout << "L2 projection error: " << err_l2 - << std::endl; + std::cout << "L2 projection error: " << err_l2 << std::endl; } else { - if (fabs(sol.ComputeL2Error(sol_coeff))>2e-3) { + if (fabs(sol.ComputeL2Error(sol_coeff)) > 2e-3) { std::cout << "Error too large: " << err_l2 << std::endl; } } @@ -186,8 +179,8 @@ int main(int argc, char *argv[]) { // 10. Open a socket connection to GLVis and send the mesh and solution for // visualization. if (visualization) { - char vishost[] = "localhost"; - int visport = 19916; + char vishost[] = "localhost"; + int visport = 19916; mfem::socketstream sol_sock(vishost, visport); sol_sock.precision(8); sol_sock << "solution\n" << *mesh << sol << std::flush; diff --git a/examples/mfem/bp3.h b/examples/mfem/bp3.h index 10c95fae8c..ddc06443de 100644 --- a/examples/mfem/bp3.h +++ b/examples/mfem/bp3.h @@ -11,11 +11,12 @@ #include /// A structure used to pass additional data to f_build_diff and f_apply_diff -struct BuildContext { CeedInt dim, space_dim; }; +struct BuildContext { + CeedInt dim, space_dim; +}; /// libCEED Q-function for building quadrature data for a diffusion operator -CEED_QFUNCTION(f_build_diff)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(f_build_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { BuildContext *bc = (BuildContext *)ctx; // in[0] is Jacobians with shape [dim, nc=dim, Q] // in[1] is quadrature weights, size (Q) @@ -23,110 +24,99 @@ CEED_QFUNCTION(f_build_diff)(void *ctx, const CeedInt Q, // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store // the symmetric part of the result. const CeedScalar *J = in[0], *w = in[1]; - CeedScalar *qdata = out[0]; + CeedScalar *qdata = out[0]; - switch (bc->dim + 10*bc->space_dim) { - case 11: - // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; idim + 10 * bc->space_dim) { + case 11: + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { qdata[i] = w[i] / J[i]; } + break; + case 22: + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // J: 0 2 qdata: 0 2 adj(J): J22 -J12 + // 1 3 2 1 -J21 J11 + const CeedScalar J11 = J[i + Q * 0]; + const CeedScalar J21 = J[i + Q * 1]; + const CeedScalar J12 = J[i + Q * 2]; + const CeedScalar J22 = J[i + Q * 3]; + const CeedScalar qw = w[i] / (J11 * J22 - J21 * J12); + qdata[i + Q * 0] = qw * (J12 * J12 + J22 * J22); + qdata[i + Q * 1] = qw * (J11 * J11 + J21 * J21); + qdata[i + Q * 2] = -qw * (J11 * J12 + J21 * J22); + } + break; + case 33: + // Quadrature Point Loop + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { + // J: 0 3 6 qdata: 0 5 4 + // 1 4 7 5 1 3 + // 2 5 8 4 3 2 + const CeedScalar J11 = J[i + Q * 0]; + const CeedScalar J21 = J[i + Q * 1]; + const CeedScalar J31 = J[i + Q * 2]; + const CeedScalar J12 = J[i + Q * 3]; + const CeedScalar J22 = J[i + Q * 4]; + const CeedScalar J32 = J[i + Q * 5]; + const CeedScalar J13 = J[i + Q * 6]; + const CeedScalar J23 = J[i + Q * 7]; + const CeedScalar J33 = J[i + Q * 8]; + const CeedScalar A11 = J22 * J33 - J23 * J32; + const CeedScalar A12 = J13 * J32 - J12 * J33; + const CeedScalar A13 = J12 * J23 - J13 * J22; + const CeedScalar A21 = J23 * J31 - J21 * J33; + const CeedScalar A22 = J11 * J33 - J13 * J31; + const CeedScalar A23 = J13 * J21 - J11 * J23; + const CeedScalar A31 = J21 * J32 - J22 * J31; + const CeedScalar A32 = J12 * J31 - J11 * J32; + const CeedScalar A33 = J11 * J22 - J12 * J21; + const CeedScalar qw = w[i] / (J11 * A11 + J21 * A12 + J31 * A13); + qdata[i + Q * 0] = qw * (A11 * A11 + A12 * A12 + A13 * A13); + qdata[i + Q * 1] = qw * (A21 * A21 + A22 * A22 + A23 * A23); + qdata[i + Q * 2] = qw * (A31 * A31 + A32 * A32 + A33 * A33); + qdata[i + Q * 3] = qw * (A21 * A31 + A22 * A32 + A23 * A33); + qdata[i + Q * 4] = qw * (A11 * A31 + A12 * A32 + A13 * A33); + qdata[i + Q * 5] = qw * (A11 * A21 + A12 * A22 + A13 * A23); + } + break; } return 0; } /// libCEED Q-function for applying a diff operator -CEED_QFUNCTION(f_apply_diff)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(f_apply_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { BuildContext *bc = (BuildContext *)ctx; // in[0], out[0] have shape [dim, nc=1, Q] const CeedScalar *ug = in[0], *qdata = in[1]; - CeedScalar *vg = out[0]; + CeedScalar *vg = out[0]; switch (bc->dim) { - case 1: - // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; i + #include + #include "bp3.h" /// Wrapper for a diffusion CeedOperator as an mfem::Operator class CeedDiffusionOperator : public mfem::Operator { protected: const mfem::FiniteElementSpace *fes; - CeedOperator build_oper, oper; - CeedBasis basis, mesh_basis; - CeedElemRestriction restr, mesh_restr, restr_i, mesh_restr_i; - CeedQFunction apply_qfunc, build_qfunc; - CeedQFunctionContext build_ctx; - CeedVector node_coords, qdata; + CeedOperator build_oper, oper; + CeedBasis basis, mesh_basis; + CeedElemRestriction restr, mesh_restr, restr_i, mesh_restr_i; + CeedQFunction apply_qfunc, build_qfunc; + CeedQFunctionContext build_ctx; + CeedVector node_coords, qdata; BuildContext build_ctx_data; CeedVector u, v; - static void FESpace2Ceed(const mfem::FiniteElementSpace *fes, - const mfem::IntegrationRule &ir, - Ceed ceed, CeedBasis *basis, + static void FESpace2Ceed(const mfem::FiniteElementSpace *fes, const mfem::IntegrationRule &ir, Ceed ceed, CeedBasis *basis, CeedElemRestriction *restr) { - mfem::Mesh *mesh = fes->GetMesh(); - const mfem::FiniteElement *fe = fes->GetFE(0); - const int order = fes->GetOrder(0); - mfem::Array dof_map; + mfem::Mesh *mesh = fes->GetMesh(); + const mfem::FiniteElement *fe = fes->GetFE(0); + const int order = fes->GetOrder(0); + mfem::Array dof_map; switch (mesh->Dimension()) { - case 1: { - const mfem::H1_SegmentElement *h1_fe = - dynamic_cast(fe); - MFEM_VERIFY(h1_fe, "invalid FE"); - h1_fe->GetDofMap().Copy(dof_map); - break; - } - case 2: { - const mfem::H1_QuadrilateralElement *h1_fe = - dynamic_cast(fe); - MFEM_VERIFY(h1_fe, "invalid FE"); - h1_fe->GetDofMap().Copy(dof_map); - break; - } - case 3: { - const mfem::H1_HexahedronElement *h1_fe = - dynamic_cast(fe); - MFEM_VERIFY(h1_fe, "invalid FE"); - h1_fe->GetDofMap().Copy(dof_map); - break; - } + case 1: { + const mfem::H1_SegmentElement *h1_fe = dynamic_cast(fe); + MFEM_VERIFY(h1_fe, "invalid FE"); + h1_fe->GetDofMap().Copy(dof_map); + break; + } + case 2: { + const mfem::H1_QuadrilateralElement *h1_fe = dynamic_cast(fe); + MFEM_VERIFY(h1_fe, "invalid FE"); + h1_fe->GetDofMap().Copy(dof_map); + break; + } + case 3: { + const mfem::H1_HexahedronElement *h1_fe = dynamic_cast(fe); + MFEM_VERIFY(h1_fe, "invalid FE"); + h1_fe->GetDofMap().Copy(dof_map); + break; + } } - const mfem::FiniteElement *fe1d = - fes->FEColl()->FiniteElementForGeometry(mfem::Geometry::SEGMENT); - mfem::DenseMatrix shape1d(fe1d->GetDof(), ir.GetNPoints()); - mfem::DenseMatrix grad_1d(fe1d->GetDof(), ir.GetNPoints()); - mfem::Vector q_ref_1d(ir.GetNPoints()), q_weight_1d(ir.GetNPoints()); - mfem::Vector shape_i(shape1d.Height()); - mfem::DenseMatrix grad_i(grad_1d.Height(), 1); - const mfem::H1_SegmentElement *h1_fe1d = - dynamic_cast(fe1d); + const mfem::FiniteElement *fe1d = fes->FEColl()->FiniteElementForGeometry(mfem::Geometry::SEGMENT); + mfem::DenseMatrix shape1d(fe1d->GetDof(), ir.GetNPoints()); + mfem::DenseMatrix grad_1d(fe1d->GetDof(), ir.GetNPoints()); + mfem::Vector q_ref_1d(ir.GetNPoints()), q_weight_1d(ir.GetNPoints()); + mfem::Vector shape_i(shape1d.Height()); + mfem::DenseMatrix grad_i(grad_1d.Height(), 1); + const mfem::H1_SegmentElement *h1_fe1d = dynamic_cast(fe1d); MFEM_VERIFY(h1_fe1d, "invalid FE"); const mfem::Array &dof_map_1d = h1_fe1d->GetDofMap(); for (int i = 0; i < ir.GetNPoints(); i++) { const mfem::IntegrationPoint &ip = ir.IntPoint(i); - q_ref_1d(i) = ip.x; - q_weight_1d(i) = ip.weight; + q_ref_1d(i) = ip.x; + q_weight_1d(i) = ip.weight; fe1d->CalcShape(ip, shape_i); fe1d->CalcDShape(ip, grad_i); for (int j = 0; j < shape1d.Height(); j++) { - shape1d(j,i) = shape_i(dof_map_1d[j]); - grad_1d(j,i) = grad_i(dof_map_1d[j],0); + shape1d(j, i) = shape_i(dof_map_1d[j]); + grad_1d(j, i) = grad_i(dof_map_1d[j], 0); } } - CeedBasisCreateTensorH1(ceed, mesh->Dimension(), fes->GetVDim(), order+1, - ir.GetNPoints(), shape1d.GetData(), - grad_1d.GetData(), q_ref_1d.GetData(), - q_weight_1d.GetData(), basis); + CeedBasisCreateTensorH1(ceed, mesh->Dimension(), fes->GetVDim(), order + 1, ir.GetNPoints(), shape1d.GetData(), grad_1d.GetData(), + q_ref_1d.GetData(), q_weight_1d.GetData(), basis); const mfem::Table &el_dof = fes->GetElementToDofTable(); - mfem::Array tp_el_dof(el_dof.Size_of_connections()); + mfem::Array tp_el_dof(el_dof.Size_of_connections()); for (int i = 0; i < mesh->GetNE(); i++) { - const int el_offset = fe->GetDof()*i; + const int el_offset = fe->GetDof() * i; for (int j = 0; j < fe->GetDof(); j++) { tp_el_dof[j + el_offset] = el_dof.GetJ()[dof_map[j] + el_offset]; } } - CeedElemRestrictionCreate(ceed, mesh->GetNE(), fe->GetDof(), - fes->GetVDim(), fes->GetNDofs(), - (fes->GetVDim())*(fes->GetNDofs()), - CEED_MEM_HOST, CEED_COPY_VALUES, - tp_el_dof.GetData(), restr); + CeedElemRestrictionCreate(ceed, mesh->GetNE(), fe->GetDof(), fes->GetVDim(), fes->GetNDofs(), (fes->GetVDim()) * (fes->GetNDofs()), CEED_MEM_HOST, + CEED_COPY_VALUES, tp_el_dof.GetData(), restr); } public: /// Constructor. Assumes @a fes is a scalar FE space. - CeedDiffusionOperator(Ceed ceed, const mfem::FiniteElementSpace *fes) - : Operator(fes->GetNDofs()), - fes(fes) { - mfem::Mesh *mesh = fes->GetMesh(); - const int order = fes->GetOrder(0); - const int ir_order = 2*(order + 2) - 1; // <----- - const mfem::IntegrationRule &ir = - mfem::IntRules.Get(mfem::Geometry::SEGMENT, ir_order); - CeedInt num_elem = mesh->GetNE(), dim = mesh->SpaceDimension(), - ncompx = dim, nqpts; + CeedDiffusionOperator(Ceed ceed, const mfem::FiniteElementSpace *fes) : Operator(fes->GetNDofs()), fes(fes) { + mfem::Mesh *mesh = fes->GetMesh(); + const int order = fes->GetOrder(0); + const int ir_order = 2 * (order + 2) - 1; // <----- + const mfem::IntegrationRule &ir = mfem::IntRules.Get(mfem::Geometry::SEGMENT, ir_order); + CeedInt num_elem = mesh->GetNE(), dim = mesh->SpaceDimension(), ncompx = dim, nqpts; FESpace2Ceed(fes, ir, ceed, &basis, &restr); @@ -120,58 +106,46 @@ class CeedDiffusionOperator : public mfem::Operator { FESpace2Ceed(mesh_fes, ir, ceed, &mesh_basis, &mesh_restr); CeedBasisGetNumQuadraturePoints(basis, &nqpts); - CeedInt strides[3] = {1, nqpts, nqpts *dim *(dim+1)/2}; - CeedElemRestrictionCreateStrided(ceed, num_elem, nqpts, dim*(dim+1)/2, - dim*(dim+1)/2*nqpts*num_elem, strides, - &restr_i); + CeedInt strides[3] = {1, nqpts, nqpts * dim * (dim + 1) / 2}; + CeedElemRestrictionCreateStrided(ceed, num_elem, nqpts, dim * (dim + 1) / 2, dim * (dim + 1) / 2 * nqpts * num_elem, strides, &restr_i); CeedVectorCreate(ceed, mesh->GetNodes()->Size(), &node_coords); - CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, - mesh->GetNodes()->GetData()); + CeedVectorSetArray(node_coords, CEED_MEM_HOST, CEED_USE_POINTER, mesh->GetNodes()->GetData()); - CeedVectorCreate(ceed, num_elem*nqpts*dim*(dim+1)/2, &qdata); + CeedVectorCreate(ceed, num_elem * nqpts * dim * (dim + 1) / 2, &qdata); // Context data to be passed to the 'f_build_diff' Q-function. - build_ctx_data.dim = mesh->Dimension(); + build_ctx_data.dim = mesh->Dimension(); build_ctx_data.space_dim = dim; CeedQFunctionContextCreate(ceed, &build_ctx); - CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(build_ctx_data), &build_ctx_data); + CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(build_ctx_data), &build_ctx_data); // Create the Q-function that builds the diff operator (i.e. computes its // quadrature data) and set its context data. - CeedQFunctionCreateInterior(ceed, 1, f_build_diff, - f_build_diff_loc, &build_qfunc); - CeedQFunctionAddInput(build_qfunc, "dx", ncompx*dim, CEED_EVAL_GRAD); + CeedQFunctionCreateInterior(ceed, 1, f_build_diff, f_build_diff_loc, &build_qfunc); + CeedQFunctionAddInput(build_qfunc, "dx", ncompx * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(build_qfunc, "weights", 1, CEED_EVAL_WEIGHT); - CeedQFunctionAddOutput(build_qfunc, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); + CeedQFunctionAddOutput(build_qfunc, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE); CeedQFunctionSetContext(build_qfunc, build_ctx); // Create the operator that builds the quadrature data for the diff operator. - CeedOperatorCreate(ceed, build_qfunc, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &build_oper); - CeedOperatorSetField(build_oper, "dx", mesh_restr, mesh_basis, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(build_oper, "weights", CEED_ELEMRESTRICTION_NONE, - mesh_basis, CEED_VECTOR_NONE); - CeedOperatorSetField(build_oper, "qdata", restr_i, CEED_BASIS_COLLOCATED, - CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, build_qfunc, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &build_oper); + CeedOperatorSetField(build_oper, "dx", mesh_restr, mesh_basis, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(build_oper, "weights", CEED_ELEMRESTRICTION_NONE, mesh_basis, CEED_VECTOR_NONE); + CeedOperatorSetField(build_oper, "qdata", restr_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // Compute the quadrature data for the diff operator. - CeedOperatorApply(build_oper, node_coords, qdata, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(build_oper, node_coords, qdata, CEED_REQUEST_IMMEDIATE); // Create the Q-function that defines the action of the diff operator. - CeedQFunctionCreateInterior(ceed, 1, f_apply_diff, - f_apply_diff_loc, &apply_qfunc); + CeedQFunctionCreateInterior(ceed, 1, f_apply_diff, f_apply_diff_loc, &apply_qfunc); CeedQFunctionAddInput(apply_qfunc, "u", dim, CEED_EVAL_GRAD); - CeedQFunctionAddInput(apply_qfunc, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); + CeedQFunctionAddInput(apply_qfunc, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE); CeedQFunctionAddOutput(apply_qfunc, "v", dim, CEED_EVAL_GRAD); CeedQFunctionSetContext(apply_qfunc, build_ctx); // Create the diff operator. - CeedOperatorCreate(ceed, apply_qfunc, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &oper); + CeedOperatorCreate(ceed, apply_qfunc, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &oper); CeedOperatorSetField(oper, "u", restr, basis, CEED_VECTOR_ACTIVE); CeedOperatorSetField(oper, "qdata", restr_i, CEED_BASIS_COLLOCATED, qdata); CeedOperatorSetField(oper, "v", restr, basis, CEED_VECTOR_ACTIVE); diff --git a/examples/nek/bps/bps.h b/examples/nek/bps/bps.h index e878d5dac8..835121a1a6 100644 --- a/examples/nek/bps/bps.h +++ b/examples/nek/bps/bps.h @@ -12,110 +12,103 @@ #include #ifndef M_PI -#define M_PI 3.14159265358979323846 +#define M_PI 3.14159265358979323846 #endif // ***************************************************************************** // BP 1 // ***************************************************************************** -CEED_QFUNCTION(masssetupf)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) { - CeedScalar *qdata = out[0], *rhs = out[1]; +CEED_QFUNCTION(masssetupf)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + CeedScalar *qdata = out[0], *rhs = out[1]; const CeedScalar *x = in[0]; const CeedScalar *J = in[1]; const CeedScalar *w = in[2]; // Quadrature Point Loop - for (CeedInt i=0; i -#include #include #include #include +#include +#include -#include "area.h" #include "include/areaproblemdata.h" +#include "include/libceedsetup.h" +#include "include/matops.h" #include "include/petscutils.h" #include "include/petscversion.h" -#include "include/matops.h" #include "include/structs.h" -#include "include/libceedsetup.h" -#if PETSC_VERSION_LT(3,12,0) +#if PETSC_VERSION_LT(3, 12, 0) #ifdef PETSC_HAVE_CUDA #include // Note: With PETSc prior to version 3.12.0, providing the source path to @@ -61,211 +61,174 @@ static const char help[] = #endif #ifndef M_PI -# define M_PI 3.14159265358979323846 +#define M_PI 3.14159265358979323846 #endif int main(int argc, char **argv) { - PetscInt ierr; MPI_Comm comm; - char filename[PETSC_MAX_PATH_LEN], - ceed_resource[PETSC_MAX_PATH_LEN] = "/cpu/self"; + char filename[PETSC_MAX_PATH_LEN], ceed_resource[PETSC_MAX_PATH_LEN] = "/cpu/self"; PetscInt l_size, g_size, xl_size, - q_extra = 1, // default number of extra quadrature points - num_comp_x = 3, // number of components of 3D physical coordinates - num_comp_u = 1, // dimension of field to which apply mass operator - topo_dim = 2, // topological dimension of manifold - degree = 3; // default degree for finite element bases - PetscBool read_mesh = PETSC_FALSE, - test_mode = PETSC_FALSE, - simplex = PETSC_FALSE; - Vec U, U_loc, V, V_loc; - DM dm; + q_extra = 1, // default number of extra quadrature points + num_comp_x = 3, // number of components of 3D physical coordinates + num_comp_u = 1, // dimension of field to which apply mass operator + topo_dim = 2, // topological dimension of manifold + degree = 3; // default degree for finite element bases + PetscBool read_mesh = PETSC_FALSE, test_mode = PETSC_FALSE, simplex = PETSC_FALSE; + Vec U, U_loc, V, V_loc; + DM dm; OperatorApplyContext op_apply_ctx; - Ceed ceed; - CeedData ceed_data; - ProblemType problem_choice; - VecType vec_type; - PetscMemType mem_type; + Ceed ceed; + CeedData ceed_data; + ProblemType problem_choice; + VecType vec_type; + PetscMemType mem_type; - ierr = PetscInitialize(&argc, &argv, NULL, help); - if (ierr) return ierr; + PetscCall(PetscInitialize(&argc, &argv, NULL, help)); comm = PETSC_COMM_WORLD; // Read command line options PetscOptionsBegin(comm, NULL, "CEED surface area problem with PETSc", NULL); problem_choice = SPHERE; - ierr = PetscOptionsEnum("-problem", - "Problem to solve", NULL, - problem_types, (PetscEnum)problem_choice, - (PetscEnum *)&problem_choice, - NULL); CHKERRQ(ierr); - ierr = PetscOptionsInt("-q_extra", "Number of extra quadrature points", - NULL, q_extra, &q_extra, NULL); CHKERRQ(ierr); - ierr = PetscOptionsString("-ceed", "CEED resource specifier", - NULL, ceed_resource, ceed_resource, - sizeof(ceed_resource), NULL); CHKERRQ(ierr); - ierr = PetscOptionsBool("-test", - "Testing mode (do not print unless error is large)", - NULL, test_mode, &test_mode, NULL); CHKERRQ(ierr); - ierr = PetscOptionsString("-mesh", "Read mesh from file", NULL, - filename, filename, sizeof(filename), &read_mesh); - CHKERRQ(ierr); - ierr = PetscOptionsBool("-simplex", "Use simplices, or tensor product cells", - NULL, simplex, &simplex, NULL); CHKERRQ(ierr); - ierr = PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", - NULL, degree, °ree, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsEnum("-problem", "Problem to solve", NULL, problem_types, (PetscEnum)problem_choice, (PetscEnum *)&problem_choice, NULL)); + PetscCall(PetscOptionsInt("-q_extra", "Number of extra quadrature points", NULL, q_extra, &q_extra, NULL)); + PetscCall(PetscOptionsString("-ceed", "CEED resource specifier", NULL, ceed_resource, ceed_resource, sizeof(ceed_resource), NULL)); + PetscCall(PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, test_mode, &test_mode, NULL)); + PetscCall(PetscOptionsString("-mesh", "Read mesh from file", NULL, filename, filename, sizeof(filename), &read_mesh)); + PetscCall(PetscOptionsBool("-simplex", "Use simplices, or tensor product cells", NULL, simplex, &simplex, NULL)); + PetscCall(PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", NULL, degree, °ree, NULL)); PetscOptionsEnd(); // Setup DM if (read_mesh) { - ierr = DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, - &dm); - CHKERRQ(ierr); + PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, &dm)); } else { // Create the mesh as a 0-refined sphere. This will create a cubic surface, not a box - ierr = DMPlexCreateSphereMesh(PETSC_COMM_WORLD, topo_dim, simplex, 1., &dm); - CHKERRQ(ierr); + PetscCall(DMPlexCreateSphereMesh(PETSC_COMM_WORLD, topo_dim, simplex, 1., &dm)); if (problem_choice == CUBE) { - ierr = DMPlexCreateCoordinateSpace(dm, 1, NULL); CHKERRQ(ierr); + PetscCall(DMPlexCreateCoordinateSpace(dm, 1, NULL)); } // Set the object name - ierr = PetscObjectSetName((PetscObject)dm, problem_types[problem_choice]); - CHKERRQ(ierr); + PetscCall(PetscObjectSetName((PetscObject)dm, problem_types[problem_choice])); // Refine DMPlex with uniform refinement using runtime option -dm_refine - ierr = DMPlexSetRefinementUniform(dm, PETSC_TRUE); CHKERRQ(ierr); - ierr = DMSetFromOptions(dm); CHKERRQ(ierr); + PetscCall(DMPlexSetRefinementUniform(dm, PETSC_TRUE)); + PetscCall(DMSetFromOptions(dm)); // View DMPlex via runtime option - ierr = DMViewFromOptions(dm, NULL, "-dm_view"); CHKERRQ(ierr); + PetscCall(DMViewFromOptions(dm, NULL, "-dm_view")); } // Create DM - ierr = SetupDMByDegree(dm, degree, q_extra, num_comp_u, topo_dim, false); - CHKERRQ(ierr); + PetscCall(SetupDMByDegree(dm, degree, q_extra, num_comp_u, topo_dim, false)); // Create vectors - ierr = DMCreateGlobalVector(dm, &U); CHKERRQ(ierr); - ierr = VecGetLocalSize(U, &l_size); CHKERRQ(ierr); - ierr = VecGetSize(U, &g_size); CHKERRQ(ierr); - ierr = DMCreateLocalVector(dm, &U_loc); CHKERRQ(ierr); - ierr = VecGetSize(U_loc, &xl_size); CHKERRQ(ierr); - ierr = VecDuplicate(U, &V); CHKERRQ(ierr); - ierr = VecDuplicate(U_loc, &V_loc); CHKERRQ(ierr); + PetscCall(DMCreateGlobalVector(dm, &U)); + PetscCall(VecGetLocalSize(U, &l_size)); + PetscCall(VecGetSize(U, &g_size)); + PetscCall(DMCreateLocalVector(dm, &U_loc)); + PetscCall(VecGetSize(U_loc, &xl_size)); + PetscCall(VecDuplicate(U, &V)); + PetscCall(VecDuplicate(U_loc, &V_loc)); // Setup op_apply_ctx structure - ierr = PetscMalloc1(1, &op_apply_ctx); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &op_apply_ctx)); // Set up libCEED CeedInit(ceed_resource, &ceed); CeedMemType mem_type_backend; CeedGetPreferredMemType(ceed, &mem_type_backend); - ierr = DMGetVecType(dm, &vec_type); CHKERRQ(ierr); - if (!vec_type) { // Not yet set by op_apply_ctx -dm_vec_type + PetscCall(DMGetVecType(dm, &vec_type)); + if (!vec_type) { // Not yet set by op_apply_ctx -dm_vec_type switch (mem_type_backend) { - case CEED_MEM_HOST: vec_type = VECSTANDARD; break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) - vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 - else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; - else vec_type = VECSTANDARD; - } + case CEED_MEM_HOST: + vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 + else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; + else vec_type = VECSTANDARD; + } } - ierr = DMSetVecType(dm, vec_type); CHKERRQ(ierr); + PetscCall(DMSetVecType(dm, vec_type)); } // Print summary if (!test_mode) { - PetscInt P = degree + 1, Q = P + q_extra; + PetscInt P = degree + 1, Q = P + q_extra; const char *used_resource; CeedGetResource(ceed, &used_resource); - ierr = PetscPrintf(comm, - "\n-- libCEED + PETSc Surface Area of a Manifold --\n" - " libCEED:\n" - " libCEED Backend : %s\n" - " libCEED Backend MemType : %s\n" - " Mesh:\n" - " Solution Order (P) : %" CeedInt_FMT "\n" - " Quadrature Order (Q) : %" CeedInt_FMT "\n" - " Additional quadrature points (q_extra) : %" CeedInt_FMT "\n" - " Global nodes : %" PetscInt_FMT "\n" - " DoF per node : %" PetscInt_FMT "\n" - " Global DoFs : %" PetscInt_FMT "\n", - used_resource, CeedMemTypes[mem_type_backend], P, Q, q_extra, - g_size/num_comp_u, num_comp_u, g_size); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, + "\n-- libCEED + PETSc Surface Area of a Manifold --\n" + " libCEED:\n" + " libCEED Backend : %s\n" + " libCEED Backend MemType : %s\n" + " Mesh:\n" + " Solution Order (P) : %" CeedInt_FMT "\n" + " Quadrature Order (Q) : %" CeedInt_FMT "\n" + " Additional quadrature points (q_extra) : %" CeedInt_FMT "\n" + " Global nodes : %" PetscInt_FMT "\n" + " DoF per node : %" PetscInt_FMT "\n" + " Global DoFs : %" PetscInt_FMT "\n", + used_resource, CeedMemTypes[mem_type_backend], P, Q, q_extra, g_size / num_comp_u, num_comp_u, g_size)); } // Setup libCEED's objects and apply setup operator - ierr = PetscMalloc1(1, &ceed_data); CHKERRQ(ierr); - ierr = SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, - num_comp_u, g_size, xl_size, - problem_options[problem_choice], ceed_data, - false, (CeedVector)NULL, (CeedVector *)NULL); - CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &ceed_data)); + PetscCall(SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, num_comp_u, g_size, xl_size, problem_options[problem_choice], + ceed_data, false, (CeedVector)NULL, (CeedVector *)NULL)); // Setup output vector PetscScalar *v; - ierr = VecZeroEntries(V_loc); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(V_loc, &v, &mem_type); CHKERRQ(ierr); - CeedVectorSetArray(ceed_data->y_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, - v); + PetscCall(VecZeroEntries(V_loc)); + PetscCall(VecGetArrayAndMemType(V_loc, &v, &mem_type)); + CeedVectorSetArray(ceed_data->y_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, v); // Compute the mesh volume using the mass operator: area = 1^T \cdot M \cdot 1 if (!test_mode) { - ierr = PetscPrintf(comm, - "Computing the mesh area using the formula: area = 1^T M 1\n"); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Computing the mesh area using the formula: area = 1^T M 1\n")); } // Initialize u with ones CeedVectorSetValue(ceed_data->x_ceed, 1.0); // Apply the mass operator: 'u' -> 'v' - CeedOperatorApply(ceed_data->op_apply, ceed_data->x_ceed, ceed_data->y_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(ceed_data->op_apply, ceed_data->x_ceed, ceed_data->y_ceed, CEED_REQUEST_IMMEDIATE); // Gather output vector CeedVectorTakeArray(ceed_data->y_ceed, CEED_MEM_HOST, NULL); - ierr = VecRestoreArrayAndMemType(V_loc, &v); CHKERRQ(ierr); - ierr = VecZeroEntries(V); CHKERRQ(ierr); - ierr = DMLocalToGlobalBegin(dm, V_loc, ADD_VALUES, V); CHKERRQ(ierr); - ierr = DMLocalToGlobalEnd(dm, V_loc, ADD_VALUES, V); CHKERRQ(ierr); + PetscCall(VecRestoreArrayAndMemType(V_loc, &v)); + PetscCall(VecZeroEntries(V)); + PetscCall(DMLocalToGlobalBegin(dm, V_loc, ADD_VALUES, V)); + PetscCall(DMLocalToGlobalEnd(dm, V_loc, ADD_VALUES, V)); // Compute and print the sum of the entries of 'v' giving the mesh surface area PetscScalar area; - ierr = VecSum(V, &area); CHKERRQ(ierr); + PetscCall(VecSum(V, &area)); // Compute the exact surface area and print the result CeedScalar exact_surface_area = 4 * M_PI; if (problem_choice == CUBE) { - exact_surface_area = 6 * 2 * 2; // surface of [-1, 1]^3 + exact_surface_area = 6 * 2 * 2; // surface of [-1, 1]^3 } PetscReal error = fabs(area - exact_surface_area); - PetscReal tol = 5e-6; + PetscReal tol = 5e-6; if (!test_mode || error > tol) { - ierr = PetscPrintf(comm, - "Exact mesh surface area : % .14g\n", - exact_surface_area); - CHKERRQ(ierr); - ierr = PetscPrintf(comm, - "Computed mesh surface area : % .14g\n", area); - CHKERRQ(ierr); - ierr = PetscPrintf(comm, - "Area error : % .14g\n", error); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "Exact mesh surface area : % .14g\n", exact_surface_area)); + PetscCall(PetscPrintf(comm, "Computed mesh surface area : % .14g\n", area)); + PetscCall(PetscPrintf(comm, "Area error : % .14g\n", error)); } // Cleanup - ierr = DMDestroy(&dm); CHKERRQ(ierr); - ierr = VecDestroy(&U); CHKERRQ(ierr); - ierr = VecDestroy(&U_loc); CHKERRQ(ierr); - ierr = VecDestroy(&V); CHKERRQ(ierr); - ierr = VecDestroy(&V_loc); CHKERRQ(ierr); - ierr = PetscFree(op_apply_ctx); CHKERRQ(ierr); - ierr = CeedDataDestroy(0, ceed_data); CHKERRQ(ierr); + PetscCall(DMDestroy(&dm)); + PetscCall(VecDestroy(&U)); + PetscCall(VecDestroy(&U_loc)); + PetscCall(VecDestroy(&V)); + PetscCall(VecDestroy(&V_loc)); + PetscCall(PetscFree(op_apply_ctx)); + PetscCall(CeedDataDestroy(0, ceed_data)); CeedDestroy(&ceed); return PetscFinalize(); } diff --git a/examples/petsc/area.h b/examples/petsc/area.h index 233b408f9a..8445abbae1 100644 --- a/examples/petsc/area.h +++ b/examples/petsc/area.h @@ -8,12 +8,12 @@ #ifndef libceed_petsc_examples_area_h #define libceed_petsc_examples_area_h +#include + // ----------------------------------------------------------------------------- // Command Line Options // ----------------------------------------------------------------------------- -static const char *const problem_types[] = {"cube", "sphere", - "ProblemType", "AREA", NULL - }; +static const char *const problem_types[] = {"cube", "sphere", "ProblemType", "AREA", NULL}; -#endif // libceed_petsc_examples_area_h +#endif // libceed_petsc_examples_area_h diff --git a/examples/petsc/bps.c b/examples/petsc/bps.c index f305a00703..e6be5b88d8 100644 --- a/examples/petsc/bps.c +++ b/examples/petsc/bps.c @@ -33,23 +33,24 @@ /// See bpsraw.c for a "raw" implementation using a structured grid. const char help[] = "Solve CEED BPs using PETSc with DMPlex\n"; -#include -#include +#include "bps.h" + #include #include #include #include #include +#include +#include -#include "bps.h" #include "include/bpsproblemdata.h" +#include "include/libceedsetup.h" +#include "include/matops.h" #include "include/petscutils.h" #include "include/petscversion.h" -#include "include/matops.h" #include "include/structs.h" -#include "include/libceedsetup.h" -#if PETSC_VERSION_LT(3,12,0) +#if PETSC_VERSION_LT(3, 12, 0) #ifdef PETSC_HAVE_CUDA #include // Note: With PETSc prior to version 3.12.0, providing the source path to @@ -60,23 +61,21 @@ const char help[] = "Solve CEED BPs using PETSc with DMPlex\n"; // ----------------------------------------------------------------------------- // Main body of program, called in a loop for performance benchmarking purposes // ----------------------------------------------------------------------------- -static PetscErrorCode RunWithDM(RunParams rp, DM dm, - const char *ceed_resource) { - PetscErrorCode ierr; - double my_rt_start, my_rt, rt_min, rt_max; - PetscInt xl_size, l_size, g_size; - PetscScalar *r; - Vec X, X_loc, rhs, rhs_loc; - Mat mat_O; - KSP ksp; +static PetscErrorCode RunWithDM(RunParams rp, DM dm, const char *ceed_resource) { + double my_rt_start, my_rt, rt_min, rt_max; + PetscInt xl_size, l_size, g_size; + PetscScalar *r; + Vec X, X_loc, rhs, rhs_loc; + Mat mat_O; + KSP ksp; OperatorApplyContext op_apply_ctx, op_error_ctx; - Ceed ceed; - CeedData ceed_data; - CeedQFunction qf_error; - CeedOperator op_error; - CeedVector rhs_ceed, target; - VecType vec_type; - PetscMemType mem_type; + Ceed ceed; + CeedData ceed_data; + CeedQFunction qf_error; + CeedOperator op_error; + CeedVector rhs_ceed, target; + VecType vec_type; + PetscMemType mem_type; PetscFunctionBeginUser; // Set up libCEED @@ -84,41 +83,39 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, CeedMemType mem_type_backend; CeedGetPreferredMemType(ceed, &mem_type_backend); - ierr = DMGetVecType(dm, &vec_type); CHKERRQ(ierr); - if (!vec_type) { // Not yet set by user -dm_vec_type + PetscCall(DMGetVecType(dm, &vec_type)); + if (!vec_type) { // Not yet set by user -dm_vec_type switch (mem_type_backend) { - case CEED_MEM_HOST: vec_type = VECSTANDARD; break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) - vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 - else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; - else vec_type = VECSTANDARD; - } + case CEED_MEM_HOST: + vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 + else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; + else vec_type = VECSTANDARD; + } } - ierr = DMSetVecType(dm, vec_type); CHKERRQ(ierr); + PetscCall(DMSetVecType(dm, vec_type)); } // Create global and local solution vectors - ierr = DMCreateGlobalVector(dm, &X); CHKERRQ(ierr); - ierr = VecGetLocalSize(X, &l_size); CHKERRQ(ierr); - ierr = VecGetSize(X, &g_size); CHKERRQ(ierr); - ierr = DMCreateLocalVector(dm, &X_loc); CHKERRQ(ierr); - ierr = VecGetSize(X_loc, &xl_size); CHKERRQ(ierr); - ierr = VecDuplicate(X, &rhs); CHKERRQ(ierr); + PetscCall(DMCreateGlobalVector(dm, &X)); + PetscCall(VecGetLocalSize(X, &l_size)); + PetscCall(VecGetSize(X, &g_size)); + PetscCall(DMCreateLocalVector(dm, &X_loc)); + PetscCall(VecGetSize(X_loc, &xl_size)); + PetscCall(VecDuplicate(X, &rhs)); // Operator - ierr = PetscMalloc1(1, &op_apply_ctx); CHKERRQ(ierr); - ierr = PetscMalloc1(1, &op_error_ctx); CHKERRQ(ierr); - ierr = MatCreateShell(rp->comm, l_size, l_size, g_size, g_size, - op_apply_ctx, &mat_O); CHKERRQ(ierr); - ierr = MatShellSetOperation(mat_O, MATOP_MULT, - (void(*)(void))MatMult_Ceed); CHKERRQ(ierr); - ierr = MatShellSetOperation(mat_O, MATOP_GET_DIAGONAL, - (void(*)(void))MatGetDiag); CHKERRQ(ierr); - ierr = MatShellSetVecType(mat_O, vec_type); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &op_apply_ctx)); + PetscCall(PetscMalloc1(1, &op_error_ctx)); + PetscCall(MatCreateShell(rp->comm, l_size, l_size, g_size, g_size, op_apply_ctx, &mat_O)); + PetscCall(MatShellSetOperation(mat_O, MATOP_MULT, (void (*)(void))MatMult_Ceed)); + PetscCall(MatShellSetOperation(mat_O, MATOP_GET_DIAGONAL, (void (*)(void))MatGetDiag)); + PetscCall(MatShellSetVecType(mat_O, vec_type)); // Print summary if (!rp->test_mode) { @@ -128,217 +125,189 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, CeedGetResource(ceed, &used_resource); VecType vec_type; - ierr = VecGetType(X, &vec_type); CHKERRQ(ierr); + PetscCall(VecGetType(X, &vec_type)); PetscInt c_start, c_end; - ierr = DMPlexGetHeightStratum(dm, 0, &c_start, &c_end); CHKERRQ(ierr); - DMPolytopeType cell_type; - ierr = DMPlexGetCellType(dm, c_start, &cell_type); CHKERRQ(ierr); + PetscCall(DMPlexGetHeightStratum(dm, 0, &c_start, &c_end)); + DMPolytopeType cell_type; + PetscCall(DMPlexGetCellType(dm, c_start, &cell_type)); CeedElemTopology elem_topo = ElemTopologyP2C(cell_type); - PetscMPIInt comm_size; - ierr = MPI_Comm_size(rp->comm, &comm_size); CHKERRQ(ierr); - ierr = PetscPrintf(rp->comm, - "\n-- CEED Benchmark Problem %" CeedInt_FMT " -- libCEED + PETSc --\n" - " MPI:\n" - " Hostname : %s\n" - " Total ranks : %d\n" - " Ranks per compute node : %d\n" - " PETSc:\n" - " PETSc Vec Type : %s\n" - " libCEED:\n" - " libCEED Backend : %s\n" - " libCEED Backend MemType : %s\n" - " Mesh:\n" - " Solution Order (P) : %" CeedInt_FMT "\n" - " Quadrature Order (Q) : %" CeedInt_FMT "\n" - " Additional quadrature points (q_extra) : %" CeedInt_FMT "\n" - " Global nodes : %" PetscInt_FMT "\n" - " Local Elements : %" PetscInt_FMT "\n" - " Element topology : %s\n" - " Owned nodes : %" PetscInt_FMT "\n" - " DoF per node : %" PetscInt_FMT "\n", - rp->bp_choice+1, rp->hostname, comm_size, - rp->ranks_per_node, vec_type, used_resource, - CeedMemTypes[mem_type_backend], P, Q, rp->q_extra, - g_size/rp->num_comp_u, c_end - c_start, - CeedElemTopologies[elem_topo], - l_size/rp->num_comp_u, rp->num_comp_u); - CHKERRQ(ierr); + PetscMPIInt comm_size; + PetscCall(MPI_Comm_size(rp->comm, &comm_size)); + PetscCall(PetscPrintf(rp->comm, + "\n-- CEED Benchmark Problem %" CeedInt_FMT " -- libCEED + PETSc --\n" + " MPI:\n" + " Hostname : %s\n" + " Total ranks : %d\n" + " Ranks per compute node : %d\n" + " PETSc:\n" + " PETSc Vec Type : %s\n" + " libCEED:\n" + " libCEED Backend : %s\n" + " libCEED Backend MemType : %s\n" + " Mesh:\n" + " Solution Order (P) : %" CeedInt_FMT "\n" + " Quadrature Order (Q) : %" CeedInt_FMT "\n" + " Additional quadrature points (q_extra) : %" CeedInt_FMT "\n" + " Global nodes : %" PetscInt_FMT "\n" + " Local Elements : %" PetscInt_FMT "\n" + " Element topology : %s\n" + " Owned nodes : %" PetscInt_FMT "\n" + " DoF per node : %" PetscInt_FMT "\n", + rp->bp_choice + 1, rp->hostname, comm_size, rp->ranks_per_node, vec_type, used_resource, CeedMemTypes[mem_type_backend], P, + Q, rp->q_extra, g_size / rp->num_comp_u, c_end - c_start, CeedElemTopologies[elem_topo], l_size / rp->num_comp_u, + rp->num_comp_u)); } // Create RHS vector - ierr = VecDuplicate(X_loc, &rhs_loc); CHKERRQ(ierr); - ierr = VecZeroEntries(rhs_loc); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(rhs_loc, &r, &mem_type); CHKERRQ(ierr); + PetscCall(VecDuplicate(X_loc, &rhs_loc)); + PetscCall(VecZeroEntries(rhs_loc)); + PetscCall(VecGetArrayAndMemType(rhs_loc, &r, &mem_type)); CeedVectorCreate(ceed, xl_size, &rhs_ceed); CeedVectorSetArray(rhs_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, r); - ierr = PetscMalloc1(1, &ceed_data); CHKERRQ(ierr); - ierr = SetupLibceedByDegree(dm, ceed, rp->degree, rp->dim, rp->q_extra, - rp->dim, rp->num_comp_u, g_size, xl_size, bp_options[rp->bp_choice], - ceed_data, true, rhs_ceed, &target); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &ceed_data)); + PetscCall(SetupLibceedByDegree(dm, ceed, rp->degree, rp->dim, rp->q_extra, rp->dim, rp->num_comp_u, g_size, xl_size, bp_options[rp->bp_choice], + ceed_data, true, rhs_ceed, &target)); // Gather RHS CeedVectorTakeArray(rhs_ceed, MemTypeP2C(mem_type), NULL); - ierr = VecRestoreArrayAndMemType(rhs_loc, &r); CHKERRQ(ierr); - ierr = VecZeroEntries(rhs); CHKERRQ(ierr); - ierr = DMLocalToGlobal(dm, rhs_loc, ADD_VALUES, rhs); CHKERRQ(ierr); + PetscCall(VecRestoreArrayAndMemType(rhs_loc, &r)); + PetscCall(VecZeroEntries(rhs)); + PetscCall(DMLocalToGlobal(dm, rhs_loc, ADD_VALUES, rhs)); CeedVectorDestroy(&rhs_ceed); // Create the error QFunction - CeedQFunctionCreateInterior(ceed, 1, bp_options[rp->bp_choice].error, - bp_options[rp->bp_choice].error_loc, &qf_error); + CeedQFunctionCreateInterior(ceed, 1, bp_options[rp->bp_choice].error, bp_options[rp->bp_choice].error_loc, &qf_error); CeedQFunctionAddInput(qf_error, "u", rp->num_comp_u, CEED_EVAL_INTERP); CeedQFunctionAddInput(qf_error, "true_soln", rp->num_comp_u, CEED_EVAL_NONE); - CeedQFunctionAddInput(qf_error, "qdata", ceed_data->q_data_size, - CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_error, "qdata", ceed_data->q_data_size, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_error, "error", rp->num_comp_u, CEED_EVAL_INTERP); // Create the error operator - CeedOperatorCreate(ceed, qf_error, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_error); - CeedOperatorSetField(op_error, "u", ceed_data->elem_restr_u, - ceed_data->basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_error, "true_soln", ceed_data->elem_restr_u_i, - CEED_BASIS_COLLOCATED, target); - CeedOperatorSetField(op_error, "qdata", ceed_data->elem_restr_qd_i, - CEED_BASIS_COLLOCATED, ceed_data->q_data); - CeedOperatorSetField(op_error, "error", ceed_data->elem_restr_u, - ceed_data->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_error, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_error); + CeedOperatorSetField(op_error, "u", ceed_data->elem_restr_u, ceed_data->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_error, "true_soln", ceed_data->elem_restr_u_i, CEED_BASIS_COLLOCATED, target); + CeedOperatorSetField(op_error, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_COLLOCATED, ceed_data->q_data); + CeedOperatorSetField(op_error, "error", ceed_data->elem_restr_u, ceed_data->basis_u, CEED_VECTOR_ACTIVE); // Set up apply operator context - ierr = SetupApplyOperatorCtx(rp->comm, dm, ceed, - ceed_data, X_loc, - op_apply_ctx); CHKERRQ(ierr); - ierr = KSPCreate(rp->comm, &ksp); CHKERRQ(ierr); + PetscCall(SetupApplyOperatorCtx(rp->comm, dm, ceed, ceed_data, X_loc, op_apply_ctx)); + PetscCall(KSPCreate(rp->comm, &ksp)); { PC pc; - ierr = KSPGetPC(ksp, &pc); CHKERRQ(ierr); + PetscCall(KSPGetPC(ksp, &pc)); if (rp->bp_choice == CEED_BP1 || rp->bp_choice == CEED_BP2) { - ierr = PCSetType(pc, PCJACOBI); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCJACOBI)); if (rp->simplex) { - ierr = PCJacobiSetType(pc, PC_JACOBI_DIAGONAL); CHKERRQ(ierr); + PetscCall(PCJacobiSetType(pc, PC_JACOBI_DIAGONAL)); } else { - ierr = PCJacobiSetType(pc, PC_JACOBI_ROWSUM); CHKERRQ(ierr); + PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM)); } } else { - ierr = PCSetType(pc, PCNONE); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCNONE)); } - ierr = KSPSetType(ksp, KSPCG); CHKERRQ(ierr); - ierr = KSPSetNormType(ksp, KSP_NORM_NATURAL); CHKERRQ(ierr); - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, - PETSC_DEFAULT); CHKERRQ(ierr); + PetscCall(KSPSetType(ksp, KSPCG)); + PetscCall(KSPSetNormType(ksp, KSP_NORM_NATURAL)); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, PETSC_DEFAULT)); } - ierr = KSPSetOperators(ksp, mat_O, mat_O); CHKERRQ(ierr); + PetscCall(KSPSetOperators(ksp, mat_O, mat_O)); // First run's performance log is not considered for benchmarking purposes - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1); - CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1)); my_rt_start = MPI_Wtime(); - ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); + PetscCall(KSPSolve(ksp, rhs, X)); my_rt = MPI_Wtime() - my_rt_start; - ierr = MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, rp->comm); - CHKERRQ(ierr); + PetscCall(MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, rp->comm)); // Set maxits based on first iteration timing if (my_rt > 0.02) { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, - rp->ksp_max_it_clip[0]); - CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, rp->ksp_max_it_clip[0])); } else { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, - rp->ksp_max_it_clip[1]); - CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, rp->ksp_max_it_clip[1])); } - ierr = KSPSetFromOptions(ksp); CHKERRQ(ierr); + PetscCall(KSPSetFromOptions(ksp)); // Timed solve - ierr = VecZeroEntries(X); CHKERRQ(ierr); - ierr = PetscBarrier((PetscObject)ksp); CHKERRQ(ierr); + PetscCall(VecZeroEntries(X)); + PetscCall(PetscBarrier((PetscObject)ksp)); // -- Performance logging - ierr = PetscLogStagePush(rp->solve_stage); CHKERRQ(ierr); + PetscCall(PetscLogStagePush(rp->solve_stage)); // -- Solve my_rt_start = MPI_Wtime(); - ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); + PetscCall(KSPSolve(ksp, rhs, X)); my_rt = MPI_Wtime() - my_rt_start; // -- Performance logging - ierr = PetscLogStagePop(); + PetscCall(PetscLogStagePop()); // Output results { - KSPType ksp_type; + KSPType ksp_type; KSPConvergedReason reason; - PetscReal rnorm; - PetscInt its; - ierr = KSPGetType(ksp, &ksp_type); CHKERRQ(ierr); - ierr = KSPGetConvergedReason(ksp, &reason); CHKERRQ(ierr); - ierr = KSPGetIterationNumber(ksp, &its); CHKERRQ(ierr); - ierr = KSPGetResidualNorm(ksp, &rnorm); CHKERRQ(ierr); + PetscReal rnorm; + PetscInt its; + PetscCall(KSPGetType(ksp, &ksp_type)); + PetscCall(KSPGetConvergedReason(ksp, &reason)); + PetscCall(KSPGetIterationNumber(ksp, &its)); + PetscCall(KSPGetResidualNorm(ksp, &rnorm)); if (!rp->test_mode || reason < 0 || rnorm > 1e-8) { - ierr = PetscPrintf(rp->comm, - " KSP:\n" - " KSP Type : %s\n" - " KSP Convergence : %s\n" - " Total KSP Iterations : %" PetscInt_FMT "\n" - " Final rnorm : %e\n", - ksp_type, KSPConvergedReasons[reason], its, - (double)rnorm); CHKERRQ(ierr); + PetscCall(PetscPrintf(rp->comm, + " KSP:\n" + " KSP Type : %s\n" + " KSP Convergence : %s\n" + " Total KSP Iterations : %" PetscInt_FMT "\n" + " Final rnorm : %e\n", + ksp_type, KSPConvergedReasons[reason], its, (double)rnorm)); } if (!rp->test_mode) { - ierr = PetscPrintf(rp->comm," Performance:\n"); CHKERRQ(ierr); + PetscCall(PetscPrintf(rp->comm, " Performance:\n")); } { // Set up error operator context - ierr = SetupErrorOperatorCtx(rp->comm, dm, ceed, - ceed_data, X_loc, op_error, - op_error_ctx); CHKERRQ(ierr); + PetscCall(SetupErrorOperatorCtx(rp->comm, dm, ceed, ceed_data, X_loc, op_error, op_error_ctx)); PetscScalar l2_error; - ierr = ComputeL2Error(X, &l2_error, op_error_ctx); CHKERRQ(ierr); + PetscCall(ComputeL2Error(X, &l2_error, op_error_ctx)); PetscReal tol = 5e-2; if (!rp->test_mode || l2_error > tol) { - ierr = MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, rp->comm); - CHKERRQ(ierr); - ierr = MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, rp->comm); - CHKERRQ(ierr); - ierr = PetscPrintf(rp->comm, - " L2 Error : %e\n" - " CG Solve Time : %g (%g) sec\n", - (double)l2_error, rt_max, rt_min); CHKERRQ(ierr); + PetscCall(MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, rp->comm)); + PetscCall(MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, rp->comm)); + PetscCall(PetscPrintf(rp->comm, + " L2 Error : %e\n" + " CG Solve Time : %g (%g) sec\n", + (double)l2_error, rt_max, rt_min)); } } if (!rp->test_mode) { - ierr = PetscPrintf(rp->comm, - " DoFs/Sec in CG : %g (%g) million\n", - 1e-6*g_size*its/rt_max, - 1e-6*g_size*its/rt_min); CHKERRQ(ierr); + PetscCall(PetscPrintf(rp->comm, " DoFs/Sec in CG : %g (%g) million\n", 1e-6 * g_size * its / rt_max, + 1e-6 * g_size * its / rt_min)); } } if (rp->write_solution) { PetscViewer vtk_viewer_soln; - ierr = PetscViewerCreate(rp->comm, &vtk_viewer_soln); CHKERRQ(ierr); - ierr = PetscViewerSetType(vtk_viewer_soln, PETSCVIEWERVTK); CHKERRQ(ierr); - ierr = PetscViewerFileSetName(vtk_viewer_soln, "solution.vtu"); CHKERRQ(ierr); - ierr = VecView(X, vtk_viewer_soln); CHKERRQ(ierr); - ierr = PetscViewerDestroy(&vtk_viewer_soln); CHKERRQ(ierr); + PetscCall(PetscViewerCreate(rp->comm, &vtk_viewer_soln)); + PetscCall(PetscViewerSetType(vtk_viewer_soln, PETSCVIEWERVTK)); + PetscCall(PetscViewerFileSetName(vtk_viewer_soln, "solution.vtu")); + PetscCall(VecView(X, vtk_viewer_soln)); + PetscCall(PetscViewerDestroy(&vtk_viewer_soln)); } // Cleanup - ierr = VecDestroy(&X); CHKERRQ(ierr); - ierr = VecDestroy(&X_loc); CHKERRQ(ierr); - ierr = VecDestroy(&op_apply_ctx->Y_loc); CHKERRQ(ierr); - ierr = VecDestroy(&op_error_ctx->Y_loc); CHKERRQ(ierr); - ierr = MatDestroy(&mat_O); CHKERRQ(ierr); - ierr = PetscFree(op_apply_ctx); CHKERRQ(ierr); - ierr = PetscFree(op_error_ctx); CHKERRQ(ierr); - ierr = CeedDataDestroy(0, ceed_data); CHKERRQ(ierr); - - ierr = VecDestroy(&rhs); CHKERRQ(ierr); - ierr = VecDestroy(&rhs_loc); CHKERRQ(ierr); - ierr = KSPDestroy(&ksp); CHKERRQ(ierr); + PetscCall(VecDestroy(&X)); + PetscCall(VecDestroy(&X_loc)); + PetscCall(VecDestroy(&op_apply_ctx->Y_loc)); + PetscCall(VecDestroy(&op_error_ctx->Y_loc)); + PetscCall(MatDestroy(&mat_O)); + PetscCall(PetscFree(op_apply_ctx)); + PetscCall(PetscFree(op_error_ctx)); + PetscCall(CeedDataDestroy(0, ceed_data)); + + PetscCall(VecDestroy(&rhs)); + PetscCall(VecDestroy(&rhs_loc)); + PetscCall(KSPDestroy(&ksp)); CeedVectorDestroy(&target); CeedQFunctionDestroy(&qf_error); CeedOperatorDestroy(&op_error); @@ -346,203 +315,162 @@ static PetscErrorCode RunWithDM(RunParams rp, DM dm, PetscFunctionReturn(0); } -static PetscErrorCode Run(RunParams rp, PetscInt num_resources, - char *const *ceed_resources, PetscInt num_bp_choices, - const BPType *bp_choices) { - PetscInt ierr; +static PetscErrorCode Run(RunParams rp, PetscInt num_resources, char *const *ceed_resources, PetscInt num_bp_choices, const BPType *bp_choices) { DM dm; PetscFunctionBeginUser; // Setup DM - ierr = CreateDistributedDM(rp, &dm); CHKERRQ(ierr); + PetscCall(CreateDistributedDM(rp, &dm)); for (PetscInt b = 0; b < num_bp_choices; b++) { - DM dm_deg; - VecType vec_type; + DM dm_deg; + VecType vec_type; PetscInt q_extra = rp->q_extra; - rp->bp_choice = bp_choices[b]; - rp->num_comp_u = bp_options[rp->bp_choice].num_comp_u; - rp->q_extra = q_extra < 0 ? bp_options[rp->bp_choice].q_extra : q_extra; - ierr = DMClone(dm, &dm_deg); CHKERRQ(ierr); - ierr = DMGetVecType(dm, &vec_type); CHKERRQ(ierr); - ierr = DMSetVecType(dm_deg, vec_type); CHKERRQ(ierr); + rp->bp_choice = bp_choices[b]; + rp->num_comp_u = bp_options[rp->bp_choice].num_comp_u; + rp->q_extra = q_extra < 0 ? bp_options[rp->bp_choice].q_extra : q_extra; + PetscCall(DMClone(dm, &dm_deg)); + PetscCall(DMGetVecType(dm, &vec_type)); + PetscCall(DMSetVecType(dm_deg, vec_type)); // Create DM PetscInt dim; - ierr = DMGetDimension(dm_deg, &dim); CHKERRQ(ierr); - ierr = SetupDMByDegree(dm_deg, rp->degree, rp->q_extra, rp->num_comp_u, dim, - bp_options[rp->bp_choice].enforce_bc); CHKERRQ(ierr); + PetscCall(DMGetDimension(dm_deg, &dim)); + PetscCall(SetupDMByDegree(dm_deg, rp->degree, rp->q_extra, rp->num_comp_u, dim, bp_options[rp->bp_choice].enforce_bc)); for (PetscInt r = 0; r < num_resources; r++) { - ierr = RunWithDM(rp, dm_deg, ceed_resources[r]); CHKERRQ(ierr); + PetscCall(RunWithDM(rp, dm_deg, ceed_resources[r])); } - ierr = DMDestroy(&dm_deg); CHKERRQ(ierr); + PetscCall(DMDestroy(&dm_deg)); rp->q_extra = q_extra; } - ierr = DMDestroy(&dm); CHKERRQ(ierr); + PetscCall(DMDestroy(&dm)); PetscFunctionReturn(0); } int main(int argc, char **argv) { - PetscInt ierr, comm_size; + PetscInt comm_size; RunParams rp; - MPI_Comm comm; - char filename[PETSC_MAX_PATH_LEN]; - char *ceed_resources[30]; - PetscInt num_ceed_resources = 30; - char hostname[PETSC_MAX_PATH_LEN]; - - PetscInt dim = 3, mesh_elem[3] = {3, 3, 3}; - PetscInt num_degrees = 30, degree[30] = {}, num_local_nodes = 2, - local_nodes[2] = {}; + MPI_Comm comm; + char filename[PETSC_MAX_PATH_LEN]; + char *ceed_resources[30]; + PetscInt num_ceed_resources = 30; + char hostname[PETSC_MAX_PATH_LEN]; + + PetscInt dim = 3, mesh_elem[3] = {3, 3, 3}; + PetscInt num_degrees = 30, degree[30] = {}, num_local_nodes = 2, local_nodes[2] = {}; PetscMPIInt ranks_per_node; - PetscBool degree_set; - BPType bp_choices[10]; - PetscInt num_bp_choices = 10; + PetscBool degree_set; + BPType bp_choices[10]; + PetscInt num_bp_choices = 10; // Initialize PETSc - ierr = PetscInitialize(&argc, &argv, NULL, help); - if (ierr) return ierr; + PetscCall(PetscInitialize(&argc, &argv, NULL, help)); comm = PETSC_COMM_WORLD; - ierr = MPI_Comm_size(comm, &comm_size); - if (ierr != MPI_SUCCESS) return ierr; - #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY) + PetscCall(MPI_Comm_size(comm, &comm_size)); +#if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY) { MPI_Comm splitcomm; - ierr = MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, - &splitcomm); - CHKERRQ(ierr); - ierr = MPI_Comm_size(splitcomm, &ranks_per_node); CHKERRQ(ierr); - ierr = MPI_Comm_free(&splitcomm); CHKERRQ(ierr); + PetscCall(MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &splitcomm)); + PetscCall(MPI_Comm_size(splitcomm, &ranks_per_node)); + PetscCall(MPI_Comm_free(&splitcomm)); } - #else - ranks_per_node = -1; // Unknown - #endif +#else + ranks_per_node = -1; // Unknown +#endif // Setup all parameters needed in Run() - ierr = PetscMalloc1(1, &rp); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &rp)); rp->comm = comm; // Read command line options PetscOptionsBegin(comm, NULL, "CEED BPs in PETSc", NULL); { PetscBool set; - ierr = PetscOptionsEnumArray("-problem", "CEED benchmark problem to solve", - NULL, - bp_types, (PetscEnum *)bp_choices, &num_bp_choices, &set); - CHKERRQ(ierr); + PetscCall(PetscOptionsEnumArray("-problem", "CEED benchmark problem to solve", NULL, bp_types, (PetscEnum *)bp_choices, &num_bp_choices, &set)); if (!set) { - bp_choices[0] = CEED_BP1; + bp_choices[0] = CEED_BP1; num_bp_choices = 1; } } rp->test_mode = PETSC_FALSE; - ierr = PetscOptionsBool("-test", - "Testing mode (do not print unless error is large)", - NULL, rp->test_mode, &rp->test_mode, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, rp->test_mode, &rp->test_mode, NULL)); rp->write_solution = PETSC_FALSE; - ierr = PetscOptionsBool("-write_solution", "Write solution for visualization", - NULL, rp->write_solution, &rp->write_solution, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-write_solution", "Write solution for visualization", NULL, rp->write_solution, &rp->write_solution, NULL)); rp->simplex = PETSC_FALSE; - ierr = PetscOptionsBool("-simplex", "Element topology (default:hex)", - NULL, rp->simplex, &rp->simplex, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-simplex", "Element topology (default:hex)", NULL, rp->simplex, &rp->simplex, NULL)); if ((bp_choices[0] == CEED_BP5 || bp_choices[0] == CEED_BP6) && (rp->simplex)) { - SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, - "BP5/6 is not supported with simplex"); + SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "BP5/6 is not supported with simplex"); } degree[0] = rp->test_mode ? 3 : 2; - ierr = PetscOptionsIntArray("-degree", - "Polynomial degree of tensor product basis", NULL, - degree, &num_degrees, °ree_set); CHKERRQ(ierr); - if (!degree_set) - num_degrees = 1; + PetscCall(PetscOptionsIntArray("-degree", "Polynomial degree of tensor product basis", NULL, degree, &num_degrees, °ree_set)); + if (!degree_set) num_degrees = 1; rp->q_extra = PETSC_DECIDE; - ierr = PetscOptionsInt("-q_extra", - "Number of extra quadrature points (-1 for auto)", NULL, - rp->q_extra, &rp->q_extra, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-q_extra", "Number of extra quadrature points (-1 for auto)", NULL, rp->q_extra, &rp->q_extra, NULL)); { PetscBool set; - ierr = PetscOptionsStringArray("-ceed", - "CEED resource specifier (comma-separated list)", NULL, - ceed_resources, &num_ceed_resources, &set); CHKERRQ(ierr); + PetscCall(PetscOptionsStringArray("-ceed", "CEED resource specifier (comma-separated list)", NULL, ceed_resources, &num_ceed_resources, &set)); if (!set) { - ierr = PetscStrallocpy( "/cpu/self", &ceed_resources[0]); CHKERRQ(ierr); + PetscCall(PetscStrallocpy("/cpu/self", &ceed_resources[0])); num_ceed_resources = 1; } } - ierr = PetscGetHostName(hostname, sizeof hostname); CHKERRQ(ierr); - ierr = PetscOptionsString("-hostname", "Hostname for output", NULL, hostname, - hostname, sizeof(hostname), NULL); CHKERRQ(ierr); + PetscCall(PetscGetHostName(hostname, sizeof hostname)); + PetscCall(PetscOptionsString("-hostname", "Hostname for output", NULL, hostname, hostname, sizeof(hostname), NULL)); rp->read_mesh = PETSC_FALSE; - ierr = PetscOptionsString("-mesh", "Read mesh from file", NULL, filename, - filename, sizeof(filename), &rp->read_mesh); - CHKERRQ(ierr); + PetscCall(PetscOptionsString("-mesh", "Read mesh from file", NULL, filename, filename, sizeof(filename), &rp->read_mesh)); rp->filename = filename; if (!rp->read_mesh) { PetscInt tmp = dim; - ierr = PetscOptionsIntArray("-cells", "Number of cells per dimension", NULL, - mesh_elem, &tmp, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray("-cells", "Number of cells per dimension", NULL, mesh_elem, &tmp, NULL)); } local_nodes[0] = 1000; - ierr = PetscOptionsIntArray("-local_nodes", - "Target number of locally owned nodes per " - "process (single value or min,max)", - NULL, local_nodes, &num_local_nodes, &rp->user_l_nodes); - CHKERRQ(ierr); - if (num_local_nodes < 2) - local_nodes[1] = 2 * local_nodes[0]; + PetscCall(PetscOptionsIntArray("-local_nodes", + "Target number of locally owned nodes per " + "process (single value or min,max)", + NULL, local_nodes, &num_local_nodes, &rp->user_l_nodes)); + if (num_local_nodes < 2) local_nodes[1] = 2 * local_nodes[0]; { - PetscInt two = 2; + PetscInt two = 2; rp->ksp_max_it_clip[0] = 5; rp->ksp_max_it_clip[1] = 20; - ierr = PetscOptionsIntArray("-ksp_max_it_clip", - "Min and max number of iterations to use during benchmarking", - NULL, rp->ksp_max_it_clip, &two, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray("-ksp_max_it_clip", "Min and max number of iterations to use during benchmarking", NULL, rp->ksp_max_it_clip, &two, + NULL)); } if (!degree_set) { PetscInt max_degree = 8; - ierr = PetscOptionsInt("-max_degree", - "Range of degrees [1, max_degree] to run with", - NULL, max_degree, &max_degree, NULL); - CHKERRQ(ierr); - for (PetscInt i = 0; i < max_degree; i++) - degree[i] = i + 1; + PetscCall(PetscOptionsInt("-max_degree", "Range of degrees [1, max_degree] to run with", NULL, max_degree, &max_degree, NULL)); + for (PetscInt i = 0; i < max_degree; i++) degree[i] = i + 1; num_degrees = max_degree; } { PetscBool flg; - PetscInt p = ranks_per_node; - ierr = PetscOptionsInt("-p", "Number of MPI ranks per node", NULL, - p, &p, &flg); - CHKERRQ(ierr); + PetscInt p = ranks_per_node; + PetscCall(PetscOptionsInt("-p", "Number of MPI ranks per node", NULL, p, &p, &flg)); if (flg) ranks_per_node = p; } PetscOptionsEnd(); // Register PETSc logging stage - ierr = PetscLogStageRegister("Solve Stage", &rp->solve_stage); - CHKERRQ(ierr); + PetscCall(PetscLogStageRegister("Solve Stage", &rp->solve_stage)); - rp->hostname = hostname; - rp->dim = dim; - rp->mesh_elem = mesh_elem; + rp->hostname = hostname; + rp->dim = dim; + rp->mesh_elem = mesh_elem; rp->ranks_per_node = ranks_per_node; for (PetscInt d = 0; d < num_degrees; d++) { PetscInt deg = degree[d]; for (PetscInt n = local_nodes[0]; n < local_nodes[1]; n *= 2) { - rp->degree = deg; + rp->degree = deg; rp->local_nodes = n; - ierr = Run(rp, num_ceed_resources, ceed_resources, - num_bp_choices, bp_choices); CHKERRQ(ierr); + PetscCall(Run(rp, num_ceed_resources, ceed_resources, num_bp_choices, bp_choices)); } } // Clear memory - ierr = PetscFree(rp); CHKERRQ(ierr); - for (PetscInt i=0; i #include #include + #include "qfunctions/bps/bp1.h" #include "qfunctions/bps/bp2.h" #include "qfunctions/bps/bp3.h" #include "qfunctions/bps/bp4.h" #include "qfunctions/bps/common.h" -#if PETSC_VERSION_LT(3,12,0) +#if PETSC_VERSION_LT(3, 12, 0) #ifdef PETSC_HAVE_CUDA #include // Note: With PETSc prior to version 3.12.0, providing the source path to @@ -52,39 +53,29 @@ const char help[] = "Solve CEED BPs using PETSc\n"; #endif #endif -static CeedMemType MemTypeP2C(PetscMemType mem_type) { - return PetscMemTypeDevice(mem_type) ? CEED_MEM_DEVICE : CEED_MEM_HOST; -} +static CeedMemType MemTypeP2C(PetscMemType mem_type) { return PetscMemTypeDevice(mem_type) ? CEED_MEM_DEVICE : CEED_MEM_HOST; } static void Split3(PetscInt size, PetscInt m[3], bool reverse) { - for (PetscInt d=0, size_left=size; d<3; d++) { - PetscInt try = (PetscInt)PetscCeilReal(PetscPowReal(size_left, 1./(3 - d))); + for (PetscInt d = 0, size_left = size; d < 3; d++) { + PetscInt try = (PetscInt)PetscCeilReal(PetscPowReal(size_left, 1. / (3 - d))); while (try * (size_left / try) != size_left) try++; - m[reverse ? 2-d : d] = try; + m[reverse ? 2 - d : d] = try; size_left /= try; } } -static PetscInt Max3(const PetscInt a[3]) { - return PetscMax(a[0], PetscMax(a[1], a[2])); -} -static PetscInt Min3(const PetscInt a[3]) { - return PetscMin(a[0], PetscMin(a[1], a[2])); -} -static void GlobalNodes(const PetscInt p[3], const PetscInt i_rank[3], - PetscInt degree, const PetscInt mesh_elem[3], - PetscInt m_nodes[3]) { - for (int d=0; d<3; d++) - m_nodes[d] = degree*mesh_elem[d] + (i_rank[d] == p[d]-1); +static PetscInt Max3(const PetscInt a[3]) { return PetscMax(a[0], PetscMax(a[1], a[2])); } +static PetscInt Min3(const PetscInt a[3]) { return PetscMin(a[0], PetscMin(a[1], a[2])); } +static void GlobalNodes(const PetscInt p[3], const PetscInt i_rank[3], PetscInt degree, const PetscInt mesh_elem[3], PetscInt m_nodes[3]) { + for (int d = 0; d < 3; d++) m_nodes[d] = degree * mesh_elem[d] + (i_rank[d] == p[d] - 1); } -static PetscInt GlobalStart(const PetscInt p[3], const PetscInt i_rank[3], - PetscInt degree, const PetscInt mesh_elem[3]) { +static PetscInt GlobalStart(const PetscInt p[3], const PetscInt i_rank[3], PetscInt degree, const PetscInt mesh_elem[3]) { PetscInt start = 0; // Dumb brute-force is easier to read - for (PetscInt i=0; il_to_g, X, op_apply_ctx->X_loc, - INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); - ierr = VecScatterEnd(op_apply_ctx->l_to_g, X, op_apply_ctx->X_loc, - INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); + PetscCall(VecScatterBegin(op_apply_ctx->l_to_g, X, op_apply_ctx->X_loc, INSERT_VALUES, SCATTER_REVERSE)); + PetscCall(VecScatterEnd(op_apply_ctx->l_to_g, X, op_apply_ctx->X_loc, INSERT_VALUES, SCATTER_REVERSE)); // Setup libCEED vectors - ierr = VecGetArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x, - &x_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(op_apply_ctx->Y_loc, &y, &y_mem_type); - CHKERRQ(ierr); - CeedVectorSetArray(op_apply_ctx->x_ceed, MemTypeP2C(x_mem_type), - CEED_USE_POINTER, x); - CeedVectorSetArray(op_apply_ctx->y_ceed, MemTypeP2C(y_mem_type), - CEED_USE_POINTER, y); + PetscCall(VecGetArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x, &x_mem_type)); + PetscCall(VecGetArrayAndMemType(op_apply_ctx->Y_loc, &y, &y_mem_type)); + CeedVectorSetArray(op_apply_ctx->x_ceed, MemTypeP2C(x_mem_type), CEED_USE_POINTER, x); + CeedVectorSetArray(op_apply_ctx->y_ceed, MemTypeP2C(y_mem_type), CEED_USE_POINTER, y); // Apply libCEED operator - CeedOperatorApply(op_apply_ctx->op, op_apply_ctx->x_ceed, op_apply_ctx->y_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_apply_ctx->op, op_apply_ctx->x_ceed, op_apply_ctx->y_ceed, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(op_apply_ctx->x_ceed, MemTypeP2C(x_mem_type), NULL); CeedVectorTakeArray(op_apply_ctx->y_ceed, MemTypeP2C(y_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(op_apply_ctx->X_loc, - (const PetscScalar **)&x); CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(op_apply_ctx->Y_loc, &y); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x)); + PetscCall(VecRestoreArrayAndMemType(op_apply_ctx->Y_loc, &y)); // Local-to-global if (Y) { - ierr = VecZeroEntries(Y); CHKERRQ(ierr); - ierr = VecScatterBegin(op_apply_ctx->l_to_g, op_apply_ctx->Y_loc, Y, ADD_VALUES, - SCATTER_FORWARD); CHKERRQ(ierr); - ierr = VecScatterEnd(op_apply_ctx->l_to_g, op_apply_ctx->Y_loc, Y, ADD_VALUES, - SCATTER_FORWARD); CHKERRQ(ierr); + PetscCall(VecZeroEntries(Y)); + PetscCall(VecScatterBegin(op_apply_ctx->l_to_g, op_apply_ctx->Y_loc, Y, ADD_VALUES, SCATTER_FORWARD)); + PetscCall(VecScatterEnd(op_apply_ctx->l_to_g, op_apply_ctx->Y_loc, Y, ADD_VALUES, SCATTER_FORWARD)); } PetscFunctionReturn(0); } @@ -311,68 +272,49 @@ static PetscErrorCode MatMult_Mass(Mat A, Vec X, Vec Y) { // This function uses libCEED to compute the action of the Laplacian with // Dirichlet boundary conditions static PetscErrorCode MatMult_Diff(Mat A, Vec X, Vec Y) { - PetscErrorCode ierr; OperatorApplyContext op_apply_ctx; - PetscScalar *x, *y; - PetscMemType x_mem_type, y_mem_type; + PetscScalar *x, *y; + PetscMemType x_mem_type, y_mem_type; PetscFunctionBeginUser; - ierr = MatShellGetContext(A, &op_apply_ctx); CHKERRQ(ierr); + PetscCall(MatShellGetContext(A, &op_apply_ctx)); // Global-to-local - ierr = VecScatterBegin(op_apply_ctx->l_to_g_0, X, op_apply_ctx->X_loc, - INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); - ierr = VecScatterEnd(op_apply_ctx->l_to_g_0, X, op_apply_ctx->X_loc, - INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); + PetscCall(VecScatterBegin(op_apply_ctx->l_to_g_0, X, op_apply_ctx->X_loc, INSERT_VALUES, SCATTER_REVERSE)); + PetscCall(VecScatterEnd(op_apply_ctx->l_to_g_0, X, op_apply_ctx->X_loc, INSERT_VALUES, SCATTER_REVERSE)); // Setup libCEED vectors - ierr = VecGetArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x, - &x_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(op_apply_ctx->Y_loc, &y, &y_mem_type); - CHKERRQ(ierr); - CeedVectorSetArray(op_apply_ctx->x_ceed, MemTypeP2C(x_mem_type), - CEED_USE_POINTER, x); - CeedVectorSetArray(op_apply_ctx->y_ceed, MemTypeP2C(y_mem_type), - CEED_USE_POINTER, y); + PetscCall(VecGetArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x, &x_mem_type)); + PetscCall(VecGetArrayAndMemType(op_apply_ctx->Y_loc, &y, &y_mem_type)); + CeedVectorSetArray(op_apply_ctx->x_ceed, MemTypeP2C(x_mem_type), CEED_USE_POINTER, x); + CeedVectorSetArray(op_apply_ctx->y_ceed, MemTypeP2C(y_mem_type), CEED_USE_POINTER, y); // Apply libCEED operator - CeedOperatorApply(op_apply_ctx->op, op_apply_ctx->x_ceed, op_apply_ctx->y_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_apply_ctx->op, op_apply_ctx->x_ceed, op_apply_ctx->y_ceed, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(op_apply_ctx->x_ceed, MemTypeP2C(x_mem_type), NULL); CeedVectorTakeArray(op_apply_ctx->y_ceed, MemTypeP2C(y_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(op_apply_ctx->X_loc, - (const PetscScalar **)&x); - CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(op_apply_ctx->Y_loc, &y); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x)); + PetscCall(VecRestoreArrayAndMemType(op_apply_ctx->Y_loc, &y)); // Local-to-global - ierr = VecZeroEntries(Y); CHKERRQ(ierr); - ierr = VecScatterBegin(op_apply_ctx->g_to_g_D, X, Y, INSERT_VALUES, - SCATTER_FORWARD); - CHKERRQ(ierr); - ierr = VecScatterEnd(op_apply_ctx->g_to_g_D, X, Y, INSERT_VALUES, - SCATTER_FORWARD); CHKERRQ(ierr); - ierr = VecScatterBegin(op_apply_ctx->l_to_g_0, op_apply_ctx->Y_loc, Y, - ADD_VALUES, SCATTER_FORWARD); CHKERRQ(ierr); - ierr = VecScatterEnd(op_apply_ctx->l_to_g_0, op_apply_ctx->Y_loc, Y, ADD_VALUES, - SCATTER_FORWARD); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(Y)); + PetscCall(VecScatterBegin(op_apply_ctx->g_to_g_D, X, Y, INSERT_VALUES, SCATTER_FORWARD)); + PetscCall(VecScatterEnd(op_apply_ctx->g_to_g_D, X, Y, INSERT_VALUES, SCATTER_FORWARD)); + PetscCall(VecScatterBegin(op_apply_ctx->l_to_g_0, op_apply_ctx->Y_loc, Y, ADD_VALUES, SCATTER_FORWARD)); + PetscCall(VecScatterEnd(op_apply_ctx->l_to_g_0, op_apply_ctx->Y_loc, Y, ADD_VALUES, SCATTER_FORWARD)); PetscFunctionReturn(0); } // This function calculates the error in the final solution -static PetscErrorCode ComputeErrorMax(OperatorApplyContext op_apply_ctx, - CeedOperator op_error, Vec X, - CeedVector target, PetscReal *max_error) { - PetscErrorCode ierr; +static PetscErrorCode ComputeErrorMax(OperatorApplyContext op_apply_ctx, CeedOperator op_error, Vec X, CeedVector target, PetscReal *max_error) { PetscScalar *x; PetscMemType mem_type; - CeedVector collocated_error; - CeedSize length; + CeedVector collocated_error; + CeedSize length; PetscFunctionBeginUser; @@ -380,36 +322,29 @@ static PetscErrorCode ComputeErrorMax(OperatorApplyContext op_apply_ctx, CeedVectorCreate(op_apply_ctx->ceed, length, &collocated_error); // Global-to-local - ierr = VecScatterBegin(op_apply_ctx->l_to_g, X, op_apply_ctx->X_loc, - INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); - ierr = VecScatterEnd(op_apply_ctx->l_to_g, X, op_apply_ctx->X_loc, - INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); + PetscCall(VecScatterBegin(op_apply_ctx->l_to_g, X, op_apply_ctx->X_loc, INSERT_VALUES, SCATTER_REVERSE)); + PetscCall(VecScatterEnd(op_apply_ctx->l_to_g, X, op_apply_ctx->X_loc, INSERT_VALUES, SCATTER_REVERSE)); // Setup libCEED vector - ierr = VecGetArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x, - &mem_type); CHKERRQ(ierr); - CeedVectorSetArray(op_apply_ctx->x_ceed, MemTypeP2C(mem_type), - CEED_USE_POINTER, x); + PetscCall(VecGetArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x, &mem_type)); + CeedVectorSetArray(op_apply_ctx->x_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, x); // Apply libCEED operator - CeedOperatorApply(op_error, op_apply_ctx->x_ceed, collocated_error, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_error, op_apply_ctx->x_ceed, collocated_error, CEED_REQUEST_IMMEDIATE); // Restore PETSc vector CeedVectorTakeArray(op_apply_ctx->x_ceed, MemTypeP2C(mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(op_apply_ctx->X_loc, - (const PetscScalar **)&x); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x)); // Reduce max error *max_error = 0; const CeedScalar *e; CeedVectorGetArrayRead(collocated_error, CEED_MEM_HOST, &e); - for (CeedInt i=0; icomm); CHKERRQ(ierr); + PetscCall(MPI_Allreduce(MPI_IN_PLACE, max_error, 1, MPIU_REAL, MPIU_MAX, op_apply_ctx->comm)); // Cleanup CeedVectorDestroy(&collocated_error); @@ -418,79 +353,57 @@ static PetscErrorCode ComputeErrorMax(OperatorApplyContext op_apply_ctx, } int main(int argc, char **argv) { - PetscInt ierr; MPI_Comm comm; - char ceed_resource[PETSC_MAX_PATH_LEN] = "/cpu/self"; - double my_rt_start, my_rt, rt_min, rt_max; - PetscInt degree, q_extra, local_nodes, local_elem, mesh_elem[3], m_nodes[3], - p[3], - i_rank[3], l_nodes[3], l_size, num_comp_u = 1, ksp_max_it_clip[2]; - PetscScalar *r; - PetscBool test_mode, benchmark_mode, write_solution; - PetscMPIInt size, rank; - PetscLogStage solve_stage; - Vec X, X_loc, rhs, rhs_loc; - Mat mat; - KSP ksp; - VecScatter l_to_g, l_to_g_0, g_to_g_D; - PetscMemType mem_type; + char ceed_resource[PETSC_MAX_PATH_LEN] = "/cpu/self"; + double my_rt_start, my_rt, rt_min, rt_max; + PetscInt degree, q_extra, local_nodes, local_elem, mesh_elem[3], m_nodes[3], p[3], i_rank[3], l_nodes[3], l_size, num_comp_u = 1, + ksp_max_it_clip[2]; + PetscScalar *r; + PetscBool test_mode, benchmark_mode, write_solution; + PetscMPIInt size, rank; + PetscLogStage solve_stage; + Vec X, X_loc, rhs, rhs_loc; + Mat mat; + KSP ksp; + VecScatter l_to_g, l_to_g_0, g_to_g_D; + PetscMemType mem_type; OperatorApplyContext op_apply_ctx; - Ceed ceed; - CeedBasis basis_x, basis_u; - CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_u_i, elem_restr_qd_i; - CeedQFunction qf_setup_geo, qf_setup_rhs, qf_apply, qf_error; - CeedOperator op_setup_geo, op_setup_rhs, op_apply, op_error; - CeedVector x_coord, q_data, rhs_ceed, target; - CeedInt P, Q; - const CeedInt dim = 3, num_comp_x = 3; - BPType bp_choice; - - ierr = PetscInitialize(&argc, &argv, NULL, help); - if (ierr) return ierr; + Ceed ceed; + CeedBasis basis_x, basis_u; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_u_i, elem_restr_qd_i; + CeedQFunction qf_setup_geo, qf_setup_rhs, qf_apply, qf_error; + CeedOperator op_setup_geo, op_setup_rhs, op_apply, op_error; + CeedVector x_coord, q_data, rhs_ceed, target; + CeedInt P, Q; + const CeedInt dim = 3, num_comp_x = 3; + BPType bp_choice; + + PetscCall(PetscInitialize(&argc, &argv, NULL, help)); comm = PETSC_COMM_WORLD; // Read command line options PetscOptionsBegin(comm, NULL, "CEED BPs in PETSc", NULL); bp_choice = CEED_BP1; - ierr = PetscOptionsEnum("-problem", - "CEED benchmark problem to solve", NULL, - bp_types, (PetscEnum)bp_choice, (PetscEnum *)&bp_choice, - NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsEnum("-problem", "CEED benchmark problem to solve", NULL, bp_types, (PetscEnum)bp_choice, (PetscEnum *)&bp_choice, NULL)); num_comp_u = bp_options[bp_choice].num_comp_u; - test_mode = PETSC_FALSE; - ierr = PetscOptionsBool("-test", - "Testing mode (do not print unless error is large)", - NULL, test_mode, &test_mode, NULL); CHKERRQ(ierr); + test_mode = PETSC_FALSE; + PetscCall(PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, test_mode, &test_mode, NULL)); benchmark_mode = PETSC_FALSE; - ierr = PetscOptionsBool("-benchmark", - "Benchmarking mode (prints benchmark statistics)", - NULL, benchmark_mode, &benchmark_mode, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-benchmark", "Benchmarking mode (prints benchmark statistics)", NULL, benchmark_mode, &benchmark_mode, NULL)); write_solution = PETSC_FALSE; - ierr = PetscOptionsBool("-write_solution", - "Write solution for visualization", - NULL, write_solution, &write_solution, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-write_solution", "Write solution for visualization", NULL, write_solution, &write_solution, NULL)); degree = test_mode ? 3 : 1; - ierr = PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", - NULL, degree, °ree, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", NULL, degree, °ree, NULL)); q_extra = bp_options[bp_choice].q_extra; - ierr = PetscOptionsInt("-q_extra", "Number of extra quadrature points", - NULL, q_extra, &q_extra, NULL); CHKERRQ(ierr); - ierr = PetscOptionsString("-ceed", "CEED resource specifier", - NULL, ceed_resource, ceed_resource, - sizeof(ceed_resource), NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-q_extra", "Number of extra quadrature points", NULL, q_extra, &q_extra, NULL)); + PetscCall(PetscOptionsString("-ceed", "CEED resource specifier", NULL, ceed_resource, ceed_resource, sizeof(ceed_resource), NULL)); local_nodes = 1000; - ierr = PetscOptionsInt("-local", - "Target number of locally owned nodes per process", - NULL, local_nodes, &local_nodes, NULL); CHKERRQ(ierr); - PetscInt two = 2; + PetscCall(PetscOptionsInt("-local", "Target number of locally owned nodes per process", NULL, local_nodes, &local_nodes, NULL)); + PetscInt two = 2; ksp_max_it_clip[0] = 5; ksp_max_it_clip[1] = 20; - ierr = PetscOptionsIntArray("-ksp_max_it_clip", - "Min and max number of iterations to use during benchmarking", - NULL, ksp_max_it_clip, &two, NULL); - CHKERRQ(ierr); + PetscCall( + PetscOptionsIntArray("-ksp_max_it_clip", "Min and max number of iterations to use during benchmarking", NULL, ksp_max_it_clip, &two, NULL)); PetscOptionsEnd(); P = degree + 1; Q = P + q_extra; @@ -502,343 +415,278 @@ int main(int argc, char **argv) { VecType default_vec_type = NULL, vec_type; switch (mem_type_backend) { - case CEED_MEM_HOST: default_vec_type = VECSTANDARD; break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) default_vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) - default_vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 - else if (strstr(resolved, "/gpu/hip")) default_vec_type = VECHIP; - else default_vec_type = VECSTANDARD; - } + case CEED_MEM_HOST: + default_vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) default_vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip/occa")) default_vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 + else if (strstr(resolved, "/gpu/hip")) default_vec_type = VECHIP; + else default_vec_type = VECSTANDARD; + } } // Determine size of process grid - ierr = MPI_Comm_size(comm, &size); CHKERRQ(ierr); + PetscCall(MPI_Comm_size(comm, &size)); Split3(size, p, false); // Find a nicely composite number of elements no less than local_nodes - for (local_elem = PetscMax(1, local_nodes / (degree*degree*degree)); ; - local_elem++) { + for (local_elem = PetscMax(1, local_nodes / (degree * degree * degree));; local_elem++) { Split3(local_elem, mesh_elem, true); if (Max3(mesh_elem) / Min3(mesh_elem) <= 2) break; } // Find my location in the process grid - ierr = MPI_Comm_rank(comm, &rank); CHKERRQ(ierr); - for (int d=0, rank_left=rank; d=m_nodes[0], ii=i-ir*m_nodes[0], i=m_nodes[1], jj=j-jr*m_nodes[1], j=m_nodes[2], kk=k-kr*m_nodes[2], k= m_nodes[0], ii = i - ir * m_nodes[0], i < l_nodes[0]; i++) { + for (PetscInt j = 0, jr, jj; jr = j >= m_nodes[1], jj = j - jr * m_nodes[1], j < l_nodes[1]; j++) { + for (PetscInt k = 0, kr, kk; kr = k >= m_nodes[2], kk = k - kr * m_nodes[2], k < l_nodes[2]; k++) { + PetscInt here = (i * l_nodes[1] + j) * l_nodes[2] + k; + l_to_g_ind[here] = g_start[ir][jr][kr] + (ii * g_m_nodes[ir][jr][kr][1] + jj) * g_m_nodes[ir][jr][kr][2] + kk; + if ((i_rank[0] == 0 && i == 0) || (i_rank[1] == 0 && j == 0) || (i_rank[2] == 0 && k == 0) || + (i_rank[0] + 1 == p[0] && i + 1 == l_nodes[0]) || (i_rank[1] + 1 == p[1] && j + 1 == l_nodes[1]) || + (i_rank[2] + 1 == p[2] && k + 1 == l_nodes[2])) continue; l_to_g_ind_0[l_0_count] = l_to_g_ind[here]; - loc_ind[l_0_count++] = here; + loc_ind[l_0_count++] = here; } - ierr = ISCreateBlock(comm, num_comp_u, l_size, l_to_g_ind, PETSC_OWN_POINTER, - &l_to_g_is); CHKERRQ(ierr); - ierr = VecScatterCreate(X_loc, NULL, X, l_to_g_is, &l_to_g); CHKERRQ(ierr); - CHKERRQ(ierr); - ierr = ISCreateBlock(comm, num_comp_u, l_0_count, l_to_g_ind_0, - PETSC_OWN_POINTER, - &l_to_g_is_0); CHKERRQ(ierr); - ierr = ISCreateBlock(comm, num_comp_u, l_0_count, loc_ind, PETSC_OWN_POINTER, - &loc_is); CHKERRQ(ierr); - ierr = VecScatterCreate(X_loc, loc_is, X, l_to_g_is_0, &l_to_g_0); - CHKERRQ(ierr); + } + } + PetscCall(ISCreateBlock(comm, num_comp_u, l_size, l_to_g_ind, PETSC_OWN_POINTER, &l_to_g_is)); + PetscCall(VecScatterCreate(X_loc, NULL, X, l_to_g_is, &l_to_g)); + PetscCall(ISCreateBlock(comm, num_comp_u, l_0_count, l_to_g_ind_0, PETSC_OWN_POINTER, &l_to_g_is_0)); + PetscCall(ISCreateBlock(comm, num_comp_u, l_0_count, loc_ind, PETSC_OWN_POINTER, &loc_is)); + PetscCall(VecScatterCreate(X_loc, loc_is, X, l_to_g_is_0, &l_to_g_0)); { // Create global-to-global scatter for Dirichlet values (everything not in // l_to_g_is_0, which is the range of l_to_g_0) - PetscInt x_start, x_end, *ind_D, count_D = 0; - IS is_D; + PetscInt x_start, x_end, *ind_D, count_D = 0; + IS is_D; const PetscScalar *x; - ierr = VecZeroEntries(X_loc); CHKERRQ(ierr); - ierr = VecSet(X, 1.0); CHKERRQ(ierr); - ierr = VecScatterBegin(l_to_g_0, X_loc, X, INSERT_VALUES, SCATTER_FORWARD); - CHKERRQ(ierr); - ierr = VecScatterEnd(l_to_g_0, X_loc, X, INSERT_VALUES, SCATTER_FORWARD); - CHKERRQ(ierr); - ierr = VecGetOwnershipRange(X, &x_start, &x_end); CHKERRQ(ierr); - ierr = PetscMalloc1(x_end-x_start, &ind_D); CHKERRQ(ierr); - ierr = VecGetArrayRead(X, &x); CHKERRQ(ierr); - for (PetscInt i=0; icomm = comm; + PetscCall(PetscMalloc1(1, &op_apply_ctx)); + op_apply_ctx->comm = comm; op_apply_ctx->l_to_g = l_to_g; if (bp_choice != CEED_BP1 && bp_choice != CEED_BP2) { op_apply_ctx->l_to_g_0 = l_to_g_0; op_apply_ctx->g_to_g_D = g_to_g_D; } op_apply_ctx->X_loc = X_loc; - ierr = VecDuplicate(X_loc, &op_apply_ctx->Y_loc); CHKERRQ(ierr); - CeedVectorCreate(ceed, l_size*num_comp_u, &op_apply_ctx->x_ceed); - CeedVectorCreate(ceed, l_size*num_comp_u, &op_apply_ctx->y_ceed); - op_apply_ctx->op = op_apply; + PetscCall(VecDuplicate(X_loc, &op_apply_ctx->Y_loc)); + CeedVectorCreate(ceed, l_size * num_comp_u, &op_apply_ctx->x_ceed); + CeedVectorCreate(ceed, l_size * num_comp_u, &op_apply_ctx->y_ceed); + op_apply_ctx->op = op_apply; op_apply_ctx->q_data = q_data; - op_apply_ctx->ceed = ceed; + op_apply_ctx->ceed = ceed; - ierr = MatCreateShell(comm, m_nodes[0]*m_nodes[1]*m_nodes[2]*num_comp_u, - m_nodes[0]*m_nodes[1]*m_nodes[2]*num_comp_u, - PETSC_DECIDE, PETSC_DECIDE, op_apply_ctx, &mat); CHKERRQ(ierr); + PetscCall(MatCreateShell(comm, m_nodes[0] * m_nodes[1] * m_nodes[2] * num_comp_u, m_nodes[0] * m_nodes[1] * m_nodes[2] * num_comp_u, PETSC_DECIDE, + PETSC_DECIDE, op_apply_ctx, &mat)); if (bp_choice == CEED_BP1 || bp_choice == CEED_BP2) { - ierr = MatShellSetOperation(mat, MATOP_MULT, (void(*)(void))MatMult_Mass); - CHKERRQ(ierr); + PetscCall(MatShellSetOperation(mat, MATOP_MULT, (void (*)(void))MatMult_Mass)); } else { - ierr = MatShellSetOperation(mat, MATOP_MULT, (void(*)(void))MatMult_Diff); - CHKERRQ(ierr); + PetscCall(MatShellSetOperation(mat, MATOP_MULT, (void (*)(void))MatMult_Diff)); } - ierr = VecGetType(X, &vec_type); CHKERRQ(ierr); - ierr = MatShellSetVecType(mat, vec_type); CHKERRQ(ierr); + PetscCall(VecGetType(X, &vec_type)); + PetscCall(MatShellSetVecType(mat, vec_type)); // Get RHS vector - ierr = VecDuplicate(X, &rhs); CHKERRQ(ierr); - ierr = VecDuplicate(X_loc, &rhs_loc); CHKERRQ(ierr); - ierr = VecZeroEntries(rhs_loc); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(rhs_loc, &r, &mem_type); CHKERRQ(ierr); + PetscCall(VecDuplicate(X, &rhs)); + PetscCall(VecDuplicate(X_loc, &rhs_loc)); + PetscCall(VecZeroEntries(rhs_loc)); + PetscCall(VecGetArrayAndMemType(rhs_loc, &r, &mem_type)); CeedVectorSetArray(rhs_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, r); // Setup q_data, rhs, and target @@ -847,132 +695,119 @@ int main(int argc, char **argv) { CeedVectorDestroy(&x_coord); // Gather RHS - ierr = CeedVectorTakeArray(rhs_ceed, MemTypeP2C(mem_type), NULL); CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(rhs_loc, &r); CHKERRQ(ierr); - ierr = VecZeroEntries(rhs); CHKERRQ(ierr); - ierr = VecScatterBegin(l_to_g, rhs_loc, rhs, ADD_VALUES, SCATTER_FORWARD); - CHKERRQ(ierr); - ierr = VecScatterEnd(l_to_g, rhs_loc, rhs, ADD_VALUES, SCATTER_FORWARD); - CHKERRQ(ierr); + PetscCall(CeedVectorTakeArray(rhs_ceed, MemTypeP2C(mem_type), NULL)); + PetscCall(VecRestoreArrayAndMemType(rhs_loc, &r)); + PetscCall(VecZeroEntries(rhs)); + PetscCall(VecScatterBegin(l_to_g, rhs_loc, rhs, ADD_VALUES, SCATTER_FORWARD)); + PetscCall(VecScatterEnd(l_to_g, rhs_loc, rhs, ADD_VALUES, SCATTER_FORWARD)); CeedVectorDestroy(&rhs_ceed); - ierr = KSPCreate(comm, &ksp); CHKERRQ(ierr); + PetscCall(KSPCreate(comm, &ksp)); { PC pc; - ierr = KSPGetPC(ksp, &pc); CHKERRQ(ierr); + PetscCall(KSPGetPC(ksp, &pc)); if (bp_choice == CEED_BP1 || bp_choice == CEED_BP2) { - ierr = PCSetType(pc, PCJACOBI); CHKERRQ(ierr); - ierr = PCJacobiSetType(pc, PC_JACOBI_ROWSUM); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCJACOBI)); + PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM)); } else { - ierr = PCSetType(pc, PCNONE); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCNONE)); } - ierr = KSPSetType(ksp, KSPCG); CHKERRQ(ierr); - ierr = KSPSetNormType(ksp, KSP_NORM_NATURAL); CHKERRQ(ierr); - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, - PETSC_DEFAULT); CHKERRQ(ierr); + PetscCall(KSPSetType(ksp, KSPCG)); + PetscCall(KSPSetNormType(ksp, KSP_NORM_NATURAL)); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, PETSC_DEFAULT)); } - ierr = KSPSetOperators(ksp, mat, mat); CHKERRQ(ierr); + PetscCall(KSPSetOperators(ksp, mat, mat)); // First run's performance log is not considered for benchmarking purposes - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1); - CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1)); my_rt_start = MPI_Wtime(); - ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); + PetscCall(KSPSolve(ksp, rhs, X)); my_rt = MPI_Wtime() - my_rt_start; - ierr = MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, comm); - CHKERRQ(ierr); + PetscCall(MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, comm)); // Set maxits based on first iteration timing if (my_rt > 0.02) { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, - ksp_max_it_clip[0]); CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, ksp_max_it_clip[0])); } else { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, - ksp_max_it_clip[1]); CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, ksp_max_it_clip[1])); } - ierr = KSPSetFromOptions(ksp); CHKERRQ(ierr); + PetscCall(KSPSetFromOptions(ksp)); // Timed solve - ierr = VecZeroEntries(X); CHKERRQ(ierr); - ierr = PetscBarrier((PetscObject)ksp); CHKERRQ(ierr); + PetscCall(VecZeroEntries(X)); + PetscCall(PetscBarrier((PetscObject)ksp)); // -- Performance logging - ierr = PetscLogStageRegister("Solve Stage", &solve_stage); CHKERRQ(ierr); - ierr = PetscLogStagePush(solve_stage); CHKERRQ(ierr); + PetscCall(PetscLogStageRegister("Solve Stage", &solve_stage)); + PetscCall(PetscLogStagePush(solve_stage)); // -- Solve my_rt_start = MPI_Wtime(); - ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); + PetscCall(KSPSolve(ksp, rhs, X)); my_rt = MPI_Wtime() - my_rt_start; // -- Performance logging - ierr = PetscLogStagePop(); + PetscCall(PetscLogStagePop()); // Output results { - KSPType ksp_type; + KSPType ksp_type; KSPConvergedReason reason; - PetscReal rnorm; - PetscInt its; - ierr = KSPGetType(ksp, &ksp_type); CHKERRQ(ierr); - ierr = KSPGetConvergedReason(ksp, &reason); CHKERRQ(ierr); - ierr = KSPGetIterationNumber(ksp, &its); CHKERRQ(ierr); - ierr = KSPGetResidualNorm(ksp, &rnorm); CHKERRQ(ierr); + PetscReal rnorm; + PetscInt its; + PetscCall(KSPGetType(ksp, &ksp_type)); + PetscCall(KSPGetConvergedReason(ksp, &reason)); + PetscCall(KSPGetIterationNumber(ksp, &its)); + PetscCall(KSPGetResidualNorm(ksp, &rnorm)); if (!test_mode || reason < 0 || rnorm > 1e-8) { - ierr = PetscPrintf(comm, - " KSP:\n" - " KSP Type : %s\n" - " KSP Convergence : %s\n" - " Total KSP Iterations : %" PetscInt_FMT "\n" - " Final rnorm : %e\n", - ksp_type, KSPConvergedReasons[reason], its, - (double)rnorm); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, + " KSP:\n" + " KSP Type : %s\n" + " KSP Convergence : %s\n" + " Total KSP Iterations : %" PetscInt_FMT "\n" + " Final rnorm : %e\n", + ksp_type, KSPConvergedReasons[reason], its, (double)rnorm)); } if (!test_mode) { - ierr = PetscPrintf(comm," Performance:\n"); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " Performance:\n")); } { PetscReal max_error; - ierr = ComputeErrorMax(op_apply_ctx, op_error, X, target, &max_error); - CHKERRQ(ierr); + PetscCall(ComputeErrorMax(op_apply_ctx, op_error, X, target, &max_error)); PetscReal tol = 5e-2; if (!test_mode || max_error > tol) { - ierr = MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, comm); - CHKERRQ(ierr); - ierr = MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, comm); - CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " Pointwise Error (max) : %e\n" - " CG Solve Time : %g (%g) sec\n", - (double)max_error, rt_max, rt_min); CHKERRQ(ierr); + PetscCall(MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, comm)); + PetscCall(MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, comm)); + PetscCall(PetscPrintf(comm, + " Pointwise Error (max) : %e\n" + " CG Solve Time : %g (%g) sec\n", + (double)max_error, rt_max, rt_min)); } } if (!test_mode) { - ierr = PetscPrintf(comm, - " DoFs/Sec in CG : %g (%g) million\n", - 1e-6*gsize*its/rt_max, - 1e-6*gsize*its/rt_min); CHKERRQ(ierr); + PetscCall( + PetscPrintf(comm, " DoFs/Sec in CG : %g (%g) million\n", 1e-6 * gsize * its / rt_max, 1e-6 * gsize * its / rt_min)); } } if (write_solution) { PetscViewer vtk_viewer_soln; - ierr = PetscViewerCreate(comm, &vtk_viewer_soln); CHKERRQ(ierr); - ierr = PetscViewerSetType(vtk_viewer_soln, PETSCVIEWERVTK); CHKERRQ(ierr); - ierr = PetscViewerFileSetName(vtk_viewer_soln, "solution.vtu"); CHKERRQ(ierr); - ierr = VecView(X, vtk_viewer_soln); CHKERRQ(ierr); - ierr = PetscViewerDestroy(&vtk_viewer_soln); CHKERRQ(ierr); + PetscCall(PetscViewerCreate(comm, &vtk_viewer_soln)); + PetscCall(PetscViewerSetType(vtk_viewer_soln, PETSCVIEWERVTK)); + PetscCall(PetscViewerFileSetName(vtk_viewer_soln, "solution.vtu")); + PetscCall(VecView(X, vtk_viewer_soln)); + PetscCall(PetscViewerDestroy(&vtk_viewer_soln)); } - ierr = VecDestroy(&rhs); CHKERRQ(ierr); - ierr = VecDestroy(&rhs_loc); CHKERRQ(ierr); - ierr = VecDestroy(&X); CHKERRQ(ierr); - ierr = VecDestroy(&op_apply_ctx->X_loc); CHKERRQ(ierr); - ierr = VecDestroy(&op_apply_ctx->Y_loc); CHKERRQ(ierr); - ierr = VecScatterDestroy(&l_to_g); CHKERRQ(ierr); - ierr = VecScatterDestroy(&l_to_g_0); CHKERRQ(ierr); - ierr = VecScatterDestroy(&g_to_g_D); CHKERRQ(ierr); - ierr = MatDestroy(&mat); CHKERRQ(ierr); - ierr = KSPDestroy(&ksp); CHKERRQ(ierr); + PetscCall(VecDestroy(&rhs)); + PetscCall(VecDestroy(&rhs_loc)); + PetscCall(VecDestroy(&X)); + PetscCall(VecDestroy(&op_apply_ctx->X_loc)); + PetscCall(VecDestroy(&op_apply_ctx->Y_loc)); + PetscCall(VecScatterDestroy(&l_to_g)); + PetscCall(VecScatterDestroy(&l_to_g_0)); + PetscCall(VecScatterDestroy(&g_to_g_D)); + PetscCall(MatDestroy(&mat)); + PetscCall(KSPDestroy(&ksp)); CeedVectorDestroy(&op_apply_ctx->x_ceed); CeedVectorDestroy(&op_apply_ctx->y_ceed); @@ -993,6 +828,6 @@ int main(int argc, char **argv) { CeedBasisDestroy(&basis_u); CeedBasisDestroy(&basis_x); CeedDestroy(&ceed); - ierr = PetscFree(op_apply_ctx); CHKERRQ(ierr); + PetscCall(PetscFree(op_apply_ctx)); return PetscFinalize(); } diff --git a/examples/petsc/bpssphere.c b/examples/petsc/bpssphere.c index c1bc4c3583..97c2e62069 100644 --- a/examples/petsc/bpssphere.c +++ b/examples/petsc/bpssphere.c @@ -34,22 +34,22 @@ /// and bpsdmplex.c for an implementation using an unstructured grid. static const char help[] = "Solve CEED BPs on a sphere using DMPlex in PETSc\n"; -#include -#include +#include "bpssphere.h" + #include #include #include #include +#include +#include -#include "bpssphere.h" -#include "include/sphereproblemdata.h" +#include "include/libceedsetup.h" +#include "include/matops.h" #include "include/petscutils.h" #include "include/petscversion.h" -#include "include/matops.h" -#include "include/libceedsetup.h" - +#include "include/sphereproblemdata.h" -#if PETSC_VERSION_LT(3,12,0) +#if PETSC_VERSION_LT(3, 12, 0) #ifdef PETSC_HAVE_CUDA #include // Note: With PETSc prior to version 3.12.0, providing the source path to @@ -58,306 +58,256 @@ static const char help[] = "Solve CEED BPs on a sphere using DMPlex in PETSc\n"; #endif int main(int argc, char **argv) { - PetscInt ierr; - MPI_Comm comm; - char ceed_resource[PETSC_MAX_PATH_LEN] = "/cpu/self", - filename[PETSC_MAX_PATH_LEN]; - double my_rt_start, my_rt, rt_min, rt_max; - PetscInt degree = 3, q_extra, l_size, g_size, topo_dim = 2, num_comp_x = 3, - num_comp_u = 1, xl_size; - PetscScalar *r; - PetscBool test_mode, benchmark_mode, read_mesh, write_solution, simplex; - PetscLogStage solve_stage; - Vec X, X_loc, rhs, rhs_loc; - Mat mat_O; - KSP ksp; - DM dm; + MPI_Comm comm; + char ceed_resource[PETSC_MAX_PATH_LEN] = "/cpu/self", filename[PETSC_MAX_PATH_LEN]; + double my_rt_start, my_rt, rt_min, rt_max; + PetscInt degree = 3, q_extra, l_size, g_size, topo_dim = 2, num_comp_x = 3, num_comp_u = 1, xl_size; + PetscScalar *r; + PetscBool test_mode, benchmark_mode, read_mesh, write_solution, simplex; + PetscLogStage solve_stage; + Vec X, X_loc, rhs, rhs_loc; + Mat mat_O; + KSP ksp; + DM dm; OperatorApplyContext op_apply_ctx, op_error_ctx; - Ceed ceed; - CeedData ceed_data; - CeedQFunction qf_error; - CeedOperator op_error; - CeedVector rhs_ceed, target; - BPType bp_choice; - VecType vec_type; - PetscMemType mem_type; - - ierr = PetscInitialize(&argc, &argv, NULL, help); - if (ierr) return ierr; + Ceed ceed; + CeedData ceed_data; + CeedQFunction qf_error; + CeedOperator op_error; + CeedVector rhs_ceed, target; + BPType bp_choice; + VecType vec_type; + PetscMemType mem_type; + + PetscCall(PetscInitialize(&argc, &argv, NULL, help)); comm = PETSC_COMM_WORLD; // Read command line options PetscOptionsBegin(comm, NULL, "CEED BPs in PETSc", NULL); bp_choice = CEED_BP1; - ierr = PetscOptionsEnum("-problem", - "CEED benchmark problem to solve", NULL, - bp_types, (PetscEnum)bp_choice, (PetscEnum *)&bp_choice, - NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsEnum("-problem", "CEED benchmark problem to solve", NULL, bp_types, (PetscEnum)bp_choice, (PetscEnum *)&bp_choice, NULL)); num_comp_u = bp_options[bp_choice].num_comp_u; - test_mode = PETSC_FALSE; - ierr = PetscOptionsBool("-test", - "Testing mode (do not print unless error is large)", - NULL, test_mode, &test_mode, NULL); CHKERRQ(ierr); + test_mode = PETSC_FALSE; + PetscCall(PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, test_mode, &test_mode, NULL)); benchmark_mode = PETSC_FALSE; - ierr = PetscOptionsBool("-benchmark", - "Benchmarking mode (prints benchmark statistics)", - NULL, benchmark_mode, &benchmark_mode, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-benchmark", "Benchmarking mode (prints benchmark statistics)", NULL, benchmark_mode, &benchmark_mode, NULL)); write_solution = PETSC_FALSE; - ierr = PetscOptionsBool("-write_solution", - "Write solution for visualization", - NULL, write_solution, &write_solution, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-write_solution", "Write solution for visualization", NULL, write_solution, &write_solution, NULL)); degree = test_mode ? 3 : 2; - ierr = PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", - NULL, degree, °ree, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", NULL, degree, °ree, NULL)); q_extra = bp_options[bp_choice].q_extra; - ierr = PetscOptionsInt("-q_extra", "Number of extra quadrature points", - NULL, q_extra, &q_extra, NULL); CHKERRQ(ierr); - ierr = PetscOptionsString("-ceed", "CEED resource specifier", - NULL, ceed_resource, ceed_resource, - sizeof(ceed_resource), NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-q_extra", "Number of extra quadrature points", NULL, q_extra, &q_extra, NULL)); + PetscCall(PetscOptionsString("-ceed", "CEED resource specifier", NULL, ceed_resource, ceed_resource, sizeof(ceed_resource), NULL)); read_mesh = PETSC_FALSE; - ierr = PetscOptionsString("-mesh", "Read mesh from file", NULL, - filename, filename, sizeof(filename), &read_mesh); - CHKERRQ(ierr); + PetscCall(PetscOptionsString("-mesh", "Read mesh from file", NULL, filename, filename, sizeof(filename), &read_mesh)); simplex = PETSC_FALSE; - ierr = PetscOptionsBool("-simplex", "Use simplices, or tensor product cells", - NULL, simplex, &simplex, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-simplex", "Use simplices, or tensor product cells", NULL, simplex, &simplex, NULL)); PetscOptionsEnd(); // Setup DM if (read_mesh) { - ierr = DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, - &dm); - CHKERRQ(ierr); + PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, &dm)); } else { // Create the mesh as a 0-refined sphere. This will create a cubic surface, // not a box, and will snap to the unit sphere upon refinement. - ierr = DMPlexCreateSphereMesh(PETSC_COMM_WORLD, topo_dim, simplex, 1., &dm); - CHKERRQ(ierr); + PetscCall(DMPlexCreateSphereMesh(PETSC_COMM_WORLD, topo_dim, simplex, 1., &dm)); // Set the object name - ierr = PetscObjectSetName((PetscObject)dm, "Sphere"); CHKERRQ(ierr); + PetscCall(PetscObjectSetName((PetscObject)dm, "Sphere")); // Refine DMPlex with uniform refinement using runtime option -dm_refine - ierr = DMPlexSetRefinementUniform(dm, PETSC_TRUE); CHKERRQ(ierr); + PetscCall(DMPlexSetRefinementUniform(dm, PETSC_TRUE)); } - ierr = DMSetFromOptions(dm); CHKERRQ(ierr); + PetscCall(DMSetFromOptions(dm)); // View DMPlex via runtime option - ierr = DMViewFromOptions(dm, NULL, "-dm_view"); CHKERRQ(ierr); + PetscCall(DMViewFromOptions(dm, NULL, "-dm_view")); // Create DM - ierr = SetupDMByDegree(dm, degree, q_extra, num_comp_u, topo_dim, false); - CHKERRQ(ierr); + PetscCall(SetupDMByDegree(dm, degree, q_extra, num_comp_u, topo_dim, false)); // Create vectors - ierr = DMCreateGlobalVector(dm, &X); CHKERRQ(ierr); - ierr = VecGetLocalSize(X, &l_size); CHKERRQ(ierr); - ierr = VecGetSize(X, &g_size); CHKERRQ(ierr); - ierr = DMCreateLocalVector(dm, &X_loc); CHKERRQ(ierr); - ierr = VecGetSize(X_loc, &xl_size); CHKERRQ(ierr); - ierr = VecDuplicate(X, &rhs); CHKERRQ(ierr); + PetscCall(DMCreateGlobalVector(dm, &X)); + PetscCall(VecGetLocalSize(X, &l_size)); + PetscCall(VecGetSize(X, &g_size)); + PetscCall(DMCreateLocalVector(dm, &X_loc)); + PetscCall(VecGetSize(X_loc, &xl_size)); + PetscCall(VecDuplicate(X, &rhs)); // Operator - ierr = PetscMalloc1(1, &op_apply_ctx); CHKERRQ(ierr); - ierr = PetscMalloc1(1, &op_error_ctx); CHKERRQ(ierr); - ierr = MatCreateShell(comm, l_size, l_size, g_size, g_size, - op_apply_ctx, &mat_O); CHKERRQ(ierr); - ierr = MatShellSetOperation(mat_O, MATOP_MULT, - (void(*)(void))MatMult_Ceed); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &op_apply_ctx)); + PetscCall(PetscMalloc1(1, &op_error_ctx)); + PetscCall(MatCreateShell(comm, l_size, l_size, g_size, g_size, op_apply_ctx, &mat_O)); + PetscCall(MatShellSetOperation(mat_O, MATOP_MULT, (void (*)(void))MatMult_Ceed)); // Set up libCEED CeedInit(ceed_resource, &ceed); CeedMemType mem_type_backend; CeedGetPreferredMemType(ceed, &mem_type_backend); - ierr = DMGetVecType(dm, &vec_type); CHKERRQ(ierr); - if (!vec_type) { // Not yet set by user -dm_vec_type + PetscCall(DMGetVecType(dm, &vec_type)); + if (!vec_type) { // Not yet set by user -dm_vec_type switch (mem_type_backend) { - case CEED_MEM_HOST: vec_type = VECSTANDARD; break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) - vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 - else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; - else vec_type = VECSTANDARD; - } + case CEED_MEM_HOST: + vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 + else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; + else vec_type = VECSTANDARD; + } } - ierr = DMSetVecType(dm, vec_type); CHKERRQ(ierr); + PetscCall(DMSetVecType(dm, vec_type)); } // Print summary if (!test_mode) { - PetscInt P = degree + 1, Q = P + q_extra; + PetscInt P = degree + 1, Q = P + q_extra; const char *used_resource; CeedGetResource(ceed, &used_resource); - ierr = PetscPrintf(comm, - "\n-- CEED Benchmark Problem %" CeedInt_FMT - " on the Sphere -- libCEED + PETSc --\n" - " libCEED:\n" - " libCEED Backend : %s\n" - " libCEED Backend MemType : %s\n" - " Mesh:\n" - " Solution Order (P) : %" CeedInt_FMT "\n" - " Quadrature Order (Q) : %" CeedInt_FMT "\n" - " Additional quadrature points (q_extra) : %" CeedInt_FMT "\n" - " Global nodes : %" PetscInt_FMT "\n", - bp_choice+1, ceed_resource, CeedMemTypes[mem_type_backend], P, Q, - q_extra, g_size/num_comp_u); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, + "\n-- CEED Benchmark Problem %" CeedInt_FMT " on the Sphere -- libCEED + PETSc --\n" + " libCEED:\n" + " libCEED Backend : %s\n" + " libCEED Backend MemType : %s\n" + " Mesh:\n" + " Solution Order (P) : %" CeedInt_FMT "\n" + " Quadrature Order (Q) : %" CeedInt_FMT "\n" + " Additional quadrature points (q_extra) : %" CeedInt_FMT "\n" + " Global nodes : %" PetscInt_FMT "\n", + bp_choice + 1, ceed_resource, CeedMemTypes[mem_type_backend], P, Q, q_extra, g_size / num_comp_u)); } // Create RHS vector - ierr = VecDuplicate(X_loc, &rhs_loc); CHKERRQ(ierr); - ierr = VecZeroEntries(rhs_loc); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(rhs_loc, &r, &mem_type); CHKERRQ(ierr); + PetscCall(VecDuplicate(X_loc, &rhs_loc)); + PetscCall(VecZeroEntries(rhs_loc)); + PetscCall(VecGetArrayAndMemType(rhs_loc, &r, &mem_type)); CeedVectorCreate(ceed, xl_size, &rhs_ceed); CeedVectorSetArray(rhs_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, r); // Setup libCEED's objects - ierr = PetscMalloc1(1, &ceed_data); CHKERRQ(ierr); - ierr = SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, - num_comp_u, g_size, xl_size, bp_options[bp_choice], - ceed_data, true, rhs_ceed, &target); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &ceed_data)); + PetscCall(SetupLibceedByDegree(dm, ceed, degree, topo_dim, q_extra, num_comp_x, num_comp_u, g_size, xl_size, bp_options[bp_choice], ceed_data, true, + rhs_ceed, &target)); // Gather RHS CeedVectorTakeArray(rhs_ceed, MemTypeP2C(mem_type), NULL); - ierr = VecRestoreArrayAndMemType(rhs_loc, &r); CHKERRQ(ierr); - ierr = VecZeroEntries(rhs); CHKERRQ(ierr); - ierr = DMLocalToGlobal(dm, rhs_loc, ADD_VALUES, rhs); CHKERRQ(ierr); + PetscCall(VecRestoreArrayAndMemType(rhs_loc, &r)); + PetscCall(VecZeroEntries(rhs)); + PetscCall(DMLocalToGlobal(dm, rhs_loc, ADD_VALUES, rhs)); CeedVectorDestroy(&rhs_ceed); // Create the error Q-function - CeedQFunctionCreateInterior(ceed, 1, bp_options[bp_choice].error, - bp_options[bp_choice].error_loc, &qf_error); + CeedQFunctionCreateInterior(ceed, 1, bp_options[bp_choice].error, bp_options[bp_choice].error_loc, &qf_error); CeedQFunctionAddInput(qf_error, "u", num_comp_u, CEED_EVAL_INTERP); CeedQFunctionAddInput(qf_error, "true_soln", num_comp_u, CEED_EVAL_NONE); - CeedQFunctionAddInput(qf_error, "qdata", ceed_data->q_data_size, - CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_error, "qdata", ceed_data->q_data_size, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_error, "error", num_comp_u, CEED_EVAL_INTERP); // Create the error operator CeedOperatorCreate(ceed, qf_error, NULL, NULL, &op_error); - CeedOperatorSetField(op_error, "u", ceed_data->elem_restr_u, - ceed_data->basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_error, "true_soln", ceed_data->elem_restr_u_i, - CEED_BASIS_COLLOCATED, target); - CeedOperatorSetField(op_error, "qdata", ceed_data->elem_restr_qd_i, - CEED_BASIS_COLLOCATED, ceed_data->q_data); - CeedOperatorSetField(op_error, "error", ceed_data->elem_restr_u, - ceed_data->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_error, "u", ceed_data->elem_restr_u, ceed_data->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_error, "true_soln", ceed_data->elem_restr_u_i, CEED_BASIS_COLLOCATED, target); + CeedOperatorSetField(op_error, "qdata", ceed_data->elem_restr_qd_i, CEED_BASIS_COLLOCATED, ceed_data->q_data); + CeedOperatorSetField(op_error, "error", ceed_data->elem_restr_u, ceed_data->basis_u, CEED_VECTOR_ACTIVE); // Set up apply operator context - ierr = SetupApplyOperatorCtx(comm, dm, ceed, - ceed_data, X_loc, - op_apply_ctx); CHKERRQ(ierr); + PetscCall(SetupApplyOperatorCtx(comm, dm, ceed, ceed_data, X_loc, op_apply_ctx)); // Setup solver - ierr = KSPCreate(comm, &ksp); CHKERRQ(ierr); + PetscCall(KSPCreate(comm, &ksp)); { PC pc; - ierr = KSPGetPC(ksp, &pc); CHKERRQ(ierr); + PetscCall(KSPGetPC(ksp, &pc)); if (bp_choice == CEED_BP1 || bp_choice == CEED_BP2) { - ierr = PCSetType(pc, PCJACOBI); CHKERRQ(ierr); - ierr = PCJacobiSetType(pc, PC_JACOBI_ROWSUM); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCJACOBI)); + PetscCall(PCJacobiSetType(pc, PC_JACOBI_ROWSUM)); } else { - ierr = PCSetType(pc, PCNONE); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCNONE)); MatNullSpace nullspace; - ierr = MatNullSpaceCreate(PETSC_COMM_WORLD, PETSC_TRUE, 0, 0, &nullspace); - CHKERRQ(ierr); - ierr = MatSetNullSpace(mat_O, nullspace); CHKERRQ(ierr); - ierr = MatNullSpaceDestroy(&nullspace); CHKERRQ(ierr); + PetscCall(MatNullSpaceCreate(PETSC_COMM_WORLD, PETSC_TRUE, 0, 0, &nullspace)); + PetscCall(MatSetNullSpace(mat_O, nullspace)); + PetscCall(MatNullSpaceDestroy(&nullspace)); } - ierr = KSPSetType(ksp, KSPCG); CHKERRQ(ierr); - ierr = KSPSetNormType(ksp, KSP_NORM_NATURAL); CHKERRQ(ierr); - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, - PETSC_DEFAULT); CHKERRQ(ierr); + PetscCall(KSPSetType(ksp, KSPCG)); + PetscCall(KSPSetNormType(ksp, KSP_NORM_NATURAL)); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, PETSC_DEFAULT)); } - ierr = KSPSetFromOptions(ksp); CHKERRQ(ierr); - ierr = KSPSetOperators(ksp, mat_O, mat_O); CHKERRQ(ierr); + PetscCall(KSPSetFromOptions(ksp)); + PetscCall(KSPSetOperators(ksp, mat_O, mat_O)); // First run, if benchmarking if (benchmark_mode) { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1); - CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1)); my_rt_start = MPI_Wtime(); - ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); + PetscCall(KSPSolve(ksp, rhs, X)); my_rt = MPI_Wtime() - my_rt_start; - ierr = MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, comm); - CHKERRQ(ierr); + PetscCall(MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, comm)); // Set maxits based on first iteration timing if (my_rt > 0.02) { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 5); - CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 5)); } else { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 20); - CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 20)); } } // Timed solve - ierr = VecZeroEntries(X); CHKERRQ(ierr); - ierr = PetscBarrier((PetscObject)ksp); CHKERRQ(ierr); + PetscCall(VecZeroEntries(X)); + PetscCall(PetscBarrier((PetscObject)ksp)); // -- Performance logging - ierr = PetscLogStageRegister("Solve Stage", &solve_stage); CHKERRQ(ierr); - ierr = PetscLogStagePush(solve_stage); CHKERRQ(ierr); + PetscCall(PetscLogStageRegister("Solve Stage", &solve_stage)); + PetscCall(PetscLogStagePush(solve_stage)); // -- Solve my_rt_start = MPI_Wtime(); - ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); + PetscCall(KSPSolve(ksp, rhs, X)); my_rt = MPI_Wtime() - my_rt_start; // -- Performance logging - ierr = PetscLogStagePop(); + PetscCall(PetscLogStagePop()); // Output results { - KSPType ksp_type; + KSPType ksp_type; KSPConvergedReason reason; - PetscReal rnorm; - PetscInt its; - ierr = KSPGetType(ksp, &ksp_type); CHKERRQ(ierr); - ierr = KSPGetConvergedReason(ksp, &reason); CHKERRQ(ierr); - ierr = KSPGetIterationNumber(ksp, &its); CHKERRQ(ierr); - ierr = KSPGetResidualNorm(ksp, &rnorm); CHKERRQ(ierr); + PetscReal rnorm; + PetscInt its; + PetscCall(KSPGetType(ksp, &ksp_type)); + PetscCall(KSPGetConvergedReason(ksp, &reason)); + PetscCall(KSPGetIterationNumber(ksp, &its)); + PetscCall(KSPGetResidualNorm(ksp, &rnorm)); if (!test_mode || reason < 0 || rnorm > 1e-8) { - ierr = PetscPrintf(comm, - " KSP:\n" - " KSP Type : %s\n" - " KSP Convergence : %s\n" - " Total KSP Iterations : %" PetscInt_FMT "\n" - " Final rnorm : %e\n", - ksp_type, KSPConvergedReasons[reason], its, - (double)rnorm); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, + " KSP:\n" + " KSP Type : %s\n" + " KSP Convergence : %s\n" + " Total KSP Iterations : %" PetscInt_FMT "\n" + " Final rnorm : %e\n", + ksp_type, KSPConvergedReasons[reason], its, (double)rnorm)); } if (!test_mode) { - ierr = PetscPrintf(comm," Performance:\n"); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " Performance:\n")); } { // Set up error operator context - ierr = SetupErrorOperatorCtx(comm, dm, ceed, - ceed_data, X_loc, op_error, - op_error_ctx); CHKERRQ(ierr); + PetscCall(SetupErrorOperatorCtx(comm, dm, ceed, ceed_data, X_loc, op_error, op_error_ctx)); PetscScalar l2_error; - ierr = ComputeL2Error(X, &l2_error, op_error_ctx); CHKERRQ(ierr); + PetscCall(ComputeL2Error(X, &l2_error, op_error_ctx)); PetscReal tol = 5e-4; if (!test_mode || l2_error > tol) { - ierr = MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, comm); - CHKERRQ(ierr); - ierr = MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, comm); - CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " L2 Error : %e\n" - " CG Solve Time : %g (%g) sec\n", - (double)l2_error, rt_max, rt_min); CHKERRQ(ierr); + PetscCall(MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, comm)); + PetscCall(MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, comm)); + PetscCall(PetscPrintf(comm, + " L2 Error : %e\n" + " CG Solve Time : %g (%g) sec\n", + (double)l2_error, rt_max, rt_min)); } } if (benchmark_mode && (!test_mode)) { - ierr = PetscPrintf(comm, - " DoFs/Sec in CG : %g (%g) million\n", - 1e-6*g_size*its/rt_max, 1e-6*g_size*its/rt_min); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " DoFs/Sec in CG : %g (%g) million\n", 1e-6 * g_size * its / rt_max, + 1e-6 * g_size * its / rt_min)); } } @@ -365,27 +315,27 @@ int main(int argc, char **argv) { if (write_solution) { PetscViewer vtk_viewer_soln; - ierr = PetscViewerCreate(comm, &vtk_viewer_soln); CHKERRQ(ierr); - ierr = PetscViewerSetType(vtk_viewer_soln, PETSCVIEWERVTK); CHKERRQ(ierr); - ierr = PetscViewerFileSetName(vtk_viewer_soln, "solution.vtu"); CHKERRQ(ierr); - ierr = VecView(X, vtk_viewer_soln); CHKERRQ(ierr); - ierr = PetscViewerDestroy(&vtk_viewer_soln); CHKERRQ(ierr); + PetscCall(PetscViewerCreate(comm, &vtk_viewer_soln)); + PetscCall(PetscViewerSetType(vtk_viewer_soln, PETSCVIEWERVTK)); + PetscCall(PetscViewerFileSetName(vtk_viewer_soln, "solution.vtu")); + PetscCall(VecView(X, vtk_viewer_soln)); + PetscCall(PetscViewerDestroy(&vtk_viewer_soln)); } // Cleanup - ierr = VecDestroy(&X); CHKERRQ(ierr); - ierr = VecDestroy(&X_loc); CHKERRQ(ierr); - ierr = VecDestroy(&op_apply_ctx->Y_loc); CHKERRQ(ierr); - ierr = VecDestroy(&op_error_ctx->Y_loc); CHKERRQ(ierr); - ierr = MatDestroy(&mat_O); CHKERRQ(ierr); - ierr = PetscFree(op_apply_ctx); CHKERRQ(ierr); - ierr = PetscFree(op_error_ctx); CHKERRQ(ierr); - ierr = CeedDataDestroy(0, ceed_data); CHKERRQ(ierr); - ierr = DMDestroy(&dm); CHKERRQ(ierr); - - ierr = VecDestroy(&rhs); CHKERRQ(ierr); - ierr = VecDestroy(&rhs_loc); CHKERRQ(ierr); - ierr = KSPDestroy(&ksp); CHKERRQ(ierr); + PetscCall(VecDestroy(&X)); + PetscCall(VecDestroy(&X_loc)); + PetscCall(VecDestroy(&op_apply_ctx->Y_loc)); + PetscCall(VecDestroy(&op_error_ctx->Y_loc)); + PetscCall(MatDestroy(&mat_O)); + PetscCall(PetscFree(op_apply_ctx)); + PetscCall(PetscFree(op_error_ctx)); + PetscCall(CeedDataDestroy(0, ceed_data)); + PetscCall(DMDestroy(&dm)); + + PetscCall(VecDestroy(&rhs)); + PetscCall(VecDestroy(&rhs_loc)); + PetscCall(KSPDestroy(&ksp)); CeedVectorDestroy(&target); CeedQFunctionDestroy(&qf_error); CeedOperatorDestroy(&op_error); diff --git a/examples/petsc/bpssphere.h b/examples/petsc/bpssphere.h index e9754ebd28..17da1c0620 100644 --- a/examples/petsc/bpssphere.h +++ b/examples/petsc/bpssphere.h @@ -12,8 +12,6 @@ // Command Line Options // ----------------------------------------------------------------------------- -static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", - "BPType", "CEED_BP", 0 - }; +static const char *const bp_types[] = {"bp1", "bp2", "bp3", "bp4", "bp5", "bp6", "BPType", "CEED_BP", 0}; -#endif // libceed_petsc_examples_sphere_h +#endif // libceed_petsc_examples_sphere_h diff --git a/examples/petsc/include/areaproblemdata.h b/examples/petsc/include/areaproblemdata.h index e79fac6f80..699ac6a70a 100644 --- a/examples/petsc/include/areaproblemdata.h +++ b/examples/petsc/include/areaproblemdata.h @@ -3,6 +3,7 @@ #include #include + #include "../include/structs.h" #include "../qfunctions/area/areacube.h" #include "../qfunctions/area/areasphere.h" @@ -12,41 +13,40 @@ // ----------------------------------------------------------------------------- // Problem options -typedef enum { - CUBE = 0, SPHERE = 1 -} ProblemType; +typedef enum { CUBE = 0, SPHERE = 1 } ProblemType; static BPData problem_options[6] = { - [CUBE] = { - .num_comp_x = 3, - .num_comp_u = 1, - .topo_dim = 2, - .q_data_size = 1, - .q_extra = 1, - .setup_geo = SetupMassGeoCube, - .apply = Mass, - .setup_geo_loc = SetupMassGeoCube_loc, - .apply_loc = Mass_loc, - .in_mode = CEED_EVAL_INTERP, - .out_mode = CEED_EVAL_INTERP, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_FALSE, - }, - [SPHERE] = { - .num_comp_x = 3, - .num_comp_u = 1, - .topo_dim = 2, - .q_data_size = 1, - .q_extra = 1, - .setup_geo = SetupMassGeoSphere, - .apply = Mass, - .setup_geo_loc = SetupMassGeoSphere_loc, - .apply_loc = Mass_loc, - .in_mode = CEED_EVAL_INTERP, - .out_mode = CEED_EVAL_INTERP, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_FALSE, - } + [CUBE] = + { + .num_comp_x = 3, + .num_comp_u = 1, + .topo_dim = 2, + .q_data_size = 1, + .q_extra = 1, + .setup_geo = SetupMassGeoCube, + .apply = Mass, + .setup_geo_loc = SetupMassGeoCube_loc, + .apply_loc = Mass_loc, + .in_mode = CEED_EVAL_INTERP, + .out_mode = CEED_EVAL_INTERP, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_FALSE, + }, + [SPHERE] = { + .num_comp_x = 3, + .num_comp_u = 1, + .topo_dim = 2, + .q_data_size = 1, + .q_extra = 1, + .setup_geo = SetupMassGeoSphere, + .apply = Mass, + .setup_geo_loc = SetupMassGeoSphere_loc, + .apply_loc = Mass_loc, + .in_mode = CEED_EVAL_INTERP, + .out_mode = CEED_EVAL_INTERP, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_FALSE, + } }; -#endif // libceed_petsc_examples_area_problem_data_h +#endif // libceed_petsc_examples_area_problem_data_h diff --git a/examples/petsc/include/bpsproblemdata.h b/examples/petsc/include/bpsproblemdata.h index 5ee8ffee33..2df9c8b5ce 100644 --- a/examples/petsc/include/bpsproblemdata.h +++ b/examples/petsc/include/bpsproblemdata.h @@ -3,6 +3,7 @@ #include #include + #include "../include/structs.h" #include "../qfunctions/bps/bp1.h" #include "../qfunctions/bps/bp2.h" @@ -15,120 +16,108 @@ // ----------------------------------------------------------------------------- BPData bp_options[6] = { - [CEED_BP1] = { - .num_comp_u = 1, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 1, - .q_extra = 1, - .setup_geo = SetupMassGeo, - .setup_rhs = SetupMassRhs, - .apply = Mass, - .error = Error, - .setup_geo_loc = SetupMassGeo_loc, - .setup_rhs_loc = SetupMassRhs_loc, - .apply_loc = Mass_loc, - .error_loc = Error_loc, - .in_mode = CEED_EVAL_INTERP, - .out_mode = CEED_EVAL_INTERP, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_FALSE - }, - [CEED_BP2] = { - .num_comp_u = 3, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 1, - .q_extra = 1, - .setup_geo = SetupMassGeo, - .setup_rhs = SetupMassRhs3, - .apply = Mass3, - .error = Error3, - .setup_geo_loc = SetupMassGeo_loc, - .setup_rhs_loc = SetupMassRhs3_loc, - .apply_loc = Mass3_loc, - .error_loc = Error3_loc, - .in_mode = CEED_EVAL_INTERP, - .out_mode = CEED_EVAL_INTERP, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_FALSE - }, - [CEED_BP3] = { - .num_comp_u = 1, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 7, - .q_extra = 1, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs, - .apply = Diff, - .error = Error, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs_loc, - .apply_loc = Diff_loc, - .error_loc = Error_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_TRUE - }, - [CEED_BP4] = { - .num_comp_u = 3, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 7, - .q_extra = 1, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs3, - .apply = Diff3, - .error = Error3, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs3_loc, - .apply_loc = Diff3_loc, - .error_loc = Error3_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS, - .enforce_bc = PETSC_TRUE - }, - [CEED_BP5] = { - .num_comp_u = 1, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 7, - .q_extra = 0, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs, - .apply = Diff, - .error = Error, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs_loc, - .apply_loc = Diff_loc, - .error_loc = Error_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS_LOBATTO, - .enforce_bc = PETSC_TRUE - }, - [CEED_BP6] = { - .num_comp_u = 3, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 7, - .q_extra = 0, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs3, - .apply = Diff3, - .error = Error3, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs3_loc, - .apply_loc = Diff3_loc, - .error_loc = Error3_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS_LOBATTO, - .enforce_bc = PETSC_TRUE - } + [CEED_BP1] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 1, + .q_extra = 1, + .setup_geo = SetupMassGeo, + .setup_rhs = SetupMassRhs, + .apply = Mass, + .error = Error, + .setup_geo_loc = SetupMassGeo_loc, + .setup_rhs_loc = SetupMassRhs_loc, + .apply_loc = Mass_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_INTERP, + .out_mode = CEED_EVAL_INTERP, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_FALSE}, + [CEED_BP2] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 1, + .q_extra = 1, + .setup_geo = SetupMassGeo, + .setup_rhs = SetupMassRhs3, + .apply = Mass3, + .error = Error3, + .setup_geo_loc = SetupMassGeo_loc, + .setup_rhs_loc = SetupMassRhs3_loc, + .apply_loc = Mass3_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_INTERP, + .out_mode = CEED_EVAL_INTERP, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_FALSE}, + [CEED_BP3] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 1, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs, + .apply = Diff, + .error = Error, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs_loc, + .apply_loc = Diff_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_TRUE }, + [CEED_BP4] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 1, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs3, + .apply = Diff3, + .error = Error3, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs3_loc, + .apply_loc = Diff3_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS, + .enforce_bc = PETSC_TRUE }, + [CEED_BP5] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 0, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs, + .apply = Diff, + .error = Error, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs_loc, + .apply_loc = Diff_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS_LOBATTO, + .enforce_bc = PETSC_TRUE }, + [CEED_BP6] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 7, + .q_extra = 0, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs3, + .apply = Diff3, + .error = Error3, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs3_loc, + .apply_loc = Diff3_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS_LOBATTO, + .enforce_bc = PETSC_TRUE } }; -#endif // libceed_petsc_examples_bps_problem_data_h +#endif // libceed_petsc_examples_bps_problem_data_h diff --git a/examples/petsc/include/libceedsetup.h b/examples/petsc/include/libceedsetup.h index d60c144949..aedd38003e 100644 --- a/examples/petsc/include/libceedsetup.h +++ b/examples/petsc/include/libceedsetup.h @@ -7,15 +7,9 @@ #include "structs.h" PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data); -PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, - CeedInt topo_dim, CeedInt q_extra, - PetscInt num_comp_x, PetscInt num_comp_u, - PetscInt g_size, PetscInt xl_size, - BPData bp_data, CeedData data, - PetscBool setup_rhs, CeedVector rhs_ceed, +PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt topo_dim, CeedInt q_extra, PetscInt num_comp_x, PetscInt num_comp_u, + PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, CeedVector rhs_ceed, CeedVector *target); -PetscErrorCode CeedLevelTransferSetup(DM dm, Ceed ceed, CeedInt level, - CeedInt num_comp_u, CeedData *data, - BPData bp_data, Vec fine_mult); +PetscErrorCode CeedLevelTransferSetup(DM dm, Ceed ceed, CeedInt level, CeedInt num_comp_u, CeedData *data, BPData bp_data, Vec fine_mult); -#endif // libceed_petsc_examples_setup_h +#endif // libceed_petsc_examples_setup_h diff --git a/examples/petsc/include/matops.h b/examples/petsc/include/matops.h index 20c0799a9e..c8d2945350 100644 --- a/examples/petsc/include/matops.h +++ b/examples/petsc/include/matops.h @@ -7,11 +7,8 @@ #include "structs.h" -PetscErrorCode SetupApplyOperatorCtx(MPI_Comm comm, DM dm, Ceed ceed, - CeedData ceed_data, Vec X_loc, - OperatorApplyContext op_apply_ctx); -PetscErrorCode SetupErrorOperatorCtx(MPI_Comm comm, DM dm, Ceed ceed, - CeedData ceed_data, Vec X_loc, CeedOperator op_error, +PetscErrorCode SetupApplyOperatorCtx(MPI_Comm comm, DM dm, Ceed ceed, CeedData ceed_data, Vec X_loc, OperatorApplyContext op_apply_ctx); +PetscErrorCode SetupErrorOperatorCtx(MPI_Comm comm, DM dm, Ceed ceed, CeedData ceed_data, Vec X_loc, CeedOperator op_error, OperatorApplyContext op_error_ctx); PetscErrorCode MatGetDiag(Mat A, Vec D); PetscErrorCode ApplyLocal_Ceed(Vec X, Vec Y, OperatorApplyContext op_apply_ctx); @@ -19,7 +16,6 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y); PetscErrorCode FormResidual_Ceed(SNES snes, Vec X, Vec Y, void *ctx); PetscErrorCode MatMult_Prolong(Mat A, Vec X, Vec Y); PetscErrorCode MatMult_Restrict(Mat A, Vec X, Vec Y); -PetscErrorCode ComputeL2Error(Vec X, PetscScalar *l2_error, - OperatorApplyContext op_error_ctx); +PetscErrorCode ComputeL2Error(Vec X, PetscScalar *l2_error, OperatorApplyContext op_error_ctx); -#endif // libceed_petsc_examples_matops_h +#endif // libceed_petsc_examples_matops_h diff --git a/examples/petsc/include/petscutils.h b/examples/petsc/include/petscutils.h index 05ff9361ec..ba974b6621 100644 --- a/examples/petsc/include/petscutils.h +++ b/examples/petsc/include/petscutils.h @@ -5,25 +5,19 @@ #include #include #include + #include "structs.h" -CeedMemType MemTypeP2C(PetscMemType mtype); -PetscErrorCode Kershaw(DM dm_orig, PetscScalar eps); -PetscErrorCode SetupDMByDegree(DM dm, PetscInt p_degree, PetscInt q_extra, - PetscInt num_comp_u, PetscInt topo_dim, - bool enforce_bc); -PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr); +CeedMemType MemTypeP2C(PetscMemType mtype); +PetscErrorCode Kershaw(DM dm_orig, PetscScalar eps); +PetscErrorCode SetupDMByDegree(DM dm, PetscInt p_degree, PetscInt q_extra, PetscInt num_comp_u, PetscInt topo_dim, bool enforce_bc); +PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr); CeedElemTopology ElemTopologyP2C(DMPolytopeType cell_type); -PetscErrorCode DMFieldToDSField(DM dm, DMLabel domain_label, PetscInt dm_field, - PetscInt *ds_field); -PetscErrorCode BasisCreateFromTabulation(Ceed ceed, DM dm, DMLabel domain_label, - PetscInt label_value, PetscInt height, PetscInt face, - PetscFE fe, PetscTabulation basis_tabulation, PetscQuadrature quadrature, - CeedBasis *basis); -PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, - CeedInt label_value, CeedInt height, - CeedInt dm_field, BPData bp_data, CeedBasis *basis); -PetscErrorCode CreateDistributedDM(RunParams rp, DM *dm); +PetscErrorCode DMFieldToDSField(DM dm, DMLabel domain_label, PetscInt dm_field, PetscInt *ds_field); +PetscErrorCode BasisCreateFromTabulation(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, PetscInt height, PetscInt face, PetscFE fe, + PetscTabulation basis_tabulation, PetscQuadrature quadrature, CeedBasis *basis); +PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, CeedInt label_value, CeedInt height, CeedInt dm_field, BPData bp_data, + CeedBasis *basis); +PetscErrorCode CreateDistributedDM(RunParams rp, DM *dm); -#endif // libceed_petsc_examples_utils_h +#endif // libceed_petsc_examples_utils_h diff --git a/examples/petsc/include/petscversion.h b/examples/petsc/include/petscversion.h index d808934ad4..6d0e248459 100644 --- a/examples/petsc/include/petscversion.h +++ b/examples/petsc/include/petscversion.h @@ -1,7 +1,7 @@ #ifndef libceed_petsc_examples_version_h #define libceed_petsc_examples_version_h -#if PETSC_VERSION_LT(3,17,0) +#if PETSC_VERSION_LT(3, 17, 0) #error "PETSc v3.17 or later is required" #endif diff --git a/examples/petsc/include/sphereproblemdata.h b/examples/petsc/include/sphereproblemdata.h index b7d2dffea9..2e1a1d552c 100644 --- a/examples/petsc/include/sphereproblemdata.h +++ b/examples/petsc/include/sphereproblemdata.h @@ -3,6 +3,7 @@ #include #include + #include "../include/structs.h" #include "../qfunctions/bps/bp1sphere.h" #include "../qfunctions/bps/bp2sphere.h" @@ -15,114 +16,102 @@ // ----------------------------------------------------------------------------- static BPData bp_options[6] = { - [CEED_BP1] = { - .num_comp_u = 1, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 1, - .q_extra = 1, - .setup_geo = SetupMassGeo, - .setup_rhs = SetupMassRhs, - .apply = Mass, - .error = Error, - .setup_geo_loc = SetupMassGeo_loc, - .setup_rhs_loc = SetupMassRhs_loc, - .apply_loc = Mass_loc, - .error_loc = Error_loc, - .in_mode = CEED_EVAL_INTERP, - .out_mode = CEED_EVAL_INTERP, - .q_mode = CEED_GAUSS - }, - [CEED_BP2] = { - .num_comp_u = 3, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 1, - .q_extra = 1, - .setup_geo = SetupMassGeo, - .setup_rhs = SetupMassRhs3, - .apply = Mass3, - .error = Error3, - .setup_geo_loc = SetupMassGeo_loc, - .setup_rhs_loc = SetupMassRhs3_loc, - .apply_loc = Mass3_loc, - .error_loc = Error3_loc, - .in_mode = CEED_EVAL_INTERP, - .out_mode = CEED_EVAL_INTERP, - .q_mode = CEED_GAUSS - }, - [CEED_BP3] = { - .num_comp_u = 1, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 4, - .q_extra = 1, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs, - .apply = Diff, - .error = Error, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs_loc, - .apply_loc = Diff_loc, - .error_loc = Error_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS - }, - [CEED_BP4] = { - .num_comp_u = 3, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 4, - .q_extra = 1, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs3, - .apply = Diff3, - .error = Error3, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs3_loc, - .apply_loc = Diff_loc, - .error_loc = Error3_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS - }, - [CEED_BP5] = { - .num_comp_u = 1, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 4, - .q_extra = 0, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs, - .apply = Diff, - .error = Error, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs_loc, - .apply_loc = Diff_loc, - .error_loc = Error_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS_LOBATTO - }, - [CEED_BP6] = { - .num_comp_u = 3, - .num_comp_x = 3, - .topo_dim = 3, - .q_data_size = 4, - .q_extra = 0, - .setup_geo = SetupDiffGeo, - .setup_rhs = SetupDiffRhs3, - .apply = Diff3, - .error = Error3, - .setup_geo_loc = SetupDiffGeo_loc, - .setup_rhs_loc = SetupDiffRhs3_loc, - .apply_loc = Diff_loc, - .error_loc = Error3_loc, - .in_mode = CEED_EVAL_GRAD, - .out_mode = CEED_EVAL_GRAD, - .q_mode = CEED_GAUSS_LOBATTO - } + [CEED_BP1] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 1, + .q_extra = 1, + .setup_geo = SetupMassGeo, + .setup_rhs = SetupMassRhs, + .apply = Mass, + .error = Error, + .setup_geo_loc = SetupMassGeo_loc, + .setup_rhs_loc = SetupMassRhs_loc, + .apply_loc = Mass_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_INTERP, + .out_mode = CEED_EVAL_INTERP, + .q_mode = CEED_GAUSS }, + [CEED_BP2] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 1, + .q_extra = 1, + .setup_geo = SetupMassGeo, + .setup_rhs = SetupMassRhs3, + .apply = Mass3, + .error = Error3, + .setup_geo_loc = SetupMassGeo_loc, + .setup_rhs_loc = SetupMassRhs3_loc, + .apply_loc = Mass3_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_INTERP, + .out_mode = CEED_EVAL_INTERP, + .q_mode = CEED_GAUSS }, + [CEED_BP3] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 4, + .q_extra = 1, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs, + .apply = Diff, + .error = Error, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs_loc, + .apply_loc = Diff_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS }, + [CEED_BP4] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 4, + .q_extra = 1, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs3, + .apply = Diff3, + .error = Error3, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs3_loc, + .apply_loc = Diff_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS }, + [CEED_BP5] = {.num_comp_u = 1, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 4, + .q_extra = 0, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs, + .apply = Diff, + .error = Error, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs_loc, + .apply_loc = Diff_loc, + .error_loc = Error_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS_LOBATTO}, + [CEED_BP6] = {.num_comp_u = 3, + .num_comp_x = 3, + .topo_dim = 3, + .q_data_size = 4, + .q_extra = 0, + .setup_geo = SetupDiffGeo, + .setup_rhs = SetupDiffRhs3, + .apply = Diff3, + .error = Error3, + .setup_geo_loc = SetupDiffGeo_loc, + .setup_rhs_loc = SetupDiffRhs3_loc, + .apply_loc = Diff_loc, + .error_loc = Error3_loc, + .in_mode = CEED_EVAL_GRAD, + .out_mode = CEED_EVAL_GRAD, + .q_mode = CEED_GAUSS_LOBATTO} }; -#endif // libceed_petsc_examples_sphere_problem_data_h +#endif // libceed_petsc_examples_sphere_problem_data_h diff --git a/examples/petsc/include/structs.h b/examples/petsc/include/structs.h index f2ac5c5417..bad0a900a4 100644 --- a/examples/petsc/include/structs.h +++ b/examples/petsc/include/structs.h @@ -11,23 +11,23 @@ // Data for PETSc Matshell typedef struct OperatorApplyContext_ *OperatorApplyContext; struct OperatorApplyContext_ { - MPI_Comm comm; - DM dm; - Vec X_loc, Y_loc, diag; - CeedVector x_ceed, y_ceed; + MPI_Comm comm; + DM dm; + Vec X_loc, Y_loc, diag; + CeedVector x_ceed, y_ceed; CeedOperator op; - Ceed ceed; + Ceed ceed; }; // Data for PETSc Prolong/Restrict Matshells typedef struct ProlongRestrContext_ *ProlongRestrContext; struct ProlongRestrContext_ { - MPI_Comm comm; - DM dmc, dmf; - Vec loc_vec_c, loc_vec_f, mult_vec; - CeedVector ceed_vec_c, ceed_vec_f; + MPI_Comm comm; + DM dmc, dmf; + Vec loc_vec_c, loc_vec_f, mult_vec; + CeedVector ceed_vec_c, ceed_vec_f; CeedOperator op_prolong, op_restrict; - Ceed ceed; + Ceed ceed; }; // ----------------------------------------------------------------------------- @@ -37,45 +37,41 @@ struct ProlongRestrContext_ { // libCEED data struct for level typedef struct CeedData_ *CeedData; struct CeedData_ { - Ceed ceed; - CeedBasis basis_x, basis_u; + Ceed ceed; + CeedBasis basis_x, basis_u; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_u_i, elem_restr_qd_i; - CeedQFunction qf_apply; - CeedOperator op_apply, op_restrict, op_prolong; - CeedVector q_data, x_ceed, y_ceed; - CeedInt q_data_size; + CeedQFunction qf_apply; + CeedOperator op_apply, op_restrict, op_prolong; + CeedVector q_data, x_ceed, y_ceed; + CeedInt q_data_size; }; // BP specific data typedef struct { - CeedInt num_comp_x, num_comp_u, topo_dim, q_data_size, q_extra; + CeedInt num_comp_x, num_comp_u, topo_dim, q_data_size, q_extra; CeedQFunctionUser setup_geo, setup_rhs, apply, error; - const char *setup_geo_loc, *setup_rhs_loc, *apply_loc, *error_loc; - CeedEvalMode in_mode, out_mode; - CeedQuadMode q_mode; - PetscBool enforce_bc; + const char *setup_geo_loc, *setup_rhs_loc, *apply_loc, *error_loc; + CeedEvalMode in_mode, out_mode; + CeedQuadMode q_mode; + PetscBool enforce_bc; } BPData; // BP options -typedef enum { - CEED_BP1 = 0, CEED_BP2 = 1, CEED_BP3 = 2, - CEED_BP4 = 3, CEED_BP5 = 4, CEED_BP6 = 5 -} BPType; +typedef enum { CEED_BP1 = 0, CEED_BP2 = 1, CEED_BP3 = 2, CEED_BP4 = 3, CEED_BP5 = 4, CEED_BP6 = 5 } BPType; // ----------------------------------------------------------------------------- // Parameter structure for running problems // ----------------------------------------------------------------------------- typedef struct RunParams_ *RunParams; struct RunParams_ { - MPI_Comm comm; - PetscBool test_mode, read_mesh, user_l_nodes, write_solution, simplex; - char *filename, *hostname; - PetscInt local_nodes, degree, q_extra, dim, num_comp_u, *mesh_elem; - PetscInt ksp_max_it_clip[2]; - PetscMPIInt ranks_per_node; - BPType bp_choice; + MPI_Comm comm; + PetscBool test_mode, read_mesh, user_l_nodes, write_solution, simplex; + char *filename, *hostname; + PetscInt local_nodes, degree, q_extra, dim, num_comp_u, *mesh_elem; + PetscInt ksp_max_it_clip[2]; + PetscMPIInt ranks_per_node; + BPType bp_choice; PetscLogStage solve_stage; - }; -#endif // libceed_petsc_examples_structs_h +#endif // libceed_petsc_examples_structs_h diff --git a/examples/petsc/multigrid.c b/examples/petsc/multigrid.c index 27bc185b08..a2ff0613e9 100644 --- a/examples/petsc/multigrid.c +++ b/examples/petsc/multigrid.c @@ -30,23 +30,23 @@ /// CEED BPs 1-6 multigrid example using PETSc const char help[] = "Solve CEED BPs using p-multigrid with PETSc and DMPlex\n"; -#include -#include #include #include #include #include #include +#include +#include #include "bps.h" #include "include/bpsproblemdata.h" +#include "include/libceedsetup.h" +#include "include/matops.h" #include "include/petscutils.h" #include "include/petscversion.h" -#include "include/matops.h" #include "include/structs.h" -#include "include/libceedsetup.h" -#if PETSC_VERSION_LT(3,12,0) +#if PETSC_VERSION_LT(3, 12, 0) #ifdef PETSC_HAVE_CUDA #include // Note: With PETSc prior to version 3.12.0, providing the source path to @@ -55,98 +55,66 @@ const char help[] = "Solve CEED BPs using p-multigrid with PETSc and DMPlex\n"; #endif int main(int argc, char **argv) { - PetscInt ierr; MPI_Comm comm; - char filename[PETSC_MAX_PATH_LEN], - ceed_resource[PETSC_MAX_PATH_LEN] = "/cpu/self"; - double my_rt_start, my_rt, rt_min, rt_max; - PetscInt degree = 3, q_extra, *l_size, *xl_size, *g_size, dim = 3, fine_level, - mesh_elem[3] = {3, 3, 3}, num_comp_u = 1, num_levels = degree, *level_degrees; - PetscScalar *r; - PetscScalar eps = 1.0; - PetscBool test_mode, benchmark_mode, read_mesh, write_solution, simplex; - PetscLogStage solve_stage; - PetscLogEvent assemble_event; - DM *dm, dm_orig; - KSP ksp; - PC pc; - Mat *mat_O, *mat_pr, mat_coarse; - Vec *X, *X_loc, *mult, rhs, rhs_loc; - PetscMemType mem_type; + char filename[PETSC_MAX_PATH_LEN], ceed_resource[PETSC_MAX_PATH_LEN] = "/cpu/self"; + double my_rt_start, my_rt, rt_min, rt_max; + PetscInt degree = 3, q_extra, *l_size, *xl_size, *g_size, dim = 3, fine_level, mesh_elem[3] = {3, 3, 3}, num_comp_u = 1, num_levels = degree, + *level_degrees; + PetscScalar *r; + PetscScalar eps = 1.0; + PetscBool test_mode, benchmark_mode, read_mesh, write_solution, simplex; + PetscLogStage solve_stage; + PetscLogEvent assemble_event; + DM *dm, dm_orig; + KSP ksp; + PC pc; + Mat *mat_O, *mat_pr, mat_coarse; + Vec *X, *X_loc, *mult, rhs, rhs_loc; + PetscMemType mem_type; OperatorApplyContext *op_apply_ctx, op_error_ctx; - ProlongRestrContext *pr_restr_ctx; - Ceed ceed; - CeedData *ceed_data; - CeedVector rhs_ceed, target; - CeedQFunction qf_error; - CeedOperator op_error; - BPType bp_choice; - CoarsenType coarsen; - - ierr = PetscInitialize(&argc, &argv, NULL, help); - if (ierr) return ierr; + ProlongRestrContext *pr_restr_ctx; + Ceed ceed; + CeedData *ceed_data; + CeedVector rhs_ceed, target; + CeedQFunction qf_error; + CeedOperator op_error; + BPType bp_choice; + CoarsenType coarsen; + + PetscCall(PetscInitialize(&argc, &argv, NULL, help)); comm = PETSC_COMM_WORLD; // Parse command line options PetscOptionsBegin(comm, NULL, "CEED BPs in PETSc", NULL); bp_choice = CEED_BP3; - ierr = PetscOptionsEnum("-problem", - "CEED benchmark problem to solve", NULL, - bp_types, (PetscEnum)bp_choice, (PetscEnum *)&bp_choice, - NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsEnum("-problem", "CEED benchmark problem to solve", NULL, bp_types, (PetscEnum)bp_choice, (PetscEnum *)&bp_choice, NULL)); num_comp_u = bp_options[bp_choice].num_comp_u; - test_mode = PETSC_FALSE; - ierr = PetscOptionsBool("-test", - "Testing mode (do not print unless error is large)", - NULL, test_mode, &test_mode, NULL); CHKERRQ(ierr); + test_mode = PETSC_FALSE; + PetscCall(PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, test_mode, &test_mode, NULL)); benchmark_mode = PETSC_FALSE; - ierr = PetscOptionsBool("-benchmark", - "Benchmarking mode (prints benchmark statistics)", - NULL, benchmark_mode, &benchmark_mode, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-benchmark", "Benchmarking mode (prints benchmark statistics)", NULL, benchmark_mode, &benchmark_mode, NULL)); write_solution = PETSC_FALSE; - ierr = PetscOptionsBool("-write_solution", - "Write solution for visualization", - NULL, write_solution, &write_solution, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-write_solution", "Write solution for visualization", NULL, write_solution, &write_solution, NULL)); simplex = PETSC_FALSE; - ierr = PetscOptionsBool("-simplex", - "Element topology (default:hex)", - NULL, simplex, &simplex, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-simplex", "Element topology (default:hex)", NULL, simplex, &simplex, NULL)); if ((bp_choice == CEED_BP5 || bp_choice == CEED_BP6) && (simplex)) { - SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, - "BP5/6 is not supported with simplex"); + SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "BP5/6 is not supported with simplex"); } - ierr = PetscOptionsScalar("-eps", - "Epsilon parameter for Kershaw mesh transformation", - NULL, eps, &eps, NULL); - if (eps > 1 || eps <= 0) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_OUTOFRANGE, - "-eps %g must be (0,1]", (double)PetscRealPart(eps)); + PetscCall(PetscOptionsScalar("-eps", "Epsilon parameter for Kershaw mesh transformation", NULL, eps, &eps, NULL)); + if (eps > 1 || eps <= 0) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_OUTOFRANGE, "-eps %g must be (0,1]", (double)PetscRealPart(eps)); degree = test_mode ? 3 : 2; - ierr = PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", - NULL, degree, °ree, NULL); CHKERRQ(ierr); - if (degree < 1) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_OUTOFRANGE, - "-degree %" PetscInt_FMT " must be at least 1", degree); + PetscCall(PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", NULL, degree, °ree, NULL)); + if (degree < 1) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_OUTOFRANGE, "-degree %" PetscInt_FMT " must be at least 1", degree); q_extra = bp_options[bp_choice].q_extra; - ierr = PetscOptionsInt("-q_extra", "Number of extra quadrature points", - NULL, q_extra, &q_extra, NULL); CHKERRQ(ierr); - ierr = PetscOptionsString("-ceed", "CEED resource specifier", - NULL, ceed_resource, ceed_resource, - sizeof(ceed_resource), NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-q_extra", "Number of extra quadrature points", NULL, q_extra, &q_extra, NULL)); + PetscCall(PetscOptionsString("-ceed", "CEED resource specifier", NULL, ceed_resource, ceed_resource, sizeof(ceed_resource), NULL)); coarsen = COARSEN_UNIFORM; - ierr = PetscOptionsEnum("-coarsen", - "Coarsening strategy to use", NULL, - coarsen_types, (PetscEnum)coarsen, - (PetscEnum *)&coarsen, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsEnum("-coarsen", "Coarsening strategy to use", NULL, coarsen_types, (PetscEnum)coarsen, (PetscEnum *)&coarsen, NULL)); read_mesh = PETSC_FALSE; - ierr = PetscOptionsString("-mesh", "Read mesh from file", NULL, - filename, filename, sizeof(filename), &read_mesh); - CHKERRQ(ierr); + PetscCall(PetscOptionsString("-mesh", "Read mesh from file", NULL, filename, filename, sizeof(filename), &read_mesh)); if (!read_mesh) { PetscInt tmp = dim; - ierr = PetscOptionsIntArray("-cells","Number of cells per dimension", NULL, - mesh_elem, &tmp, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray("-cells", "Number of cells per dimension", NULL, mesh_elem, &tmp, NULL)); } PetscOptionsEnd(); @@ -157,119 +125,107 @@ int main(int argc, char **argv) { // Setup DM if (read_mesh) { - ierr = DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, - &dm_orig); - CHKERRQ(ierr); + PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, filename, NULL, PETSC_TRUE, &dm_orig)); } else { - ierr = DMPlexCreateBoxMesh(PETSC_COMM_WORLD, dim, simplex, mesh_elem, NULL, - NULL, NULL, PETSC_TRUE, &dm_orig); CHKERRQ(ierr); + PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, dim, simplex, mesh_elem, NULL, NULL, NULL, PETSC_TRUE, &dm_orig)); } VecType vec_type; switch (mem_type_backend) { - case CEED_MEM_HOST: vec_type = VECSTANDARD; break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; - else if (strstr(resolved, "/gpu/hip/occa")) - vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 - else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; - else vec_type = VECSTANDARD; - } + case CEED_MEM_HOST: + vec_type = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vec_type = VECCUDA; + else if (strstr(resolved, "/gpu/hip/occa")) vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 + else if (strstr(resolved, "/gpu/hip")) vec_type = VECHIP; + else vec_type = VECSTANDARD; + } } - ierr = DMSetVecType(dm_orig, vec_type); CHKERRQ(ierr); - ierr = DMSetFromOptions(dm_orig); CHKERRQ(ierr); - ierr = DMViewFromOptions(dm_orig, NULL, "-dm_view"); CHKERRQ(ierr); + PetscCall(DMSetVecType(dm_orig, vec_type)); + PetscCall(DMSetFromOptions(dm_orig)); + PetscCall(DMViewFromOptions(dm_orig, NULL, "-dm_view")); // Apply Kershaw mesh transformation - ierr = Kershaw(dm_orig, eps); CHKERRQ(ierr); + PetscCall(Kershaw(dm_orig, eps)); // Allocate arrays for PETSc objects for each level switch (coarsen) { - case COARSEN_UNIFORM: - num_levels = degree; - break; - case COARSEN_LOGARITHMIC: - num_levels = ceil(log(degree)/log(2)) + 1; - break; + case COARSEN_UNIFORM: + num_levels = degree; + break; + case COARSEN_LOGARITHMIC: + num_levels = ceil(log(degree) / log(2)) + 1; + break; } - ierr = PetscMalloc1(num_levels, &level_degrees); CHKERRQ(ierr); + PetscCall(PetscMalloc1(num_levels, &level_degrees)); fine_level = num_levels - 1; switch (coarsen) { - case COARSEN_UNIFORM: - for (int i=0; i 0) { // Interp - ierr = PetscMalloc1(1, &pr_restr_ctx[i]); CHKERRQ(ierr); - ierr = MatCreateShell(comm, l_size[i], l_size[i-1], g_size[i], g_size[i-1], - pr_restr_ctx[i], &mat_pr[i]); CHKERRQ(ierr); - ierr = MatShellSetOperation(mat_pr[i], MATOP_MULT, - (void(*)(void))MatMult_Prolong); - CHKERRQ(ierr); - ierr = MatShellSetOperation(mat_pr[i], MATOP_MULT_TRANSPOSE, - (void(*)(void))MatMult_Restrict); - CHKERRQ(ierr); - ierr = MatShellSetVecType(mat_pr[i], vec_type); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &pr_restr_ctx[i])); + PetscCall(MatCreateShell(comm, l_size[i], l_size[i - 1], g_size[i], g_size[i - 1], pr_restr_ctx[i], &mat_pr[i])); + PetscCall(MatShellSetOperation(mat_pr[i], MATOP_MULT, (void (*)(void))MatMult_Prolong)); + PetscCall(MatShellSetOperation(mat_pr[i], MATOP_MULT_TRANSPOSE, (void (*)(void))MatMult_Restrict)); + PetscCall(MatShellSetVecType(mat_pr[i], vec_type)); } } - ierr = VecDuplicate(X[fine_level], &rhs); CHKERRQ(ierr); + PetscCall(VecDuplicate(X[fine_level], &rhs)); // Print global grid information if (!test_mode) { @@ -278,388 +234,347 @@ int main(int argc, char **argv) { const char *used_resource; CeedGetResource(ceed, &used_resource); - ierr = VecGetType(X[0], &vec_type); CHKERRQ(ierr); - - ierr = PetscPrintf(comm, - "\n-- CEED Benchmark Problem %" CeedInt_FMT " -- libCEED + PETSc + PCMG --\n" - " PETSc:\n" - " PETSc Vec Type : %s\n" - " libCEED:\n" - " libCEED Backend : %s\n" - " libCEED Backend MemType : %s\n" - " Mesh:\n" - " Solution Order (P) : %" CeedInt_FMT "\n" - " Quadrature Order (Q) : %" CeedInt_FMT "\n" - " Additional quadrature points (q_extra) : %" CeedInt_FMT "\n" - " Global Nodes : %" PetscInt_FMT "\n" - " Owned Nodes : %" PetscInt_FMT "\n" - " DoF per node : %" PetscInt_FMT "\n" - " Element topology : %s\n" - " Multigrid:\n" - " Number of Levels : %" CeedInt_FMT "\n", - bp_choice+1, vec_type, used_resource, - CeedMemTypes[mem_type_backend], P, Q, q_extra, - g_size[fine_level]/num_comp_u, l_size[fine_level]/num_comp_u, - num_comp_u, CeedElemTopologies[elem_topo], - num_levels); CHKERRQ(ierr); + PetscCall(VecGetType(X[0], &vec_type)); + + PetscCall(PetscPrintf(comm, + "\n-- CEED Benchmark Problem %" CeedInt_FMT " -- libCEED + PETSc + PCMG --\n" + " PETSc:\n" + " PETSc Vec Type : %s\n" + " libCEED:\n" + " libCEED Backend : %s\n" + " libCEED Backend MemType : %s\n" + " Mesh:\n" + " Solution Order (P) : %" CeedInt_FMT "\n" + " Quadrature Order (Q) : %" CeedInt_FMT "\n" + " Additional quadrature points (q_extra) : %" CeedInt_FMT "\n" + " Global Nodes : %" PetscInt_FMT "\n" + " Owned Nodes : %" PetscInt_FMT "\n" + " DoF per node : %" PetscInt_FMT "\n" + " Element topology : %s\n" + " Multigrid:\n" + " Number of Levels : %" CeedInt_FMT "\n", + bp_choice + 1, vec_type, used_resource, CeedMemTypes[mem_type_backend], P, Q, q_extra, g_size[fine_level] / num_comp_u, + l_size[fine_level] / num_comp_u, num_comp_u, CeedElemTopologies[elem_topo], num_levels)); } // Create RHS vector - ierr = VecDuplicate(X_loc[fine_level], &rhs_loc); CHKERRQ(ierr); - ierr = VecZeroEntries(rhs_loc); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(rhs_loc, &r, &mem_type); CHKERRQ(ierr); + PetscCall(VecDuplicate(X_loc[fine_level], &rhs_loc)); + PetscCall(VecZeroEntries(rhs_loc)); + PetscCall(VecGetArrayAndMemType(rhs_loc, &r, &mem_type)); CeedVectorCreate(ceed, xl_size[fine_level], &rhs_ceed); CeedVectorSetArray(rhs_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, r); // Set up libCEED operators on each level - ierr = PetscMalloc1(num_levels, &ceed_data); CHKERRQ(ierr); - for (PetscInt i=0; iq_data_size, - CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_error, "qdata", ceed_data[fine_level]->q_data_size, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_error, "error", num_comp_u, CEED_EVAL_INTERP); // Create the error operator - CeedOperatorCreate(ceed, qf_error, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_error); - CeedOperatorSetField(op_error, "u", ceed_data[fine_level]->elem_restr_u, - ceed_data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_error, "true_soln", - ceed_data[fine_level]->elem_restr_u_i, - CEED_BASIS_COLLOCATED, target); - CeedOperatorSetField(op_error, "qdata", ceed_data[fine_level]->elem_restr_qd_i, - CEED_BASIS_COLLOCATED, ceed_data[fine_level]->q_data); - CeedOperatorSetField(op_error, "error", ceed_data[fine_level]->elem_restr_u, - ceed_data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_error, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_error); + CeedOperatorSetField(op_error, "u", ceed_data[fine_level]->elem_restr_u, ceed_data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_error, "true_soln", ceed_data[fine_level]->elem_restr_u_i, CEED_BASIS_COLLOCATED, target); + CeedOperatorSetField(op_error, "qdata", ceed_data[fine_level]->elem_restr_qd_i, CEED_BASIS_COLLOCATED, ceed_data[fine_level]->q_data); + CeedOperatorSetField(op_error, "error", ceed_data[fine_level]->elem_restr_u, ceed_data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); // Calculate multiplicity - for (int i=0; ix_ceed, CEED_MEM_HOST, CEED_USE_POINTER, x); // Multiplicity - CeedElemRestrictionGetMultiplicity(ceed_data[i]->elem_restr_u, - ceed_data[i]->x_ceed); + CeedElemRestrictionGetMultiplicity(ceed_data[i]->elem_restr_u, ceed_data[i]->x_ceed); CeedVectorSyncArray(ceed_data[i]->x_ceed, CEED_MEM_HOST); // Restore vector - ierr = VecRestoreArray(X_loc[i], &x); CHKERRQ(ierr); + PetscCall(VecRestoreArray(X_loc[i], &x)); // Creat mult vector - ierr = VecDuplicate(X_loc[i], &mult[i]); CHKERRQ(ierr); + PetscCall(VecDuplicate(X_loc[i], &mult[i])); // Local-to-global - ierr = VecZeroEntries(X[i]); CHKERRQ(ierr); - ierr = DMLocalToGlobal(dm[i], X_loc[i], ADD_VALUES, X[i]); - CHKERRQ(ierr); - ierr = VecZeroEntries(X_loc[i]); CHKERRQ(ierr); + PetscCall(VecZeroEntries(X[i])); + PetscCall(DMLocalToGlobal(dm[i], X_loc[i], ADD_VALUES, X[i])); + PetscCall(VecZeroEntries(X_loc[i])); // Global-to-local - ierr = DMGlobalToLocal(dm[i], X[i], INSERT_VALUES, mult[i]); - CHKERRQ(ierr); - ierr = VecZeroEntries(X[i]); CHKERRQ(ierr); + PetscCall(DMGlobalToLocal(dm[i], X[i], INSERT_VALUES, mult[i])); + PetscCall(VecZeroEntries(X[i])); // Multiplicity scaling - ierr = VecReciprocal(mult[i]); + PetscCall(VecReciprocal(mult[i])); } // Set up Mat - for (int i=0; i 0) { // Prolongation/Restriction Operator - ierr = CeedLevelTransferSetup(dm[i-1], ceed, i, num_comp_u, ceed_data, - bp_options[bp_choice], mult[i]); CHKERRQ(ierr); - pr_restr_ctx[i]->comm = comm; - pr_restr_ctx[i]->dmf = dm[i]; - pr_restr_ctx[i]->dmc = dm[i-1]; - pr_restr_ctx[i]->loc_vec_c = X_loc[i-1]; - pr_restr_ctx[i]->loc_vec_f = op_apply_ctx[i]->Y_loc; - pr_restr_ctx[i]->mult_vec = mult[i]; - pr_restr_ctx[i]->ceed_vec_c = op_apply_ctx[i-1]->x_ceed; - pr_restr_ctx[i]->ceed_vec_f = op_apply_ctx[i]->y_ceed; - pr_restr_ctx[i]->op_prolong = ceed_data[i]->op_prolong; + PetscCall(CeedLevelTransferSetup(dm[i - 1], ceed, i, num_comp_u, ceed_data, bp_options[bp_choice], mult[i])); + pr_restr_ctx[i]->comm = comm; + pr_restr_ctx[i]->dmf = dm[i]; + pr_restr_ctx[i]->dmc = dm[i - 1]; + pr_restr_ctx[i]->loc_vec_c = X_loc[i - 1]; + pr_restr_ctx[i]->loc_vec_f = op_apply_ctx[i]->Y_loc; + pr_restr_ctx[i]->mult_vec = mult[i]; + pr_restr_ctx[i]->ceed_vec_c = op_apply_ctx[i - 1]->x_ceed; + pr_restr_ctx[i]->ceed_vec_f = op_apply_ctx[i]->y_ceed; + pr_restr_ctx[i]->op_prolong = ceed_data[i]->op_prolong; pr_restr_ctx[i]->op_restrict = ceed_data[i]->op_restrict; - pr_restr_ctx[i]->ceed = ceed; + pr_restr_ctx[i]->ceed = ceed; } } // Assemble coarse grid Jacobian for AMG (or other sparse matrix) solve - ierr = DMCreateMatrix(dm[0], &mat_coarse); CHKERRQ(ierr); + PetscCall(DMCreateMatrix(dm[0], &mat_coarse)); - ierr = PetscLogEventRegister("AssembleMatrix", MAT_CLASSID, &assemble_event); - CHKERRQ(ierr); + PetscCall(PetscLogEventRegister("AssembleMatrix", MAT_CLASSID, &assemble_event)); { // Assemble matrix analytically PetscCount num_entries; - CeedInt *rows, *cols; + CeedInt *rows, *cols; CeedVector coo_values; - CeedOperatorLinearAssembleSymbolic(op_apply_ctx[0]->op, &num_entries, &rows, - &cols); + CeedOperatorLinearAssembleSymbolic(op_apply_ctx[0]->op, &num_entries, &rows, &cols); ISLocalToGlobalMapping ltog_row, ltog_col; - ierr = MatGetLocalToGlobalMapping(mat_coarse, <og_row, <og_col); - CHKERRQ(ierr); - ierr = ISLocalToGlobalMappingApply(ltog_row, num_entries, rows, rows); - CHKERRQ(ierr); - ierr = ISLocalToGlobalMappingApply(ltog_col, num_entries, cols, cols); - CHKERRQ(ierr); - ierr = MatSetPreallocationCOO(mat_coarse, num_entries, rows, cols); - CHKERRQ(ierr); + PetscCall(MatGetLocalToGlobalMapping(mat_coarse, <og_row, <og_col)); + PetscCall(ISLocalToGlobalMappingApply(ltog_row, num_entries, rows, rows)); + PetscCall(ISLocalToGlobalMappingApply(ltog_col, num_entries, cols, cols)); + PetscCall(MatSetPreallocationCOO(mat_coarse, num_entries, rows, cols)); free(rows); free(cols); CeedVectorCreate(ceed, num_entries, &coo_values); - ierr = PetscLogEventBegin(assemble_event, mat_coarse, 0, 0, 0); CHKERRQ(ierr); + PetscCall(PetscLogEventBegin(assemble_event, mat_coarse, 0, 0, 0)); CeedOperatorLinearAssemble(op_apply_ctx[0]->op, coo_values); const CeedScalar *values; CeedVectorGetArrayRead(coo_values, CEED_MEM_HOST, &values); - ierr = MatSetValuesCOO(mat_coarse, values, ADD_VALUES); CHKERRQ(ierr); + PetscCall(MatSetValuesCOO(mat_coarse, values, ADD_VALUES)); CeedVectorRestoreArrayRead(coo_values, &values); - ierr = PetscLogEventEnd(assemble_event, mat_coarse, 0, 0, 0); CHKERRQ(ierr); + PetscCall(PetscLogEventEnd(assemble_event, mat_coarse, 0, 0, 0)); CeedVectorDestroy(&coo_values); } // Set up KSP - ierr = KSPCreate(comm, &ksp); CHKERRQ(ierr); + PetscCall(KSPCreate(comm, &ksp)); { - ierr = KSPSetType(ksp, KSPCG); CHKERRQ(ierr); - ierr = KSPSetNormType(ksp, KSP_NORM_NATURAL); CHKERRQ(ierr); - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, - PETSC_DEFAULT); CHKERRQ(ierr); + PetscCall(KSPSetType(ksp, KSPCG)); + PetscCall(KSPSetNormType(ksp, KSP_NORM_NATURAL)); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, PETSC_DEFAULT)); } - ierr = KSPSetFromOptions(ksp); CHKERRQ(ierr); - ierr = KSPSetOperators(ksp, mat_O[fine_level], mat_O[fine_level]); - CHKERRQ(ierr); + PetscCall(KSPSetFromOptions(ksp)); + PetscCall(KSPSetOperators(ksp, mat_O[fine_level], mat_O[fine_level])); // Set up PCMG - ierr = KSPGetPC(ksp, &pc); CHKERRQ(ierr); + PetscCall(KSPGetPC(ksp, &pc)); PCMGCycleType pcmg_cycle_type = PC_MG_CYCLE_V; { - ierr = PCSetType(pc, PCMG); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCMG)); // PCMG levels - ierr = PCMGSetLevels(pc, num_levels, NULL); CHKERRQ(ierr); - for (int i=0; i 0) { // Interpolation - ierr = PCMGSetInterpolation(pc, i, mat_pr[i]); CHKERRQ(ierr); + PetscCall(PCMGSetInterpolation(pc, i, mat_pr[i])); } // Coarse solve KSP coarse; - PC coarse_pc; - ierr = PCMGGetCoarseSolve(pc, &coarse); CHKERRQ(ierr); - ierr = KSPSetType(coarse, KSPPREONLY); CHKERRQ(ierr); - ierr = KSPSetOperators(coarse, mat_coarse, mat_coarse); CHKERRQ(ierr); - - ierr = KSPGetPC(coarse, &coarse_pc); CHKERRQ(ierr); - ierr = PCSetType(coarse_pc, PCGAMG); CHKERRQ(ierr); - - ierr = KSPSetOptionsPrefix(coarse, "coarse_"); CHKERRQ(ierr); - ierr = PCSetOptionsPrefix(coarse_pc, "coarse_"); CHKERRQ(ierr); - ierr = KSPSetFromOptions(coarse); CHKERRQ(ierr); - ierr = PCSetFromOptions(coarse_pc); CHKERRQ(ierr); + PC coarse_pc; + PetscCall(PCMGGetCoarseSolve(pc, &coarse)); + PetscCall(KSPSetType(coarse, KSPPREONLY)); + PetscCall(KSPSetOperators(coarse, mat_coarse, mat_coarse)); + + PetscCall(KSPGetPC(coarse, &coarse_pc)); + PetscCall(PCSetType(coarse_pc, PCGAMG)); + + PetscCall(KSPSetOptionsPrefix(coarse, "coarse_")); + PetscCall(PCSetOptionsPrefix(coarse_pc, "coarse_")); + PetscCall(KSPSetFromOptions(coarse)); + PetscCall(PCSetFromOptions(coarse_pc)); } // PCMG options - ierr = PCMGSetType(pc, PC_MG_MULTIPLICATIVE); CHKERRQ(ierr); - ierr = PCMGSetNumberSmooth(pc, 3); CHKERRQ(ierr); - ierr = PCMGSetCycleType(pc, pcmg_cycle_type); CHKERRQ(ierr); + PetscCall(PCMGSetType(pc, PC_MG_MULTIPLICATIVE)); + PetscCall(PCMGSetNumberSmooth(pc, 3)); + PetscCall(PCMGSetCycleType(pc, pcmg_cycle_type)); } // First run, if benchmarking if (benchmark_mode) { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1); - CHKERRQ(ierr); - ierr = VecZeroEntries(X[fine_level]); CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1)); + PetscCall(VecZeroEntries(X[fine_level])); my_rt_start = MPI_Wtime(); - ierr = KSPSolve(ksp, rhs, X[fine_level]); CHKERRQ(ierr); + PetscCall(KSPSolve(ksp, rhs, X[fine_level])); my_rt = MPI_Wtime() - my_rt_start; - ierr = MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, comm); - CHKERRQ(ierr); + PetscCall(MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, comm)); // Set maxits based on first iteration timing if (my_rt > 0.02) { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 5); - CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 5)); } else { - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 20); - CHKERRQ(ierr); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 20)); } } // Timed solve - ierr = VecZeroEntries(X[fine_level]); CHKERRQ(ierr); - ierr = PetscBarrier((PetscObject)ksp); CHKERRQ(ierr); + PetscCall(VecZeroEntries(X[fine_level])); + PetscCall(PetscBarrier((PetscObject)ksp)); // -- Performance logging - ierr = PetscLogStageRegister("Solve Stage", &solve_stage); CHKERRQ(ierr); - ierr = PetscLogStagePush(solve_stage); CHKERRQ(ierr); + PetscCall(PetscLogStageRegister("Solve Stage", &solve_stage)); + PetscCall(PetscLogStagePush(solve_stage)); // -- Solve my_rt_start = MPI_Wtime(); - ierr = KSPSolve(ksp, rhs, X[fine_level]); CHKERRQ(ierr); + PetscCall(KSPSolve(ksp, rhs, X[fine_level])); my_rt = MPI_Wtime() - my_rt_start; - // -- Performance logging - ierr = PetscLogStagePop(); + PetscCall(PetscLogStagePop()); // Output results { - KSPType ksp_type; - PCMGType pcmg_type; + KSPType ksp_type; + PCMGType pcmg_type; KSPConvergedReason reason; - PetscReal rnorm; - PetscInt its; - ierr = KSPGetType(ksp, &ksp_type); CHKERRQ(ierr); - ierr = KSPGetConvergedReason(ksp, &reason); CHKERRQ(ierr); - ierr = KSPGetIterationNumber(ksp, &its); CHKERRQ(ierr); - ierr = KSPGetResidualNorm(ksp, &rnorm); CHKERRQ(ierr); - ierr = PCMGGetType(pc, &pcmg_type); CHKERRQ(ierr); + PetscReal rnorm; + PetscInt its; + PetscCall(KSPGetType(ksp, &ksp_type)); + PetscCall(KSPGetConvergedReason(ksp, &reason)); + PetscCall(KSPGetIterationNumber(ksp, &its)); + PetscCall(KSPGetResidualNorm(ksp, &rnorm)); + PetscCall(PCMGGetType(pc, &pcmg_type)); if (!test_mode || reason < 0 || rnorm > 1e-8) { - ierr = PetscPrintf(comm, - " KSP:\n" - " KSP Type : %s\n" - " KSP Convergence : %s\n" - " Total KSP Iterations : %" PetscInt_FMT "\n" - " Final rnorm : %e\n", - ksp_type, KSPConvergedReasons[reason], its, - (double)rnorm); CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " PCMG:\n" - " PCMG Type : %s\n" - " PCMG Cycle Type : %s\n", - PCMGTypes[pcmg_type], - PCMGCycleTypes[pcmg_cycle_type]); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, + " KSP:\n" + " KSP Type : %s\n" + " KSP Convergence : %s\n" + " Total KSP Iterations : %" PetscInt_FMT "\n" + " Final rnorm : %e\n", + ksp_type, KSPConvergedReasons[reason], its, (double)rnorm)); + PetscCall(PetscPrintf(comm, + " PCMG:\n" + " PCMG Type : %s\n" + " PCMG Cycle Type : %s\n", + PCMGTypes[pcmg_type], PCMGCycleTypes[pcmg_cycle_type])); } if (!test_mode) { - ierr = PetscPrintf(comm," Performance:\n"); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " Performance:\n")); } { // Set up error operator context - ierr = SetupErrorOperatorCtx(comm, dm[fine_level], ceed, - ceed_data[fine_level], X_loc[fine_level], - op_error, op_error_ctx); CHKERRQ(ierr); + PetscCall(SetupErrorOperatorCtx(comm, dm[fine_level], ceed, ceed_data[fine_level], X_loc[fine_level], op_error, op_error_ctx)); PetscScalar l2_error; - ierr = ComputeL2Error(X[fine_level], &l2_error, op_error_ctx); CHKERRQ(ierr); + PetscCall(ComputeL2Error(X[fine_level], &l2_error, op_error_ctx)); PetscReal tol = 5e-2; if (!test_mode || l2_error > tol) { - ierr = MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, comm); - CHKERRQ(ierr); - ierr = MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, comm); - CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " L2 Error : %e\n" - " CG Solve Time : %g (%g) sec\n", - (double)l2_error, rt_max, rt_min); CHKERRQ(ierr); + PetscCall(MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, comm)); + PetscCall(MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, comm)); + PetscCall(PetscPrintf(comm, + " L2 Error : %e\n" + " CG Solve Time : %g (%g) sec\n", + (double)l2_error, rt_max, rt_min)); } } if (benchmark_mode && (!test_mode)) { - ierr = PetscPrintf(comm, - " DoFs/Sec in CG : %g (%g) million\n", - 1e-6*g_size[fine_level]*its/rt_max, - 1e-6*g_size[fine_level]*its/rt_min); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " DoFs/Sec in CG : %g (%g) million\n", 1e-6 * g_size[fine_level] * its / rt_max, + 1e-6 * g_size[fine_level] * its / rt_min)); } } if (write_solution) { PetscViewer vtk_viewer_soln; - ierr = PetscViewerCreate(comm, &vtk_viewer_soln); CHKERRQ(ierr); - ierr = PetscViewerSetType(vtk_viewer_soln, PETSCVIEWERVTK); CHKERRQ(ierr); - ierr = PetscViewerFileSetName(vtk_viewer_soln, "solution.vtu"); CHKERRQ(ierr); - ierr = VecView(X[fine_level], vtk_viewer_soln); CHKERRQ(ierr); - ierr = PetscViewerDestroy(&vtk_viewer_soln); CHKERRQ(ierr); + PetscCall(PetscViewerCreate(comm, &vtk_viewer_soln)); + PetscCall(PetscViewerSetType(vtk_viewer_soln, PETSCVIEWERVTK)); + PetscCall(PetscViewerFileSetName(vtk_viewer_soln, "solution.vtu")); + PetscCall(VecView(X[fine_level], vtk_viewer_soln)); + PetscCall(PetscViewerDestroy(&vtk_viewer_soln)); } // Cleanup - for (int i=0; iY_loc); CHKERRQ(ierr); - ierr = MatDestroy(&mat_O[i]); CHKERRQ(ierr); - ierr = PetscFree(op_apply_ctx[i]); CHKERRQ(ierr); + for (int i = 0; i < num_levels; i++) { + PetscCall(VecDestroy(&X[i])); + PetscCall(VecDestroy(&X_loc[i])); + PetscCall(VecDestroy(&mult[i])); + PetscCall(VecDestroy(&op_apply_ctx[i]->Y_loc)); + PetscCall(MatDestroy(&mat_O[i])); + PetscCall(PetscFree(op_apply_ctx[i])); if (i > 0) { - ierr = MatDestroy(&mat_pr[i]); CHKERRQ(ierr); - ierr = PetscFree(pr_restr_ctx[i]); CHKERRQ(ierr); + PetscCall(MatDestroy(&mat_pr[i])); + PetscCall(PetscFree(pr_restr_ctx[i])); } - ierr = CeedDataDestroy(i, ceed_data[i]); CHKERRQ(ierr); - ierr = DMDestroy(&dm[i]); CHKERRQ(ierr); + PetscCall(CeedDataDestroy(i, ceed_data[i])); + PetscCall(DMDestroy(&dm[i])); } - ierr = PetscFree(level_degrees); CHKERRQ(ierr); - ierr = PetscFree(dm); CHKERRQ(ierr); - ierr = PetscFree(X); CHKERRQ(ierr); - ierr = PetscFree(X_loc); CHKERRQ(ierr); - ierr = VecDestroy(&op_error_ctx->Y_loc); CHKERRQ(ierr); - ierr = PetscFree(mult); CHKERRQ(ierr); - ierr = PetscFree(mat_O); CHKERRQ(ierr); - ierr = PetscFree(mat_pr); CHKERRQ(ierr); - ierr = PetscFree(ceed_data); CHKERRQ(ierr); - ierr = PetscFree(op_apply_ctx); CHKERRQ(ierr); - ierr = PetscFree(op_error_ctx); CHKERRQ(ierr); - ierr = PetscFree(pr_restr_ctx); CHKERRQ(ierr); - ierr = PetscFree(l_size); CHKERRQ(ierr); - ierr = PetscFree(xl_size); CHKERRQ(ierr); - ierr = PetscFree(g_size); CHKERRQ(ierr); - ierr = VecDestroy(&rhs); CHKERRQ(ierr); - ierr = VecDestroy(&rhs_loc); CHKERRQ(ierr); - ierr = MatDestroy(&mat_coarse); CHKERRQ(ierr); - ierr = KSPDestroy(&ksp); CHKERRQ(ierr); - ierr = DMDestroy(&dm_orig); CHKERRQ(ierr); + PetscCall(PetscFree(level_degrees)); + PetscCall(PetscFree(dm)); + PetscCall(PetscFree(X)); + PetscCall(PetscFree(X_loc)); + PetscCall(VecDestroy(&op_error_ctx->Y_loc)); + PetscCall(PetscFree(mult)); + PetscCall(PetscFree(mat_O)); + PetscCall(PetscFree(mat_pr)); + PetscCall(PetscFree(ceed_data)); + PetscCall(PetscFree(op_apply_ctx)); + PetscCall(PetscFree(op_error_ctx)); + PetscCall(PetscFree(pr_restr_ctx)); + PetscCall(PetscFree(l_size)); + PetscCall(PetscFree(xl_size)); + PetscCall(PetscFree(g_size)); + PetscCall(VecDestroy(&rhs)); + PetscCall(VecDestroy(&rhs_loc)); + PetscCall(MatDestroy(&mat_coarse)); + PetscCall(KSPDestroy(&ksp)); + PetscCall(DMDestroy(&dm_orig)); CeedVectorDestroy(&target); CeedQFunctionDestroy(&qf_error); CeedOperatorDestroy(&op_error); diff --git a/examples/petsc/qfunctions/area/areacube.h b/examples/petsc/qfunctions/area/areacube.h index 5cbc30d3d9..036d63491b 100644 --- a/examples/petsc/qfunctions/area/areacube.h +++ b/examples/petsc/qfunctions/area/areacube.h @@ -44,57 +44,45 @@ // Qdata: w * det(dx_i/dX_j) // // ----------------------------------------------------------------------------- -CEED_QFUNCTION(SetupMassGeoCube)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(SetupMassGeoCube)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Inputs const CeedScalar *J = in[1], *w = in[2]; // Outputs CeedScalar *q_data = out[0]; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; i // ----------------------------------------------------------------------------- -CEED_QFUNCTION(Error)(void *ctx, CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Error)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *u = in[0], *target = in[1], *q_data = in[2]; - CeedScalar *error = out[0]; - for (CeedInt i=0; i #include "../include/libceedsetup.h" + +#include + #include "../include/petscutils.h" // ----------------------------------------------------------------------------- // Destroy libCEED operator objects // ----------------------------------------------------------------------------- PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data) { - int ierr; - PetscFunctionBeginUser; + CeedVectorDestroy(&data->q_data); CeedVectorDestroy(&data->x_ceed); CeedVectorDestroy(&data->y_ceed); @@ -24,7 +25,7 @@ PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data) { CeedOperatorDestroy(&data->op_prolong); CeedOperatorDestroy(&data->op_restrict); } - ierr = PetscFree(data); CHKERRQ(ierr); + PetscCall(PetscFree(data)); PetscFunctionReturn(0); }; @@ -32,136 +33,107 @@ PetscErrorCode CeedDataDestroy(CeedInt i, CeedData data) { // ----------------------------------------------------------------------------- // Set up libCEED for a given degree // ----------------------------------------------------------------------------- -PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, - CeedInt topo_dim, CeedInt q_extra, - PetscInt num_comp_x, PetscInt num_comp_u, - PetscInt g_size, PetscInt xl_size, - BPData bp_data, CeedData data, - PetscBool setup_rhs, CeedVector rhs_ceed, +PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedInt topo_dim, CeedInt q_extra, PetscInt num_comp_x, PetscInt num_comp_u, + PetscInt g_size, PetscInt xl_size, BPData bp_data, CeedData data, PetscBool setup_rhs, CeedVector rhs_ceed, CeedVector *target) { - int ierr; - DM dm_coord; - Vec coords; - const PetscScalar *coord_array; - CeedBasis basis_x, basis_u; + DM dm_coord; + Vec coords; + const PetscScalar *coord_array; + CeedBasis basis_x, basis_u; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_u_i, elem_restr_qd_i; - CeedQFunction qf_setup_geo, qf_apply; - CeedOperator op_setup_geo, op_apply; - CeedVector x_coord, q_data, x_ceed, y_ceed; - CeedInt num_qpts, c_start, c_end, num_elem, - q_data_size = bp_data.q_data_size; - CeedScalar R = 1, // radius of the sphere - l = 1.0/PetscSqrtReal(3.0); // half edge of the inscribed cube + CeedQFunction qf_setup_geo, qf_apply; + CeedOperator op_setup_geo, op_apply; + CeedVector x_coord, q_data, x_ceed, y_ceed; + CeedInt num_qpts, c_start, c_end, num_elem, q_data_size = bp_data.q_data_size; + CeedScalar R = 1; // radius of the sphere + CeedScalar l = 1.0 / PetscSqrtReal(3.0); // half edge of the inscribed cube PetscFunctionBeginUser; - ierr = DMGetCoordinateDM(dm, &dm_coord); CHKERRQ(ierr); + PetscCall(DMGetCoordinateDM(dm, &dm_coord)); // CEED bases - ierr = CreateBasisFromPlex(ceed, dm_coord, 0, 0, 0, 0, bp_data, &basis_x); - CHKERRQ(ierr); - ierr = CreateBasisFromPlex(ceed, dm, 0, 0, 0, 0, bp_data, &basis_u); - CHKERRQ(ierr); + PetscCall(CreateBasisFromPlex(ceed, dm_coord, 0, 0, 0, 0, bp_data, &basis_x)); + PetscCall(CreateBasisFromPlex(ceed, dm, 0, 0, 0, 0, bp_data, &basis_u)); // CEED restrictions - ierr = CreateRestrictionFromPlex(ceed, dm_coord, 0, 0, 0, &elem_restr_x); - CHKERRQ(ierr); - ierr = CreateRestrictionFromPlex(ceed, dm, 0, 0, 0, &elem_restr_u); - CHKERRQ(ierr); + PetscCall(CreateRestrictionFromPlex(ceed, dm_coord, 0, 0, 0, &elem_restr_x)); + PetscCall(CreateRestrictionFromPlex(ceed, dm, 0, 0, 0, &elem_restr_u)); - ierr = DMPlexGetHeightStratum(dm, 0, &c_start, &c_end); CHKERRQ(ierr); + PetscCall(DMPlexGetHeightStratum(dm, 0, &c_start, &c_end)); num_elem = c_end - c_start; CeedBasisGetNumQuadraturePoints(basis_u, &num_qpts); - CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, num_comp_u, - num_comp_u*num_elem*num_qpts, - CEED_STRIDES_BACKEND, &elem_restr_u_i); - CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, q_data_size, - q_data_size*num_elem*num_qpts, - CEED_STRIDES_BACKEND, &elem_restr_qd_i); + CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, num_comp_u, num_comp_u * num_elem * num_qpts, CEED_STRIDES_BACKEND, &elem_restr_u_i); + CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, q_data_size, q_data_size * num_elem * num_qpts, CEED_STRIDES_BACKEND, &elem_restr_qd_i); // Element coordinates - ierr = DMGetCoordinatesLocal(dm, &coords); CHKERRQ(ierr); - ierr = VecGetArrayRead(coords, &coord_array); CHKERRQ(ierr); + PetscCall(DMGetCoordinatesLocal(dm, &coords)); + PetscCall(VecGetArrayRead(coords, &coord_array)); CeedElemRestrictionCreateVector(elem_restr_x, &x_coord, NULL); - CeedVectorSetArray(x_coord, CEED_MEM_HOST, CEED_COPY_VALUES, - (PetscScalar *)coord_array); - ierr = VecRestoreArrayRead(coords, &coord_array); + CeedVectorSetArray(x_coord, CEED_MEM_HOST, CEED_COPY_VALUES, (PetscScalar *)coord_array); + PetscCall(VecRestoreArrayRead(coords, &coord_array)); // Create the persistent vectors that will be needed in setup and apply - CeedVectorCreate(ceed, q_data_size*num_elem*num_qpts, &q_data); + CeedVectorCreate(ceed, q_data_size * num_elem * num_qpts, &q_data); CeedVectorCreate(ceed, xl_size, &x_ceed); CeedVectorCreate(ceed, xl_size, &y_ceed); // Create the QFunction that builds the context data - CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_geo, bp_data.setup_geo_loc, - &qf_setup_geo); + CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_geo, bp_data.setup_geo_loc, &qf_setup_geo); CeedQFunctionAddInput(qf_setup_geo, "x", num_comp_x, CEED_EVAL_INTERP); - CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x*topo_dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x * topo_dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_setup_geo, "weight", 1, CEED_EVAL_WEIGHT); CeedQFunctionAddOutput(qf_setup_geo, "qdata", q_data_size, CEED_EVAL_NONE); // Create the operator that builds the quadrature data CeedOperatorCreate(ceed, qf_setup_geo, NULL, NULL, &op_setup_geo); - CeedOperatorSetField(op_setup_geo, "x", elem_restr_x, basis_x, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_geo, "dx", elem_restr_x, basis_x, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, - CEED_VECTOR_NONE); - CeedOperatorSetField(op_setup_geo, "qdata", elem_restr_qd_i, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_geo, "x", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_geo, "dx", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_geo, "qdata", elem_restr_qd_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // Setup q_data CeedOperatorApply(op_setup_geo, x_coord, q_data, CEED_REQUEST_IMMEDIATE); // Set up PDE operator - CeedInt in_scale = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1; + CeedInt in_scale = bp_data.in_mode == CEED_EVAL_GRAD ? topo_dim : 1; CeedInt out_scale = bp_data.out_mode == CEED_EVAL_GRAD ? topo_dim : 1; - CeedQFunctionCreateInterior(ceed, 1, bp_data.apply, bp_data.apply_loc, - &qf_apply); - CeedQFunctionAddInput(qf_apply, "u", num_comp_u*in_scale, bp_data.in_mode); + CeedQFunctionCreateInterior(ceed, 1, bp_data.apply, bp_data.apply_loc, &qf_apply); + CeedQFunctionAddInput(qf_apply, "u", num_comp_u * in_scale, bp_data.in_mode); CeedQFunctionAddInput(qf_apply, "qdata", q_data_size, CEED_EVAL_NONE); - CeedQFunctionAddOutput(qf_apply, "v", num_comp_u*out_scale, bp_data.out_mode); + CeedQFunctionAddOutput(qf_apply, "v", num_comp_u * out_scale, bp_data.out_mode); // Create the mass or diff operator CeedOperatorCreate(ceed, qf_apply, NULL, NULL, &op_apply); CeedOperatorSetField(op_apply, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply, "qdata", elem_restr_qd_i, CEED_BASIS_COLLOCATED, - q_data); + CeedOperatorSetField(op_apply, "qdata", elem_restr_qd_i, CEED_BASIS_COLLOCATED, q_data); CeedOperatorSetField(op_apply, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); // Set up RHS if needed if (setup_rhs) { CeedQFunction qf_setup_rhs; - CeedOperator op_setup_rhs; - CeedVectorCreate(ceed, num_elem*num_qpts*num_comp_u, target); + CeedOperator op_setup_rhs; + CeedVectorCreate(ceed, num_elem * num_qpts * num_comp_u, target); // Create the q-function that sets up the RHS and true solution - CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_rhs, bp_data.setup_rhs_loc, - &qf_setup_rhs); + CeedQFunctionCreateInterior(ceed, 1, bp_data.setup_rhs, bp_data.setup_rhs_loc, &qf_setup_rhs); CeedQFunctionAddInput(qf_setup_rhs, "x", num_comp_x, CEED_EVAL_INTERP); CeedQFunctionAddInput(qf_setup_rhs, "qdata", q_data_size, CEED_EVAL_NONE); - CeedQFunctionAddOutput(qf_setup_rhs, "true solution", num_comp_u, - CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_setup_rhs, "true solution", num_comp_u, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_setup_rhs, "rhs", num_comp_u, CEED_EVAL_INTERP); // Create the operator that builds the RHS and true solution CeedOperatorCreate(ceed, qf_setup_rhs, NULL, NULL, &op_setup_rhs); - CeedOperatorSetField(op_setup_rhs, "x", elem_restr_x, basis_x, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_rhs, "qdata", elem_restr_qd_i, - CEED_BASIS_COLLOCATED, q_data); - CeedOperatorSetField(op_setup_rhs, "true solution", elem_restr_u_i, - CEED_BASIS_COLLOCATED, *target); - CeedOperatorSetField(op_setup_rhs, "rhs", elem_restr_u, basis_u, - CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_rhs, "x", elem_restr_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_rhs, "qdata", elem_restr_qd_i, CEED_BASIS_COLLOCATED, q_data); + CeedOperatorSetField(op_setup_rhs, "true solution", elem_restr_u_i, CEED_BASIS_COLLOCATED, *target); + CeedOperatorSetField(op_setup_rhs, "rhs", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); // Set up the libCEED context CeedQFunctionContext ctx_rhs_setup; CeedQFunctionContextCreate(ceed, &ctx_rhs_setup); CeedScalar rhs_setup_data[2] = {R, l}; - CeedQFunctionContextSetData(ctx_rhs_setup, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof rhs_setup_data, &rhs_setup_data); + CeedQFunctionContextSetData(ctx_rhs_setup, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof rhs_setup_data, &rhs_setup_data); CeedQFunctionSetContext(qf_setup_rhs, ctx_rhs_setup); CeedQFunctionContextDestroy(&ctx_rhs_setup); @@ -179,17 +151,18 @@ PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, CeedVectorDestroy(&x_coord); // Save libCEED data required for level - data->basis_x = basis_x; data->basis_u = basis_u; - data->elem_restr_x = elem_restr_x; - data->elem_restr_u = elem_restr_u; - data->elem_restr_u_i = elem_restr_u_i; + data->basis_x = basis_x; + data->basis_u = basis_u; + data->elem_restr_x = elem_restr_x; + data->elem_restr_u = elem_restr_u; + data->elem_restr_u_i = elem_restr_u_i; data->elem_restr_qd_i = elem_restr_qd_i; - data->qf_apply = qf_apply; - data->op_apply = op_apply; - data->q_data = q_data; - data->x_ceed = x_ceed; - data->y_ceed = y_ceed; - data->q_data_size = q_data_size; + data->qf_apply = qf_apply; + data->op_apply = op_apply; + data->q_data = q_data; + data->x_ceed = x_ceed; + data->y_ceed = y_ceed; + data->q_data_size = q_data_size; PetscFunctionReturn(0); }; @@ -197,11 +170,7 @@ PetscErrorCode SetupLibceedByDegree(DM dm, Ceed ceed, CeedInt degree, // ----------------------------------------------------------------------------- // Setup libCEED level transfer operator objects // ----------------------------------------------------------------------------- -PetscErrorCode CeedLevelTransferSetup(DM dm, Ceed ceed, CeedInt level, - CeedInt num_comp_u, CeedData *data, - BPData bp_data, Vec fine_mult) { - int ierr; - +PetscErrorCode CeedLevelTransferSetup(DM dm, Ceed ceed, CeedInt level, CeedInt num_comp_u, CeedData *data, BPData bp_data, Vec fine_mult) { PetscFunctionBeginUser; // Restriction - Fine to corse CeedOperator op_restrict; @@ -211,8 +180,7 @@ PetscErrorCode CeedLevelTransferSetup(DM dm, Ceed ceed, CeedInt level, CeedOperator op_apply; // Basis CeedBasis basis_u; - ierr = CreateBasisFromPlex(ceed, dm, 0, 0, 0, 0, bp_data, &basis_u); - CHKERRQ(ierr); + PetscCall(CreateBasisFromPlex(ceed, dm, 0, 0, 0, 0, bp_data, &basis_u)); // --------------------------------------------------------------------------- // Coarse Grid, Prolongation, and Restriction Operators @@ -222,25 +190,21 @@ PetscErrorCode CeedLevelTransferSetup(DM dm, Ceed ceed, CeedInt level, // --------------------------------------------------------------------------- // Place in libCEED array const PetscScalar *m; - PetscMemType m_mem_type; - ierr = VecGetArrayReadAndMemType(fine_mult, &m, &m_mem_type); - CHKERRQ(ierr); - CeedVectorSetArray(data[level]->x_ceed, MemTypeP2C(m_mem_type), - CEED_USE_POINTER, (CeedScalar *)m); + PetscMemType m_mem_type; + PetscCall(VecGetArrayReadAndMemType(fine_mult, &m, &m_mem_type)); + CeedVectorSetArray(data[level]->x_ceed, MemTypeP2C(m_mem_type), CEED_USE_POINTER, (CeedScalar *)m); - CeedOperatorMultigridLevelCreate(data[level]->op_apply, data[level]->x_ceed, - data[level-1]->elem_restr_u, basis_u, - &op_apply, &op_prolong, &op_restrict); + CeedOperatorMultigridLevelCreate(data[level]->op_apply, data[level]->x_ceed, data[level - 1]->elem_restr_u, basis_u, &op_apply, &op_prolong, + &op_restrict); // Restore PETSc vector - CeedVectorTakeArray(data[level]->x_ceed, MemTypeP2C(m_mem_type), - (CeedScalar **)&m); - ierr = VecRestoreArrayReadAndMemType(fine_mult, &m); CHKERRQ(ierr); - ierr = VecZeroEntries(fine_mult); CHKERRQ(ierr); + CeedVectorTakeArray(data[level]->x_ceed, MemTypeP2C(m_mem_type), (CeedScalar **)&m); + PetscCall(VecRestoreArrayReadAndMemType(fine_mult, &m)); + PetscCall(VecZeroEntries(fine_mult)); // -- Save libCEED data - data[level-1]->op_apply = op_apply; - data[level]->op_prolong = op_prolong; - data[level]->op_restrict = op_restrict; + data[level - 1]->op_apply = op_apply; + data[level]->op_prolong = op_prolong; + data[level]->op_restrict = op_restrict; CeedBasisDestroy(&basis_u); PetscFunctionReturn(0); diff --git a/examples/petsc/src/matops.c b/examples/petsc/src/matops.c index 8c21d81a52..c4159334e1 100644 --- a/examples/petsc/src/matops.c +++ b/examples/petsc/src/matops.c @@ -1,43 +1,41 @@ #include "../include/matops.h" + #include "../include/petscutils.h" // ----------------------------------------------------------------------------- // Setup apply operator context data // ----------------------------------------------------------------------------- -PetscErrorCode SetupApplyOperatorCtx(MPI_Comm comm, DM dm, Ceed ceed, - CeedData ceed_data, Vec X_loc, - OperatorApplyContext op_apply_ctx) { - PetscErrorCode ierr; +PetscErrorCode SetupApplyOperatorCtx(MPI_Comm comm, DM dm, Ceed ceed, CeedData ceed_data, Vec X_loc, OperatorApplyContext op_apply_ctx) { PetscFunctionBeginUser; - op_apply_ctx->comm = comm; - op_apply_ctx->dm = dm; + op_apply_ctx->comm = comm; + op_apply_ctx->dm = dm; op_apply_ctx->X_loc = X_loc; - ierr = VecDuplicate(X_loc, &op_apply_ctx->Y_loc); CHKERRQ(ierr); + PetscCall(VecDuplicate(X_loc, &op_apply_ctx->Y_loc)); op_apply_ctx->x_ceed = ceed_data->x_ceed; op_apply_ctx->y_ceed = ceed_data->y_ceed; - op_apply_ctx->op = ceed_data->op_apply; - op_apply_ctx->ceed = ceed; + op_apply_ctx->op = ceed_data->op_apply; + op_apply_ctx->ceed = ceed; + PetscFunctionReturn(0); } // ----------------------------------------------------------------------------- // Setup error operator context data // ----------------------------------------------------------------------------- -PetscErrorCode SetupErrorOperatorCtx(MPI_Comm comm, DM dm, Ceed ceed, - CeedData ceed_data, Vec X_loc, CeedOperator op_error, +PetscErrorCode SetupErrorOperatorCtx(MPI_Comm comm, DM dm, Ceed ceed, CeedData ceed_data, Vec X_loc, CeedOperator op_error, OperatorApplyContext op_error_ctx) { - PetscErrorCode ierr; PetscFunctionBeginUser; - op_error_ctx->comm = comm; - op_error_ctx->dm = dm; + op_error_ctx->comm = comm; + op_error_ctx->dm = dm; op_error_ctx->X_loc = X_loc; - ierr = VecDuplicate(X_loc, &op_error_ctx->Y_loc); CHKERRQ(ierr); + PetscCall(VecDuplicate(X_loc, &op_error_ctx->Y_loc)); op_error_ctx->x_ceed = ceed_data->x_ceed; op_error_ctx->y_ceed = ceed_data->y_ceed; - op_error_ctx->op = op_error; - op_error_ctx->ceed = ceed; + op_error_ctx->op = op_error; + op_error_ctx->ceed = ceed; + PetscFunctionReturn(0); } @@ -45,32 +43,28 @@ PetscErrorCode SetupErrorOperatorCtx(MPI_Comm comm, DM dm, Ceed ceed, // This function returns the computed diagonal of the operator // ----------------------------------------------------------------------------- PetscErrorCode MatGetDiag(Mat A, Vec D) { - PetscErrorCode ierr; OperatorApplyContext op_apply_ctx; PetscFunctionBeginUser; - ierr = MatShellGetContext(A, &op_apply_ctx); CHKERRQ(ierr); + PetscCall(MatShellGetContext(A, &op_apply_ctx)); // Compute Diagonal via libCEED PetscScalar *y; PetscMemType mem_type; // -- Place PETSc vector in libCEED vector - ierr = VecGetArrayAndMemType(op_apply_ctx->Y_loc, &y, &mem_type); CHKERRQ(ierr); - CeedVectorSetArray(op_apply_ctx->y_ceed, MemTypeP2C(mem_type), - CEED_USE_POINTER, y); + PetscCall(VecGetArrayAndMemType(op_apply_ctx->Y_loc, &y, &mem_type)); + CeedVectorSetArray(op_apply_ctx->y_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, y); // -- Compute Diagonal - CeedOperatorLinearAssembleDiagonal(op_apply_ctx->op, op_apply_ctx->y_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorLinearAssembleDiagonal(op_apply_ctx->op, op_apply_ctx->y_ceed, CEED_REQUEST_IMMEDIATE); // -- Local-to-Global CeedVectorTakeArray(op_apply_ctx->y_ceed, MemTypeP2C(mem_type), NULL); - ierr = VecRestoreArrayAndMemType(op_apply_ctx->Y_loc, &y); CHKERRQ(ierr); - ierr = VecZeroEntries(D); CHKERRQ(ierr); - ierr = DMLocalToGlobal(op_apply_ctx->dm, op_apply_ctx->Y_loc, ADD_VALUES, D); - CHKERRQ(ierr); + PetscCall(VecRestoreArrayAndMemType(op_apply_ctx->Y_loc, &y)); + PetscCall(VecZeroEntries(D)); + PetscCall(DMLocalToGlobal(op_apply_ctx->dm, op_apply_ctx->Y_loc, ADD_VALUES, D)); PetscFunctionReturn(0); }; @@ -79,44 +73,33 @@ PetscErrorCode MatGetDiag(Mat A, Vec D) { // This function uses libCEED to compute the action of the Laplacian with // Dirichlet boundary conditions // ----------------------------------------------------------------------------- -PetscErrorCode ApplyLocal_Ceed(Vec X, Vec Y, - OperatorApplyContext op_apply_ctx) { - PetscErrorCode ierr; +PetscErrorCode ApplyLocal_Ceed(Vec X, Vec Y, OperatorApplyContext op_apply_ctx) { PetscScalar *x, *y; PetscMemType x_mem_type, y_mem_type; PetscFunctionBeginUser; // Global-to-local - ierr = DMGlobalToLocal(op_apply_ctx->dm, X, INSERT_VALUES, op_apply_ctx->X_loc); - CHKERRQ(ierr); + PetscCall(DMGlobalToLocal(op_apply_ctx->dm, X, INSERT_VALUES, op_apply_ctx->X_loc)); // Setup libCEED vectors - ierr = VecGetArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x, - &x_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(op_apply_ctx->Y_loc, &y, &y_mem_type); - CHKERRQ(ierr); - CeedVectorSetArray(op_apply_ctx->x_ceed, MemTypeP2C(x_mem_type), - CEED_USE_POINTER, x); - CeedVectorSetArray(op_apply_ctx->y_ceed, MemTypeP2C(y_mem_type), - CEED_USE_POINTER, y); + PetscCall(VecGetArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x, &x_mem_type)); + PetscCall(VecGetArrayAndMemType(op_apply_ctx->Y_loc, &y, &y_mem_type)); + CeedVectorSetArray(op_apply_ctx->x_ceed, MemTypeP2C(x_mem_type), CEED_USE_POINTER, x); + CeedVectorSetArray(op_apply_ctx->y_ceed, MemTypeP2C(y_mem_type), CEED_USE_POINTER, y); // Apply libCEED operator - CeedOperatorApply(op_apply_ctx->op, op_apply_ctx->x_ceed, op_apply_ctx->y_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_apply_ctx->op, op_apply_ctx->x_ceed, op_apply_ctx->y_ceed, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(op_apply_ctx->x_ceed, MemTypeP2C(x_mem_type), NULL); CeedVectorTakeArray(op_apply_ctx->y_ceed, MemTypeP2C(y_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(op_apply_ctx->X_loc, - (const PetscScalar **)&x); - CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(op_apply_ctx->Y_loc, &y); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(op_apply_ctx->X_loc, (const PetscScalar **)&x)); + PetscCall(VecRestoreArrayAndMemType(op_apply_ctx->Y_loc, &y)); // Local-to-global - ierr = VecZeroEntries(Y); CHKERRQ(ierr); - ierr = DMLocalToGlobal(op_apply_ctx->dm, op_apply_ctx->Y_loc, ADD_VALUES, Y); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(Y)); + PetscCall(DMLocalToGlobal(op_apply_ctx->dm, op_apply_ctx->Y_loc, ADD_VALUES, Y)); PetscFunctionReturn(0); }; @@ -125,15 +108,14 @@ PetscErrorCode ApplyLocal_Ceed(Vec X, Vec Y, // This function wraps the libCEED operator for a MatShell // ----------------------------------------------------------------------------- PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) { - PetscErrorCode ierr; OperatorApplyContext op_apply_ctx; PetscFunctionBeginUser; - ierr = MatShellGetContext(A, &op_apply_ctx); CHKERRQ(ierr); + PetscCall(MatShellGetContext(A, &op_apply_ctx)); // libCEED for local action of residual evaluator - ierr = ApplyLocal_Ceed(X, Y, op_apply_ctx); CHKERRQ(ierr); + PetscCall(ApplyLocal_Ceed(X, Y, op_apply_ctx)); PetscFunctionReturn(0); }; @@ -142,52 +124,39 @@ PetscErrorCode MatMult_Ceed(Mat A, Vec X, Vec Y) { // This function uses libCEED to compute the action of the prolongation operator // ----------------------------------------------------------------------------- PetscErrorCode MatMult_Prolong(Mat A, Vec X, Vec Y) { - PetscErrorCode ierr; ProlongRestrContext pr_restr_ctx; - PetscScalar *c, *f; - PetscMemType c_mem_type, f_mem_type; + PetscScalar *c, *f; + PetscMemType c_mem_type, f_mem_type; PetscFunctionBeginUser; - ierr = MatShellGetContext(A, &pr_restr_ctx); CHKERRQ(ierr); + PetscCall(MatShellGetContext(A, &pr_restr_ctx)); // Global-to-local - ierr = VecZeroEntries(pr_restr_ctx->loc_vec_c); CHKERRQ(ierr); - ierr = DMGlobalToLocal(pr_restr_ctx->dmc, X, INSERT_VALUES, - pr_restr_ctx->loc_vec_c); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(pr_restr_ctx->loc_vec_c)); + PetscCall(DMGlobalToLocal(pr_restr_ctx->dmc, X, INSERT_VALUES, pr_restr_ctx->loc_vec_c)); // Setup libCEED vectors - ierr = VecGetArrayReadAndMemType(pr_restr_ctx->loc_vec_c, - (const PetscScalar **)&c, - &c_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(pr_restr_ctx->loc_vec_f, &f, &f_mem_type); - CHKERRQ(ierr); - CeedVectorSetArray(pr_restr_ctx->ceed_vec_c, MemTypeP2C(c_mem_type), - CEED_USE_POINTER, c); - CeedVectorSetArray(pr_restr_ctx->ceed_vec_f, MemTypeP2C(f_mem_type), - CEED_USE_POINTER, f); + PetscCall(VecGetArrayReadAndMemType(pr_restr_ctx->loc_vec_c, (const PetscScalar **)&c, &c_mem_type)); + PetscCall(VecGetArrayAndMemType(pr_restr_ctx->loc_vec_f, &f, &f_mem_type)); + CeedVectorSetArray(pr_restr_ctx->ceed_vec_c, MemTypeP2C(c_mem_type), CEED_USE_POINTER, c); + CeedVectorSetArray(pr_restr_ctx->ceed_vec_f, MemTypeP2C(f_mem_type), CEED_USE_POINTER, f); // Apply libCEED operator - CeedOperatorApply(pr_restr_ctx->op_prolong, pr_restr_ctx->ceed_vec_c, - pr_restr_ctx->ceed_vec_f, CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(pr_restr_ctx->op_prolong, pr_restr_ctx->ceed_vec_c, pr_restr_ctx->ceed_vec_f, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(pr_restr_ctx->ceed_vec_c, MemTypeP2C(c_mem_type), NULL); CeedVectorTakeArray(pr_restr_ctx->ceed_vec_f, MemTypeP2C(f_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(pr_restr_ctx->loc_vec_c, - (const PetscScalar **)&c); - CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(pr_restr_ctx->loc_vec_f, &f); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(pr_restr_ctx->loc_vec_c, (const PetscScalar **)&c)); + PetscCall(VecRestoreArrayAndMemType(pr_restr_ctx->loc_vec_f, &f)); // Multiplicity - ierr = VecPointwiseMult(pr_restr_ctx->loc_vec_f, pr_restr_ctx->loc_vec_f, - pr_restr_ctx->mult_vec); + PetscCall(VecPointwiseMult(pr_restr_ctx->loc_vec_f, pr_restr_ctx->loc_vec_f, pr_restr_ctx->mult_vec)); // Local-to-global - ierr = VecZeroEntries(Y); CHKERRQ(ierr); - ierr = DMLocalToGlobal(pr_restr_ctx->dmf, pr_restr_ctx->loc_vec_f, ADD_VALUES, - Y); CHKERRQ(ierr); + PetscCall(VecZeroEntries(Y)); + PetscCall(DMLocalToGlobal(pr_restr_ctx->dmf, pr_restr_ctx->loc_vec_f, ADD_VALUES, Y)); PetscFunctionReturn(0); }; @@ -196,53 +165,39 @@ PetscErrorCode MatMult_Prolong(Mat A, Vec X, Vec Y) { // This function uses libCEED to compute the action of the restriction operator // ----------------------------------------------------------------------------- PetscErrorCode MatMult_Restrict(Mat A, Vec X, Vec Y) { - PetscErrorCode ierr; ProlongRestrContext pr_restr_ctx; - PetscScalar *c, *f; - PetscMemType c_mem_type, f_mem_type; + PetscScalar *c, *f; + PetscMemType c_mem_type, f_mem_type; PetscFunctionBeginUser; - ierr = MatShellGetContext(A, &pr_restr_ctx); CHKERRQ(ierr); + PetscCall(MatShellGetContext(A, &pr_restr_ctx)); // Global-to-local - ierr = VecZeroEntries(pr_restr_ctx->loc_vec_f); CHKERRQ(ierr); - ierr = DMGlobalToLocal(pr_restr_ctx->dmf, X, INSERT_VALUES, - pr_restr_ctx->loc_vec_f); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(pr_restr_ctx->loc_vec_f)); + PetscCall(DMGlobalToLocal(pr_restr_ctx->dmf, X, INSERT_VALUES, pr_restr_ctx->loc_vec_f)); // Multiplicity - ierr = VecPointwiseMult(pr_restr_ctx->loc_vec_f, pr_restr_ctx->loc_vec_f, - pr_restr_ctx->mult_vec); - CHKERRQ(ierr); + PetscCall(VecPointwiseMult(pr_restr_ctx->loc_vec_f, pr_restr_ctx->loc_vec_f, pr_restr_ctx->mult_vec)); // Setup libCEED vectors - ierr = VecGetArrayReadAndMemType(pr_restr_ctx->loc_vec_f, - (const PetscScalar **)&f, - &f_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(pr_restr_ctx->loc_vec_c, &c, &c_mem_type); - CHKERRQ(ierr); - CeedVectorSetArray(pr_restr_ctx->ceed_vec_f, MemTypeP2C(f_mem_type), - CEED_USE_POINTER, f); - CeedVectorSetArray(pr_restr_ctx->ceed_vec_c, MemTypeP2C(c_mem_type), - CEED_USE_POINTER, c); + PetscCall(VecGetArrayReadAndMemType(pr_restr_ctx->loc_vec_f, (const PetscScalar **)&f, &f_mem_type)); + PetscCall(VecGetArrayAndMemType(pr_restr_ctx->loc_vec_c, &c, &c_mem_type)); + CeedVectorSetArray(pr_restr_ctx->ceed_vec_f, MemTypeP2C(f_mem_type), CEED_USE_POINTER, f); + CeedVectorSetArray(pr_restr_ctx->ceed_vec_c, MemTypeP2C(c_mem_type), CEED_USE_POINTER, c); // Apply CEED operator - CeedOperatorApply(pr_restr_ctx->op_restrict, pr_restr_ctx->ceed_vec_f, - pr_restr_ctx->ceed_vec_c, CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(pr_restr_ctx->op_restrict, pr_restr_ctx->ceed_vec_f, pr_restr_ctx->ceed_vec_c, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(pr_restr_ctx->ceed_vec_c, MemTypeP2C(c_mem_type), NULL); CeedVectorTakeArray(pr_restr_ctx->ceed_vec_f, MemTypeP2C(f_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(pr_restr_ctx->loc_vec_f, - (const PetscScalar **)&f); CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(pr_restr_ctx->loc_vec_c, &c); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(pr_restr_ctx->loc_vec_f, (const PetscScalar **)&f)); + PetscCall(VecRestoreArrayAndMemType(pr_restr_ctx->loc_vec_c, &c)); // Local-to-global - ierr = VecZeroEntries(Y); CHKERRQ(ierr); - ierr = DMLocalToGlobal(pr_restr_ctx->dmc, pr_restr_ctx->loc_vec_c, ADD_VALUES, - Y); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(Y)); + PetscCall(DMLocalToGlobal(pr_restr_ctx->dmc, pr_restr_ctx->loc_vec_c, ADD_VALUES, Y)); PetscFunctionReturn(0); }; @@ -250,13 +205,11 @@ PetscErrorCode MatMult_Restrict(Mat A, Vec X, Vec Y) { // ----------------------------------------------------------------------------- // This function calculates the error in the final solution // ----------------------------------------------------------------------------- -PetscErrorCode ComputeL2Error(Vec X, PetscScalar *l2_error, - OperatorApplyContext op_error_ctx) { - - Vec E; +PetscErrorCode ComputeL2Error(Vec X, PetscScalar *l2_error, OperatorApplyContext op_error_ctx) { + Vec E; PetscFunctionBeginUser; PetscCall(VecDuplicate(X, &E)); - PetscCall(ApplyLocal_Ceed(X, E, op_error_ctx) ); + PetscCall(ApplyLocal_Ceed(X, E, op_error_ctx)); PetscScalar error_sq = 1.0; PetscCall(VecSum(E, &error_sq)); *l2_error = sqrt(error_sq); diff --git a/examples/petsc/src/petscutils.c b/examples/petsc/src/petscutils.c index 52f6313fe6..f31d9eeb02 100644 --- a/examples/petsc/src/petscutils.c +++ b/examples/petsc/src/petscutils.c @@ -3,9 +3,7 @@ // ----------------------------------------------------------------------------- // Convert PETSc MemType to libCEED MemType // ----------------------------------------------------------------------------- -CeedMemType MemTypeP2C(PetscMemType mem_type) { - return PetscMemTypeDevice(mem_type) ? CEED_MEM_DEVICE : CEED_MEM_HOST; -} +CeedMemType MemTypeP2C(PetscMemType mem_type) { return PetscMemTypeDevice(mem_type) ? CEED_MEM_DEVICE : CEED_MEM_HOST; } // ----------------------------------------------------------------------------- // Apply 3D Kershaw mesh transformation @@ -15,63 +13,59 @@ CeedMemType MemTypeP2C(PetscMemType mem_type) { static double step(const double a, const double b, double x) { if (x <= 0) return a; if (x >= 1) return b; - return a + (b-a) * (x); + return a + (b - a) * (x); } // 1D transformation at the right boundary -static double right(const double eps, const double x) { - return (x <= 0.5) ? (2-eps) * x : 1 + eps*(x-1); -} +static double right(const double eps, const double x) { return (x <= 0.5) ? (2 - eps) * x : 1 + eps * (x - 1); } // 1D transformation at the left boundary -static double left(const double eps, const double x) { - return 1-right(eps,1-x); -} +static double left(const double eps, const double x) { return 1 - right(eps, 1 - x); } // Apply 3D Kershaw mesh transformation // The eps parameters are in (0, 1] // Uniform mesh is recovered for eps=1 PetscErrorCode Kershaw(DM dm_orig, PetscScalar eps) { - PetscErrorCode ierr; - Vec coord; - PetscInt ncoord; + Vec coord; + PetscInt ncoord; PetscScalar *c; PetscFunctionBeginUser; - ierr = DMGetCoordinatesLocal(dm_orig, &coord); CHKERRQ(ierr); - ierr = VecGetLocalSize(coord, &ncoord); CHKERRQ(ierr); - ierr = VecGetArray(coord, &c); CHKERRQ(ierr); + + PetscCall(DMGetCoordinatesLocal(dm_orig, &coord)); + PetscCall(VecGetLocalSize(coord, &ncoord)); + PetscCall(VecGetArray(coord, &c)); for (PetscInt i = 0; i < ncoord; i += 3) { - PetscScalar x = c[i], y = c[i+1], z = c[i+2]; - PetscInt layer = x*6; - PetscScalar lambda = (x-layer/6.0)*6; - c[i] = x; + PetscScalar x = c[i], y = c[i + 1], z = c[i + 2]; + PetscInt layer = x * 6; + PetscScalar lambda = (x - layer / 6.0) * 6; + c[i] = x; switch (layer) { - case 0: - c[i+1] = left(eps, y); - c[i+2] = left(eps, z); - break; - case 1: - case 4: - c[i+1] = step(left(eps, y), right(eps, y), lambda); - c[i+2] = step(left(eps, z), right(eps, z), lambda); - break; - case 2: - c[i+1] = step(right(eps, y), left(eps, y), lambda/2); - c[i+2] = step(right(eps, z), left(eps, z), lambda/2); - break; - case 3: - c[i+1] = step(right(eps, y), left(eps, y), (1+lambda)/2); - c[i+2] = step(right(eps, z), left(eps, z), (1+lambda)/2); - break; - default: - c[i+1] = right(eps, y); - c[i+2] = right(eps, z); + case 0: + c[i + 1] = left(eps, y); + c[i + 2] = left(eps, z); + break; + case 1: + case 4: + c[i + 1] = step(left(eps, y), right(eps, y), lambda); + c[i + 2] = step(left(eps, z), right(eps, z), lambda); + break; + case 2: + c[i + 1] = step(right(eps, y), left(eps, y), lambda / 2); + c[i + 2] = step(right(eps, z), left(eps, z), lambda / 2); + break; + case 3: + c[i + 1] = step(right(eps, y), left(eps, y), (1 + lambda) / 2); + c[i + 2] = step(right(eps, z), left(eps, z), (1 + lambda) / 2); + break; + default: + c[i + 1] = right(eps, y); + c[i + 2] = right(eps, z); } } - ierr = VecRestoreArray(coord, &c); CHKERRQ(ierr); + PetscCall(VecRestoreArray(coord, &c)); PetscFunctionReturn(0); } @@ -79,7 +73,6 @@ PetscErrorCode Kershaw(DM dm_orig, PetscScalar eps) { // Create BC label // ----------------------------------------------------------------------------- static PetscErrorCode CreateBCLabel(DM dm, const char name[]) { - DMLabel label; PetscFunctionBeginUser; @@ -95,34 +88,31 @@ static PetscErrorCode CreateBCLabel(DM dm, const char name[]) { // ----------------------------------------------------------------------------- // This function sets up a DM for a given degree // ----------------------------------------------------------------------------- -PetscErrorCode SetupDMByDegree(DM dm, PetscInt p_degree, PetscInt q_extra, - PetscInt num_comp_u, PetscInt dim, bool enforce_bc) { - PetscInt ierr, marker_ids[1] = {1}; - PetscInt q_degree = p_degree + q_extra; - PetscFE fe; - MPI_Comm comm; - PetscBool is_simplex = PETSC_TRUE; +PetscErrorCode SetupDMByDegree(DM dm, PetscInt p_degree, PetscInt q_extra, PetscInt num_comp_u, PetscInt dim, bool enforce_bc) { + PetscInt marker_ids[1] = {1}; + PetscInt q_degree = p_degree + q_extra; + PetscFE fe; + MPI_Comm comm; + PetscBool is_simplex = PETSC_TRUE; PetscFunctionBeginUser; // Check if simplex or tensor-product mesh - ierr = DMPlexIsSimplex(dm, &is_simplex); CHKERRQ(ierr); + PetscCall(DMPlexIsSimplex(dm, &is_simplex)); // Setup FE - ierr = PetscObjectGetComm((PetscObject)dm, &comm); CHKERRQ(ierr); - ierr = PetscFECreateLagrange(comm, dim, num_comp_u, is_simplex, p_degree, - q_degree, &fe); CHKERRQ(ierr); - ierr = DMAddField(dm, NULL, (PetscObject)fe); CHKERRQ(ierr); - ierr = DMCreateDS(dm); CHKERRQ(ierr); + PetscCall(PetscObjectGetComm((PetscObject)dm, &comm)); + PetscCall(PetscFECreateLagrange(comm, dim, num_comp_u, is_simplex, p_degree, q_degree, &fe)); + PetscCall(DMAddField(dm, NULL, (PetscObject)fe)); + PetscCall(DMCreateDS(dm)); { // create FE field for coordinates - PetscFE fe_coords; + PetscFE fe_coords; PetscInt num_comp_coord; - ierr = DMGetCoordinateDim(dm, &num_comp_coord); CHKERRQ(ierr); - ierr = PetscFECreateLagrange(comm, dim, num_comp_coord, is_simplex, 1, q_degree, - &fe_coords); CHKERRQ(ierr); - ierr = DMProjectCoordinates(dm, fe_coords); CHKERRQ(ierr); - ierr = PetscFEDestroy(&fe_coords); CHKERRQ(ierr); + PetscCall(DMGetCoordinateDim(dm, &num_comp_coord)); + PetscCall(PetscFECreateLagrange(comm, dim, num_comp_coord, is_simplex, 1, q_degree, &fe_coords)); + PetscCall(DMProjectCoordinates(dm, fe_coords)); + PetscCall(PetscFEDestroy(&fe_coords)); } // Setup Dirichlet BC @@ -132,25 +122,23 @@ PetscErrorCode SetupDMByDegree(DM dm, PetscInt p_degree, PetscInt q_extra, if (enforce_bc) { PetscBool has_label; DMHasLabel(dm, "marker", &has_label); - if (!has_label) {CreateBCLabel(dm, "marker");} + if (!has_label) { + CreateBCLabel(dm, "marker"); + } DMLabel label; - ierr = DMGetLabel(dm, "marker", &label); CHKERRQ(ierr); - ierr = DMAddBoundary(dm, DM_BC_ESSENTIAL, "wall", label, 1, - marker_ids, 0, 0, NULL, NULL, - NULL, NULL, NULL); CHKERRQ(ierr); + PetscCall(DMGetLabel(dm, "marker", &label)); + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "wall", label, 1, marker_ids, 0, 0, NULL, NULL, NULL, NULL, NULL)); PetscCall(DMSetOptionsPrefix(dm, "final_")); PetscCall(DMViewFromOptions(dm, NULL, "-dm_view")); } if (!is_simplex) { DM dm_coord; - ierr = DMGetCoordinateDM(dm, &dm_coord); CHKERRQ(ierr); - ierr = DMPlexSetClosurePermutationTensor(dm, PETSC_DETERMINE, NULL); - CHKERRQ(ierr); - ierr = DMPlexSetClosurePermutationTensor(dm_coord, PETSC_DETERMINE, NULL); - CHKERRQ(ierr); + PetscCall(DMGetCoordinateDM(dm, &dm_coord)); + PetscCall(DMPlexSetClosurePermutationTensor(dm, PETSC_DETERMINE, NULL)); + PetscCall(DMPlexSetClosurePermutationTensor(dm_coord, PETSC_DETERMINE, NULL)); } - ierr = PetscFEDestroy(&fe); CHKERRQ(ierr); + PetscCall(PetscFEDestroy(&fe)); PetscFunctionReturn(0); }; @@ -158,21 +146,15 @@ PetscErrorCode SetupDMByDegree(DM dm, PetscInt p_degree, PetscInt q_extra, // ----------------------------------------------------------------------------- // Get CEED restriction data from DMPlex // ----------------------------------------------------------------------------- -PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr) { +PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr) { PetscInt num_elem, elem_size, num_dof, num_comp, *elem_restr_offsets; - PetscErrorCode ierr; PetscFunctionBeginUser; - ierr = DMPlexGetLocalOffsets(dm, domain_label, value, height, 0, &num_elem, - &elem_size, &num_comp, &num_dof, &elem_restr_offsets); - CHKERRQ(ierr); + PetscCall(DMPlexGetLocalOffsets(dm, domain_label, value, height, 0, &num_elem, &elem_size, &num_comp, &num_dof, &elem_restr_offsets)); - CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp, - 1, num_dof, CEED_MEM_HOST, CEED_COPY_VALUES, - elem_restr_offsets, elem_restr); - ierr = PetscFree(elem_restr_offsets); CHKERRQ(ierr); + CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp, 1, num_dof, CEED_MEM_HOST, CEED_COPY_VALUES, elem_restr_offsets, elem_restr); + PetscCall(PetscFree(elem_restr_offsets)); PetscFunctionReturn(0); }; @@ -182,19 +164,23 @@ PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, // ----------------------------------------------------------------------------- CeedElemTopology ElemTopologyP2C(DMPolytopeType cell_type) { switch (cell_type) { - case DM_POLYTOPE_TRIANGLE: return CEED_TOPOLOGY_TRIANGLE; - case DM_POLYTOPE_QUADRILATERAL: return CEED_TOPOLOGY_QUAD; - case DM_POLYTOPE_TETRAHEDRON: return CEED_TOPOLOGY_TET; - case DM_POLYTOPE_HEXAHEDRON: return CEED_TOPOLOGY_HEX; - default: return 0; + case DM_POLYTOPE_TRIANGLE: + return CEED_TOPOLOGY_TRIANGLE; + case DM_POLYTOPE_QUADRILATERAL: + return CEED_TOPOLOGY_QUAD; + case DM_POLYTOPE_TETRAHEDRON: + return CEED_TOPOLOGY_TET; + case DM_POLYTOPE_HEXAHEDRON: + return CEED_TOPOLOGY_HEX; + default: + return 0; } } // ----------------------------------------------------------------------------- // Convert DM field to DS field // ----------------------------------------------------------------------------- -PetscErrorCode DMFieldToDSField(DM dm, DMLabel domain_label, PetscInt dm_field, - PetscInt *ds_field) { +PetscErrorCode DMFieldToDSField(DM dm, DMLabel domain_label, PetscInt dm_field, PetscInt *ds_field) { PetscDS ds; IS field_is; const PetscInt *fields; @@ -214,8 +200,7 @@ PetscErrorCode DMFieldToDSField(DM dm, DMLabel domain_label, PetscInt dm_field, } PetscCall(ISRestoreIndices(field_is, &fields)); - if (*ds_field == -1) SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP, - "Could not find dm_field %" PetscInt_FMT " in DS", dm_field); + if (*ds_field == -1) SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP, "Could not find dm_field %" PetscInt_FMT " in DS", dm_field); PetscFunctionReturn(0); } @@ -223,11 +208,8 @@ PetscErrorCode DMFieldToDSField(DM dm, DMLabel domain_label, PetscInt dm_field, // ----------------------------------------------------------------------------- // Create libCEED Basis from PetscTabulation // ----------------------------------------------------------------------------- -PetscErrorCode BasisCreateFromTabulation(Ceed ceed, DM dm, DMLabel domain_label, - PetscInt label_value, PetscInt height, PetscInt face, - PetscFE fe, PetscTabulation basis_tabulation, PetscQuadrature quadrature, - CeedBasis *basis) { - +PetscErrorCode BasisCreateFromTabulation(Ceed ceed, DM dm, DMLabel domain_label, PetscInt label_value, PetscInt height, PetscInt face, PetscFE fe, + PetscTabulation basis_tabulation, PetscQuadrature quadrature, CeedBasis *basis) { PetscInt first_point; PetscInt ids[1] = {label_value}; DMLabel depth_label; @@ -258,24 +240,20 @@ PetscErrorCode BasisCreateFromTabulation(Ceed ceed, DM dm, DMLabel domain_label, } // Get cell interp, grad, and quadrature data - PetscCall(DMGetFirstLabeledPoint(dm, dm, - domain_label ? domain_label : depth_label, 1, ids, height, &first_point, NULL)); + PetscCall(DMGetFirstLabeledPoint(dm, dm, domain_label ? domain_label : depth_label, 1, ids, height, &first_point, NULL)); PetscCall(DMPlexGetCellType(dm, first_point, &cell_type)); elem_topo = ElemTopologyP2C(cell_type); - if (!elem_topo) SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP, - "DMPlex topology not supported"); + if (!elem_topo) SETERRQ(PetscObjectComm((PetscObject)dm), PETSC_ERR_SUP, "DMPlex topology not supported"); { size_t q_points_size; const PetscScalar *q_points_petsc; PetscInt q_dim; - PetscCall(PetscQuadratureGetData(quadrature, &q_dim, NULL, &Q, &q_points_petsc, - &q_weights)); + PetscCall(PetscQuadratureGetData(quadrature, &q_dim, NULL, &Q, &q_points_petsc, &q_weights)); q_points_size = Q * dim * sizeof(CeedScalar); PetscCall(PetscCalloc(q_points_size, &q_points)); for (PetscInt q = 0; q < Q; q++) { - for (PetscInt d = 0; d < q_dim; - d++) q_points[q * dim + d] = q_points_petsc[q * q_dim + d]; + for (PetscInt d = 0; d < q_dim; d++) q_points[q * dim + d] = q_points_petsc[q * q_dim + d]; } } @@ -291,8 +269,7 @@ PetscErrorCode BasisCreateFromTabulation(Ceed ceed, DM dm, DMLabel domain_label, // -- Get permutation PetscCall(DMGetLocalSection(dm, §ion)); - PetscCall(PetscSectionGetClosurePermutation(section, (PetscObject)dm, dim, - num_comp * P, &permutation)); + PetscCall(PetscSectionGetClosurePermutation(section, (PetscObject)dm, dim, num_comp * P, &permutation)); PetscCall(ISGetIndices(permutation, &permutation_indices)); } @@ -302,14 +279,11 @@ PetscErrorCode BasisCreateFromTabulation(Ceed ceed, DM dm, DMLabel domain_label, const CeedInt c = 0; for (CeedInt q = 0; q < Q; q++) { for (CeedInt p_ceed = 0; p_ceed < P; p_ceed++) { - CeedInt p_petsc = is_simplex ? (p_ceed * num_comp) : permutation_indices[p_ceed - * num_comp]; + CeedInt p_petsc = is_simplex ? (p_ceed * num_comp) : permutation_indices[p_ceed * num_comp]; - interp[q * P + p_ceed] = basis_tabulation->T[0][((face * Q + q) * P * num_comp + - p_petsc) * num_comp + c]; + interp[q * P + p_ceed] = basis_tabulation->T[0][((face * Q + q) * P * num_comp + p_petsc) * num_comp + c]; for (CeedInt d = 0; d < dim; d++) { - grad[(d * Q + q) * P + p_ceed] = basis_tabulation->T[1][((( - face * Q + q) * P * num_comp + p_petsc) * num_comp + c) * dim + d]; + grad[(d * Q + q) * P + p_ceed] = basis_tabulation->T[1][(((face * Q + q) * P * num_comp + p_petsc) * num_comp + c) * dim + d]; } } } @@ -320,8 +294,7 @@ PetscErrorCode BasisCreateFromTabulation(Ceed ceed, DM dm, DMLabel domain_label, } // Finally, create libCEED basis - CeedBasisCreateH1(ceed, elem_topo, num_comp, P, Q, interp, grad, q_points, - q_weights, basis); + CeedBasisCreateH1(ceed, elem_topo, num_comp, P, Q, interp, grad, q_points, q_weights, basis); PetscCall(PetscFree(q_points)); PetscCall(PetscFree(interp)); PetscCall(PetscFree(grad)); @@ -332,9 +305,8 @@ PetscErrorCode BasisCreateFromTabulation(Ceed ceed, DM dm, DMLabel domain_label, // ----------------------------------------------------------------------------- // Get CEED Basis from DMPlex // ----------------------------------------------------------------------------- -PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, - CeedInt label_value, CeedInt height, - CeedInt dm_field, BPData bp_data, CeedBasis *basis) { +PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, CeedInt label_value, CeedInt height, CeedInt dm_field, BPData bp_data, + CeedBasis *basis) { PetscDS ds; PetscFE fe; PetscQuadrature quadrature; @@ -359,8 +331,7 @@ PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, PetscInt num_derivatives = 1, face = 0; PetscCall(PetscFEGetCellTabulation(fe, num_derivatives, &basis_tabulation)); - PetscCall(BasisCreateFromTabulation(ceed, dm, domain_label, label_value, height, - face, fe, basis_tabulation, quadrature, basis)); + PetscCall(BasisCreateFromTabulation(ceed, dm, domain_label, label_value, height, face, fe, basis_tabulation, quadrature, basis)); } else { PetscDualSpace dual_space; PetscInt num_dual_basis_vectors; @@ -376,8 +347,7 @@ PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, CeedInt P_1d = (CeedInt)round(pow(P, 1.0 / dim)); CeedInt Q_1d = (CeedInt)round(pow(Q, 1.0 / dim)); - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, P_1d, Q_1d, - bp_data.q_mode, basis); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, P_1d, Q_1d, bp_data.q_mode, basis); } PetscFunctionReturn(0); @@ -389,55 +359,42 @@ PetscErrorCode CreateBasisFromPlex(Ceed ceed, DM dm, DMLabel domain_label, // Utility function, compute three factors of an integer static void Split3(PetscInt size, PetscInt m[3], bool reverse) { - for (PetscInt d=0, size_left=size; d<3; d++) { - PetscInt try = (PetscInt)PetscCeilReal(PetscPowReal(size_left, 1./(3 - d))); + for (PetscInt d = 0, size_left = size; d < 3; d++) { + PetscInt try = (PetscInt)PetscCeilReal(PetscPowReal(size_left, 1. / (3 - d))); while (try * (size_left / try) != size_left) try++; - m[reverse ? 2-d : d] = try; + m[reverse ? 2 - d : d] = try; size_left /= try; } } -static int Max3(const PetscInt a[3]) { - return PetscMax(a[0], PetscMax(a[1], a[2])); -} +static int Max3(const PetscInt a[3]) { return PetscMax(a[0], PetscMax(a[1], a[2])); } -static int Min3(const PetscInt a[3]) { - return PetscMin(a[0], PetscMin(a[1], a[2])); -} +static int Min3(const PetscInt a[3]) { return PetscMin(a[0], PetscMin(a[1], a[2])); } // ----------------------------------------------------------------------------- // Create distribute dm // ----------------------------------------------------------------------------- PetscErrorCode CreateDistributedDM(RunParams rp, DM *dm) { - PetscErrorCode ierr; - PetscFunctionBeginUser; // Setup DM if (rp->read_mesh) { - ierr = DMPlexCreateFromFile(PETSC_COMM_WORLD, rp->filename, NULL, PETSC_TRUE, - dm); - CHKERRQ(ierr); + PetscCall(DMPlexCreateFromFile(PETSC_COMM_WORLD, rp->filename, NULL, PETSC_TRUE, dm)); } else { if (rp->user_l_nodes) { // Find a nicely composite number of elements no less than global nodes PetscMPIInt size; - ierr = MPI_Comm_size(rp->comm, &size); CHKERRQ(ierr); - for (PetscInt g_elem = - PetscMax(1, size * rp->local_nodes / PetscPowInt(rp->degree, rp->dim)); - ; - g_elem++) { + PetscCall(MPI_Comm_size(rp->comm, &size)); + for (PetscInt g_elem = PetscMax(1, size * rp->local_nodes / PetscPowInt(rp->degree, rp->dim));; g_elem++) { Split3(g_elem, rp->mesh_elem, true); if (Max3(rp->mesh_elem) / Min3(rp->mesh_elem) <= 2) break; } } - ierr = DMPlexCreateBoxMesh(PETSC_COMM_WORLD, rp->dim, rp->simplex, - rp->mesh_elem, - NULL, NULL, NULL, PETSC_TRUE, dm); CHKERRQ(ierr); + PetscCall(DMPlexCreateBoxMesh(PETSC_COMM_WORLD, rp->dim, rp->simplex, rp->mesh_elem, NULL, NULL, NULL, PETSC_TRUE, dm)); } - ierr = DMSetFromOptions(*dm); CHKERRQ(ierr); - ierr = DMViewFromOptions(*dm, NULL, "-dm_view"); CHKERRQ(ierr); + PetscCall(DMSetFromOptions(*dm)); + PetscCall(DMViewFromOptions(*dm, NULL, "-dm_view")); PetscFunctionReturn(0); } diff --git a/examples/solids/elasticity.c b/examples/solids/elasticity.c index d330858a8b..7bec7cf7bc 100644 --- a/examples/solids/elasticity.c +++ b/examples/solids/elasticity.c @@ -19,8 +19,10 @@ // Sample runs: // // ./elasticity -mesh [.exo file] -degree 2 -E 1 -nu 0.3 -problem Linear -forcing mms -// ./elasticity -mesh [.exo file] -degree 2 -E 1 -nu 0.3 -bc_clamp 998,999 -bc_clamp_998_translate 0.1,0.2,0.3 -problem SS-NH -forcing none -ceed /cpu/self -// ./elasticity -mesh [.exo file] -degree 2 -E 1 -nu 0.3 -bc_clamp 998,999 -bc_clamp_998_rotate 1,0,0,0.2 -problem FSInitial-NH1 -forcing none -ceed /gpu/cuda +// ./elasticity -mesh [.exo file] -degree 2 -E 1 -nu 0.3 -bc_clamp 998,999 -bc_clamp_998_translate 0.1,0.2,0.3 -problem SS-NH -forcing none -ceed +// /cpu/self +// ./elasticity -mesh [.exo file] -degree 2 -E 1 -nu 0.3 -bc_clamp 998,999 -bc_clamp_998_rotate 1,0,0,0.2 -problem FSInitial-NH1 -forcing none +// -ceed /gpu/cuda // // Sample meshes can be found at https://github.com/jeremylt/ceedSampleMeshes // @@ -36,42 +38,39 @@ const char help[] = "Solve solid Problems with CEED and PETSc DMPlex\n"; #include "elasticity.h" int main(int argc, char **argv) { - PetscInt ierr; - MPI_Comm comm; + MPI_Comm comm; // Context structs - AppCtx app_ctx; // Contains problem options - ProblemFunctions problem_functions; // Setup functions for each problem - Units units; // Contains units scaling + AppCtx app_ctx; // Contains problem options + ProblemFunctions problem_functions; // Setup functions for each problem + Units units; // Contains units scaling // PETSc objects - PetscLogStage stage_dm_setup, stage_libceed_setup, - stage_snes_setup, stage_snes_solve; - DM dm_orig; // Distributed DM to clone - DM dm_energy, dm_diagnostic; // DMs for postprocessing - DM *level_dms; - Vec U, *U_g, *U_loc; // U: solution, R: residual, F: forcing - Vec R, R_loc, F, F_loc; // g: global, loc: local - Vec neumann_bcs = NULL, bcs_loc = NULL; - SNES snes; - Mat *jacob_mat, jacob_mat_coarse, *prolong_restr_mat; + PetscLogStage stage_dm_setup, stage_libceed_setup, stage_snes_setup, stage_snes_solve; + DM dm_orig; // Distributed DM to clone + DM dm_energy, dm_diagnostic; // DMs for postprocessing + DM *level_dms; + Vec U, *U_g, *U_loc; // U: solution, R: residual, F: forcing + Vec R, R_loc, F, F_loc; // g: global, loc: local + Vec neumann_bcs = NULL, bcs_loc = NULL; + SNES snes; + Mat *jacob_mat, jacob_mat_coarse, *prolong_restr_mat; // PETSc data - UserMult res_ctx, jacob_coarse_ctx = NULL, *jacob_ctx; - FormJacobCtx form_jacob_ctx; + UserMult res_ctx, jacob_coarse_ctx = NULL, *jacob_ctx; + FormJacobCtx form_jacob_ctx; UserMultProlongRestr *prolong_restr_ctx; - PCMGCycleType pcmg_cycle_type = PC_MG_CYCLE_V; + PCMGCycleType pcmg_cycle_type = PC_MG_CYCLE_V; // libCEED objects - Ceed ceed; - CeedData *ceed_data; + Ceed ceed; + CeedData *ceed_data; CeedQFunctionContext ctx_phys, ctx_phys_smoother = NULL; // Parameters - PetscInt num_comp_u = 3; // 3 DoFs in 3D - PetscInt num_comp_e = 1, num_comp_d = 5; // 1 energy output, 5 diagnostic - PetscInt num_levels = 1, fine_level = 0; - PetscInt *U_g_size, *U_l_size, *U_loc_size; - PetscInt snes_its = 0, ksp_its = 0; - double start_time, elapsed_time, min_time, max_time; + PetscInt num_comp_u = 3; // 3 DoFs in 3D + PetscInt num_comp_e = 1, num_comp_d = 5; // 1 energy output, 5 diagnostic + PetscInt num_levels = 1, fine_level = 0; + PetscInt *U_g_size, *U_l_size, *U_loc_size; + PetscInt snes_its = 0, ksp_its = 0; + double start_time, elapsed_time, min_time, max_time; - ierr = PetscInitialize(&argc, &argv, NULL, help); - if (ierr) return ierr; + PetscCall(PetscInitialize(&argc, &argv, NULL, help)); // --------------------------------------------------------------------------- // Process command line options @@ -79,10 +78,10 @@ int main(int argc, char **argv) { comm = PETSC_COMM_WORLD; // -- Set mesh file, polynomial degree, problem type - ierr = PetscCalloc1(1, &app_ctx); CHKERRQ(ierr); - ierr = ProcessCommandLineOptions(comm, app_ctx); CHKERRQ(ierr); - ierr = PetscCalloc1(1, &problem_functions); CHKERRQ(ierr); - ierr = RegisterProblems(problem_functions); CHKERRQ(ierr); + PetscCall(PetscCalloc1(1, &app_ctx)); + PetscCall(ProcessCommandLineOptions(comm, app_ctx)); + PetscCall(PetscCalloc1(1, &problem_functions)); + PetscCall(RegisterProblems(problem_functions)); num_levels = app_ctx->num_levels; fine_level = num_levels - 1; @@ -98,256 +97,206 @@ int main(int argc, char **argv) { // Setup physics context and wrap in libCEED object { PetscErrorCode (*SetupPhysics)(MPI_Comm, Ceed, Units *, CeedQFunctionContext *); - ierr = PetscFunctionListFind(problem_functions->setupPhysics, app_ctx->name, - &SetupPhysics); CHKERRQ(ierr); - if (!SetupPhysics) - SETERRQ(PETSC_COMM_SELF, 1, "Physics setup for '%s' not found", - app_ctx->name); - ierr = (*SetupPhysics)(comm, ceed, &units, &ctx_phys); CHKERRQ(ierr); - PetscErrorCode (*SetupSmootherPhysics)(MPI_Comm, Ceed, CeedQFunctionContext, - CeedQFunctionContext *); - ierr = PetscFunctionListFind(problem_functions->setupSmootherPhysics, - app_ctx->name, &SetupSmootherPhysics); - CHKERRQ(ierr); - if (!SetupSmootherPhysics) - SETERRQ(PETSC_COMM_SELF, 1, "Smoother physics setup for '%s' not found", - app_ctx->name); - ierr = (*SetupSmootherPhysics)(comm, ceed, ctx_phys, &ctx_phys_smoother); - CHKERRQ(ierr); + PetscCall(PetscFunctionListFind(problem_functions->setupPhysics, app_ctx->name, &SetupPhysics)); + if (!SetupPhysics) SETERRQ(PETSC_COMM_SELF, 1, "Physics setup for '%s' not found", app_ctx->name); + PetscCall((*SetupPhysics)(comm, ceed, &units, &ctx_phys)); + PetscErrorCode (*SetupSmootherPhysics)(MPI_Comm, Ceed, CeedQFunctionContext, CeedQFunctionContext *); + PetscCall(PetscFunctionListFind(problem_functions->setupSmootherPhysics, app_ctx->name, &SetupSmootherPhysics)); + if (!SetupSmootherPhysics) SETERRQ(PETSC_COMM_SELF, 1, "Smoother physics setup for '%s' not found", app_ctx->name); + PetscCall((*SetupSmootherPhysics)(comm, ceed, ctx_phys, &ctx_phys_smoother)); } // --------------------------------------------------------------------------- // Setup DM // --------------------------------------------------------------------------- // Performance logging - ierr = PetscLogStageRegister("DM and Vector Setup Stage", &stage_dm_setup); - CHKERRQ(ierr); - ierr = PetscLogStagePush(stage_dm_setup); CHKERRQ(ierr); + PetscCall(PetscLogStageRegister("DM and Vector Setup Stage", &stage_dm_setup)); + PetscCall(PetscLogStagePush(stage_dm_setup)); // -- Create distributed DM from mesh file - ierr = CreateDistributedDM(comm, app_ctx, &dm_orig); CHKERRQ(ierr); + PetscCall(CreateDistributedDM(comm, app_ctx, &dm_orig)); VecType vectype; switch (mem_type_backend) { - case CEED_MEM_HOST: vectype = VECSTANDARD; break; - case CEED_MEM_DEVICE: { - const char *resolved; - CeedGetResource(ceed, &resolved); - if (strstr(resolved, "/gpu/cuda")) vectype = VECCUDA; - else if (strstr(resolved, "/gpu/hip")) vectype = VECHIP; - else vectype = VECSTANDARD; - } + case CEED_MEM_HOST: + vectype = VECSTANDARD; + break; + case CEED_MEM_DEVICE: { + const char *resolved; + CeedGetResource(ceed, &resolved); + if (strstr(resolved, "/gpu/cuda")) vectype = VECCUDA; + else if (strstr(resolved, "/gpu/hip")) vectype = VECHIP; + else vectype = VECSTANDARD; + } } - ierr = DMSetVecType(dm_orig, vectype); CHKERRQ(ierr); - ierr = DMPlexDistributeSetDefault(dm_orig, PETSC_FALSE); CHKERRQ(ierr); - ierr = DMSetFromOptions(dm_orig); CHKERRQ(ierr); + PetscCall(DMSetVecType(dm_orig, vectype)); + PetscCall(DMPlexDistributeSetDefault(dm_orig, PETSC_FALSE)); + PetscCall(DMSetFromOptions(dm_orig)); // -- Setup DM by polynomial degree - ierr = PetscMalloc1(num_levels, &level_dms); CHKERRQ(ierr); + PetscCall(PetscMalloc1(num_levels, &level_dms)); for (PetscInt level = 0; level < num_levels; level++) { - ierr = DMClone(dm_orig, &level_dms[level]); CHKERRQ(ierr); - ierr = DMGetVecType(dm_orig, &vectype); CHKERRQ(ierr); - ierr = DMSetVecType(level_dms[level], vectype); CHKERRQ(ierr); - ierr = SetupDMByDegree(level_dms[level], app_ctx, app_ctx->level_degrees[level], - PETSC_TRUE, num_comp_u); CHKERRQ(ierr); + PetscCall(DMClone(dm_orig, &level_dms[level])); + PetscCall(DMGetVecType(dm_orig, &vectype)); + PetscCall(DMSetVecType(level_dms[level], vectype)); + PetscCall(SetupDMByDegree(level_dms[level], app_ctx, app_ctx->level_degrees[level], PETSC_TRUE, num_comp_u)); // -- Label field components for viewing // Empty name for conserved field (because there is only one field) PetscSection section; - ierr = DMGetLocalSection(level_dms[level], §ion); CHKERRQ(ierr); - ierr = PetscSectionSetFieldName(section, 0, "Displacement"); CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 0, "DisplacementX"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 1, "DisplacementY"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 2, "DisplacementZ"); - CHKERRQ(ierr); + PetscCall(DMGetLocalSection(level_dms[level], §ion)); + PetscCall(PetscSectionSetFieldName(section, 0, "Displacement")); + PetscCall(PetscSectionSetComponentName(section, 0, 0, "DisplacementX")); + PetscCall(PetscSectionSetComponentName(section, 0, 1, "DisplacementY")); + PetscCall(PetscSectionSetComponentName(section, 0, 2, "DisplacementZ")); } // -- Setup postprocessing DMs - ierr = DMClone(dm_orig, &dm_energy); CHKERRQ(ierr); - ierr = SetupDMByDegree(dm_energy, app_ctx, app_ctx->level_degrees[fine_level], - PETSC_FALSE, num_comp_e); CHKERRQ(ierr); - ierr = DMClone(dm_orig, &dm_diagnostic); CHKERRQ(ierr); - ierr = SetupDMByDegree(dm_diagnostic, app_ctx, - app_ctx->level_degrees[fine_level], - PETSC_FALSE, num_comp_u + num_comp_d); CHKERRQ(ierr); - ierr = DMSetVecType(dm_energy, vectype); CHKERRQ(ierr); - ierr = DMSetVecType(dm_diagnostic, vectype); CHKERRQ(ierr); + PetscCall(DMClone(dm_orig, &dm_energy)); + PetscCall(SetupDMByDegree(dm_energy, app_ctx, app_ctx->level_degrees[fine_level], PETSC_FALSE, num_comp_e)); + PetscCall(DMClone(dm_orig, &dm_diagnostic)); + PetscCall(SetupDMByDegree(dm_diagnostic, app_ctx, app_ctx->level_degrees[fine_level], PETSC_FALSE, num_comp_u + num_comp_d)); + PetscCall(DMSetVecType(dm_energy, vectype)); + PetscCall(DMSetVecType(dm_diagnostic, vectype)); { // -- Label field components for viewing // Empty name for conserved field (because there is only one field) PetscSection section; - ierr = DMGetLocalSection(dm_diagnostic, §ion); CHKERRQ(ierr); - ierr = PetscSectionSetFieldName(section, 0, "Diagnostics"); CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 0, "DisplacementX"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 1, "DisplacementY"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 2, "DisplacementZ"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 3, "Pressure"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 4, "VolumentricStrain"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 5, "TraceE2"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 6, "detJ"); - CHKERRQ(ierr); - ierr = PetscSectionSetComponentName(section, 0, 7, "StrainEnergyDensity"); - CHKERRQ(ierr); + PetscCall(DMGetLocalSection(dm_diagnostic, §ion)); + PetscCall(PetscSectionSetFieldName(section, 0, "Diagnostics")); + PetscCall(PetscSectionSetComponentName(section, 0, 0, "DisplacementX")); + PetscCall(PetscSectionSetComponentName(section, 0, 1, "DisplacementY")); + PetscCall(PetscSectionSetComponentName(section, 0, 2, "DisplacementZ")); + PetscCall(PetscSectionSetComponentName(section, 0, 3, "Pressure")); + PetscCall(PetscSectionSetComponentName(section, 0, 4, "VolumentricStrain")); + PetscCall(PetscSectionSetComponentName(section, 0, 5, "TraceE2")); + PetscCall(PetscSectionSetComponentName(section, 0, 6, "detJ")); + PetscCall(PetscSectionSetComponentName(section, 0, 7, "StrainEnergyDensity")); } // --------------------------------------------------------------------------- // Setup solution and work vectors // --------------------------------------------------------------------------- // Allocate arrays - ierr = PetscMalloc1(num_levels, &U_g); CHKERRQ(ierr); - ierr = PetscMalloc1(num_levels, &U_loc); CHKERRQ(ierr); - ierr = PetscMalloc1(num_levels, &U_g_size); CHKERRQ(ierr); - ierr = PetscMalloc1(num_levels, &U_l_size); CHKERRQ(ierr); - ierr = PetscMalloc1(num_levels, &U_loc_size); CHKERRQ(ierr); + PetscCall(PetscMalloc1(num_levels, &U_g)); + PetscCall(PetscMalloc1(num_levels, &U_loc)); + PetscCall(PetscMalloc1(num_levels, &U_g_size)); + PetscCall(PetscMalloc1(num_levels, &U_l_size)); + PetscCall(PetscMalloc1(num_levels, &U_loc_size)); // -- Setup solution vectors for each level for (PetscInt level = 0; level < num_levels; level++) { // -- Create global unknown vector U - ierr = DMCreateGlobalVector(level_dms[level], &U_g[level]); CHKERRQ(ierr); - ierr = VecGetSize(U_g[level], &U_g_size[level]); CHKERRQ(ierr); + PetscCall(DMCreateGlobalVector(level_dms[level], &U_g[level])); + PetscCall(VecGetSize(U_g[level], &U_g_size[level])); // Note: Local size for matShell - ierr = VecGetLocalSize(U_g[level], &U_l_size[level]); CHKERRQ(ierr); + PetscCall(VecGetLocalSize(U_g[level], &U_l_size[level])); // -- Create local unknown vector U_loc - ierr = DMCreateLocalVector(level_dms[level], &U_loc[level]); CHKERRQ(ierr); + PetscCall(DMCreateLocalVector(level_dms[level], &U_loc[level])); // Note: local size for libCEED - ierr = VecGetSize(U_loc[level], &U_loc_size[level]); CHKERRQ(ierr); + PetscCall(VecGetSize(U_loc[level], &U_loc_size[level])); } // -- Create residual and forcing vectors - ierr = VecDuplicate(U_g[fine_level], &U); CHKERRQ(ierr); - ierr = VecDuplicate(U_g[fine_level], &R); CHKERRQ(ierr); - ierr = VecDuplicate(U_g[fine_level], &F); CHKERRQ(ierr); - ierr = VecDuplicate(U_loc[fine_level], &R_loc); CHKERRQ(ierr); - ierr = VecDuplicate(U_loc[fine_level], &F_loc); CHKERRQ(ierr); + PetscCall(VecDuplicate(U_g[fine_level], &U)); + PetscCall(VecDuplicate(U_g[fine_level], &R)); + PetscCall(VecDuplicate(U_g[fine_level], &F)); + PetscCall(VecDuplicate(U_loc[fine_level], &R_loc)); + PetscCall(VecDuplicate(U_loc[fine_level], &F_loc)); // Performance logging - ierr = PetscLogStagePop(); + PetscCall(PetscLogStagePop()); // --------------------------------------------------------------------------- // Set up libCEED // --------------------------------------------------------------------------- // Performance logging - ierr = PetscLogStageRegister("libCEED Setup Stage", &stage_libceed_setup); - CHKERRQ(ierr); - ierr = PetscLogStagePush(stage_libceed_setup); CHKERRQ(ierr); + PetscCall(PetscLogStageRegister("libCEED Setup Stage", &stage_libceed_setup)); + PetscCall(PetscLogStagePush(stage_libceed_setup)); // -- Create libCEED local forcing vector - CeedVector force_ceed; - CeedScalar *f; + CeedVector force_ceed; + CeedScalar *f; PetscMemType force_mem_type; if (app_ctx->forcing_choice != FORCE_NONE) { - ierr = VecGetArrayAndMemType(F_loc, &f, &force_mem_type); CHKERRQ(ierr); + PetscCall(VecGetArrayAndMemType(F_loc, &f, &force_mem_type)); CeedVectorCreate(ceed, U_loc_size[fine_level], &force_ceed); CeedVectorSetArray(force_ceed, MemTypeP2C(force_mem_type), CEED_USE_POINTER, f); } // -- Create libCEED local Neumann BCs vector - CeedVector neumann_ceed; - CeedScalar *n; + CeedVector neumann_ceed; + CeedScalar *n; PetscMemType nummann_mem_type; if (app_ctx->bc_traction_count > 0) { - ierr = VecDuplicate(U, &neumann_bcs); CHKERRQ(ierr); - ierr = VecDuplicate(U_loc[fine_level], &bcs_loc); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(bcs_loc, &n, &nummann_mem_type); CHKERRQ(ierr); + PetscCall(VecDuplicate(U, &neumann_bcs)); + PetscCall(VecDuplicate(U_loc[fine_level], &bcs_loc)); + PetscCall(VecGetArrayAndMemType(bcs_loc, &n, &nummann_mem_type)); CeedVectorCreate(ceed, U_loc_size[fine_level], &neumann_ceed); - CeedVectorSetArray(neumann_ceed, MemTypeP2C(nummann_mem_type), - CEED_USE_POINTER, n); + CeedVectorSetArray(neumann_ceed, MemTypeP2C(nummann_mem_type), CEED_USE_POINTER, n); } // -- Setup libCEED objects - ierr = PetscMalloc1(num_levels, &ceed_data); CHKERRQ(ierr); + PetscCall(PetscMalloc1(num_levels, &ceed_data)); // ---- Setup residual, Jacobian evaluator and geometric information - ierr = PetscCalloc1(1, &ceed_data[fine_level]); CHKERRQ(ierr); + PetscCall(PetscCalloc1(1, &ceed_data[fine_level])); { - PetscErrorCode (*SetupLibceedFineLevel)(DM, DM, DM, Ceed, AppCtx, - CeedQFunctionContext, PetscInt, - PetscInt, PetscInt, PetscInt, - CeedVector, CeedVector, CeedData *); - ierr = PetscFunctionListFind(problem_functions->setupLibceedFineLevel, - app_ctx->name, &SetupLibceedFineLevel); - CHKERRQ(ierr); - if (!SetupLibceedFineLevel) - SETERRQ(PETSC_COMM_SELF, 1, "Fine grid setup for '%s' not found", - app_ctx->name); - ierr = (*SetupLibceedFineLevel)(level_dms[fine_level], dm_energy, dm_diagnostic, - ceed, app_ctx, ctx_phys, fine_level, - num_comp_u, U_g_size[fine_level], - U_loc_size[fine_level], - force_ceed, neumann_ceed, ceed_data); - CHKERRQ(ierr); + PetscErrorCode (*SetupLibceedFineLevel)(DM, DM, DM, Ceed, AppCtx, CeedQFunctionContext, PetscInt, PetscInt, PetscInt, PetscInt, CeedVector, + CeedVector, CeedData *); + PetscCall(PetscFunctionListFind(problem_functions->setupLibceedFineLevel, app_ctx->name, &SetupLibceedFineLevel)); + if (!SetupLibceedFineLevel) SETERRQ(PETSC_COMM_SELF, 1, "Fine grid setup for '%s' not found", app_ctx->name); + PetscCall((*SetupLibceedFineLevel)(level_dms[fine_level], dm_energy, dm_diagnostic, ceed, app_ctx, ctx_phys, fine_level, num_comp_u, + U_g_size[fine_level], U_loc_size[fine_level], force_ceed, neumann_ceed, ceed_data)); } // ---- Setup coarse Jacobian evaluator and prolongation/restriction for (PetscInt level = num_levels - 2; level >= 0; level--) { - ierr = PetscCalloc1(1, &ceed_data[level]); CHKERRQ(ierr); + PetscCall(PetscCalloc1(1, &ceed_data[level])); // Get global communication restriction - ierr = VecZeroEntries(U_g[level+1]); CHKERRQ(ierr); - ierr = VecSet(U_loc[level+1], 1.0); CHKERRQ(ierr); - ierr = DMLocalToGlobal(level_dms[level+1], U_loc[level+1], ADD_VALUES, - U_g[level+1]); CHKERRQ(ierr); - ierr = DMGlobalToLocal(level_dms[level+1], U_g[level+1], INSERT_VALUES, - U_loc[level+1]); CHKERRQ(ierr); + PetscCall(VecZeroEntries(U_g[level + 1])); + PetscCall(VecSet(U_loc[level + 1], 1.0)); + PetscCall(DMLocalToGlobal(level_dms[level + 1], U_loc[level + 1], ADD_VALUES, U_g[level + 1])); + PetscCall(DMGlobalToLocal(level_dms[level + 1], U_g[level + 1], INSERT_VALUES, U_loc[level + 1])); // Place in libCEED array const PetscScalar *m; - PetscMemType m_mem_type; - ierr = VecGetArrayReadAndMemType(U_loc[level+1], &m, &m_mem_type); - CHKERRQ(ierr); - CeedVectorSetArray(ceed_data[level+1]->x_ceed, MemTypeP2C(m_mem_type), - CEED_USE_POINTER, (CeedScalar *)m); + PetscMemType m_mem_type; + PetscCall(VecGetArrayReadAndMemType(U_loc[level + 1], &m, &m_mem_type)); + CeedVectorSetArray(ceed_data[level + 1]->x_ceed, MemTypeP2C(m_mem_type), CEED_USE_POINTER, (CeedScalar *)m); // Note: use high order ceed, if specified and degree > 4 - PetscErrorCode (*SetupLibceedLevel)(DM, Ceed, AppCtx, PetscInt, - PetscInt, PetscInt, PetscInt, CeedVector, CeedData *); - ierr = PetscFunctionListFind(problem_functions->setupLibceedLevel, - app_ctx->name, &SetupLibceedLevel); - CHKERRQ(ierr); - if (!SetupLibceedLevel) - SETERRQ(PETSC_COMM_SELF, 1, "Coarse grid setup for '%s' not found", - app_ctx->name); - ierr = (*SetupLibceedLevel)(level_dms[level], ceed, app_ctx, - level, num_comp_u, U_g_size[level], - U_loc_size[level], ceed_data[level+1]->x_ceed, - ceed_data); - CHKERRQ(ierr); + PetscErrorCode (*SetupLibceedLevel)(DM, Ceed, AppCtx, PetscInt, PetscInt, PetscInt, PetscInt, CeedVector, CeedData *); + PetscCall(PetscFunctionListFind(problem_functions->setupLibceedLevel, app_ctx->name, &SetupLibceedLevel)); + if (!SetupLibceedLevel) SETERRQ(PETSC_COMM_SELF, 1, "Coarse grid setup for '%s' not found", app_ctx->name); + PetscCall((*SetupLibceedLevel)(level_dms[level], ceed, app_ctx, level, num_comp_u, U_g_size[level], U_loc_size[level], + ceed_data[level + 1]->x_ceed, ceed_data)); // Restore PETSc vector - CeedVectorTakeArray(ceed_data[level+1]->x_ceed, MemTypeP2C(m_mem_type), - (CeedScalar **)&m); - ierr = VecRestoreArrayReadAndMemType(U_loc[level+1], &m); CHKERRQ(ierr); - ierr = VecZeroEntries(U_g[level+1]); CHKERRQ(ierr); - ierr = VecZeroEntries(U_loc[level+1]); CHKERRQ(ierr); + CeedVectorTakeArray(ceed_data[level + 1]->x_ceed, MemTypeP2C(m_mem_type), (CeedScalar **)&m); + PetscCall(VecRestoreArrayReadAndMemType(U_loc[level + 1], &m)); + PetscCall(VecZeroEntries(U_g[level + 1])); + PetscCall(VecZeroEntries(U_loc[level + 1])); } // Performance logging - ierr = PetscLogStagePop(); + PetscCall(PetscLogStagePop()); // --------------------------------------------------------------------------- // Setup global forcing and Neumann BC vectors // --------------------------------------------------------------------------- - ierr = VecZeroEntries(F); CHKERRQ(ierr); + PetscCall(VecZeroEntries(F)); if (app_ctx->forcing_choice != FORCE_NONE) { CeedVectorTakeArray(force_ceed, MemTypeP2C(force_mem_type), NULL); - ierr = VecRestoreArrayAndMemType(F_loc, &f); CHKERRQ(ierr); - ierr = DMLocalToGlobal(level_dms[fine_level], F_loc, ADD_VALUES, F); - CHKERRQ(ierr); + PetscCall(VecRestoreArrayAndMemType(F_loc, &f)); + PetscCall(DMLocalToGlobal(level_dms[fine_level], F_loc, ADD_VALUES, F)); CeedVectorDestroy(&force_ceed); } if (app_ctx->bc_traction_count > 0) { - ierr = VecZeroEntries(neumann_bcs); CHKERRQ(ierr); + PetscCall(VecZeroEntries(neumann_bcs)); CeedVectorTakeArray(neumann_ceed, MemTypeP2C(nummann_mem_type), NULL); - ierr = VecRestoreArrayAndMemType(bcs_loc, &n); CHKERRQ(ierr); - ierr = DMLocalToGlobal(level_dms[fine_level], bcs_loc, ADD_VALUES, neumann_bcs); - CHKERRQ(ierr); + PetscCall(VecRestoreArrayAndMemType(bcs_loc, &n)); + PetscCall(DMLocalToGlobal(level_dms[fine_level], bcs_loc, ADD_VALUES, neumann_bcs)); CeedVectorDestroy(&neumann_ceed); } @@ -358,68 +307,58 @@ int main(int argc, char **argv) { const char *usedresource; CeedGetResource(ceed, &usedresource); char hostname[PETSC_MAX_PATH_LEN]; - ierr = PetscGetHostName(hostname, sizeof hostname); CHKERRQ(ierr); + PetscCall(PetscGetHostName(hostname, sizeof hostname)); PetscInt comm_size; - ierr = MPI_Comm_size(comm, &comm_size); CHKERRQ(ierr); - - ierr = PetscPrintf(comm, - "\n-- Elasticity Example - libCEED + PETSc --\n" - " MPI:\n" - " Hostname : %s\n" - " Total ranks : %d\n" - " libCEED:\n" - " libCEED Backend : %s\n" - " libCEED Backend MemType : %s\n", - hostname, comm_size, usedresource, CeedMemTypes[mem_type_backend]); - CHKERRQ(ierr); + PetscCall(MPI_Comm_size(comm, &comm_size)); + + PetscCall(PetscPrintf(comm, + "\n-- Elasticity Example - libCEED + PETSc --\n" + " MPI:\n" + " Hostname : %s\n" + " Total ranks : %d\n" + " libCEED:\n" + " libCEED Backend : %s\n" + " libCEED Backend MemType : %s\n", + hostname, comm_size, usedresource, CeedMemTypes[mem_type_backend])); VecType vecType; - ierr = VecGetType(U, &vecType); CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " PETSc:\n" - " PETSc Vec Type : %s\n", - vecType); CHKERRQ(ierr); - - ierr = PetscPrintf(comm, - " Problem:\n" - " Problem Name : %s\n" - " Forcing Function : %s\n" - " Mesh:\n" - " File : %s\n" - " Number of 1D Basis Nodes (p) : %" CeedInt_FMT "\n" - " Number of 1D Quadrature Points (q) : %" CeedInt_FMT "\n" - " Global nodes : %" PetscInt_FMT "\n" - " Owned nodes : %" PetscInt_FMT "\n" - " DoF per node : %" PetscInt_FMT "\n" - " Multigrid:\n" - " Type : %s\n" - " Number of Levels : %" CeedInt_FMT "\n", - app_ctx->name_for_disp, - forcing_types_for_disp[app_ctx->forcing_choice], - app_ctx->mesh_file[0] ? app_ctx->mesh_file : "Box Mesh", - app_ctx->degree + 1, app_ctx->degree + 1, - U_g_size[fine_level]/num_comp_u, U_l_size[fine_level]/num_comp_u, - num_comp_u, - (app_ctx->degree == 1 && - app_ctx->multigrid_choice != MULTIGRID_NONE) ? - "Algebraic multigrid" : - multigrid_types_for_disp[app_ctx->multigrid_choice], - (app_ctx->degree == 1 || - app_ctx->multigrid_choice == MULTIGRID_NONE) ? - 0 : num_levels); CHKERRQ(ierr); + PetscCall(VecGetType(U, &vecType)); + PetscCall(PetscPrintf(comm, + " PETSc:\n" + " PETSc Vec Type : %s\n", + vecType)); + + PetscCall(PetscPrintf(comm, + " Problem:\n" + " Problem Name : %s\n" + " Forcing Function : %s\n" + " Mesh:\n" + " File : %s\n" + " Number of 1D Basis Nodes (p) : %" CeedInt_FMT "\n" + " Number of 1D Quadrature Points (q) : %" CeedInt_FMT "\n" + " Global nodes : %" PetscInt_FMT "\n" + " Owned nodes : %" PetscInt_FMT "\n" + " DoF per node : %" PetscInt_FMT "\n" + " Multigrid:\n" + " Type : %s\n" + " Number of Levels : %" CeedInt_FMT "\n", + app_ctx->name_for_disp, forcing_types_for_disp[app_ctx->forcing_choice], + app_ctx->mesh_file[0] ? app_ctx->mesh_file : "Box Mesh", app_ctx->degree + 1, app_ctx->degree + 1, + U_g_size[fine_level] / num_comp_u, U_l_size[fine_level] / num_comp_u, num_comp_u, + (app_ctx->degree == 1 && app_ctx->multigrid_choice != MULTIGRID_NONE) ? "Algebraic multigrid" + : multigrid_types_for_disp[app_ctx->multigrid_choice], + (app_ctx->degree == 1 || app_ctx->multigrid_choice == MULTIGRID_NONE) ? 0 : num_levels)); if (app_ctx->multigrid_choice != MULTIGRID_NONE) { for (PetscInt i = 0; i < 2; i++) { CeedInt level = i ? fine_level : 0; - ierr = PetscPrintf(comm, - " Level %" PetscInt_FMT " (%s):\n" - " Number of 1D Basis Nodes (p) : %" CeedInt_FMT "\n" - " Global Nodes : %" PetscInt_FMT "\n" - " Owned Nodes : %" PetscInt_FMT "\n", - level, i ? "fine" : "coarse", - app_ctx->level_degrees[level] + 1, - U_g_size[level]/num_comp_u, U_l_size[level]/num_comp_u); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, + " Level %" PetscInt_FMT " (%s):\n" + " Number of 1D Basis Nodes (p) : %" CeedInt_FMT "\n" + " Global Nodes : %" PetscInt_FMT "\n" + " Owned Nodes : %" PetscInt_FMT "\n", + level, i ? "fine" : "coarse", app_ctx->level_degrees[level] + 1, U_g_size[level] / num_comp_u, + U_l_size[level] / num_comp_u)); } } } @@ -428,77 +367,60 @@ int main(int argc, char **argv) { // Setup SNES // --------------------------------------------------------------------------- // Performance logging - ierr = PetscLogStageRegister("SNES Setup Stage", &stage_snes_setup); - CHKERRQ(ierr); - ierr = PetscLogStagePush(stage_snes_setup); CHKERRQ(ierr); + PetscCall(PetscLogStageRegister("SNES Setup Stage", &stage_snes_setup)); + PetscCall(PetscLogStagePush(stage_snes_setup)); // Create SNES - ierr = SNESCreate(comm, &snes); CHKERRQ(ierr); - ierr = SNESSetDM(snes, level_dms[fine_level]); CHKERRQ(ierr); + PetscCall(SNESCreate(comm, &snes)); + PetscCall(SNESSetDM(snes, level_dms[fine_level])); // -- Jacobian evaluators - ierr = PetscMalloc1(num_levels, &jacob_ctx); CHKERRQ(ierr); - ierr = PetscMalloc1(num_levels, &jacob_mat); CHKERRQ(ierr); + PetscCall(PetscMalloc1(num_levels, &jacob_ctx)); + PetscCall(PetscMalloc1(num_levels, &jacob_mat)); for (PetscInt level = 0; level < num_levels; level++) { // -- Jacobian context for level - ierr = PetscMalloc1(1, &jacob_ctx[level]); CHKERRQ(ierr); - ierr = SetupJacobianCtx(comm, app_ctx, level_dms[level], U_g[level], - U_loc[level], ceed_data[level], ceed, ctx_phys, - ctx_phys_smoother, jacob_ctx[level]); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &jacob_ctx[level])); + PetscCall(SetupJacobianCtx(comm, app_ctx, level_dms[level], U_g[level], U_loc[level], ceed_data[level], ceed, ctx_phys, ctx_phys_smoother, + jacob_ctx[level])); // -- Form Action of Jacobian on delta_u - ierr = MatCreateShell(comm, U_l_size[level], U_l_size[level], U_g_size[level], - U_g_size[level], jacob_ctx[level], &jacob_mat[level]); - CHKERRQ(ierr); - ierr = MatShellSetOperation(jacob_mat[level], MATOP_MULT, - (void (*)(void))ApplyJacobian_Ceed); - CHKERRQ(ierr); - ierr = MatShellSetOperation(jacob_mat[level], MATOP_GET_DIAGONAL, - (void(*)(void))GetDiag_Ceed); - ierr = MatShellSetVecType(jacob_mat[level], vectype); CHKERRQ(ierr); + PetscCall(MatCreateShell(comm, U_l_size[level], U_l_size[level], U_g_size[level], U_g_size[level], jacob_ctx[level], &jacob_mat[level])); + PetscCall(MatShellSetOperation(jacob_mat[level], MATOP_MULT, (void (*)(void))ApplyJacobian_Ceed)); + PetscCall(MatShellSetOperation(jacob_mat[level], MATOP_GET_DIAGONAL, (void (*)(void))GetDiag_Ceed)); + PetscCall(MatShellSetVecType(jacob_mat[level], vectype)); } // Note: FormJacobian updates Jacobian matrices on each level // and assembles the Jpre matrix, if needed - ierr = PetscMalloc1(1, &form_jacob_ctx); CHKERRQ(ierr); - form_jacob_ctx->jacob_ctx = jacob_ctx; + PetscCall(PetscMalloc1(1, &form_jacob_ctx)); + form_jacob_ctx->jacob_ctx = jacob_ctx; form_jacob_ctx->num_levels = num_levels; - form_jacob_ctx->jacob_mat = jacob_mat; + form_jacob_ctx->jacob_mat = jacob_mat; // -- Residual evaluation function - ierr = PetscCalloc1(1, &res_ctx); CHKERRQ(ierr); - ierr = PetscMemcpy(res_ctx, jacob_ctx[fine_level], - sizeof(*jacob_ctx[fine_level])); CHKERRQ(ierr); + PetscCall(PetscCalloc1(1, &res_ctx)); + PetscCall(PetscMemcpy(res_ctx, jacob_ctx[fine_level], sizeof(*jacob_ctx[fine_level]))); res_ctx->op = ceed_data[fine_level]->op_residual; res_ctx->qf = ceed_data[fine_level]->qf_residual; - if (app_ctx->bc_traction_count > 0) - res_ctx->neumann_bcs = neumann_bcs; - else - res_ctx->neumann_bcs = NULL; - ierr = SNESSetFunction(snes, R, FormResidual_Ceed, res_ctx); CHKERRQ(ierr); + if (app_ctx->bc_traction_count > 0) res_ctx->neumann_bcs = neumann_bcs; + else res_ctx->neumann_bcs = NULL; + PetscCall(SNESSetFunction(snes, R, FormResidual_Ceed, res_ctx)); // -- Prolongation/Restriction evaluation - ierr = PetscMalloc1(num_levels, &prolong_restr_ctx); CHKERRQ(ierr); - ierr = PetscMalloc1(num_levels, &prolong_restr_mat); CHKERRQ(ierr); + PetscCall(PetscMalloc1(num_levels, &prolong_restr_ctx)); + PetscCall(PetscMalloc1(num_levels, &prolong_restr_mat)); for (PetscInt level = 1; level < num_levels; level++) { // ---- Prolongation/restriction context for level - ierr = PetscMalloc1(1, &prolong_restr_ctx[level]); CHKERRQ(ierr); - ierr = SetupProlongRestrictCtx(comm, app_ctx, level_dms[level-1], - level_dms[level], U_g[level], U_loc[level-1], - U_loc[level], ceed_data[level-1], - ceed_data[level], ceed, - prolong_restr_ctx[level]); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &prolong_restr_ctx[level])); + PetscCall(SetupProlongRestrictCtx(comm, app_ctx, level_dms[level - 1], level_dms[level], U_g[level], U_loc[level - 1], U_loc[level], + ceed_data[level - 1], ceed_data[level], ceed, prolong_restr_ctx[level])); // ---- Form Action of Jacobian on delta_u - ierr = MatCreateShell(comm, U_l_size[level], U_l_size[level-1], U_g_size[level], - U_g_size[level-1], prolong_restr_ctx[level], - &prolong_restr_mat[level]); CHKERRQ(ierr); + PetscCall(MatCreateShell(comm, U_l_size[level], U_l_size[level - 1], U_g_size[level], U_g_size[level - 1], prolong_restr_ctx[level], + &prolong_restr_mat[level])); // Note: In PCMG, restriction is the transpose of prolongation - ierr = MatShellSetOperation(prolong_restr_mat[level], MATOP_MULT, - (void (*)(void))Prolong_Ceed); - ierr = MatShellSetOperation(prolong_restr_mat[level], MATOP_MULT_TRANSPOSE, - (void (*)(void))Restrict_Ceed); - CHKERRQ(ierr); - ierr = MatShellSetVecType(prolong_restr_mat[level], vectype); CHKERRQ(ierr); + PetscCall(MatShellSetOperation(prolong_restr_mat[level], MATOP_MULT, (void (*)(void))Prolong_Ceed)); + PetscCall(MatShellSetOperation(prolong_restr_mat[level], MATOP_MULT_TRANSPOSE, (void (*)(void))Restrict_Ceed)); + PetscCall(MatShellSetVecType(prolong_restr_mat[level], vectype)); } // --------------------------------------------------------------------------- @@ -506,183 +428,166 @@ int main(int argc, char **argv) { // --------------------------------------------------------------------------- if (app_ctx->multigrid_choice != MULTIGRID_NONE) { // -- Jacobian Matrix - ierr = DMCreateMatrix(level_dms[0], &jacob_mat_coarse); CHKERRQ(ierr); + PetscCall(DMCreateMatrix(level_dms[0], &jacob_mat_coarse)); if (app_ctx->degree > 1) { // -- Assemble sparsity pattern PetscCount num_entries; - CeedInt *rows, *cols; + CeedInt *rows, *cols; CeedVector coo_values; - CeedOperatorLinearAssembleSymbolic(ceed_data[0]->op_jacobian, &num_entries, - &rows, &cols); + CeedOperatorLinearAssembleSymbolic(ceed_data[0]->op_jacobian, &num_entries, &rows, &cols); ISLocalToGlobalMapping ltog_row, ltog_col; - ierr = MatGetLocalToGlobalMapping(jacob_mat_coarse, <og_row, <og_col); - CHKERRQ(ierr); - ierr = ISLocalToGlobalMappingApply(ltog_row, num_entries, rows, rows); - CHKERRQ(ierr); - ierr = ISLocalToGlobalMappingApply(ltog_col, num_entries, cols, cols); - CHKERRQ(ierr); - ierr = MatSetPreallocationCOO(jacob_mat_coarse, num_entries, rows, cols); - CHKERRQ(ierr); + PetscCall(MatGetLocalToGlobalMapping(jacob_mat_coarse, <og_row, <og_col)); + PetscCall(ISLocalToGlobalMappingApply(ltog_row, num_entries, rows, rows)); + PetscCall(ISLocalToGlobalMappingApply(ltog_col, num_entries, cols, cols)); + PetscCall(MatSetPreallocationCOO(jacob_mat_coarse, num_entries, rows, cols)); free(rows); free(cols); CeedVectorCreate(ceed, num_entries, &coo_values); // -- Update form_jacob_ctx - form_jacob_ctx->coo_values = coo_values; - form_jacob_ctx->op_coarse = ceed_data[0]->op_jacobian; + form_jacob_ctx->coo_values = coo_values; + form_jacob_ctx->op_coarse = ceed_data[0]->op_jacobian; form_jacob_ctx->jacob_mat_coarse = jacob_mat_coarse; } } // Set Jacobian function if (app_ctx->degree > 1) { - ierr = SNESSetJacobian(snes, jacob_mat[fine_level], jacob_mat[fine_level], - FormJacobian, form_jacob_ctx); CHKERRQ(ierr); + PetscCall(SNESSetJacobian(snes, jacob_mat[fine_level], jacob_mat[fine_level], FormJacobian, form_jacob_ctx)); } else { - ierr = SNESSetJacobian(snes, jacob_mat[0], jacob_mat_coarse, - SNESComputeJacobianDefaultColor, NULL); - CHKERRQ(ierr); + PetscCall(SNESSetJacobian(snes, jacob_mat[0], jacob_mat_coarse, SNESComputeJacobianDefaultColor, NULL)); } // --------------------------------------------------------------------------- // Setup KSP // --------------------------------------------------------------------------- { - PC pc; + PC pc; KSP ksp; // -- KSP - ierr = SNESGetKSP(snes, &ksp); CHKERRQ(ierr); - ierr = KSPSetType(ksp, KSPCG); CHKERRQ(ierr); - ierr = KSPSetNormType(ksp, KSP_NORM_NATURAL); CHKERRQ(ierr); - ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, - PETSC_DEFAULT); CHKERRQ(ierr); - ierr = KSPSetOptionsPrefix(ksp, "outer_"); CHKERRQ(ierr); + PetscCall(SNESGetKSP(snes, &ksp)); + PetscCall(KSPSetType(ksp, KSPCG)); + PetscCall(KSPSetNormType(ksp, KSP_NORM_NATURAL)); + PetscCall(KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, PETSC_DEFAULT)); + PetscCall(KSPSetOptionsPrefix(ksp, "outer_")); // -- Preconditioning - ierr = KSPGetPC(ksp, &pc); CHKERRQ(ierr); - ierr = PCSetDM(pc, level_dms[fine_level]); CHKERRQ(ierr); - ierr = PCSetOptionsPrefix(pc, "outer_"); CHKERRQ(ierr); + PetscCall(KSPGetPC(ksp, &pc)); + PetscCall(PCSetDM(pc, level_dms[fine_level])); + PetscCall(PCSetOptionsPrefix(pc, "outer_")); if (app_ctx->multigrid_choice == MULTIGRID_NONE) { // ---- No Multigrid - ierr = PCSetType(pc, PCJACOBI); CHKERRQ(ierr); - ierr = PCJacobiSetType(pc, PC_JACOBI_DIAGONAL); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCJACOBI)); + PetscCall(PCJacobiSetType(pc, PC_JACOBI_DIAGONAL)); } else if (app_ctx->degree == 1) { // ---- AMG for degree 1 - ierr = PCSetType(pc, PCGAMG); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCGAMG)); } else { // ---- PCMG - ierr = PCSetType(pc, PCMG); CHKERRQ(ierr); + PetscCall(PCSetType(pc, PCMG)); // ------ PCMG levels - ierr = PCMGSetLevels(pc, num_levels, NULL); CHKERRQ(ierr); + PetscCall(PCMGSetLevels(pc, num_levels, NULL)); for (PetscInt level = 0; level < num_levels; level++) { // -------- Smoother KSP ksp_smoother, ksp_est; - PC pc_smoother; + PC pc_smoother; // ---------- Smoother KSP - ierr = PCMGGetSmoother(pc, level, &ksp_smoother); CHKERRQ(ierr); - ierr = KSPSetDM(ksp_smoother, level_dms[level]); CHKERRQ(ierr); - ierr = KSPSetDMActive(ksp_smoother, PETSC_FALSE); CHKERRQ(ierr); + PetscCall(PCMGGetSmoother(pc, level, &ksp_smoother)); + PetscCall(KSPSetDM(ksp_smoother, level_dms[level])); + PetscCall(KSPSetDMActive(ksp_smoother, PETSC_FALSE)); // ---------- Chebyshev options - ierr = KSPSetType(ksp_smoother, KSPCHEBYSHEV); CHKERRQ(ierr); - ierr = KSPChebyshevEstEigSet(ksp_smoother, 0, 0.1, 0, 1.1); - CHKERRQ(ierr); - ierr = KSPChebyshevEstEigGetKSP(ksp_smoother, &ksp_est); CHKERRQ(ierr); - ierr = KSPSetType(ksp_est, KSPCG); CHKERRQ(ierr); - ierr = KSPChebyshevEstEigSetUseNoisy(ksp_smoother, PETSC_TRUE); - CHKERRQ(ierr); - ierr = KSPSetOperators(ksp_smoother, jacob_mat[level], jacob_mat[level]); - CHKERRQ(ierr); + PetscCall(KSPSetType(ksp_smoother, KSPCHEBYSHEV)); + PetscCall(KSPChebyshevEstEigSet(ksp_smoother, 0, 0.1, 0, 1.1)); + PetscCall(KSPChebyshevEstEigGetKSP(ksp_smoother, &ksp_est)); + PetscCall(KSPSetType(ksp_est, KSPCG)); + PetscCall(KSPChebyshevEstEigSetUseNoisy(ksp_smoother, PETSC_TRUE)); + PetscCall(KSPSetOperators(ksp_smoother, jacob_mat[level], jacob_mat[level])); // ---------- Smoother preconditioner - ierr = KSPGetPC(ksp_smoother, &pc_smoother); CHKERRQ(ierr); - ierr = PCSetType(pc_smoother, PCJACOBI); CHKERRQ(ierr); - ierr = PCJacobiSetType(pc_smoother, PC_JACOBI_DIAGONAL); CHKERRQ(ierr); + PetscCall(KSPGetPC(ksp_smoother, &pc_smoother)); + PetscCall(PCSetType(pc_smoother, PCJACOBI)); + PetscCall(PCJacobiSetType(pc_smoother, PC_JACOBI_DIAGONAL)); // -------- Work vector if (level != fine_level) { - ierr = PCMGSetX(pc, level, U_g[level]); CHKERRQ(ierr); + PetscCall(PCMGSetX(pc, level, U_g[level])); } // -------- Level prolongation/restriction operator if (level > 0) { - ierr = PCMGSetInterpolation(pc, level, prolong_restr_mat[level]); - CHKERRQ(ierr); - ierr = PCMGSetRestriction(pc, level, prolong_restr_mat[level]); - CHKERRQ(ierr); + PetscCall(PCMGSetInterpolation(pc, level, prolong_restr_mat[level])); + PetscCall(PCMGSetRestriction(pc, level, prolong_restr_mat[level])); } } // ------ PCMG coarse solve KSP ksp_coarse; - PC pc_coarse; + PC pc_coarse; // -------- Coarse KSP - ierr = PCMGGetCoarseSolve(pc, &ksp_coarse); CHKERRQ(ierr); - ierr = KSPSetType(ksp_coarse, KSPPREONLY); CHKERRQ(ierr); - ierr = KSPSetOperators(ksp_coarse, jacob_mat_coarse, jacob_mat_coarse); - CHKERRQ(ierr); - ierr = KSPSetOptionsPrefix(ksp_coarse, "coarse_"); CHKERRQ(ierr); + PetscCall(PCMGGetCoarseSolve(pc, &ksp_coarse)); + PetscCall(KSPSetType(ksp_coarse, KSPPREONLY)); + PetscCall(KSPSetOperators(ksp_coarse, jacob_mat_coarse, jacob_mat_coarse)); + PetscCall(KSPSetOptionsPrefix(ksp_coarse, "coarse_")); // -------- Coarse preconditioner - ierr = KSPGetPC(ksp_coarse, &pc_coarse); CHKERRQ(ierr); - ierr = PCSetType(pc_coarse, PCGAMG); CHKERRQ(ierr); - ierr = PCSetOptionsPrefix(pc_coarse, "coarse_"); CHKERRQ(ierr); + PetscCall(KSPGetPC(ksp_coarse, &pc_coarse)); + PetscCall(PCSetType(pc_coarse, PCGAMG)); + PetscCall(PCSetOptionsPrefix(pc_coarse, "coarse_")); - ierr = KSPSetFromOptions(ksp_coarse); CHKERRQ(ierr); - ierr = PCSetFromOptions(pc_coarse); CHKERRQ(ierr); + PetscCall(KSPSetFromOptions(ksp_coarse)); + PetscCall(PCSetFromOptions(pc_coarse)); // ------ PCMG options - ierr = PCMGSetType(pc, PC_MG_MULTIPLICATIVE); CHKERRQ(ierr); - ierr = PCMGSetNumberSmooth(pc, 3); CHKERRQ(ierr); - ierr = PCMGSetCycleType(pc, pcmg_cycle_type); CHKERRQ(ierr); + PetscCall(PCMGSetType(pc, PC_MG_MULTIPLICATIVE)); + PetscCall(PCMGSetNumberSmooth(pc, 3)); + PetscCall(PCMGSetCycleType(pc, pcmg_cycle_type)); } - ierr = KSPSetFromOptions(ksp); - ierr = PCSetFromOptions(pc); + PetscCall(KSPSetFromOptions(ksp)); + PetscCall(PCSetFromOptions(pc)); } { // Default to critical-point (CP) line search (related to Wolfe's curvature condition) SNESLineSearch line_search; - ierr = SNESGetLineSearch(snes, &line_search); CHKERRQ(ierr); - ierr = SNESLineSearchSetType(line_search, SNESLINESEARCHCP); CHKERRQ(ierr); + PetscCall(SNESGetLineSearch(snes, &line_search)); + PetscCall(SNESLineSearchSetType(line_search, SNESLINESEARCHCP)); } - ierr = SNESSetFromOptions(snes); CHKERRQ(ierr); + PetscCall(SNESSetFromOptions(snes)); // Performance logging - ierr = PetscLogStagePop(); + PetscCall(PetscLogStagePop()); // --------------------------------------------------------------------------- // Set initial guess // --------------------------------------------------------------------------- - ierr = PetscObjectSetName((PetscObject)U, ""); CHKERRQ(ierr); - ierr = VecSet(U, 0.0); CHKERRQ(ierr); + PetscCall(PetscObjectSetName((PetscObject)U, "")); + PetscCall(VecSet(U, 0.0)); // View solution if (app_ctx->view_soln) { - ierr = ViewSolution(comm, app_ctx, U, 0, 0.0); CHKERRQ(ierr); + PetscCall(ViewSolution(comm, app_ctx, U, 0, 0.0)); } // --------------------------------------------------------------------------- // Solve SNES // --------------------------------------------------------------------------- PetscBool snes_monitor = PETSC_FALSE; - ierr = PetscOptionsHasName(NULL, NULL, "-snes_monitor", &snes_monitor); - CHKERRQ(ierr); + PetscCall(PetscOptionsHasName(NULL, NULL, "-snes_monitor", &snes_monitor)); // Performance logging - ierr = PetscLogStageRegister("SNES Solve Stage", &stage_snes_solve); - CHKERRQ(ierr); - ierr = PetscLogStagePush(stage_snes_solve); CHKERRQ(ierr); + PetscCall(PetscLogStageRegister("SNES Solve Stage", &stage_snes_solve)); + PetscCall(PetscLogStagePush(stage_snes_solve)); // Timing - ierr = PetscBarrier((PetscObject)snes); CHKERRQ(ierr); + PetscCall(PetscBarrier((PetscObject)snes)); start_time = MPI_Wtime(); // Solve for each load increment @@ -690,53 +595,46 @@ int main(int argc, char **argv) { for (increment = 1; increment <= app_ctx->num_increments; increment++) { // -- Log increment count if (snes_monitor) { - ierr = PetscPrintf(comm, "%" PetscInt_FMT " Load Increment\n", increment - 1); - CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, "%" PetscInt_FMT " Load Increment\n", increment - 1)); } // -- Scale the problem - PetscScalar load_increment = 1.0*increment / app_ctx->num_increments, - scalingFactor = load_increment / - (increment == 1 ? 1 : res_ctx->load_increment); - res_ctx->load_increment = load_increment; + PetscScalar load_increment = 1.0 * increment / app_ctx->num_increments, + scalingFactor = load_increment / (increment == 1 ? 1 : res_ctx->load_increment); + res_ctx->load_increment = load_increment; if (app_ctx->num_increments > 1 && app_ctx->forcing_choice != FORCE_NONE) { - ierr = VecScale(F, scalingFactor); CHKERRQ(ierr); + PetscCall(VecScale(F, scalingFactor)); } // -- Solve - ierr = SNESSolve(snes, F, U); CHKERRQ(ierr); + PetscCall(SNESSolve(snes, F, U)); // -- View solution if (app_ctx->view_soln) { - ierr = ViewSolution(comm, app_ctx, U, increment, load_increment); CHKERRQ(ierr); + PetscCall(ViewSolution(comm, app_ctx, U, increment, load_increment)); } // -- Update SNES iteration count PetscInt its; - ierr = SNESGetIterationNumber(snes, &its); CHKERRQ(ierr); + PetscCall(SNESGetIterationNumber(snes, &its)); snes_its += its; - ierr = SNESGetLinearSolveIterations(snes, &its); CHKERRQ(ierr); + PetscCall(SNESGetLinearSolveIterations(snes, &its)); ksp_its += its; // -- Check for divergence SNESConvergedReason reason; - ierr = SNESGetConvergedReason(snes, &reason); CHKERRQ(ierr); - if (reason < 0) - break; + PetscCall(SNESGetConvergedReason(snes, &reason)); + if (reason < 0) break; if (app_ctx->energy_viewer) { // -- Log strain energy for current load increment CeedScalar energy; - ierr = ComputeStrainEnergy(dm_energy, res_ctx, ceed_data[fine_level]->op_energy, - U, &energy); CHKERRQ(ierr); + PetscCall(ComputeStrainEnergy(dm_energy, res_ctx, ceed_data[fine_level]->op_energy, U, &energy)); if (!app_ctx->test_mode) { // -- Output - ierr = PetscPrintf(comm, - " Strain Energy : %.12e\n", - energy); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " Strain Energy : %.12e\n", energy)); } - ierr = PetscViewerASCIIPrintf(app_ctx->energy_viewer, "%f,%e\n", load_increment, - energy); CHKERRQ(ierr); + PetscCall(PetscViewerASCIIPrintf(app_ctx->energy_viewer, "%f,%e\n", load_increment, energy)); } } @@ -744,75 +642,70 @@ int main(int argc, char **argv) { elapsed_time = MPI_Wtime() - start_time; // Performance logging - ierr = PetscLogStagePop(); + PetscCall(PetscLogStagePop()); // --------------------------------------------------------------------------- // Output summary // --------------------------------------------------------------------------- if (!app_ctx->test_mode) { // -- SNES - SNESType snes_type; + SNESType snes_type; SNESConvergedReason reason; - PetscReal rnorm; - ierr = SNESGetType(snes, &snes_type); CHKERRQ(ierr); - ierr = SNESGetConvergedReason(snes, &reason); CHKERRQ(ierr); - ierr = SNESGetFunctionNorm(snes, &rnorm); CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " SNES:\n" - " SNES Type : %s\n" - " SNES Convergence : %s\n" - " Number of Load Increments : %" PetscInt_FMT "\n" - " Completed Load Increments : %" PetscInt_FMT "\n" - " Total SNES Iterations : %" PetscInt_FMT "\n" - " Final rnorm : %e\n", - snes_type, SNESConvergedReasons[reason], - app_ctx->num_increments, increment - 1, - snes_its, (double)rnorm); CHKERRQ(ierr); + PetscReal rnorm; + PetscCall(SNESGetType(snes, &snes_type)); + PetscCall(SNESGetConvergedReason(snes, &reason)); + PetscCall(SNESGetFunctionNorm(snes, &rnorm)); + PetscCall(PetscPrintf(comm, + " SNES:\n" + " SNES Type : %s\n" + " SNES Convergence : %s\n" + " Number of Load Increments : %" PetscInt_FMT "\n" + " Completed Load Increments : %" PetscInt_FMT "\n" + " Total SNES Iterations : %" PetscInt_FMT "\n" + " Final rnorm : %e\n", + snes_type, SNESConvergedReasons[reason], app_ctx->num_increments, increment - 1, snes_its, (double)rnorm)); // -- KSP - KSP ksp; + KSP ksp; KSPType ksp_type; - ierr = SNESGetKSP(snes, &ksp); CHKERRQ(ierr); - ierr = KSPGetType(ksp, &ksp_type); CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " Linear Solver:\n" - " KSP Type : %s\n" - " Total KSP Iterations : %" PetscInt_FMT "\n", - ksp_type, ksp_its); CHKERRQ(ierr); + PetscCall(SNESGetKSP(snes, &ksp)); + PetscCall(KSPGetType(ksp, &ksp_type)); + PetscCall(PetscPrintf(comm, + " Linear Solver:\n" + " KSP Type : %s\n" + " Total KSP Iterations : %" PetscInt_FMT "\n", + ksp_type, ksp_its)); // -- PC - PC pc; + PC pc; PCType pc_type; - ierr = KSPGetPC(ksp, &pc); CHKERRQ(ierr); - ierr = PCGetType(pc, &pc_type); CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " PC Type : %s\n", - pc_type); CHKERRQ(ierr); + PetscCall(KSPGetPC(ksp, &pc)); + PetscCall(PCGetType(pc, &pc_type)); + PetscCall(PetscPrintf(comm, " PC Type : %s\n", pc_type)); if (!strcmp(pc_type, PCMG)) { PCMGType pcmg_type; - ierr = PCMGGetType(pc, &pcmg_type); CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " P-Multigrid:\n" - " PCMG Type : %s\n" - " PCMG Cycle Type : %s\n", - PCMGTypes[pcmg_type], - PCMGCycleTypes[pcmg_cycle_type]); CHKERRQ(ierr); + PetscCall(PCMGGetType(pc, &pcmg_type)); + PetscCall(PetscPrintf(comm, + " P-Multigrid:\n" + " PCMG Type : %s\n" + " PCMG Cycle Type : %s\n", + PCMGTypes[pcmg_type], PCMGCycleTypes[pcmg_cycle_type])); // -- Coarse Solve - KSP ksp_coarse; - PC pc_coarse; + KSP ksp_coarse; + PC pc_coarse; PCType pc_type; - ierr = PCMGGetCoarseSolve(pc, &ksp_coarse); CHKERRQ(ierr); - ierr = KSPGetType(ksp_coarse, &ksp_type); CHKERRQ(ierr); - ierr = KSPGetPC(ksp_coarse, &pc_coarse); CHKERRQ(ierr); - ierr = PCGetType(pc_coarse, &pc_type); CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " Coarse Solve:\n" - " KSP Type : %s\n" - " PC Type : %s\n", - ksp_type, pc_type); CHKERRQ(ierr); + PetscCall(PCMGGetCoarseSolve(pc, &ksp_coarse)); + PetscCall(KSPGetType(ksp_coarse, &ksp_type)); + PetscCall(KSPGetPC(ksp_coarse, &pc_coarse)); + PetscCall(PCGetType(pc_coarse, &pc_type)); + PetscCall(PetscPrintf(comm, + " Coarse Solve:\n" + " KSP Type : %s\n" + " PC Type : %s\n", + ksp_type, pc_type)); } } @@ -820,73 +713,62 @@ int main(int argc, char **argv) { // Compute solve time // --------------------------------------------------------------------------- if (!app_ctx->test_mode) { - ierr = MPI_Allreduce(&elapsed_time, &min_time, 1, MPI_DOUBLE, MPI_MIN, comm); - CHKERRQ(ierr); - ierr = MPI_Allreduce(&elapsed_time, &max_time, 1, MPI_DOUBLE, MPI_MAX, comm); - CHKERRQ(ierr); - ierr = PetscPrintf(comm, - " Performance:\n" - " SNES Solve Time : %g (%g) sec\n" - " DoFs/Sec in SNES : %g (%g) million\n", - max_time, min_time, 1e-6*U_g_size[fine_level]*ksp_its/max_time, - 1e-6*U_g_size[fine_level]*ksp_its/min_time); CHKERRQ(ierr); + PetscCall(MPI_Allreduce(&elapsed_time, &min_time, 1, MPI_DOUBLE, MPI_MIN, comm)); + PetscCall(MPI_Allreduce(&elapsed_time, &max_time, 1, MPI_DOUBLE, MPI_MAX, comm)); + PetscCall(PetscPrintf(comm, + " Performance:\n" + " SNES Solve Time : %g (%g) sec\n" + " DoFs/Sec in SNES : %g (%g) million\n", + max_time, min_time, 1e-6 * U_g_size[fine_level] * ksp_its / max_time, 1e-6 * U_g_size[fine_level] * ksp_its / min_time)); } // --------------------------------------------------------------------------- // Compute error // --------------------------------------------------------------------------- if (app_ctx->forcing_choice == FORCE_MMS) { - CeedScalar l2_error = 1., l2_U_norm = 1.; + CeedScalar l2_error = 1., l2_U_norm = 1.; const CeedScalar *true_array; - Vec error_vec, true_vec; + Vec error_vec, true_vec; // -- Work vectors - ierr = VecDuplicate(U, &error_vec); CHKERRQ(ierr); - ierr = VecSet(error_vec, 0.0); CHKERRQ(ierr); - ierr = VecDuplicate(U, &true_vec); CHKERRQ(ierr); - ierr = VecSet(true_vec, 0.0); CHKERRQ(ierr); + PetscCall(VecDuplicate(U, &error_vec)); + PetscCall(VecSet(error_vec, 0.0)); + PetscCall(VecDuplicate(U, &true_vec)); + PetscCall(VecSet(true_vec, 0.0)); // -- Assemble global true solution vector - CeedVectorGetArrayRead(ceed_data[fine_level]->true_soln, - CEED_MEM_HOST, &true_array); - ierr = VecPlaceArray(res_ctx->Y_loc, (PetscScalar *)true_array); - CHKERRQ(ierr); - ierr = DMLocalToGlobal(res_ctx->dm, res_ctx->Y_loc, INSERT_VALUES, true_vec); - CHKERRQ(ierr); - ierr = VecResetArray(res_ctx->Y_loc); CHKERRQ(ierr); + CeedVectorGetArrayRead(ceed_data[fine_level]->true_soln, CEED_MEM_HOST, &true_array); + PetscCall(VecPlaceArray(res_ctx->Y_loc, (PetscScalar *)true_array)); + PetscCall(DMLocalToGlobal(res_ctx->dm, res_ctx->Y_loc, INSERT_VALUES, true_vec)); + PetscCall(VecResetArray(res_ctx->Y_loc)); CeedVectorRestoreArrayRead(ceed_data[fine_level]->true_soln, &true_array); // -- Compute L2 error - ierr = VecWAXPY(error_vec, -1.0, U, true_vec); CHKERRQ(ierr); - ierr = VecNorm(error_vec, NORM_2, &l2_error); CHKERRQ(ierr); - ierr = VecNorm(U, NORM_2, &l2_U_norm); CHKERRQ(ierr); + PetscCall(VecWAXPY(error_vec, -1.0, U, true_vec)); + PetscCall(VecNorm(error_vec, NORM_2, &l2_error)); + PetscCall(VecNorm(U, NORM_2, &l2_U_norm)); l2_error /= l2_U_norm; // -- Output if (!app_ctx->test_mode || l2_error > 0.05) { - ierr = PetscPrintf(comm, - " L2 Error : %e\n", - l2_error); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " L2 Error : %e\n", l2_error)); } // -- Cleanup - ierr = VecDestroy(&error_vec); CHKERRQ(ierr); - ierr = VecDestroy(&true_vec); CHKERRQ(ierr); + PetscCall(VecDestroy(&error_vec)); + PetscCall(VecDestroy(&true_vec)); } // --------------------------------------------------------------------------- // Compute energy // --------------------------------------------------------------------------- PetscReal energy; - ierr = ComputeStrainEnergy(dm_energy, res_ctx, ceed_data[fine_level]->op_energy, - U, &energy); CHKERRQ(ierr); + PetscCall(ComputeStrainEnergy(dm_energy, res_ctx, ceed_data[fine_level]->op_energy, U, &energy)); if (!app_ctx->test_mode) { // -- Output - ierr = PetscPrintf(comm, - " Strain Energy : %.12e\n", - energy); CHKERRQ(ierr); + PetscCall(PetscPrintf(comm, " Strain Energy : %.12e\n", energy)); } - ierr = RegressionTests_solids(app_ctx, energy); CHKERRQ(ierr); + PetscCall(RegressionTests_solids(app_ctx, energy)); // --------------------------------------------------------------------------- // Output diagnostic quantities @@ -894,19 +776,16 @@ int main(int argc, char **argv) { if (app_ctx->view_soln || app_ctx->view_final_soln) { // -- Setup context UserMult diagnostic_ctx; - ierr = PetscMalloc1(1, &diagnostic_ctx); CHKERRQ(ierr); - ierr = PetscMemcpy(diagnostic_ctx, res_ctx, sizeof(*res_ctx)); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &diagnostic_ctx)); + PetscCall(PetscMemcpy(diagnostic_ctx, res_ctx, sizeof(*res_ctx))); diagnostic_ctx->dm = dm_diagnostic; diagnostic_ctx->op = ceed_data[fine_level]->op_diagnostic; // -- Compute and output - ierr = ViewDiagnosticQuantities(comm, level_dms[fine_level], diagnostic_ctx, - app_ctx, U, - ceed_data[fine_level]->elem_restr_diagnostic); - CHKERRQ(ierr); + PetscCall(ViewDiagnosticQuantities(comm, level_dms[fine_level], diagnostic_ctx, app_ctx, U, ceed_data[fine_level]->elem_restr_diagnostic)); // -- Cleanup - ierr = PetscFree(diagnostic_ctx); CHKERRQ(ierr); + PetscCall(PetscFree(diagnostic_ctx)); } // --------------------------------------------------------------------------- @@ -915,41 +794,41 @@ int main(int argc, char **argv) { // Data in arrays per level for (PetscInt level = 0; level < num_levels; level++) { // Vectors - ierr = VecDestroy(&U_g[level]); CHKERRQ(ierr); - ierr = VecDestroy(&U_loc[level]); CHKERRQ(ierr); + PetscCall(VecDestroy(&U_g[level])); + PetscCall(VecDestroy(&U_loc[level])); // Jacobian matrix and data - ierr = VecDestroy(&jacob_ctx[level]->Y_loc); CHKERRQ(ierr); - ierr = MatDestroy(&jacob_mat[level]); CHKERRQ(ierr); - ierr = PetscFree(jacob_ctx[level]); CHKERRQ(ierr); + PetscCall(VecDestroy(&jacob_ctx[level]->Y_loc)); + PetscCall(MatDestroy(&jacob_mat[level])); + PetscCall(PetscFree(jacob_ctx[level])); // Prolongation/Restriction matrix and data if (level > 0) { - ierr = PetscFree(prolong_restr_ctx[level]); CHKERRQ(ierr); - ierr = MatDestroy(&prolong_restr_mat[level]); CHKERRQ(ierr); + PetscCall(PetscFree(prolong_restr_ctx[level])); + PetscCall(MatDestroy(&prolong_restr_mat[level])); } // DM - ierr = DMDestroy(&level_dms[level]); CHKERRQ(ierr); + PetscCall(DMDestroy(&level_dms[level])); // libCEED objects - ierr = CeedDataDestroy(level, ceed_data[level]); CHKERRQ(ierr); + PetscCall(CeedDataDestroy(level, ceed_data[level])); } - ierr = PetscViewerDestroy(&app_ctx->energy_viewer); CHKERRQ(ierr); + PetscCall(PetscViewerDestroy(&app_ctx->energy_viewer)); // Arrays - ierr = PetscFree(U_g); CHKERRQ(ierr); - ierr = PetscFree(U_loc); CHKERRQ(ierr); - ierr = PetscFree(U_g_size); CHKERRQ(ierr); - ierr = PetscFree(U_l_size); CHKERRQ(ierr); - ierr = PetscFree(U_loc_size); CHKERRQ(ierr); - ierr = PetscFree(jacob_ctx); CHKERRQ(ierr); - ierr = PetscFree(jacob_mat); CHKERRQ(ierr); - ierr = PetscFree(prolong_restr_ctx); CHKERRQ(ierr); - ierr = PetscFree(prolong_restr_mat); CHKERRQ(ierr); - ierr = PetscFree(app_ctx->level_degrees); CHKERRQ(ierr); - ierr = PetscFree(ceed_data); CHKERRQ(ierr); + PetscCall(PetscFree(U_g)); + PetscCall(PetscFree(U_loc)); + PetscCall(PetscFree(U_g_size)); + PetscCall(PetscFree(U_l_size)); + PetscCall(PetscFree(U_loc_size)); + PetscCall(PetscFree(jacob_ctx)); + PetscCall(PetscFree(jacob_mat)); + PetscCall(PetscFree(prolong_restr_ctx)); + PetscCall(PetscFree(prolong_restr_mat)); + PetscCall(PetscFree(app_ctx->level_degrees)); + PetscCall(PetscFree(ceed_data)); // libCEED objects CeedVectorDestroy(&form_jacob_ctx->coo_values); @@ -958,37 +837,33 @@ int main(int argc, char **argv) { CeedDestroy(&ceed); // PETSc objects - ierr = VecDestroy(&U); CHKERRQ(ierr); - ierr = VecDestroy(&R); CHKERRQ(ierr); - ierr = VecDestroy(&R_loc); CHKERRQ(ierr); - ierr = VecDestroy(&F); CHKERRQ(ierr); - ierr = VecDestroy(&F_loc); CHKERRQ(ierr); - ierr = VecDestroy(&neumann_bcs); CHKERRQ(ierr); - ierr = VecDestroy(&bcs_loc); CHKERRQ(ierr); - ierr = MatDestroy(&jacob_mat_coarse); CHKERRQ(ierr); - ierr = SNESDestroy(&snes); CHKERRQ(ierr); - ierr = DMDestroy(&dm_orig); CHKERRQ(ierr); - ierr = DMDestroy(&dm_energy); CHKERRQ(ierr); - ierr = DMDestroy(&dm_diagnostic); CHKERRQ(ierr); - ierr = PetscFree(level_dms); CHKERRQ(ierr); + PetscCall(VecDestroy(&U)); + PetscCall(VecDestroy(&R)); + PetscCall(VecDestroy(&R_loc)); + PetscCall(VecDestroy(&F)); + PetscCall(VecDestroy(&F_loc)); + PetscCall(VecDestroy(&neumann_bcs)); + PetscCall(VecDestroy(&bcs_loc)); + PetscCall(MatDestroy(&jacob_mat_coarse)); + PetscCall(SNESDestroy(&snes)); + PetscCall(DMDestroy(&dm_orig)); + PetscCall(DMDestroy(&dm_energy)); + PetscCall(DMDestroy(&dm_diagnostic)); + PetscCall(PetscFree(level_dms)); // -- Function list - ierr = PetscFunctionListDestroy(&problem_functions->setupPhysics); - CHKERRQ(ierr); - ierr = PetscFunctionListDestroy(&problem_functions->setupSmootherPhysics); - CHKERRQ(ierr); - ierr = PetscFunctionListDestroy(&problem_functions->setupLibceedFineLevel); - CHKERRQ(ierr); - ierr = PetscFunctionListDestroy(&problem_functions->setupLibceedLevel); - CHKERRQ(ierr); + PetscCall(PetscFunctionListDestroy(&problem_functions->setupPhysics)); + PetscCall(PetscFunctionListDestroy(&problem_functions->setupSmootherPhysics)); + PetscCall(PetscFunctionListDestroy(&problem_functions->setupLibceedFineLevel)); + PetscCall(PetscFunctionListDestroy(&problem_functions->setupLibceedLevel)); // Structs - ierr = PetscFree(res_ctx); CHKERRQ(ierr); - ierr = PetscFree(form_jacob_ctx); CHKERRQ(ierr); - ierr = PetscFree(jacob_coarse_ctx); CHKERRQ(ierr); - ierr = PetscFree(app_ctx); CHKERRQ(ierr); - ierr = PetscFree(problem_functions); CHKERRQ(ierr); - ierr = PetscFree(units); CHKERRQ(ierr); + PetscCall(PetscFree(res_ctx)); + PetscCall(PetscFree(form_jacob_ctx)); + PetscCall(PetscFree(jacob_coarse_ctx)); + PetscCall(PetscFree(app_ctx)); + PetscCall(PetscFree(problem_functions)); + PetscCall(PetscFree(units)); return PetscFinalize(); } diff --git a/examples/solids/elasticity.h b/examples/solids/elasticity.h index 3d76e6d069..e9ae84e225 100644 --- a/examples/solids/elasticity.h +++ b/examples/solids/elasticity.h @@ -15,17 +15,18 @@ #include #include #include -#include "problems/problems.h" + #include "include/cl-options.h" #include "include/matops.h" #include "include/misc.h" -#include "include/structs.h" #include "include/setup-dm.h" #include "include/setup-libceed.h" +#include "include/structs.h" #include "include/utils.h" +#include "problems/problems.h" -#if PETSC_VERSION_LT(3,17,0) +#if PETSC_VERSION_LT(3, 17, 0) #error "PETSc v3.17 or later is required" #endif -#endif // libceed_solids_examples_setup_h +#endif // libceed_solids_examples_setup_h diff --git a/examples/solids/include/boundary.h b/examples/solids/include/boundary.h index 4fc732459f..e30327d1f9 100644 --- a/examples/solids/include/boundary.h +++ b/examples/solids/include/boundary.h @@ -10,9 +10,8 @@ #include -typedef PetscErrorCode BCFunc(PetscInt, PetscReal, const PetscReal *, PetscInt, - PetscScalar *, void *); -BCFunc BCMMS, BCZero, BCClamp; +typedef PetscErrorCode BCFunc(PetscInt, PetscReal, const PetscReal *, PetscInt, PetscScalar *, void *); +BCFunc BCMMS, BCZero, BCClamp; // ----------------------------------------------------------------------------- // Boundary Functions @@ -23,14 +22,10 @@ BCFunc BCMMS, BCZero, BCClamp; // BCMMS - boundary function // Values on all points of the mesh is set based on given solution below // for u[0], u[1], u[2] -PetscErrorCode BCMMS(PetscInt dim, PetscReal load_increment, - const PetscReal coords[], PetscInt num_comp_u, - PetscScalar *u, void *ctx); +PetscErrorCode BCMMS(PetscInt dim, PetscReal load_increment, const PetscReal coords[], PetscInt num_comp_u, PetscScalar *u, void *ctx); // BCClamp - fix boundary values with affine transformation at fraction of load // increment -PetscErrorCode BCClamp(PetscInt dim, PetscReal load_increment, - const PetscReal coords[], PetscInt num_comp_u, - PetscScalar *u, void *ctx); +PetscErrorCode BCClamp(PetscInt dim, PetscReal load_increment, const PetscReal coords[], PetscInt num_comp_u, PetscScalar *u, void *ctx); -#endif // libceed_solids_examples_boundary_h +#endif // libceed_solids_examples_boundary_h diff --git a/examples/solids/include/cl-options.h b/examples/solids/include/cl-options.h index eda1664092..90a5fabe83 100644 --- a/examples/solids/include/cl-options.h +++ b/examples/solids/include/cl-options.h @@ -9,9 +9,10 @@ #define libceed_solids_examples_cl_options_h #include + #include "../include/structs.h" // Process general command line options PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx); -#endif // libceed_solids_examples_cl_options_h +#endif // libceed_solids_examples_cl_options_h diff --git a/examples/solids/include/matops.h b/examples/solids/include/matops.h index 65d01bd3cd..11dc5b4fe8 100644 --- a/examples/solids/include/matops.h +++ b/examples/solids/include/matops.h @@ -10,6 +10,7 @@ #include #include + #include "../include/structs.h" // This function uses libCEED to compute the local action of an operator @@ -34,11 +35,9 @@ PetscErrorCode Restrict_Ceed(Mat A, Vec X, Vec Y); PetscErrorCode GetDiag_Ceed(Mat A, Vec D); // This function calculates the strain energy in the final solution -PetscErrorCode ComputeStrainEnergy(DM dm_energy, UserMult user, - CeedOperator op_energy, Vec X, - PetscReal *energy); +PetscErrorCode ComputeStrainEnergy(DM dm_energy, UserMult user, CeedOperator op_energy, Vec X, PetscReal *energy); // this function checks to see if the computed energy is close enough to reference file energy. PetscErrorCode RegressionTests_solids(AppCtx app_ctx, PetscReal energy); -#endif // libceed_solids_examples_matopts_h +#endif // libceed_solids_examples_matopts_h diff --git a/examples/solids/include/misc.h b/examples/solids/include/misc.h index 568866f244..712d4487d8 100644 --- a/examples/solids/include/misc.h +++ b/examples/solids/include/misc.h @@ -10,24 +10,19 @@ #include #include + #include "../include/structs.h" // ----------------------------------------------------------------------------- // Context setup // ----------------------------------------------------------------------------- // Setup context data for Jacobian evaluation -PetscErrorCode SetupJacobianCtx(MPI_Comm comm, AppCtx app_ctx, DM dm, Vec V, - Vec V_loc, CeedData ceed_data, Ceed ceed, - CeedQFunctionContext ctx_phys, - CeedQFunctionContext ctx_phys_smoother, - UserMult jacobian_ctx); +PetscErrorCode SetupJacobianCtx(MPI_Comm comm, AppCtx app_ctx, DM dm, Vec V, Vec V_loc, CeedData ceed_data, Ceed ceed, CeedQFunctionContext ctx_phys, + CeedQFunctionContext ctx_phys_smoother, UserMult jacobian_ctx); // Setup context data for prolongation and restriction operators -PetscErrorCode SetupProlongRestrictCtx(MPI_Comm comm, AppCtx app_ctx, DM dm_c, - DM dm_f, Vec V_f, Vec V_loc_c, Vec V_loc_f, - CeedData ceed_data_c, CeedData ceed_data_f, - Ceed ceed, - UserMultProlongRestr prolong_restr_ctx); +PetscErrorCode SetupProlongRestrictCtx(MPI_Comm comm, AppCtx app_ctx, DM dm_c, DM dm_f, Vec V_f, Vec V_loc_c, Vec V_loc_f, CeedData ceed_data_c, + CeedData ceed_data_f, Ceed ceed, UserMultProlongRestr prolong_restr_ctx); // ----------------------------------------------------------------------------- // Jacobian setup @@ -37,16 +32,13 @@ PetscErrorCode FormJacobian(SNES snes, Vec U, Mat J, Mat J_pre, void *ctx); // ----------------------------------------------------------------------------- // Solution output // ----------------------------------------------------------------------------- -PetscErrorCode ViewSolution(MPI_Comm comm, AppCtx app_ctx, Vec U, - PetscInt increment, PetscScalar load_increment); +PetscErrorCode ViewSolution(MPI_Comm comm, AppCtx app_ctx, Vec U, PetscInt increment, PetscScalar load_increment); -PetscErrorCode ViewDiagnosticQuantities(MPI_Comm comm, DM dm_U, - UserMult user, AppCtx app_ctx, Vec U, - CeedElemRestriction elem_restr_diagnostic); +PetscErrorCode ViewDiagnosticQuantities(MPI_Comm comm, DM dm_U, UserMult user, AppCtx app_ctx, Vec U, CeedElemRestriction elem_restr_diagnostic); // ----------------------------------------------------------------------------- // Regression testing // ----------------------------------------------------------------------------- PetscErrorCode RegressionTests_solids(AppCtx app_ctx, PetscReal energy); -#endif // libceed_solids_examples_misc_h +#endif // libceed_solids_examples_misc_h diff --git a/examples/solids/include/setup-dm.h b/examples/solids/include/setup-dm.h index 781db3f43f..880b8f6d4a 100644 --- a/examples/solids/include/setup-dm.h +++ b/examples/solids/include/setup-dm.h @@ -12,6 +12,7 @@ #include #include #include + #include "../include/structs.h" // ----------------------------------------------------------------------------- @@ -23,7 +24,6 @@ PetscErrorCode CreateBCLabel(DM dm, const char name[]); PetscErrorCode CreateDistributedDM(MPI_Comm comm, AppCtx app_ctx, DM *dm); // Setup DM with FE space of appropriate degree -PetscErrorCode SetupDMByDegree(DM dm, AppCtx app_ctx, PetscInt order, - PetscBool boundary, PetscInt num_comp_u); +PetscErrorCode SetupDMByDegree(DM dm, AppCtx app_ctx, PetscInt order, PetscBool boundary, PetscInt num_comp_u); -#endif // libceed_solids_examples_setup_dm_h +#endif // libceed_solids_examples_setup_dm_h diff --git a/examples/solids/include/setup-libceed.h b/examples/solids/include/setup-libceed.h index d537909a8d..f686f92d4b 100644 --- a/examples/solids/include/setup-libceed.h +++ b/examples/solids/include/setup-libceed.h @@ -10,6 +10,7 @@ #include #include + #include "../include/structs.h" // ----------------------------------------------------------------------------- @@ -22,32 +23,19 @@ PetscErrorCode CeedDataDestroy(CeedInt level, CeedData data); PetscInt Involute(PetscInt i); // Utility function to create local CEED restriction from DMPlex -PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr); +PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr); // Utility function to get Ceed Restriction for each domain -PetscErrorCode GetRestrictionForDomain(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, PetscInt value, - CeedInt Q, CeedInt q_data_size, - CeedElemRestriction *elem_restr_q, - CeedElemRestriction *elem_restr_x, - CeedElemRestriction *elem_restr_qd_i); +PetscErrorCode GetRestrictionForDomain(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, PetscInt value, CeedInt Q, CeedInt q_data_size, + CeedElemRestriction *elem_restr_q, CeedElemRestriction *elem_restr_x, CeedElemRestriction *elem_restr_qd_i); // Set up libCEED for a given degree -PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, - Ceed ceed, AppCtx app_ctx, - CeedQFunctionContext phys_ctx, - ProblemData problem_data, - PetscInt fine_level, PetscInt num_comp_u, - PetscInt U_g_size, PetscInt U_loc_size, - CeedVector force_ceed, - CeedVector neumann_ceed, CeedData *data); +PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + ProblemData problem_data, PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, + CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data); // Set up libCEED multigrid level for a given degree -PetscErrorCode SetupLibceedLevel(DM dm, Ceed ceed, AppCtx app_ctx, - ProblemData problem_data, PetscInt level, - PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, - CeedData *data); +PetscErrorCode SetupLibceedLevel(DM dm, Ceed ceed, AppCtx app_ctx, ProblemData problem_data, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data); -#endif // libceed_solids_examples_setup_libceed_h +#endif // libceed_solids_examples_setup_libceed_h diff --git a/examples/solids/include/structs.h b/examples/solids/include/structs.h index 0ecb6ee777..ff47c41f1a 100644 --- a/examples/solids/include/structs.h +++ b/examples/solids/include/structs.h @@ -10,38 +10,21 @@ #include #include + #include "../problems/cl-problems.h" // ----------------------------------------------------------------------------- // Command Line Options // ----------------------------------------------------------------------------- // Forcing function options -typedef enum { - FORCE_NONE = 0, FORCE_CONST = 1, FORCE_MMS = 2 -} forcingType; -static const char *const forcing_types[] = {"none", - "constant", - "mms", - "forcingType","FORCE_",0 - }; -static const char *const forcing_types_for_disp[] = {"None", - "Constant", - "Manufactured solution" - }; +typedef enum { FORCE_NONE = 0, FORCE_CONST = 1, FORCE_MMS = 2 } forcingType; +static const char *const forcing_types[] = {"none", "constant", "mms", "forcingType", "FORCE_", 0}; +static const char *const forcing_types_for_disp[] = {"None", "Constant", "Manufactured solution"}; // Multigrid options -typedef enum { - MULTIGRID_LOGARITHMIC = 0, MULTIGRID_UNIFORM = 1, MULTIGRID_NONE = 2 -} multigridType; -static const char *const multigrid_types [] = {"logarithmic", - "uniform", - "none", - "multigridType","MULTIGRID",0 - }; -static const char *const multigrid_types_for_disp[] = {"P-multigrid, logarithmic coarsening", - "P-multigrind, uniform coarsening", - "No multigrid" - }; +typedef enum { MULTIGRID_LOGARITHMIC = 0, MULTIGRID_UNIFORM = 1, MULTIGRID_NONE = 2 } multigridType; +static const char *const multigrid_types[] = {"logarithmic", "uniform", "none", "multigridType", "MULTIGRID", 0}; +static const char *const multigrid_types_for_disp[] = {"P-multigrid, logarithmic coarsening", "P-multigrind, uniform coarsening", "No multigrid"}; // ----------------------------------------------------------------------------- // Application data structs @@ -60,9 +43,9 @@ struct Units_ { // Application context from user command line options typedef struct AppCtx_ *AppCtx; struct AppCtx_ { - const char *name, *name_for_disp; // problem name - char ceed_resource[PETSC_MAX_PATH_LEN]; // libCEED backend - char mesh_file[PETSC_MAX_PATH_LEN]; // exodusII mesh file + const char *name, *name_for_disp; // problem name + char ceed_resource[PETSC_MAX_PATH_LEN]; // libCEED backend + char mesh_file[PETSC_MAX_PATH_LEN]; // exodusII mesh file char output_dir[PETSC_MAX_PATH_LEN]; PetscBool test_mode; PetscBool view_soln; @@ -74,25 +57,25 @@ struct AppCtx_ { PetscInt degree; PetscInt q_extra; PetscInt num_levels; - PetscInt *level_degrees; - PetscInt num_increments; // Number of steps + PetscInt *level_degrees; + PetscInt num_increments; // Number of steps PetscInt bc_clamp_count; PetscInt bc_clamp_faces[16]; // [translation; 3] [rotation axis; 3] [rotation magnitude c_0, c_1] // The rotations are (c_0 + c_1 s) \pi, where s = x · axis - PetscScalar bc_clamp_max[16][8]; - PetscInt bc_traction_count; - PetscInt bc_traction_faces[16]; - PetscScalar bc_traction_vector[16][3]; - PetscScalar forcing_vector[3]; - PetscReal test_tol; - PetscReal expect_final_strain; + PetscScalar bc_clamp_max[16][8]; + PetscInt bc_traction_count; + PetscInt bc_traction_faces[16]; + PetscScalar bc_traction_vector[16][3]; + PetscScalar forcing_vector[3]; + PetscReal test_tol; + PetscReal expect_final_strain; }; // Forcing function data typedef struct { CeedQFunctionUser setup_forcing; - const char *setup_forcing_loc; + const char *setup_forcing_loc; } forcingData; extern forcingData forcing_options[3]; @@ -100,23 +83,23 @@ extern forcingData forcing_options[3]; // Data for PETSc Matshell typedef struct UserMult_ *UserMult; struct UserMult_ { - MPI_Comm comm; - DM dm; - Vec X_loc, Y_loc, neumann_bcs; - CeedVector x_ceed, y_ceed; - CeedOperator op; - CeedQFunction qf; - Ceed ceed; - PetscScalar load_increment; + MPI_Comm comm; + DM dm; + Vec X_loc, Y_loc, neumann_bcs; + CeedVector x_ceed, y_ceed; + CeedOperator op; + CeedQFunction qf; + Ceed ceed; + PetscScalar load_increment; CeedQFunctionContext ctx_phys, ctx_phys_smoother; }; // Data for Jacobian setup routine typedef struct FormJacobCtx_ *FormJacobCtx; struct FormJacobCtx_ { - UserMult *jacob_ctx; + UserMult *jacob_ctx; PetscInt num_levels; - Mat *jacob_mat, jacob_mat_coarse; + Mat *jacob_mat, jacob_mat_coarse; CeedVector coo_values; CeedOperator op_coarse; }; @@ -138,29 +121,21 @@ struct UserMultProlongRestr_ { typedef struct CeedData_ *CeedData; struct CeedData_ { Ceed ceed; - CeedBasis basis_x, basis_u, basis_c_to_f, basis_energy, - basis_diagnostic; - CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_geo_data_i, - elem_restr_energy, elem_restr_diagnostic, - elem_restr_geo_data_diagnostic_i, - elem_restr_stored_fields_i[SOLIDS_MAX_NUMBER_FIELDS]; - CeedQFunction qf_residual, qf_jacobian, qf_energy, qf_diagnostic; - CeedOperator op_residual, op_jacobian, op_restrict, op_prolong, - op_energy, - op_diagnostic; - CeedVector geo_data, geo_data_diagnostic, x_ceed, y_ceed, - true_soln, stored_fields[SOLIDS_MAX_NUMBER_FIELDS]; + CeedBasis basis_x, basis_u, basis_c_to_f, basis_energy, basis_diagnostic; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_geo_data_i, elem_restr_energy, elem_restr_diagnostic, elem_restr_geo_data_diagnostic_i, + elem_restr_stored_fields_i[SOLIDS_MAX_NUMBER_FIELDS]; + CeedQFunction qf_residual, qf_jacobian, qf_energy, qf_diagnostic; + CeedOperator op_residual, op_jacobian, op_restrict, op_prolong, op_energy, op_diagnostic; + CeedVector geo_data, geo_data_diagnostic, x_ceed, y_ceed, true_soln, stored_fields[SOLIDS_MAX_NUMBER_FIELDS]; }; typedef struct { - CeedQFunctionUser setup_geo, residual, jacobian, energy, - diagnostic, true_soln; - const char *setup_geo_loc, *residual_loc, *jacobian_loc, *energy_loc, - *diagnostic_loc, *true_soln_loc; - CeedQuadMode quadrature_mode; - CeedInt q_data_size, number_fields_stored; - CeedInt *field_sizes; + CeedQFunctionUser setup_geo, residual, jacobian, energy, diagnostic, true_soln; + const char *setup_geo_loc, *residual_loc, *jacobian_loc, *energy_loc, *diagnostic_loc, *true_soln_loc; + CeedQuadMode quadrature_mode; + CeedInt q_data_size, number_fields_stored; + CeedInt *field_sizes; const char *const *field_names; } ProblemData; -#endif // libceed_solids_examples_structs_h +#endif // libceed_solids_examples_structs_h diff --git a/examples/solids/include/utils.h b/examples/solids/include/utils.h index 0f08547a1e..6ea14c074d 100644 --- a/examples/solids/include/utils.h +++ b/examples/solids/include/utils.h @@ -12,8 +12,6 @@ #include // Translate PetscMemType to CeedMemType -static inline CeedMemType MemTypeP2C(PetscMemType mem_type) { - return PetscMemTypeDevice(mem_type) ? CEED_MEM_DEVICE : CEED_MEM_HOST; -} +static inline CeedMemType MemTypeP2C(PetscMemType mem_type) { return PetscMemTypeDevice(mem_type) ? CEED_MEM_DEVICE : CEED_MEM_HOST; } -#endif // libceed_solids_examples_utils_h +#endif // libceed_solids_examples_utils_h diff --git a/examples/solids/problems/cl-problems.h b/examples/solids/problems/cl-problems.h index 53b8d90d51..a6f1a85d50 100644 --- a/examples/solids/problems/cl-problems.h +++ b/examples/solids/problems/cl-problems.h @@ -10,25 +10,23 @@ // Problem options typedef enum { - ELAS_LINEAR = 0, ELAS_SS_NH = 1, ELAS_FSInitial_NH1 = 2, ELAS_FSInitial_NH2 = 3, - ELAS_FSCurrent_NH1 = 4, ELAS_FSCurrent_NH2 = 5, ELAS_FSInitial_MR1 = 6 + ELAS_LINEAR = 0, + ELAS_SS_NH = 1, + ELAS_FSInitial_NH1 = 2, + ELAS_FSInitial_NH2 = 3, + ELAS_FSCurrent_NH1 = 4, + ELAS_FSCurrent_NH2 = 5, + ELAS_FSInitial_MR1 = 6 } problemType; -static const char *const problemTypes[] = {"Linear", - "SS-NH", - "FSInitial-NH1", - "FSInitial-NH2", - "FSCurrent-NH1", - "FSCurrent-NH2", - "FSInitial-MR1", - "problemType","ELAS_",0 - }; -static const char *const problemTypesForDisp[] = {"Linear elasticity", - "Hyperelasticity small strain, Neo-Hookean", - "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage", - "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u), C_inv, constant storage", - "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage", - "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxcurr, tau, constant storage", - "Hyperelasticity finite strain Initial configuration Moony-Rivlin w/ dXref_dxinit, Grad(u) storage" - }; +static const char *const problemTypes[] = {"Linear", "SS-NH", "FSInitial-NH1", "FSInitial-NH2", "FSCurrent-NH1", + "FSCurrent-NH2", "FSInitial-MR1", "problemType", "ELAS_", 0}; +static const char *const problemTypesForDisp[] = { + "Linear elasticity", + "Hyperelasticity small strain, Neo-Hookean", + "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage", + "Hyperelasticity finite strain Initial configuration Neo-Hookean w/ dXref_dxinit, Grad(u), C_inv, constant storage", + "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxinit, Grad(u) storage", + "Hyperelasticity finite strain Current configuration Neo-Hookean w/ dXref_dxcurr, tau, constant storage", + "Hyperelasticity finite strain Initial configuration Moony-Rivlin w/ dXref_dxinit, Grad(u) storage"}; -#endif // cl_problems_h \ No newline at end of file +#endif // cl_problems_h \ No newline at end of file diff --git a/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c b/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c index f1b16c64be..836e75fb55 100644 --- a/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c +++ b/examples/solids/problems/finite-strain-mooney-rivlin-initial-1.c @@ -5,63 +5,53 @@ // // This file is part of CEED: http://github.com/ceed +#include "../qfunctions/finite-strain-mooney-rivlin-initial-1.h" + #include -#include "../include/structs.h" + #include "../include/setup-libceed.h" -#include "../problems/problems.h" +#include "../include/structs.h" #include "../problems/mooney-rivlin.h" +#include "../problems/problems.h" #include "../qfunctions/common.h" -#include "../qfunctions/finite-strain-mooney-rivlin-initial-1.h" static const char *const field_names[] = {"gradu"}; -static CeedInt field_sizes[] = {9}; +static CeedInt field_sizes[] = {9}; ProblemData finite_strain_Mooney_Rivlin_initial_1 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSInitialMR1F, - .residual_loc = ElasFSInitialMR1F_loc, - .number_fields_stored = 1, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSInitialMR1dF, - .jacobian_loc = ElasFSInitialMR1dF_loc, - .energy = ElasFSInitialMR1Energy, - .energy_loc = ElasFSInitialMR1Energy_loc, - .diagnostic = ElasFSInitialMR1Diagnostic, - .diagnostic_loc = ElasFSInitialMR1Diagnostic_loc, + .setup_geo = SetupGeo, + .setup_geo_loc = SetupGeo_loc, + .q_data_size = 10, + .quadrature_mode = CEED_GAUSS, + .residual = ElasFSInitialMR1F, + .residual_loc = ElasFSInitialMR1F_loc, + .number_fields_stored = 1, + .field_names = field_names, + .field_sizes = field_sizes, + .jacobian = ElasFSInitialMR1dF, + .jacobian_loc = ElasFSInitialMR1dF_loc, + .energy = ElasFSInitialMR1Energy, + .energy_loc = ElasFSInitialMR1Energy_loc, + .diagnostic = ElasFSInitialMR1Diagnostic, + .diagnostic_loc = ElasFSInitialMR1Diagnostic_loc, }; -PetscErrorCode SetupLibceedFineLevel_ElasFSInitialMR1(DM dm, DM dm_energy, - DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector force_ceed, CeedVector neumann_ceed, - CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedFineLevel_ElasFSInitialMR1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, + CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, - phys_ctx, finite_strain_Mooney_Rivlin_initial_1, - fine_level, num_comp_u, U_g_size, U_loc_size, - force_ceed, neumann_ceed, data); CHKERRQ(ierr); + PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_Mooney_Rivlin_initial_1, fine_level, + num_comp_u, U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); PetscFunctionReturn(0); }; -PetscErrorCode SetupLibceedLevel_ElasFSInitialMR1(DM dm, Ceed ceed, - AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, - PetscInt U_g_size, PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedLevel_ElasFSInitialMR1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedLevel(dm, ceed, app_ctx, - finite_strain_Mooney_Rivlin_initial_1, - level, num_comp_u, U_g_size, U_loc_size, fine_mult, data); - CHKERRQ(ierr); + PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_Mooney_Rivlin_initial_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); PetscFunctionReturn(0); }; diff --git a/examples/solids/problems/finite-strain-neo-hookean-current-1.c b/examples/solids/problems/finite-strain-neo-hookean-current-1.c index ede53ac0a4..c799491db4 100644 --- a/examples/solids/problems/finite-strain-neo-hookean-current-1.c +++ b/examples/solids/problems/finite-strain-neo-hookean-current-1.c @@ -5,63 +5,53 @@ // // This file is part of CEED: http://github.com/ceed +#include "../qfunctions/finite-strain-neo-hookean-current-1.h" + #include -#include "../include/structs.h" + #include "../include/setup-libceed.h" -#include "../problems/problems.h" +#include "../include/structs.h" #include "../problems/neo-hookean.h" +#include "../problems/problems.h" #include "../qfunctions/common.h" -#include "../qfunctions/finite-strain-neo-hookean-current-1.h" static const char *const field_names[] = {"gradu"}; -static CeedInt field_sizes[] = {9}; +static CeedInt field_sizes[] = {9}; ProblemData finite_strain_neo_Hookean_current_1 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSCurrentNH1F, - .residual_loc = ElasFSCurrentNH1F_loc, - .number_fields_stored = 1, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSCurrentNH1dF, - .jacobian_loc = ElasFSCurrentNH1dF_loc, - .energy = ElasFSCurrentNH1Energy, - .energy_loc = ElasFSCurrentNH1Energy_loc, - .diagnostic = ElasFSCurrentNH1Diagnostic, - .diagnostic_loc = ElasFSCurrentNH1Diagnostic_loc, + .setup_geo = SetupGeo, + .setup_geo_loc = SetupGeo_loc, + .q_data_size = 10, + .quadrature_mode = CEED_GAUSS, + .residual = ElasFSCurrentNH1F, + .residual_loc = ElasFSCurrentNH1F_loc, + .number_fields_stored = 1, + .field_names = field_names, + .field_sizes = field_sizes, + .jacobian = ElasFSCurrentNH1dF, + .jacobian_loc = ElasFSCurrentNH1dF_loc, + .energy = ElasFSCurrentNH1Energy, + .energy_loc = ElasFSCurrentNH1Energy_loc, + .diagnostic = ElasFSCurrentNH1Diagnostic, + .diagnostic_loc = ElasFSCurrentNH1Diagnostic_loc, }; -PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH1(DM dm, DM dm_energy, - DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector force_ceed, CeedVector neumann_ceed, - CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, + CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, - phys_ctx, finite_strain_neo_Hookean_current_1, - fine_level, num_comp_u, U_g_size, U_loc_size, - force_ceed, neumann_ceed, data); CHKERRQ(ierr); + PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_current_1, fine_level, num_comp_u, + U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); PetscFunctionReturn(0); }; -PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH1(DM dm, Ceed ceed, - AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedLevel(dm, ceed, app_ctx, - finite_strain_neo_Hookean_current_1, - level, num_comp_u, U_g_size, U_loc_size, fine_mult, data); - CHKERRQ(ierr); + PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_current_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); PetscFunctionReturn(0); }; diff --git a/examples/solids/problems/finite-strain-neo-hookean-current-2.c b/examples/solids/problems/finite-strain-neo-hookean-current-2.c index c547dc59e6..9f143f15d0 100644 --- a/examples/solids/problems/finite-strain-neo-hookean-current-2.c +++ b/examples/solids/problems/finite-strain-neo-hookean-current-2.c @@ -5,63 +5,53 @@ // // This file is part of CEED: http://github.com/ceed +#include "../qfunctions/finite-strain-neo-hookean-current-2.h" + #include -#include "../include/structs.h" + #include "../include/setup-libceed.h" -#include "../problems/problems.h" +#include "../include/structs.h" #include "../problems/neo-hookean.h" +#include "../problems/problems.h" #include "../qfunctions/common.h" -#include "../qfunctions/finite-strain-neo-hookean-current-2.h" static const char *const field_names[] = {"dXdx", "tau", "lambda_log_J"}; -static CeedInt field_sizes[] = {9, 6, 1}; +static CeedInt field_sizes[] = {9, 6, 1}; ProblemData finite_strain_neo_Hookean_current_2 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSCurrentNH2F, - .residual_loc = ElasFSCurrentNH2F_loc, - .number_fields_stored = 3, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSCurrentNH2dF, - .jacobian_loc = ElasFSCurrentNH2dF_loc, - .energy = ElasFSCurrentNH2Energy, - .energy_loc = ElasFSCurrentNH2Energy_loc, - .diagnostic = ElasFSCurrentNH2Diagnostic, - .diagnostic_loc = ElasFSCurrentNH2Diagnostic_loc, + .setup_geo = SetupGeo, + .setup_geo_loc = SetupGeo_loc, + .q_data_size = 10, + .quadrature_mode = CEED_GAUSS, + .residual = ElasFSCurrentNH2F, + .residual_loc = ElasFSCurrentNH2F_loc, + .number_fields_stored = 3, + .field_names = field_names, + .field_sizes = field_sizes, + .jacobian = ElasFSCurrentNH2dF, + .jacobian_loc = ElasFSCurrentNH2dF_loc, + .energy = ElasFSCurrentNH2Energy, + .energy_loc = ElasFSCurrentNH2Energy_loc, + .diagnostic = ElasFSCurrentNH2Diagnostic, + .diagnostic_loc = ElasFSCurrentNH2Diagnostic_loc, }; -PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH2(DM dm, DM dm_energy, - DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector force_ceed, CeedVector neumann_ceed, - CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedFineLevel_ElasFSCurrentNH2(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, + CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, - phys_ctx, finite_strain_neo_Hookean_current_2, - fine_level, num_comp_u, U_g_size, U_loc_size, - force_ceed, neumann_ceed, data); CHKERRQ(ierr); + PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_current_2, fine_level, num_comp_u, + U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); PetscFunctionReturn(0); }; -PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH2(DM dm, Ceed ceed, - AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedLevel_ElasFSCurrentNH2(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedLevel(dm, ceed, app_ctx, - finite_strain_neo_Hookean_current_2, - level, num_comp_u, U_g_size, U_loc_size, fine_mult, data); - CHKERRQ(ierr); + PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_current_2, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); PetscFunctionReturn(0); }; diff --git a/examples/solids/problems/finite-strain-neo-hookean-initial-1.c b/examples/solids/problems/finite-strain-neo-hookean-initial-1.c index faa7a0c01a..dde396a2ad 100644 --- a/examples/solids/problems/finite-strain-neo-hookean-initial-1.c +++ b/examples/solids/problems/finite-strain-neo-hookean-initial-1.c @@ -5,63 +5,53 @@ // // This file is part of CEED: http://github.com/ceed +#include "../qfunctions/finite-strain-neo-hookean-initial-1.h" + #include -#include "../include/structs.h" + #include "../include/setup-libceed.h" -#include "../problems/problems.h" +#include "../include/structs.h" #include "../problems/neo-hookean.h" +#include "../problems/problems.h" #include "../qfunctions/common.h" -#include "../qfunctions/finite-strain-neo-hookean-initial-1.h" static const char *const field_names[] = {"gradu"}; -static CeedInt field_sizes[] = {9}; +static CeedInt field_sizes[] = {9}; ProblemData finite_strain_neo_Hookean_initial_1 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSInitialNH1F, - .residual_loc = ElasFSInitialNH1F_loc, - .number_fields_stored = 1, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSInitialNH1dF, - .jacobian_loc = ElasFSInitialNH1dF_loc, - .energy = ElasFSInitialNH1Energy, - .energy_loc = ElasFSInitialNH1Energy_loc, - .diagnostic = ElasFSInitialNH1Diagnostic, - .diagnostic_loc = ElasFSInitialNH1Diagnostic_loc, + .setup_geo = SetupGeo, + .setup_geo_loc = SetupGeo_loc, + .q_data_size = 10, + .quadrature_mode = CEED_GAUSS, + .residual = ElasFSInitialNH1F, + .residual_loc = ElasFSInitialNH1F_loc, + .number_fields_stored = 1, + .field_names = field_names, + .field_sizes = field_sizes, + .jacobian = ElasFSInitialNH1dF, + .jacobian_loc = ElasFSInitialNH1dF_loc, + .energy = ElasFSInitialNH1Energy, + .energy_loc = ElasFSInitialNH1Energy_loc, + .diagnostic = ElasFSInitialNH1Diagnostic, + .diagnostic_loc = ElasFSInitialNH1Diagnostic_loc, }; -PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH1(DM dm, DM dm_energy, - DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector force_ceed, CeedVector neumann_ceed, - CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH1(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, + CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, - phys_ctx, finite_strain_neo_Hookean_initial_1, - fine_level, num_comp_u, U_g_size, U_loc_size, - force_ceed, neumann_ceed, data); CHKERRQ(ierr); + PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_initial_1, fine_level, num_comp_u, + U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); PetscFunctionReturn(0); }; -PetscErrorCode SetupLibceedLevel_ElasFSInitialNH1(DM dm, Ceed ceed, - AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedLevel_ElasFSInitialNH1(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedLevel(dm, ceed, app_ctx, - finite_strain_neo_Hookean_initial_1, - level, num_comp_u, U_g_size, U_loc_size, fine_mult, data); - CHKERRQ(ierr); + PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_initial_1, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); PetscFunctionReturn(0); }; diff --git a/examples/solids/problems/finite-strain-neo-hookean-initial-2.c b/examples/solids/problems/finite-strain-neo-hookean-initial-2.c index c511068c5c..8155cc9218 100644 --- a/examples/solids/problems/finite-strain-neo-hookean-initial-2.c +++ b/examples/solids/problems/finite-strain-neo-hookean-initial-2.c @@ -5,63 +5,53 @@ // // This file is part of CEED: http://github.com/ceed +#include "../qfunctions/finite-strain-neo-hookean-initial-2.h" + #include -#include "../include/structs.h" + #include "../include/setup-libceed.h" -#include "../problems/problems.h" +#include "../include/structs.h" #include "../problems/neo-hookean.h" +#include "../problems/problems.h" #include "../qfunctions/common.h" -#include "../qfunctions/finite-strain-neo-hookean-initial-2.h" static const char *const field_names[] = {"gradu", "C_inv", "lambda_log_J"}; -static CeedInt field_sizes[] = {9, 6, 1}; +static CeedInt field_sizes[] = {9, 6, 1}; ProblemData finite_strain_neo_Hookean_initial_2 = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasFSInitialNH2F, - .residual_loc = ElasFSInitialNH2F_loc, - .number_fields_stored = 3, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasFSInitialNH2dF, - .jacobian_loc = ElasFSInitialNH2dF_loc, - .energy = ElasFSInitialNH2Energy, - .energy_loc = ElasFSInitialNH2Energy_loc, - .diagnostic = ElasFSInitialNH2Diagnostic, - .diagnostic_loc = ElasFSInitialNH2Diagnostic_loc, + .setup_geo = SetupGeo, + .setup_geo_loc = SetupGeo_loc, + .q_data_size = 10, + .quadrature_mode = CEED_GAUSS, + .residual = ElasFSInitialNH2F, + .residual_loc = ElasFSInitialNH2F_loc, + .number_fields_stored = 3, + .field_names = field_names, + .field_sizes = field_sizes, + .jacobian = ElasFSInitialNH2dF, + .jacobian_loc = ElasFSInitialNH2dF_loc, + .energy = ElasFSInitialNH2Energy, + .energy_loc = ElasFSInitialNH2Energy_loc, + .diagnostic = ElasFSInitialNH2Diagnostic, + .diagnostic_loc = ElasFSInitialNH2Diagnostic_loc, }; -PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH2(DM dm, DM dm_energy, - DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector force_ceed, CeedVector neumann_ceed, - CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedFineLevel_ElasFSInitialNH2(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, + CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, - phys_ctx, finite_strain_neo_Hookean_initial_2, - fine_level, num_comp_u, U_g_size, U_loc_size, - force_ceed, neumann_ceed, data); CHKERRQ(ierr); + PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, finite_strain_neo_Hookean_initial_2, fine_level, num_comp_u, + U_g_size, U_loc_size, force_ceed, neumann_ceed, data)); PetscFunctionReturn(0); }; -PetscErrorCode SetupLibceedLevel_ElasFSInitialNH2(DM dm, Ceed ceed, - AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedLevel_ElasFSInitialNH2(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedLevel(dm, ceed, app_ctx, - finite_strain_neo_Hookean_initial_2, - level, num_comp_u, U_g_size, U_loc_size, fine_mult, data); - CHKERRQ(ierr); + PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, finite_strain_neo_Hookean_initial_2, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); PetscFunctionReturn(0); }; diff --git a/examples/solids/problems/linear.c b/examples/solids/problems/linear.c index 76144af744..5b659501fb 100644 --- a/examples/solids/problems/linear.c +++ b/examples/solids/problems/linear.c @@ -5,60 +5,51 @@ // // This file is part of CEED: http://github.com/ceed +#include "../qfunctions/linear.h" + #include -#include "../include/structs.h" + #include "../include/setup-libceed.h" -#include "../problems/problems.h" +#include "../include/structs.h" #include "../problems/neo-hookean.h" +#include "../problems/problems.h" #include "../qfunctions/common.h" -#include "../qfunctions/linear.h" #include "../qfunctions/manufactured-true.h" ProblemData linear_elasticity = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasLinearF, - .residual_loc = ElasLinearF_loc, - .number_fields_stored = 0, - .jacobian = ElasLineardF, - .jacobian_loc = ElasLineardF_loc, - .energy = ElasLinearEnergy, - .energy_loc = ElasLinearEnergy_loc, - .diagnostic = ElasLinearDiagnostic, - .diagnostic_loc = ElasLinearDiagnostic_loc, - .true_soln = MMSTrueSoln, - .true_soln_loc = MMSTrueSoln_loc, + .setup_geo = SetupGeo, + .setup_geo_loc = SetupGeo_loc, + .q_data_size = 10, + .quadrature_mode = CEED_GAUSS, + .residual = ElasLinearF, + .residual_loc = ElasLinearF_loc, + .number_fields_stored = 0, + .jacobian = ElasLineardF, + .jacobian_loc = ElasLineardF_loc, + .energy = ElasLinearEnergy, + .energy_loc = ElasLinearEnergy_loc, + .diagnostic = ElasLinearDiagnostic, + .diagnostic_loc = ElasLinearDiagnostic_loc, + .true_soln = MMSTrueSoln, + .true_soln_loc = MMSTrueSoln_loc, }; -PetscErrorCode SetupLibceedFineLevel_ElasLinear(DM dm, DM dm_energy, - DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector force_ceed, CeedVector neumann_ceed, - CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedFineLevel_ElasLinear(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, + CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, - phys_ctx, linear_elasticity, - fine_level, num_comp_u, U_g_size, U_loc_size, - force_ceed, neumann_ceed, data); CHKERRQ(ierr); + PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, linear_elasticity, fine_level, num_comp_u, U_g_size, + U_loc_size, force_ceed, neumann_ceed, data)); PetscFunctionReturn(0); }; -PetscErrorCode SetupLibceedLevel_ElasLinear(DM dm, Ceed ceed, AppCtx app_ctx, - PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, - CeedVector fine_mult, CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedLevel_ElasLinear(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedLevel(dm, ceed, app_ctx, linear_elasticity, - level, num_comp_u, U_g_size, U_loc_size, fine_mult, data); - CHKERRQ(ierr); + PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, linear_elasticity, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); PetscFunctionReturn(0); }; diff --git a/examples/solids/problems/mooney-rivlin.c b/examples/solids/problems/mooney-rivlin.c index 712a6b707f..7b9e45279e 100644 --- a/examples/solids/problems/mooney-rivlin.c +++ b/examples/solids/problems/mooney-rivlin.c @@ -5,64 +5,54 @@ // // This file is part of CEED: http://github.com/ceed +#include "../problems/mooney-rivlin.h" + #include #include -#include "../problems/mooney-rivlin.h" // Build libCEED context object -PetscErrorCode PhysicsContext_MR(MPI_Comm comm, Ceed ceed, Units *units, - CeedQFunctionContext *ctx) { - PetscErrorCode ierr; +PetscErrorCode PhysicsContext_MR(MPI_Comm comm, Ceed ceed, Units *units, CeedQFunctionContext *ctx) { Physics_MR phys; PetscFunctionBegin; - ierr = PetscMalloc1(1, units); CHKERRQ(ierr); - ierr = PetscMalloc1(1, &phys); CHKERRQ(ierr); - ierr = ProcessPhysics_MR(comm, phys, *units); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, units)); + PetscCall(PetscMalloc1(1, &phys)); + PetscCall(ProcessPhysics_MR(comm, phys, *units)); CeedQFunctionContextCreate(ceed, ctx); - CeedQFunctionContextSetData(*ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(*phys), phys); - ierr = PetscFree(phys); CHKERRQ(ierr); + CeedQFunctionContextSetData(*ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(*phys), phys); + PetscCall(PetscFree(phys)); PetscFunctionReturn(0); } // Build libCEED smoother context object -PetscErrorCode PhysicsSmootherContext_MR(MPI_Comm comm, Ceed ceed, - CeedQFunctionContext ctx, CeedQFunctionContext *ctx_smoother) { - PetscErrorCode ierr; +PetscErrorCode PhysicsSmootherContext_MR(MPI_Comm comm, Ceed ceed, CeedQFunctionContext ctx, CeedQFunctionContext *ctx_smoother) { PetscScalar nu_smoother = 0; - PetscBool nu_flag = PETSC_FALSE; - Physics_MR phys, phys_smoother; + PetscBool nu_flag = PETSC_FALSE; + Physics_MR phys, phys_smoother; PetscFunctionBegin; - PetscOptionsBegin(comm, NULL, "Mooney Rivlin physical parameters for smoother", - NULL); + PetscOptionsBegin(comm, NULL, "Mooney Rivlin physical parameters for smoother", NULL); - ierr = PetscOptionsScalar("-nu_smoother", "Poisson's ratio for smoother", - NULL, nu_smoother, &nu_smoother, &nu_flag); - CHKERRQ(ierr); - if (nu_smoother < 0 || - nu_smoother >= 0.5) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, - "Mooney-Rivlin model requires Poisson ratio -nu option in [0, .5)"); + PetscCall(PetscOptionsScalar("-nu_smoother", "Poisson's ratio for smoother", NULL, nu_smoother, &nu_smoother, &nu_flag)); + if (nu_smoother < 0 || nu_smoother >= 0.5) + SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mooney-Rivlin model requires Poisson ratio -nu option in [0, .5)"); - PetscOptionsEnd(); // End of setting Physics + PetscOptionsEnd(); // End of setting Physics if (nu_flag) { // Copy context CeedQFunctionContextGetData(ctx, CEED_MEM_HOST, &phys); - ierr = PetscMalloc1(1, &phys_smoother); CHKERRQ(ierr); - ierr = PetscMemcpy(phys_smoother, phys, sizeof(*phys)); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &phys_smoother)); + PetscCall(PetscMemcpy(phys_smoother, phys, sizeof(*phys))); CeedQFunctionContextRestoreData(ctx, &phys); // Create smoother context CeedQFunctionContextCreate(ceed, ctx_smoother); - phys_smoother->lambda = 2 * (phys_smoother->mu_1 + phys_smoother->mu_2) * - nu_smoother / (1 - 2*nu_smoother); - CeedQFunctionContextSetData(*ctx_smoother, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(*phys_smoother), phys_smoother); - ierr = PetscFree(phys_smoother); CHKERRQ(ierr); + phys_smoother->lambda = 2 * (phys_smoother->mu_1 + phys_smoother->mu_2) * nu_smoother / (1 - 2 * nu_smoother); + CeedQFunctionContextSetData(*ctx_smoother, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(*phys_smoother), phys_smoother); + PetscCall(PetscFree(phys_smoother)); } else { *ctx_smoother = NULL; } @@ -72,51 +62,38 @@ PetscErrorCode PhysicsSmootherContext_MR(MPI_Comm comm, Ceed ceed, // Process physics options - Mooney-Rivlin PetscErrorCode ProcessPhysics_MR(MPI_Comm comm, Physics_MR phys, Units units) { - PetscErrorCode ierr; - PetscReal nu = -1; - phys->mu_1 = -1; - phys->mu_2 = -1; - phys->lambda = -1; - units->meter = 1; // 1 meter in scaled length units - units->second = 1; // 1 second in scaled time units - units->kilogram = 1; // 1 kilogram in scaled mass units + PetscReal nu = -1; + phys->mu_1 = -1; + phys->mu_2 = -1; + phys->lambda = -1; + units->meter = 1; // 1 meter in scaled length units + units->second = 1; // 1 second in scaled time units + units->kilogram = 1; // 1 kilogram in scaled mass units PetscFunctionBeginUser; PetscOptionsBegin(comm, NULL, "Mooney Rivlin physical parameters", NULL); - ierr = PetscOptionsScalar("-mu_1", "Material Property mu_1", NULL, - phys->mu_1, &phys->mu_1, NULL); CHKERRQ(ierr); - if (phys->mu_1 < 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, - "Mooney-Rivlin model requires non-negative -mu_1 option (Pa)"); - - ierr = PetscOptionsScalar("-mu_2", "Material Property mu_2", NULL, - phys->mu_2, &phys->mu_2, NULL); CHKERRQ(ierr); - if (phys->mu_2 < 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, - "Mooney-Rivlin model requires non-negative -mu_2 option (Pa)"); - - ierr = PetscOptionsScalar("-nu", "Poisson ratio", NULL, - nu, &nu, NULL); CHKERRQ(ierr); - if (nu < 0 || nu >= 0.5) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, - "Mooney-Rivlin model requires Poisson ratio -nu option in [0, .5)"); - phys->lambda = 2 * (phys->mu_1 + phys->mu_2) * nu / (1 - 2*nu); - - ierr = PetscOptionsScalar("-units_meter", "1 meter in scaled length units", - NULL, units->meter, &units->meter, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-mu_1", "Material Property mu_1", NULL, phys->mu_1, &phys->mu_1, NULL)); + if (phys->mu_1 < 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mooney-Rivlin model requires non-negative -mu_1 option (Pa)"); + + PetscCall(PetscOptionsScalar("-mu_2", "Material Property mu_2", NULL, phys->mu_2, &phys->mu_2, NULL)); + if (phys->mu_2 < 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mooney-Rivlin model requires non-negative -mu_2 option (Pa)"); + + PetscCall(PetscOptionsScalar("-nu", "Poisson ratio", NULL, nu, &nu, NULL)); + if (nu < 0 || nu >= 0.5) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mooney-Rivlin model requires Poisson ratio -nu option in [0, .5)"); + phys->lambda = 2 * (phys->mu_1 + phys->mu_2) * nu / (1 - 2 * nu); + + PetscCall(PetscOptionsScalar("-units_meter", "1 meter in scaled length units", NULL, units->meter, &units->meter, NULL)); units->meter = fabs(units->meter); - ierr = PetscOptionsScalar("-units_second", "1 second in scaled time units", - NULL, units->second, &units->second, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_second", "1 second in scaled time units", NULL, units->second, &units->second, NULL)); units->second = fabs(units->second); - ierr = PetscOptionsScalar("-units_kilogram", "1 kilogram in scaled mass units", - NULL, units->kilogram, &units->kilogram, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_kilogram", "1 kilogram in scaled mass units", NULL, units->kilogram, &units->kilogram, NULL)); units->kilogram = fabs(units->kilogram); - PetscOptionsEnd(); // End of setting Physics + PetscOptionsEnd(); // End of setting Physics // Define derived units units->Pascal = units->kilogram / (units->meter * PetscSqr(units->second)); diff --git a/examples/solids/problems/mooney-rivlin.h b/examples/solids/problems/mooney-rivlin.h index 4ae112e560..f0843cb183 100644 --- a/examples/solids/problems/mooney-rivlin.h +++ b/examples/solids/problems/mooney-rivlin.h @@ -9,6 +9,7 @@ #define mooney_rivlin_h #include + #include "../include/structs.h" #ifndef PHYSICS_STRUCT_MR @@ -20,15 +21,13 @@ struct Physics_MR_ { CeedScalar mu_2; CeedScalar lambda; }; -#endif // PHYSICS_STRUCT_MR +#endif // PHYSICS_STRUCT_MR // Create context object -PetscErrorCode PhysicsContext_MR(MPI_Comm comm, Ceed ceed, Units *units, - CeedQFunctionContext *ctx); -PetscErrorCode PhysicsSmootherContext_MR(MPI_Comm comm, Ceed ceed, - CeedQFunctionContext ctx, CeedQFunctionContext *ctx_smoother); +PetscErrorCode PhysicsContext_MR(MPI_Comm comm, Ceed ceed, Units *units, CeedQFunctionContext *ctx); +PetscErrorCode PhysicsSmootherContext_MR(MPI_Comm comm, Ceed ceed, CeedQFunctionContext ctx, CeedQFunctionContext *ctx_smoother); // Process physics options - Mooney-Rivlin PetscErrorCode ProcessPhysics_MR(MPI_Comm comm, Physics_MR phys, Units units); -#endif // mooney_rivlin_h +#endif // mooney_rivlin_h diff --git a/examples/solids/problems/neo-hookean.c b/examples/solids/problems/neo-hookean.c index 5fd17a8c52..f9e7cf0fee 100644 --- a/examples/solids/problems/neo-hookean.c +++ b/examples/solids/problems/neo-hookean.c @@ -5,60 +5,52 @@ // // This file is part of CEED: http://github.com/ceed +#include "../problems/neo-hookean.h" + #include #include -#include "../problems/neo-hookean.h" // Build libCEED context object -PetscErrorCode PhysicsContext_NH(MPI_Comm comm, Ceed ceed, Units *units, - CeedQFunctionContext *ctx) { - PetscErrorCode ierr; +PetscErrorCode PhysicsContext_NH(MPI_Comm comm, Ceed ceed, Units *units, CeedQFunctionContext *ctx) { Physics_NH phys; PetscFunctionBegin; - ierr = PetscMalloc1(1, units); CHKERRQ(ierr); - ierr = PetscMalloc1(1, &phys); CHKERRQ(ierr); - ierr = ProcessPhysics_NH(comm, phys, *units); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, units)); + PetscCall(PetscMalloc1(1, &phys)); + PetscCall(ProcessPhysics_NH(comm, phys, *units)); CeedQFunctionContextCreate(ceed, ctx); - CeedQFunctionContextSetData(*ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(*phys), phys); - ierr = PetscFree(phys); CHKERRQ(ierr); + CeedQFunctionContextSetData(*ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(*phys), phys); + PetscCall(PetscFree(phys)); PetscFunctionReturn(0); } // Build libCEED smoother context object -PetscErrorCode PhysicsSmootherContext_NH(MPI_Comm comm, Ceed ceed, - CeedQFunctionContext ctx, CeedQFunctionContext *ctx_smoother) { - PetscErrorCode ierr; +PetscErrorCode PhysicsSmootherContext_NH(MPI_Comm comm, Ceed ceed, CeedQFunctionContext ctx, CeedQFunctionContext *ctx_smoother) { PetscScalar nu_smoother = 0; - PetscBool nu_flag = PETSC_FALSE; - Physics_NH phys, phys_smoother; + PetscBool nu_flag = PETSC_FALSE; + Physics_NH phys, phys_smoother; PetscFunctionBegin; - PetscOptionsBegin(comm, NULL, "Neo-Hookean physical parameters for smoother", - NULL); + PetscOptionsBegin(comm, NULL, "Neo-Hookean physical parameters for smoother", NULL); - ierr = PetscOptionsScalar("-nu_smoother", "Poisson's ratio for smoother", - NULL, nu_smoother, &nu_smoother, &nu_flag); - CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-nu_smoother", "Poisson's ratio for smoother", NULL, nu_smoother, &nu_smoother, &nu_flag)); - PetscOptionsEnd(); // End of setting Physics + PetscOptionsEnd(); // End of setting Physics if (nu_flag) { // Copy context CeedQFunctionContextGetData(ctx, CEED_MEM_HOST, &phys); - ierr = PetscMalloc1(1, &phys_smoother); CHKERRQ(ierr); - ierr = PetscMemcpy(phys_smoother, phys, sizeof(*phys)); CHKERRQ(ierr); + PetscCall(PetscMalloc1(1, &phys_smoother)); + PetscCall(PetscMemcpy(phys_smoother, phys, sizeof(*phys))); CeedQFunctionContextRestoreData(ctx, &phys); // Create smoother context CeedQFunctionContextCreate(ceed, ctx_smoother); phys_smoother->nu = nu_smoother; - CeedQFunctionContextSetData(*ctx_smoother, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(*phys_smoother), phys_smoother); - ierr = PetscFree(phys_smoother); CHKERRQ(ierr); + CeedQFunctionContextSetData(*ctx_smoother, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(*phys_smoother), phys_smoother); + PetscCall(PetscFree(phys_smoother)); } else { *ctx_smoother = NULL; } @@ -68,41 +60,32 @@ PetscErrorCode PhysicsSmootherContext_NH(MPI_Comm comm, Ceed ceed, // Process physics options - Neo-Hookean PetscErrorCode ProcessPhysics_NH(MPI_Comm comm, Physics_NH phys, Units units) { - PetscErrorCode ierr; - PetscBool nu_flag = PETSC_FALSE; + PetscBool nu_flag = PETSC_FALSE; PetscBool Young_flag = PETSC_FALSE; - phys->nu = 0; - phys->E = 0; - units->meter = 1; // 1 meter in scaled length units - units->second = 1; // 1 second in scaled time units - units->kilogram = 1; // 1 kilogram in scaled mass units + phys->nu = 0; + phys->E = 0; + units->meter = 1; // 1 meter in scaled length units + units->second = 1; // 1 second in scaled time units + units->kilogram = 1; // 1 kilogram in scaled mass units PetscFunctionBeginUser; PetscOptionsBegin(comm, NULL, "Neo-Hookean physical parameters", NULL); - ierr = PetscOptionsScalar("-nu", "Poisson's ratio", NULL, phys->nu, &phys->nu, - &nu_flag); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-nu", "Poisson's ratio", NULL, phys->nu, &phys->nu, &nu_flag)); - ierr = PetscOptionsScalar("-E", "Young's Modulus", NULL, phys->E, &phys->E, - &Young_flag); CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-E", "Young's Modulus", NULL, phys->E, &phys->E, &Young_flag)); - ierr = PetscOptionsScalar("-units_meter", "1 meter in scaled length units", - NULL, units->meter, &units->meter, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_meter", "1 meter in scaled length units", NULL, units->meter, &units->meter, NULL)); units->meter = fabs(units->meter); - ierr = PetscOptionsScalar("-units_second", "1 second in scaled time units", - NULL, units->second, &units->second, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_second", "1 second in scaled time units", NULL, units->second, &units->second, NULL)); units->second = fabs(units->second); - ierr = PetscOptionsScalar("-units_kilogram", "1 kilogram in scaled mass units", - NULL, units->kilogram, &units->kilogram, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsScalar("-units_kilogram", "1 kilogram in scaled mass units", NULL, units->kilogram, &units->kilogram, NULL)); units->kilogram = fabs(units->kilogram); - PetscOptionsEnd(); // End of setting Physics + PetscOptionsEnd(); // End of setting Physics // Check for all required options to be set if (!nu_flag) { diff --git a/examples/solids/problems/neo-hookean.h b/examples/solids/problems/neo-hookean.h index 431f3306fd..5b00dd6966 100644 --- a/examples/solids/problems/neo-hookean.h +++ b/examples/solids/problems/neo-hookean.h @@ -9,24 +9,23 @@ #define neo_hookean_h #include + #include "../include/structs.h" #ifndef PHYSICS_STRUCT_NH #define PHYSICS_STRUCT_NH typedef struct Physics_NH_ *Physics_NH; struct Physics_NH_ { - CeedScalar nu; // Poisson's ratio - CeedScalar E; // Young's Modulus + CeedScalar nu; // Poisson's ratio + CeedScalar E; // Young's Modulus }; -#endif // PHYSICS_STRUCT_NH +#endif // PHYSICS_STRUCT_NH // Create context object -PetscErrorCode PhysicsContext_NH(MPI_Comm comm, Ceed ceed, Units *units, - CeedQFunctionContext *ctx); -PetscErrorCode PhysicsSmootherContext_NH(MPI_Comm comm, Ceed ceed, - CeedQFunctionContext ctx, CeedQFunctionContext *ctx_smoother); +PetscErrorCode PhysicsContext_NH(MPI_Comm comm, Ceed ceed, Units *units, CeedQFunctionContext *ctx); +PetscErrorCode PhysicsSmootherContext_NH(MPI_Comm comm, Ceed ceed, CeedQFunctionContext ctx, CeedQFunctionContext *ctx_smoother); // Process physics options PetscErrorCode ProcessPhysics_NH(MPI_Comm comm, Physics_NH phys, Units units); -#endif // neo_hookean_h +#endif // neo_hookean_h diff --git a/examples/solids/problems/problems.c b/examples/solids/problems/problems.c index 8685bdc635..0127e52077 100644 --- a/examples/solids/problems/problems.c +++ b/examples/solids/problems/problems.c @@ -5,27 +5,21 @@ // // This file is part of CEED: http://github.com/ceed +#include "../problems/problems.h" + #include #include -#include "../problems/problems.h" PetscErrorCode RegisterProblems(ProblemFunctions problem_functions) { - PetscErrorCode ierr; - PetscFunctionBegin; SOLIDS_PROBLEM_REGISTER(problem_functions, "Linear", ElasLinear, NH); SOLIDS_PROBLEM_REGISTER(problem_functions, "SS-NH", ElasSSNH, NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH1", ElasFSCurrentNH1, - NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH2", ElasFSCurrentNH2, - NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH1", ElasFSInitialNH1, - NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH2", ElasFSInitialNH2, - NH); - SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-MR1", ElasFSInitialMR1, - MR); + SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH1", ElasFSCurrentNH1, NH); + SOLIDS_PROBLEM_REGISTER(problem_functions, "FSCurrent-NH2", ElasFSCurrentNH2, NH); + SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH1", ElasFSInitialNH1, NH); + SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-NH2", ElasFSInitialNH2, NH); + SOLIDS_PROBLEM_REGISTER(problem_functions, "FSInitial-MR1", ElasFSInitialMR1, MR); PetscFunctionReturn(0); }; diff --git a/examples/solids/problems/problems.h b/examples/solids/problems/problems.h index 6a4f568896..418c21a08a 100644 --- a/examples/solids/problems/problems.h +++ b/examples/solids/problems/problems.h @@ -10,38 +10,31 @@ #include #include + #include "../problems/cl-problems.h" -#include "../problems/neo-hookean.h" #include "../problems/mooney-rivlin.h" +#include "../problems/neo-hookean.h" // Physics options -#define SOLIDS_PROBLEM_REGISTER(list, name, fname, physics) \ - ierr = PetscFunctionListAdd(&list->setupPhysics, name, \ - PhysicsContext_ ## physics); CHKERRQ(ierr); \ - ierr = PetscFunctionListAdd(&list->setupSmootherPhysics, name, \ - PhysicsSmootherContext_ ## physics); CHKERRQ(ierr); \ - ierr = PetscFunctionListAdd(&list->setupLibceedFineLevel, name, \ - SetupLibceedFineLevel_ ## fname); CHKERRQ(ierr); \ - ierr = PetscFunctionListAdd(&list->setupLibceedLevel, name, \ - SetupLibceedLevel_ ## fname); CHKERRQ(ierr); \ +#define SOLIDS_PROBLEM_REGISTER(list, name, fname, physics) \ + PetscCall(PetscFunctionListAdd(&list->setupPhysics, name, PhysicsContext_##physics)); \ + PetscCall(PetscFunctionListAdd(&list->setupSmootherPhysics, name, PhysicsSmootherContext_##physics)); \ + PetscCall(PetscFunctionListAdd(&list->setupLibceedFineLevel, name, SetupLibceedFineLevel_##fname)); \ + PetscCall(PetscFunctionListAdd(&list->setupLibceedLevel, name, SetupLibceedLevel_##fname)); typedef struct ProblemFunctions_ *ProblemFunctions; struct ProblemFunctions_ { - PetscFunctionList setupPhysics, setupSmootherPhysics, setupLibceedFineLevel, - setupLibceedLevel; + PetscFunctionList setupPhysics, setupSmootherPhysics, setupLibceedFineLevel, setupLibceedLevel; }; PetscErrorCode RegisterProblems(ProblemFunctions problem_functions); -#define SOLIDS_PROBLEM(name) \ - PetscErrorCode SetupLibceedFineLevel_ ## name (DM dm, DM dm_energy, \ - DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, \ - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, \ - PetscInt U_loc_size, CeedVector force_ceed, CeedVector neumann_ceed, \ - CeedData *data); \ - PetscErrorCode SetupLibceedLevel_ ## name (DM dm, Ceed ceed, \ - AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, \ - PetscInt u_loc_size, CeedVector fine_mult, CeedData *data); \ +#define SOLIDS_PROBLEM(name) \ + PetscErrorCode SetupLibceedFineLevel_##name(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, \ + PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, \ + CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data); \ + PetscErrorCode SetupLibceedLevel_##name(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, \ + PetscInt u_loc_size, CeedVector fine_mult, CeedData *data); SOLIDS_PROBLEM(ElasLinear); SOLIDS_PROBLEM(ElasSSNH); @@ -51,4 +44,4 @@ SOLIDS_PROBLEM(ElasFSInitialNH1); SOLIDS_PROBLEM(ElasFSInitialNH2); SOLIDS_PROBLEM(ElasFSInitialMR1); -#endif //problems_h +#endif // problems_h diff --git a/examples/solids/problems/small-strain-neo-hookean.c b/examples/solids/problems/small-strain-neo-hookean.c index d5c9cd8a69..cec654b3cf 100644 --- a/examples/solids/problems/small-strain-neo-hookean.c +++ b/examples/solids/problems/small-strain-neo-hookean.c @@ -5,62 +5,53 @@ // // This file is part of CEED: http://github.com/ceed +#include "../qfunctions/small-strain-neo-hookean.h" + #include -#include "../include/structs.h" + #include "../include/setup-libceed.h" -#include "../problems/problems.h" +#include "../include/structs.h" #include "../problems/neo-hookean.h" +#include "../problems/problems.h" #include "../qfunctions/common.h" -#include "../qfunctions/small-strain-neo-hookean.h" static const char *const field_names[] = {"gradu"}; -static CeedInt field_sizes[] = {9}; +static CeedInt field_sizes[] = {9}; ProblemData small_strain_neo_Hookean = { - .setup_geo = SetupGeo, - .setup_geo_loc = SetupGeo_loc, - .q_data_size = 10, - .quadrature_mode = CEED_GAUSS, - .residual = ElasSSNHF, - .residual_loc = ElasSSNHF_loc, - .number_fields_stored = 1, - .field_names = field_names, - .field_sizes = field_sizes, - .jacobian = ElasSSNHdF, - .jacobian_loc = ElasSSNHdF_loc, - .energy = ElasSSNHEnergy, - .energy_loc = ElasSSNHEnergy_loc, - .diagnostic = ElasSSNHDiagnostic, - .diagnostic_loc = ElasSSNHDiagnostic_loc, + .setup_geo = SetupGeo, + .setup_geo_loc = SetupGeo_loc, + .q_data_size = 10, + .quadrature_mode = CEED_GAUSS, + .residual = ElasSSNHF, + .residual_loc = ElasSSNHF_loc, + .number_fields_stored = 1, + .field_names = field_names, + .field_sizes = field_sizes, + .jacobian = ElasSSNHdF, + .jacobian_loc = ElasSSNHdF_loc, + .energy = ElasSSNHEnergy, + .energy_loc = ElasSSNHEnergy_loc, + .diagnostic = ElasSSNHDiagnostic, + .diagnostic_loc = ElasSSNHDiagnostic_loc, }; -PetscErrorCode SetupLibceedFineLevel_ElasSSNH(DM dm, DM dm_energy, - DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, - PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector force_ceed, CeedVector neumann_ceed, - CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedFineLevel_ElasSSNH(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, CeedVector force_ceed, + CeedVector neumann_ceed, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, - phys_ctx, small_strain_neo_Hookean, - fine_level, num_comp_u, U_g_size, U_loc_size, - force_ceed, neumann_ceed, data); CHKERRQ(ierr); + PetscCall(SetupLibceedFineLevel(dm, dm_energy, dm_diagnostic, ceed, app_ctx, phys_ctx, small_strain_neo_Hookean, fine_level, num_comp_u, U_g_size, + U_loc_size, force_ceed, neumann_ceed, data)); PetscFunctionReturn(0); }; -PetscErrorCode SetupLibceedLevel_ElasSSNH(DM dm, Ceed ceed, AppCtx app_ctx, - PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, - CeedVector fine_mult, CeedData *data) { - PetscErrorCode ierr; - +PetscErrorCode SetupLibceedLevel_ElasSSNH(DM dm, Ceed ceed, AppCtx app_ctx, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { PetscFunctionBegin; - ierr = SetupLibceedLevel(dm, ceed, app_ctx, small_strain_neo_Hookean, - level, num_comp_u, U_g_size, U_loc_size, fine_mult, data); - CHKERRQ(ierr); + PetscCall(SetupLibceedLevel(dm, ceed, app_ctx, small_strain_neo_Hookean, level, num_comp_u, U_g_size, U_loc_size, fine_mult, data)); PetscFunctionReturn(0); }; diff --git a/examples/solids/qfunctions/common.h b/examples/solids/qfunctions/common.h index b49a8ea256..f589b920fa 100644 --- a/examples/solids/qfunctions/common.h +++ b/examples/solids/qfunctions/common.h @@ -37,40 +37,38 @@ // [A31 A32 A33] // // ----------------------------------------------------------------------------- -CEED_QFUNCTION(SetupGeo)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) { - // *INDENT-OFF* - // Inputs - const CeedScalar (*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], - (*w) = in[1]; +CEED_QFUNCTION(SetupGeo)(void *ctx, CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // *INDENT-OFF* + // Inputs + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*w) = in[1]; - // Outputs - CeedScalar (*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - // *INDENT-ON* + // Outputs + CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + // *INDENT-ON* CeedPragmaSIMD - // Quadrature Point Loop - for (CeedInt i=0; imu_1; - const CeedScalar mu_2 = context->mu_2; - const CeedScalar lambda = context->lambda; + const CeedScalar mu_1 = context->mu_1; + const CeedScalar mu_2 = context->mu_2; + const CeedScalar lambda = context->lambda; // Formulation Terminology: // I3 : 3x3 Identity matrix @@ -193,98 +186,77 @@ CEED_QFUNCTION(ElasFSInitialMR1F)(void *ctx, CeedInt Q, // P : 1st Piola-Kirchhoff // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; imu_1; - const CeedScalar mu_2 = context->mu_2; - const CeedScalar lambda = context->lambda; + const CeedScalar mu_1 = context->mu_1; + const CeedScalar mu_2 = context->mu_2; + const CeedScalar lambda = context->lambda; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; imu_1; - const CeedScalar mu_2 = context->mu_2; - const CeedScalar lambda = context->lambda; + const CeedScalar mu_1 = context->mu_1; + const CeedScalar mu_2 = context->mu_2; + const CeedScalar lambda = context->lambda; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; imu_1; - const CeedScalar mu_2 = context->mu_2; - const CeedScalar lambda = context->lambda; + const CeedScalar mu_1 = context->mu_1; + const CeedScalar mu_2 = context->mu_2; + const CeedScalar lambda = context->lambda; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Formulation Terminology: // I3 : 3x3 Identity matrix @@ -201,58 +185,42 @@ CEED_QFUNCTION(ElasFSCurrentNH1F)(void *ctx, CeedInt Q, // tau = mu*(b - I3) + lambda*log(J)*I3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Formulation Terminology: // I3 : 3x3 Identity matrix @@ -142,88 +136,71 @@ CEED_QFUNCTION(ElasFSCurrentNH2F)(void *ctx, CeedInt Q, // tau = mu*b - (mu - lambda*log(J))*I3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Formulation Terminology: // I3 : 3x3 Identity matrix @@ -193,101 +187,80 @@ CEED_QFUNCTION(ElasFSInitialNH1F)(void *ctx, CeedInt Q, // P = F*S // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Formulation Terminology: // I3 : 3x3 Identity matrix @@ -142,100 +136,80 @@ CEED_QFUNCTION(ElasFSInitialNH2F)(void *ctx, CeedInt Q, // P = F*S // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iE; - const CeedScalar nu = context->nu; + const Physics context = (Physics)ctx; + const CeedScalar E = context->E; + const CeedScalar nu = context->nu; // Constants - const CeedScalar TwoMu = E / (1 + nu); - const CeedScalar mu = TwoMu / 2; - const CeedScalar Kbulk = E / (3*(1 - 2*nu)); // Bulk Modulus - const CeedScalar lambda = (3*Kbulk - TwoMu) / 3; + const CeedScalar TwoMu = E / (1 + nu); + const CeedScalar mu = TwoMu / 2; + const CeedScalar Kbulk = E / (3 * (1 - 2 * nu)); // Bulk Modulus + const CeedScalar lambda = (3 * Kbulk - TwoMu) / 3; // Quadrature Point Loop - CeedPragmaSIMD - for (CeedInt i=0; iceed_resource, app_ctx->ceed_resource, - sizeof(app_ctx->ceed_resource), &ceed_flag); - CHKERRQ(ierr); - - ierr = PetscStrncpy(app_ctx->output_dir, ".", 2); - CHKERRQ(ierr); // Default - current directory - ierr = PetscOptionsString("-output_dir", "Output directory", - NULL, app_ctx->output_dir, app_ctx->output_dir, - sizeof(app_ctx->output_dir), NULL); CHKERRQ(ierr); - - app_ctx->degree = 3; - ierr = PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", - NULL, app_ctx->degree, &app_ctx->degree, NULL); - CHKERRQ(ierr); - - app_ctx->q_extra = 0; - ierr = PetscOptionsInt("-q_extra", "Number of extra quadrature points", - NULL, app_ctx->q_extra, &app_ctx->q_extra, NULL); - CHKERRQ(ierr); - - ierr = PetscOptionsString("-mesh", "Read mesh from file", NULL, - app_ctx->mesh_file, app_ctx->mesh_file, - sizeof(app_ctx->mesh_file), NULL); CHKERRQ(ierr); - - app_ctx->problem_choice = ELAS_LINEAR; // Default - Linear Elasticity - ierr = PetscOptionsEnum("-problem", - "Solves Elasticity & Hyperelasticity Problems", - NULL, problemTypes, (PetscEnum)app_ctx->problem_choice, - (PetscEnum *)&app_ctx->problem_choice, NULL); - CHKERRQ(ierr); - app_ctx->name = problemTypes[app_ctx->problem_choice]; + PetscOptionsBegin(comm, NULL, "Elasticity / Hyperelasticity in PETSc with libCEED", NULL); + + PetscCall(PetscOptionsString("-ceed", "CEED resource specifier", NULL, app_ctx->ceed_resource, app_ctx->ceed_resource, + sizeof(app_ctx->ceed_resource), &ceed_flag)); + + PetscCall(PetscStrncpy(app_ctx->output_dir, ".", 2)); // Default - current directory + PetscCall(PetscOptionsString("-output_dir", "Output directory", NULL, app_ctx->output_dir, app_ctx->output_dir, sizeof(app_ctx->output_dir), NULL)); + + app_ctx->degree = 3; + PetscCall(PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", NULL, app_ctx->degree, &app_ctx->degree, NULL)); + + app_ctx->q_extra = 0; + PetscCall(PetscOptionsInt("-q_extra", "Number of extra quadrature points", NULL, app_ctx->q_extra, &app_ctx->q_extra, NULL)); + + PetscCall(PetscOptionsString("-mesh", "Read mesh from file", NULL, app_ctx->mesh_file, app_ctx->mesh_file, sizeof(app_ctx->mesh_file), NULL)); + + app_ctx->problem_choice = ELAS_LINEAR; // Default - Linear Elasticity + PetscCall(PetscOptionsEnum("-problem", "Solves Elasticity & Hyperelasticity Problems", NULL, problemTypes, (PetscEnum)app_ctx->problem_choice, + (PetscEnum *)&app_ctx->problem_choice, NULL)); + app_ctx->name = problemTypes[app_ctx->problem_choice]; app_ctx->name_for_disp = problemTypesForDisp[app_ctx->problem_choice]; app_ctx->num_increments = app_ctx->problem_choice == ELAS_LINEAR ? 1 : 10; - ierr = PetscOptionsInt("-num_steps", "Number of pseudo-time steps", - NULL, app_ctx->num_increments, &app_ctx->num_increments, - NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsInt("-num_steps", "Number of pseudo-time steps", NULL, app_ctx->num_increments, &app_ctx->num_increments, NULL)); - app_ctx->forcing_choice = FORCE_NONE; // Default - no forcing term - ierr = PetscOptionsEnum("-forcing", "Set forcing function option", NULL, - forcing_types, (PetscEnum)app_ctx->forcing_choice, - (PetscEnum *)&app_ctx->forcing_choice, NULL); - CHKERRQ(ierr); + app_ctx->forcing_choice = FORCE_NONE; // Default - no forcing term + PetscCall(PetscOptionsEnum("-forcing", "Set forcing function option", NULL, forcing_types, (PetscEnum)app_ctx->forcing_choice, + (PetscEnum *)&app_ctx->forcing_choice, NULL)); - PetscInt max_n = 3; + PetscInt max_n = 3; app_ctx->forcing_vector[0] = 0; app_ctx->forcing_vector[1] = -1; app_ctx->forcing_vector[2] = 0; - ierr = PetscOptionsScalarArray("-forcing_vec", - "Direction to apply constant force", NULL, - app_ctx->forcing_vector, &max_n, NULL); - CHKERRQ(ierr); - - if ((app_ctx->problem_choice == ELAS_FSInitial_NH1 || - app_ctx->problem_choice == ELAS_FSInitial_NH2 || - app_ctx->problem_choice == ELAS_FSCurrent_NH1 || - app_ctx->problem_choice == ELAS_FSCurrent_NH2 || + PetscCall(PetscOptionsScalarArray("-forcing_vec", "Direction to apply constant force", NULL, app_ctx->forcing_vector, &max_n, NULL)); + + if ((app_ctx->problem_choice == ELAS_FSInitial_NH1 || app_ctx->problem_choice == ELAS_FSInitial_NH2 || + app_ctx->problem_choice == ELAS_FSCurrent_NH1 || app_ctx->problem_choice == ELAS_FSCurrent_NH2 || app_ctx->problem_choice == ELAS_FSInitial_MR1) && app_ctx->forcing_choice == FORCE_CONST) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, @@ -89,133 +64,88 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx) { // Dirichlet boundary conditions app_ctx->bc_clamp_count = 16; - ierr = PetscOptionsIntArray("-bc_clamp", - "Face IDs to apply incremental Dirichlet BC", - NULL, app_ctx->bc_clamp_faces, &app_ctx->bc_clamp_count, - NULL); CHKERRQ(ierr); + PetscCall( + PetscOptionsIntArray("-bc_clamp", "Face IDs to apply incremental Dirichlet BC", NULL, app_ctx->bc_clamp_faces, &app_ctx->bc_clamp_count, NULL)); // Set vector for each clamped BC for (PetscInt i = 0; i < app_ctx->bc_clamp_count; i++) { // Translation vector - char option_name[25]; - const size_t nclamp_params = sizeof(app_ctx->bc_clamp_max[0])/sizeof( - app_ctx->bc_clamp_max[0][0]); - for (PetscInt j = 0; j < nclamp_params; j++) - app_ctx->bc_clamp_max[i][j] = 0.; - - snprintf(option_name, sizeof option_name, - "-bc_clamp_%" PetscInt_FMT "_translate", - app_ctx->bc_clamp_faces[i]); + char option_name[25]; + const size_t nclamp_params = sizeof(app_ctx->bc_clamp_max[0]) / sizeof(app_ctx->bc_clamp_max[0][0]); + for (PetscInt j = 0; j < nclamp_params; j++) app_ctx->bc_clamp_max[i][j] = 0.; + + snprintf(option_name, sizeof option_name, "-bc_clamp_%" PetscInt_FMT "_translate", app_ctx->bc_clamp_faces[i]); max_n = 3; - ierr = PetscOptionsScalarArray(option_name, - "Vector to translate clamped end by", NULL, - app_ctx->bc_clamp_max[i], &max_n, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsScalarArray(option_name, "Vector to translate clamped end by", NULL, app_ctx->bc_clamp_max[i], &max_n, NULL)); // Rotation vector max_n = 5; - snprintf(option_name, sizeof option_name, "-bc_clamp_%" PetscInt_FMT "_rotate", - app_ctx->bc_clamp_faces[i]); - ierr = PetscOptionsScalarArray(option_name, - "Vector with axis of rotation and rotation, in radians", - NULL, &app_ctx->bc_clamp_max[i][3], &max_n, NULL); - CHKERRQ(ierr); + snprintf(option_name, sizeof option_name, "-bc_clamp_%" PetscInt_FMT "_rotate", app_ctx->bc_clamp_faces[i]); + PetscCall(PetscOptionsScalarArray(option_name, "Vector with axis of rotation and rotation, in radians", NULL, &app_ctx->bc_clamp_max[i][3], + &max_n, NULL)); // Normalize - PetscScalar norm = sqrt(app_ctx->bc_clamp_max[i][3]*app_ctx->bc_clamp_max[i][3] - + app_ctx->bc_clamp_max[i][4]*app_ctx->bc_clamp_max[i][4] - + app_ctx->bc_clamp_max[i][5]*app_ctx->bc_clamp_max[i][5]); - if (fabs(norm) < 1e-16) - norm = 1; - for (PetscInt j = 0; j < 3; j++) - app_ctx->bc_clamp_max[i][3 + j] /= norm; + PetscScalar norm = sqrt(app_ctx->bc_clamp_max[i][3] * app_ctx->bc_clamp_max[i][3] + app_ctx->bc_clamp_max[i][4] * app_ctx->bc_clamp_max[i][4] + + app_ctx->bc_clamp_max[i][5] * app_ctx->bc_clamp_max[i][5]); + if (fabs(norm) < 1e-16) norm = 1; + for (PetscInt j = 0; j < 3; j++) app_ctx->bc_clamp_max[i][3 + j] /= norm; } // Neumann boundary conditions app_ctx->bc_traction_count = 16; - ierr = PetscOptionsIntArray("-bc_traction", - "Face IDs to apply traction (Neumann) BC", - NULL, app_ctx->bc_traction_faces, - &app_ctx->bc_traction_count, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsIntArray("-bc_traction", "Face IDs to apply traction (Neumann) BC", NULL, app_ctx->bc_traction_faces, + &app_ctx->bc_traction_count, NULL)); // Set vector for each traction BC for (PetscInt i = 0; i < app_ctx->bc_traction_count; i++) { // Translation vector char option_name[25]; - for (PetscInt j = 0; j < 3; j++) - app_ctx->bc_traction_vector[i][j] = 0.; + for (PetscInt j = 0; j < 3; j++) app_ctx->bc_traction_vector[i][j] = 0.; - snprintf(option_name, sizeof option_name, "-bc_traction_%" PetscInt_FMT, - app_ctx->bc_traction_faces[i]); - max_n = 3; + snprintf(option_name, sizeof option_name, "-bc_traction_%" PetscInt_FMT, app_ctx->bc_traction_faces[i]); + max_n = 3; PetscBool set = false; - ierr = PetscOptionsScalarArray(option_name, - "Traction vector for constrained face", NULL, - app_ctx->bc_traction_vector[i], &max_n, &set); - CHKERRQ(ierr); - - if (!set) - SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, - "Traction vector must be set for all traction boundary conditions."); + PetscCall(PetscOptionsScalarArray(option_name, "Traction vector for constrained face", NULL, app_ctx->bc_traction_vector[i], &max_n, &set)); + + if (!set) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Traction vector must be set for all traction boundary conditions."); } app_ctx->multigrid_choice = MULTIGRID_LOGARITHMIC; - ierr = PetscOptionsEnum("-multigrid", "Set multigrid type option", NULL, - multigrid_types, (PetscEnum)app_ctx->multigrid_choice, - (PetscEnum *)&app_ctx->multigrid_choice, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsEnum("-multigrid", "Set multigrid type option", NULL, multigrid_types, (PetscEnum)app_ctx->multigrid_choice, + (PetscEnum *)&app_ctx->multigrid_choice, NULL)); app_ctx->test_mode = PETSC_FALSE; - ierr = PetscOptionsBool("-test", - "Testing mode (do not print unless error is large)", - NULL, app_ctx->test_mode, &(app_ctx->test_mode), NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, app_ctx->test_mode, &(app_ctx->test_mode), NULL)); app_ctx->expect_final_strain = -1.; - ierr = PetscOptionsReal("-expect_final_strain_energy", - "Expect final strain energy close to this value.", - NULL, app_ctx->expect_final_strain, &app_ctx->expect_final_strain, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsReal("-expect_final_strain_energy", "Expect final strain energy close to this value.", NULL, app_ctx->expect_final_strain, + &app_ctx->expect_final_strain, NULL)); app_ctx->test_tol = 1e-8; - ierr = PetscOptionsReal("-expect_final_state_rtol", - "Relative tolerance for final strain energy test", - NULL, app_ctx->test_tol, &app_ctx->test_tol, NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsReal("-expect_final_state_rtol", "Relative tolerance for final strain energy test", NULL, app_ctx->test_tol, + &app_ctx->test_tol, NULL)); app_ctx->view_soln = PETSC_FALSE; - ierr = PetscOptionsBool("-view_soln", "Write out solution vector for viewing", - NULL, app_ctx->view_soln, &(app_ctx->view_soln), NULL); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-view_soln", "Write out solution vector for viewing", NULL, app_ctx->view_soln, &(app_ctx->view_soln), NULL)); app_ctx->view_final_soln = PETSC_FALSE; - ierr = PetscOptionsBool("-view_final_soln", - "Write out final solution vector for viewing", - NULL, app_ctx->view_final_soln, &(app_ctx->view_final_soln), - NULL); CHKERRQ(ierr); - CHKERRQ(ierr); + PetscCall(PetscOptionsBool("-view_final_soln", "Write out final solution vector for viewing", NULL, app_ctx->view_final_soln, + &(app_ctx->view_final_soln), NULL)); PetscBool set; - char energy_viewer_filename[PETSC_MAX_PATH_LEN] = ""; - ierr = PetscOptionsString("-strain_energy_monitor", - "Print out current strain energy at every load increment", - NULL, energy_viewer_filename, - energy_viewer_filename, sizeof(energy_viewer_filename), - &set); CHKERRQ(ierr); + char energy_viewer_filename[PETSC_MAX_PATH_LEN] = ""; + PetscCall(PetscOptionsString("-strain_energy_monitor", "Print out current strain energy at every load increment", NULL, energy_viewer_filename, + energy_viewer_filename, sizeof(energy_viewer_filename), &set)); if (set) { - ierr = PetscViewerASCIIOpen(comm, energy_viewer_filename, - &app_ctx->energy_viewer); CHKERRQ(ierr); - ierr = PetscViewerASCIIPrintf(app_ctx->energy_viewer, "increment,energy\n"); - CHKERRQ(ierr); + PetscCall(PetscViewerASCIIOpen(comm, energy_viewer_filename, &app_ctx->energy_viewer)); + PetscCall(PetscViewerASCIIPrintf(app_ctx->energy_viewer, "increment,energy\n")); // Initial configuration is base energy state; this may not be true if we extend in the future to // initially loaded configurations (because a truly at-rest initial state may not be realizable). - ierr = PetscViewerASCIIPrintf(app_ctx->energy_viewer, "%f,%e\n", 0., 0.); - CHKERRQ(ierr); + PetscCall(PetscViewerASCIIPrintf(app_ctx->energy_viewer, "%f,%e\n", 0., 0.)); } - PetscOptionsEnd(); // End of setting AppCtx + PetscOptionsEnd(); // End of setting AppCtx // Check for all required values set if (app_ctx->test_mode) { - if (app_ctx->forcing_choice == FORCE_NONE && !app_ctx->bc_clamp_count) - app_ctx->forcing_choice = FORCE_MMS; + if (app_ctx->forcing_choice == FORCE_NONE && !app_ctx->bc_clamp_count) app_ctx->forcing_choice = FORCE_MMS; } if (!app_ctx->bc_clamp_count && app_ctx->forcing_choice != FORCE_MMS) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "-boundary options needed"); @@ -229,34 +159,31 @@ PetscErrorCode ProcessCommandLineOptions(MPI_Comm comm, AppCtx app_ctx) { // Determine number of levels switch (app_ctx->multigrid_choice) { - case MULTIGRID_LOGARITHMIC: - app_ctx->num_levels = ceil(log(app_ctx->degree)/log(2)) + 1; - break; - case MULTIGRID_UNIFORM: - app_ctx->num_levels = app_ctx->degree; - break; - case MULTIGRID_NONE: - app_ctx->num_levels = 1; - break; + case MULTIGRID_LOGARITHMIC: + app_ctx->num_levels = ceil(log(app_ctx->degree) / log(2)) + 1; + break; + case MULTIGRID_UNIFORM: + app_ctx->num_levels = app_ctx->degree; + break; + case MULTIGRID_NONE: + app_ctx->num_levels = 1; + break; } // Populate array of degrees for each level for multigrid - ierr = PetscMalloc1(app_ctx->num_levels, &(app_ctx->level_degrees)); - CHKERRQ(ierr); + PetscCall(PetscMalloc1(app_ctx->num_levels, &(app_ctx->level_degrees))); switch (app_ctx->multigrid_choice) { - case MULTIGRID_LOGARITHMIC: - for (int i = 0; i < app_ctx->num_levels-1; i++) - app_ctx->level_degrees[i] = pow(2,i); - app_ctx->level_degrees[app_ctx->num_levels-1] = app_ctx->degree; - break; - case MULTIGRID_UNIFORM: - for (int i = 0; i < app_ctx->num_levels; i++) - app_ctx->level_degrees[i] = i + 1; - break; - case MULTIGRID_NONE: - app_ctx->level_degrees[0] = app_ctx->degree; - break; + case MULTIGRID_LOGARITHMIC: + for (int i = 0; i < app_ctx->num_levels - 1; i++) app_ctx->level_degrees[i] = pow(2, i); + app_ctx->level_degrees[app_ctx->num_levels - 1] = app_ctx->degree; + break; + case MULTIGRID_UNIFORM: + for (int i = 0; i < app_ctx->num_levels; i++) app_ctx->level_degrees[i] = i + 1; + break; + case MULTIGRID_NONE: + app_ctx->level_degrees[0] = app_ctx->degree; + break; } PetscFunctionReturn(0); diff --git a/examples/solids/src/matops.c b/examples/solids/src/matops.c index f209d7f9d4..b61e7abba1 100644 --- a/examples/solids/src/matops.c +++ b/examples/solids/src/matops.c @@ -9,6 +9,7 @@ /// Matrix shell operations for solid mechanics example using PETSc #include "../include/matops.h" + #include "../include/structs.h" #include "../include/utils.h" @@ -17,20 +18,18 @@ // ----------------------------------------------------------------------------- // This function uses libCEED to compute the local action of an operator PetscErrorCode ApplyLocalCeedOp(Vec X, Vec Y, UserMult user) { - PetscErrorCode ierr; PetscScalar *x, *y; PetscMemType x_mem_type, y_mem_type; PetscFunctionBeginUser; // Global-to-local - ierr = DMGlobalToLocal(user->dm, X, INSERT_VALUES, user->X_loc); CHKERRQ(ierr); - ierr = VecZeroEntries(user->Y_loc); CHKERRQ(ierr); + PetscCall(DMGlobalToLocal(user->dm, X, INSERT_VALUES, user->X_loc)); + PetscCall(VecZeroEntries(user->Y_loc)); // Setup CEED vectors - ierr = VecGetArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x, - &x_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(user->Y_loc, &y, &y_mem_type); CHKERRQ(ierr); + PetscCall(VecGetArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x, &x_mem_type)); + PetscCall(VecGetArrayAndMemType(user->Y_loc, &y, &y_mem_type)); CeedVectorSetArray(user->x_ceed, MemTypeP2C(x_mem_type), CEED_USE_POINTER, x); CeedVectorSetArray(user->y_ceed, MemTypeP2C(y_mem_type), CEED_USE_POINTER, y); @@ -40,36 +39,32 @@ PetscErrorCode ApplyLocalCeedOp(Vec X, Vec Y, UserMult user) { // Restore PETSc vectors CeedVectorTakeArray(user->x_ceed, MemTypeP2C(x_mem_type), NULL); CeedVectorTakeArray(user->y_ceed, MemTypeP2C(y_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x); - CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(user->Y_loc, &y); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x)); + PetscCall(VecRestoreArrayAndMemType(user->Y_loc, &y)); // Local-to-global - ierr = VecZeroEntries(Y); CHKERRQ(ierr); - ierr = DMLocalToGlobal(user->dm, user->Y_loc, ADD_VALUES, Y); CHKERRQ(ierr); + PetscCall(VecZeroEntries(Y)); + PetscCall(DMLocalToGlobal(user->dm, user->Y_loc, ADD_VALUES, Y)); PetscFunctionReturn(0); }; // This function uses libCEED to compute the non-linear residual PetscErrorCode FormResidual_Ceed(SNES snes, Vec X, Vec Y, void *ctx) { - PetscErrorCode ierr; UserMult user = (UserMult)ctx; PetscFunctionBeginUser; // Use computed BCs - ierr = VecZeroEntries(user->X_loc); CHKERRQ(ierr); - ierr = DMPlexInsertBoundaryValues(user->dm, PETSC_TRUE, user->X_loc, - user->load_increment, NULL, NULL, NULL); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(user->X_loc)); + PetscCall(DMPlexInsertBoundaryValues(user->dm, PETSC_TRUE, user->X_loc, user->load_increment, NULL, NULL, NULL)); // libCEED for local action of residual evaluator - ierr = ApplyLocalCeedOp(X, Y, user); CHKERRQ(ierr); + PetscCall(ApplyLocalCeedOp(X, Y, user)); // Neumann BCs if (user->neumann_bcs) { - ierr = VecAXPY(Y, -user->load_increment, user->neumann_bcs); CHKERRQ(ierr); + PetscCall(VecAXPY(Y, -user->load_increment, user->neumann_bcs)); } PetscFunctionReturn(0); @@ -77,197 +72,169 @@ PetscErrorCode FormResidual_Ceed(SNES snes, Vec X, Vec Y, void *ctx) { // This function uses libCEED to apply the Jacobian for assembly via a SNES PetscErrorCode ApplyJacobianCoarse_Ceed(SNES snes, Vec X, Vec Y, void *ctx) { - PetscErrorCode ierr; UserMult user = (UserMult)ctx; PetscFunctionBeginUser; // Zero boundary values - ierr = VecZeroEntries(user->X_loc); CHKERRQ(ierr); + PetscCall(VecZeroEntries(user->X_loc)); // libCEED for local action of residual evaluator - ierr = ApplyLocalCeedOp(X, Y, user); CHKERRQ(ierr); + PetscCall(ApplyLocalCeedOp(X, Y, user)); PetscFunctionReturn(0); }; // This function uses libCEED to compute the action of the Jacobian PetscErrorCode ApplyJacobian_Ceed(Mat A, Vec X, Vec Y) { - PetscErrorCode ierr; UserMult user; PetscFunctionBeginUser; // Zero boundary values - ierr = MatShellGetContext(A, &user); CHKERRQ(ierr); - ierr = VecZeroEntries(user->X_loc); CHKERRQ(ierr); + PetscCall(MatShellGetContext(A, &user)); + PetscCall(VecZeroEntries(user->X_loc)); // libCEED for local action of Jacobian - ierr = ApplyLocalCeedOp(X, Y, user); CHKERRQ(ierr); + PetscCall(ApplyLocalCeedOp(X, Y, user)); PetscFunctionReturn(0); }; // This function uses libCEED to compute the action of the prolongation operator PetscErrorCode Prolong_Ceed(Mat A, Vec X, Vec Y) { - PetscErrorCode ierr; UserMultProlongRestr user; - PetscScalar *c, *f; - PetscMemType c_mem_type, f_mem_type; + PetscScalar *c, *f; + PetscMemType c_mem_type, f_mem_type; PetscFunctionBeginUser; - ierr = MatShellGetContext(A, &user); CHKERRQ(ierr); + PetscCall(MatShellGetContext(A, &user)); // Global-to-local - ierr = VecZeroEntries(user->loc_vec_c); CHKERRQ(ierr); - ierr = DMGlobalToLocal(user->dm_c, X, INSERT_VALUES, user->loc_vec_c); - CHKERRQ(ierr); - ierr = VecZeroEntries(user->loc_vec_f); CHKERRQ(ierr); + PetscCall(VecZeroEntries(user->loc_vec_c)); + PetscCall(DMGlobalToLocal(user->dm_c, X, INSERT_VALUES, user->loc_vec_c)); + PetscCall(VecZeroEntries(user->loc_vec_f)); // Setup CEED vectors - ierr = VecGetArrayReadAndMemType(user->loc_vec_c, (const PetscScalar **)&c, - &c_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(user->loc_vec_f, &f, &f_mem_type); CHKERRQ(ierr); - CeedVectorSetArray(user->ceed_vec_c, MemTypeP2C(c_mem_type), CEED_USE_POINTER, - c); - CeedVectorSetArray(user->ceed_vec_f, MemTypeP2C(f_mem_type), CEED_USE_POINTER, - f); + PetscCall(VecGetArrayReadAndMemType(user->loc_vec_c, (const PetscScalar **)&c, &c_mem_type)); + PetscCall(VecGetArrayAndMemType(user->loc_vec_f, &f, &f_mem_type)); + CeedVectorSetArray(user->ceed_vec_c, MemTypeP2C(c_mem_type), CEED_USE_POINTER, c); + CeedVectorSetArray(user->ceed_vec_f, MemTypeP2C(f_mem_type), CEED_USE_POINTER, f); // Apply CEED operator - CeedOperatorApply(user->op_prolong, user->ceed_vec_c, user->ceed_vec_f, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(user->op_prolong, user->ceed_vec_c, user->ceed_vec_f, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(user->ceed_vec_c, MemTypeP2C(c_mem_type), NULL); CeedVectorTakeArray(user->ceed_vec_f, MemTypeP2C(f_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(user->loc_vec_c, (const PetscScalar **)&c); - CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(user->loc_vec_f, &f); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(user->loc_vec_c, (const PetscScalar **)&c)); + PetscCall(VecRestoreArrayAndMemType(user->loc_vec_f, &f)); // Local-to-global - ierr = VecZeroEntries(Y); CHKERRQ(ierr); - ierr = DMLocalToGlobal(user->dm_f, user->loc_vec_f, ADD_VALUES, Y); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(Y)); + PetscCall(DMLocalToGlobal(user->dm_f, user->loc_vec_f, ADD_VALUES, Y)); PetscFunctionReturn(0); } // This function uses libCEED to compute the action of the restriction operator PetscErrorCode Restrict_Ceed(Mat A, Vec X, Vec Y) { - PetscErrorCode ierr; UserMultProlongRestr user; - PetscScalar *c, *f; - PetscMemType c_mem_type, f_mem_type; + PetscScalar *c, *f; + PetscMemType c_mem_type, f_mem_type; PetscFunctionBeginUser; - ierr = MatShellGetContext(A, &user); CHKERRQ(ierr); + PetscCall(MatShellGetContext(A, &user)); // Global-to-local - ierr = VecZeroEntries(user->loc_vec_f); CHKERRQ(ierr); - ierr = DMGlobalToLocal(user->dm_f, X, INSERT_VALUES, user->loc_vec_f); - CHKERRQ(ierr); - ierr = VecZeroEntries(user->loc_vec_c); CHKERRQ(ierr); + PetscCall(VecZeroEntries(user->loc_vec_f)); + PetscCall(DMGlobalToLocal(user->dm_f, X, INSERT_VALUES, user->loc_vec_f)); + PetscCall(VecZeroEntries(user->loc_vec_c)); // Setup CEED vectors - ierr = VecGetArrayReadAndMemType(user->loc_vec_f, (const PetscScalar **)&f, - &f_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(user->loc_vec_c, &c, &c_mem_type); CHKERRQ(ierr); - CeedVectorSetArray(user->ceed_vec_f, MemTypeP2C(f_mem_type), CEED_USE_POINTER, - f); - CeedVectorSetArray(user->ceed_vec_c, MemTypeP2C(c_mem_type), CEED_USE_POINTER, - c); + PetscCall(VecGetArrayReadAndMemType(user->loc_vec_f, (const PetscScalar **)&f, &f_mem_type)); + PetscCall(VecGetArrayAndMemType(user->loc_vec_c, &c, &c_mem_type)); + CeedVectorSetArray(user->ceed_vec_f, MemTypeP2C(f_mem_type), CEED_USE_POINTER, f); + CeedVectorSetArray(user->ceed_vec_c, MemTypeP2C(c_mem_type), CEED_USE_POINTER, c); // Apply CEED operator - CeedOperatorApply(user->op_restrict, user->ceed_vec_f, user->ceed_vec_c, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(user->op_restrict, user->ceed_vec_f, user->ceed_vec_c, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(user->ceed_vec_f, MemTypeP2C(f_mem_type), NULL); CeedVectorTakeArray(user->ceed_vec_c, MemTypeP2C(c_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(user->loc_vec_f, (const PetscScalar **)&f); - CHKERRQ(ierr); - ierr = VecRestoreArrayAndMemType(user->loc_vec_c, &c); CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(user->loc_vec_f, (const PetscScalar **)&f)); + PetscCall(VecRestoreArrayAndMemType(user->loc_vec_c, &c)); // Local-to-global - ierr = VecZeroEntries(Y); CHKERRQ(ierr); - ierr = DMLocalToGlobal(user->dm_c, user->loc_vec_c, ADD_VALUES, Y); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(Y)); + PetscCall(DMLocalToGlobal(user->dm_c, user->loc_vec_c, ADD_VALUES, Y)); PetscFunctionReturn(0); }; // This function returns the computed diagonal of the operator PetscErrorCode GetDiag_Ceed(Mat A, Vec D) { - PetscErrorCode ierr; UserMult user; PetscFunctionBeginUser; - ierr = MatShellGetContext(A, &user); CHKERRQ(ierr); + PetscCall(MatShellGetContext(A, &user)); // -- Set physics context - if (user->ctx_phys_smoother) - CeedQFunctionSetContext(user->qf, user->ctx_phys_smoother); + if (user->ctx_phys_smoother) CeedQFunctionSetContext(user->qf, user->ctx_phys_smoother); // Compute Diagonal via libCEED PetscScalar *x; PetscMemType x_mem_type; // -- Place PETSc vector in libCEED vector - ierr = VecGetArrayAndMemType(user->X_loc, &x, &x_mem_type); CHKERRQ(ierr); + PetscCall(VecGetArrayAndMemType(user->X_loc, &x, &x_mem_type)); CeedVectorSetArray(user->x_ceed, MemTypeP2C(x_mem_type), CEED_USE_POINTER, x); // -- Compute Diagonal - CeedOperatorLinearAssembleDiagonal(user->op, user->x_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorLinearAssembleDiagonal(user->op, user->x_ceed, CEED_REQUEST_IMMEDIATE); // -- Reset physics context - if (user->ctx_phys_smoother) - CeedQFunctionSetContext(user->qf, user->ctx_phys); + if (user->ctx_phys_smoother) CeedQFunctionSetContext(user->qf, user->ctx_phys); // -- Local-to-Global CeedVectorTakeArray(user->x_ceed, MemTypeP2C(x_mem_type), NULL); - ierr = VecRestoreArrayAndMemType(user->X_loc, &x); CHKERRQ(ierr); - ierr = VecZeroEntries(D); CHKERRQ(ierr); - ierr = DMLocalToGlobal(user->dm, user->X_loc, ADD_VALUES, D); CHKERRQ(ierr); + PetscCall(VecRestoreArrayAndMemType(user->X_loc, &x)); + PetscCall(VecZeroEntries(D)); + PetscCall(DMLocalToGlobal(user->dm, user->X_loc, ADD_VALUES, D)); // Cleanup - ierr = VecZeroEntries(user->X_loc); CHKERRQ(ierr); + PetscCall(VecZeroEntries(user->X_loc)); PetscFunctionReturn(0); }; // This function calculates the strain energy in the final solution -PetscErrorCode ComputeStrainEnergy(DM dmEnergy, UserMult user, - CeedOperator op_energy, Vec X, - PetscReal *energy) { - PetscErrorCode ierr; +PetscErrorCode ComputeStrainEnergy(DM dmEnergy, UserMult user, CeedOperator op_energy, Vec X, PetscReal *energy) { PetscScalar *x; PetscMemType x_mem_type; - CeedInt length; + CeedInt length; PetscFunctionBeginUser; // Global-to-local - ierr = VecZeroEntries(user->X_loc); CHKERRQ(ierr); - ierr = DMGlobalToLocal(user->dm, X, INSERT_VALUES, user->X_loc); CHKERRQ(ierr); - ierr = DMPlexInsertBoundaryValues(user->dm, PETSC_TRUE, user->X_loc, - user->load_increment, NULL, NULL, NULL); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(user->X_loc)); + PetscCall(DMGlobalToLocal(user->dm, X, INSERT_VALUES, user->X_loc)); + PetscCall(DMPlexInsertBoundaryValues(user->dm, PETSC_TRUE, user->X_loc, user->load_increment, NULL, NULL, NULL)); // Setup libCEED input vector - ierr = VecGetArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x, - &x_mem_type); CHKERRQ(ierr); + PetscCall(VecGetArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x, &x_mem_type)); CeedVectorSetArray(user->x_ceed, MemTypeP2C(x_mem_type), CEED_USE_POINTER, x); // Setup libCEED output vector - Vec E_loc; + Vec E_loc; CeedVector e_loc; - ierr = DMCreateLocalVector(dmEnergy, &E_loc); CHKERRQ(ierr); - ierr = VecGetSize(E_loc, &length); CHKERRQ(ierr); - ierr = VecDestroy(&E_loc); CHKERRQ(ierr); + PetscCall(DMCreateLocalVector(dmEnergy, &E_loc)); + PetscCall(VecGetSize(E_loc, &length)); + PetscCall(VecDestroy(&E_loc)); CeedVectorCreate(user->ceed, length, &e_loc); // Apply libCEED operator @@ -275,20 +242,17 @@ PetscErrorCode ComputeStrainEnergy(DM dmEnergy, UserMult user, // Restore PETSc vector CeedVectorTakeArray(user->x_ceed, MemTypeP2C(x_mem_type), NULL); - ierr = VecRestoreArrayRead(user->X_loc, (const PetscScalar **)&x); - CHKERRQ(ierr); + PetscCall(VecRestoreArrayRead(user->X_loc, (const PetscScalar **)&x)); // Reduce max error const CeedScalar *e; CeedVectorGetArrayRead(e_loc, CEED_MEM_HOST, &e); (*energy) = 0; - for (CeedInt i=0; icomm); CHKERRQ(ierr); + PetscCall(MPI_Allreduce(MPI_IN_PLACE, energy, 1, MPIU_REAL, MPIU_SUM, user->comm)); PetscFunctionReturn(0); }; diff --git a/examples/solids/src/misc.c b/examples/solids/src/misc.c index 8658d4c17e..c39dd5159f 100644 --- a/examples/solids/src/misc.c +++ b/examples/solids/src/misc.c @@ -9,28 +9,24 @@ /// Helper functions for solid mechanics example using PETSc #include "../include/misc.h" + #include "../include/utils.h" // ----------------------------------------------------------------------------- // Create libCEED operator context // ----------------------------------------------------------------------------- // Setup context data for Jacobian evaluation -PetscErrorCode SetupJacobianCtx(MPI_Comm comm, AppCtx app_ctx, DM dm, Vec V, - Vec V_loc, CeedData ceed_data, Ceed ceed, - CeedQFunctionContext ctx_phys, - CeedQFunctionContext ctx_phys_smoother, - UserMult jacobian_ctx) { - PetscErrorCode ierr; - +PetscErrorCode SetupJacobianCtx(MPI_Comm comm, AppCtx app_ctx, DM dm, Vec V, Vec V_loc, CeedData ceed_data, Ceed ceed, CeedQFunctionContext ctx_phys, + CeedQFunctionContext ctx_phys_smoother, UserMult jacobian_ctx) { PetscFunctionBeginUser; // PETSc objects jacobian_ctx->comm = comm; - jacobian_ctx->dm = dm; + jacobian_ctx->dm = dm; // Work vectors jacobian_ctx->X_loc = V_loc; - ierr = VecDuplicate(V_loc, &jacobian_ctx->Y_loc); CHKERRQ(ierr); + PetscCall(VecDuplicate(V_loc, &jacobian_ctx->Y_loc)); jacobian_ctx->x_ceed = ceed_data->x_ceed; jacobian_ctx->y_ceed = ceed_data->y_ceed; @@ -42,16 +38,14 @@ PetscErrorCode SetupJacobianCtx(MPI_Comm comm, AppCtx app_ctx, DM dm, Vec V, jacobian_ctx->ceed = ceed; // Physics - jacobian_ctx->ctx_phys = ctx_phys; + jacobian_ctx->ctx_phys = ctx_phys; jacobian_ctx->ctx_phys_smoother = ctx_phys_smoother; PetscFunctionReturn(0); }; // Setup context data for prolongation and restriction operators -PetscErrorCode SetupProlongRestrictCtx(MPI_Comm comm, AppCtx app_ctx, DM dm_c, - DM dm_f, Vec V_f, Vec V_loc_c, Vec V_loc_f, - CeedData ceed_data_c, CeedData ceed_data_f, - Ceed ceed, UserMultProlongRestr prolong_restr_ctx) { +PetscErrorCode SetupProlongRestrictCtx(MPI_Comm comm, AppCtx app_ctx, DM dm_c, DM dm_f, Vec V_f, Vec V_loc_c, Vec V_loc_f, CeedData ceed_data_c, + CeedData ceed_data_f, Ceed ceed, UserMultProlongRestr prolong_restr_ctx) { PetscFunctionBeginUser; // PETSc objects @@ -60,13 +54,13 @@ PetscErrorCode SetupProlongRestrictCtx(MPI_Comm comm, AppCtx app_ctx, DM dm_c, prolong_restr_ctx->dm_f = dm_f; // Work vectors - prolong_restr_ctx->loc_vec_c = V_loc_c; - prolong_restr_ctx->loc_vec_f = V_loc_f; + prolong_restr_ctx->loc_vec_c = V_loc_c; + prolong_restr_ctx->loc_vec_f = V_loc_f; prolong_restr_ctx->ceed_vec_c = ceed_data_c->x_ceed; prolong_restr_ctx->ceed_vec_f = ceed_data_f->x_ceed; // libCEED operators - prolong_restr_ctx->op_prolong = ceed_data_f->op_prolong; + prolong_restr_ctx->op_prolong = ceed_data_f->op_prolong; prolong_restr_ctx->op_restrict = ceed_data_f->op_restrict; // Ceed @@ -78,36 +72,32 @@ PetscErrorCode SetupProlongRestrictCtx(MPI_Comm comm, AppCtx app_ctx, DM dm_c, // Jacobian setup // ----------------------------------------------------------------------------- PetscErrorCode FormJacobian(SNES snes, Vec U, Mat J, Mat J_pre, void *ctx) { - PetscErrorCode ierr; - PetscFunctionBeginUser; // Context data - FormJacobCtx form_jacob_ctx = (FormJacobCtx)ctx; - PetscInt num_levels = form_jacob_ctx->num_levels; - Mat *jacob_mat = form_jacob_ctx->jacob_mat; + FormJacobCtx form_jacob_ctx = (FormJacobCtx)ctx; + PetscInt num_levels = form_jacob_ctx->num_levels; + Mat *jacob_mat = form_jacob_ctx->jacob_mat; // Update Jacobian on each level for (PetscInt level = 0; level < num_levels; level++) { - ierr = MatAssemblyBegin(jacob_mat[level], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); - ierr = MatAssemblyEnd(jacob_mat[level], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); + PetscCall(MatAssemblyBegin(jacob_mat[level], MAT_FINAL_ASSEMBLY)); + PetscCall(MatAssemblyEnd(jacob_mat[level], MAT_FINAL_ASSEMBLY)); } // Form coarse assembled matrix - CeedOperatorLinearAssemble(form_jacob_ctx->op_coarse, - form_jacob_ctx->coo_values); + CeedOperatorLinearAssemble(form_jacob_ctx->op_coarse, form_jacob_ctx->coo_values); const CeedScalar *values; CeedVectorGetArrayRead(form_jacob_ctx->coo_values, CEED_MEM_HOST, &values); - ierr = MatSetValuesCOO(form_jacob_ctx->jacob_mat_coarse, values, ADD_VALUES); - CHKERRQ(ierr); + PetscCall(MatSetValuesCOO(form_jacob_ctx->jacob_mat_coarse, values, ADD_VALUES)); CeedVectorRestoreArrayRead(form_jacob_ctx->coo_values, &values); // J_pre might be AIJ (e.g., when using coloring), so we need to assemble it - ierr = MatAssemblyBegin(J_pre, MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); - ierr = MatAssemblyEnd(J_pre, MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); + PetscCall(MatAssemblyBegin(J_pre, MAT_FINAL_ASSEMBLY)); + PetscCall(MatAssemblyEnd(J_pre, MAT_FINAL_ASSEMBLY)); if (J != J_pre) { - ierr = MatAssemblyBegin(J, MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); - ierr = MatAssemblyEnd(J, MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); + PetscCall(MatAssemblyBegin(J, MAT_FINAL_ASSEMBLY)); + PetscCall(MatAssemblyEnd(J, MAT_FINAL_ASSEMBLY)); } PetscFunctionReturn(0); }; @@ -115,34 +105,31 @@ PetscErrorCode FormJacobian(SNES snes, Vec U, Mat J, Mat J_pre, void *ctx) { // ----------------------------------------------------------------------------- // Output solution for visualization // ----------------------------------------------------------------------------- -PetscErrorCode ViewSolution(MPI_Comm comm, AppCtx app_ctx, Vec U, - PetscInt increment, PetscScalar load_increment) { - PetscErrorCode ierr; - DM dm; +PetscErrorCode ViewSolution(MPI_Comm comm, AppCtx app_ctx, Vec U, PetscInt increment, PetscScalar load_increment) { + DM dm; PetscViewer viewer; - char output_filename[PETSC_MAX_PATH_LEN]; + char output_filename[PETSC_MAX_PATH_LEN]; PetscMPIInt rank; PetscFunctionBeginUser; // Create output directory MPI_Comm_rank(comm, &rank); - if (!rank) {ierr = PetscMkdir(app_ctx->output_dir); CHKERRQ(ierr);} + if (!rank) { + PetscCall(PetscMkdir(app_ctx->output_dir)); + } // Build file name - ierr = PetscSNPrintf(output_filename, sizeof output_filename, - "%s/solution-%03" PetscInt_FMT ".vtu", app_ctx->output_dir, - increment); CHKERRQ(ierr); + PetscCall(PetscSNPrintf(output_filename, sizeof output_filename, "%s/solution-%03" PetscInt_FMT ".vtu", app_ctx->output_dir, increment)); // Increment sequence - ierr = VecGetDM(U, &dm); CHKERRQ(ierr); - ierr = DMSetOutputSequenceNumber(dm, increment, load_increment); CHKERRQ(ierr); + PetscCall(VecGetDM(U, &dm)); + PetscCall(DMSetOutputSequenceNumber(dm, increment, load_increment)); // Output solution vector - ierr = PetscViewerVTKOpen(comm, output_filename, FILE_MODE_WRITE, &viewer); - CHKERRQ(ierr); - ierr = VecView(U, viewer); CHKERRQ(ierr); - ierr = PetscViewerDestroy(&viewer); CHKERRQ(ierr); + PetscCall(PetscViewerVTKOpen(comm, output_filename, FILE_MODE_WRITE, &viewer)); + PetscCall(VecView(U, viewer)); + PetscCall(PetscViewerDestroy(&viewer)); PetscFunctionReturn(0); }; @@ -150,44 +137,38 @@ PetscErrorCode ViewSolution(MPI_Comm comm, AppCtx app_ctx, Vec U, // ----------------------------------------------------------------------------- // Output diagnostic quantities for visualization // ----------------------------------------------------------------------------- -PetscErrorCode ViewDiagnosticQuantities(MPI_Comm comm, DM dmU, - UserMult user, AppCtx app_ctx, Vec U, - CeedElemRestriction elem_restr_diagnostic) { - PetscErrorCode ierr; - Vec Diagnostic, Y_loc, mult_vec; - CeedVector y_ceed; - CeedScalar *x, *y; +PetscErrorCode ViewDiagnosticQuantities(MPI_Comm comm, DM dmU, UserMult user, AppCtx app_ctx, Vec U, CeedElemRestriction elem_restr_diagnostic) { + Vec Diagnostic, Y_loc, mult_vec; + CeedVector y_ceed; + CeedScalar *x, *y; PetscMemType x_mem_type, y_mem_type; - PetscInt loc_size; - PetscViewer viewer; - char output_filename[PETSC_MAX_PATH_LEN]; + PetscInt loc_size; + PetscViewer viewer; + char output_filename[PETSC_MAX_PATH_LEN]; PetscFunctionBeginUser; // --------------------------------------------------------------------------- // PETSc and libCEED vectors // --------------------------------------------------------------------------- - ierr = DMCreateGlobalVector(user->dm, &Diagnostic); CHKERRQ(ierr); - ierr = PetscObjectSetName((PetscObject)Diagnostic, ""); CHKERRQ(ierr); - ierr = DMCreateLocalVector(user->dm, &Y_loc); CHKERRQ(ierr); - ierr = VecGetSize(Y_loc, &loc_size); CHKERRQ(ierr); + PetscCall(DMCreateGlobalVector(user->dm, &Diagnostic)); + PetscCall(PetscObjectSetName((PetscObject)Diagnostic, "")); + PetscCall(DMCreateLocalVector(user->dm, &Y_loc)); + PetscCall(VecGetSize(Y_loc, &loc_size)); CeedVectorCreate(user->ceed, loc_size, &y_ceed); // --------------------------------------------------------------------------- // Compute quantities // --------------------------------------------------------------------------- // -- Global-to-local - ierr = VecZeroEntries(user->X_loc); CHKERRQ(ierr); - ierr = DMPlexInsertBoundaryValues(dmU, PETSC_TRUE, user->X_loc, - user->load_increment, NULL, NULL, NULL); - CHKERRQ(ierr); - ierr = DMGlobalToLocal(dmU, U, INSERT_VALUES, user->X_loc); CHKERRQ(ierr); - ierr = VecZeroEntries(Y_loc); CHKERRQ(ierr); + PetscCall(VecZeroEntries(user->X_loc)); + PetscCall(DMPlexInsertBoundaryValues(dmU, PETSC_TRUE, user->X_loc, user->load_increment, NULL, NULL, NULL)); + PetscCall(DMGlobalToLocal(dmU, U, INSERT_VALUES, user->X_loc)); + PetscCall(VecZeroEntries(Y_loc)); // -- Setup CEED vectors - ierr = VecGetArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x, - &x_mem_type); CHKERRQ(ierr); - ierr = VecGetArrayAndMemType(Y_loc, &y, &y_mem_type); CHKERRQ(ierr); + PetscCall(VecGetArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x, &x_mem_type)); + PetscCall(VecGetArrayAndMemType(Y_loc, &y, &y_mem_type)); CeedVectorSetArray(user->x_ceed, MemTypeP2C(x_mem_type), CEED_USE_POINTER, x); CeedVectorSetArray(y_ceed, MemTypeP2C(y_mem_type), CEED_USE_POINTER, y); @@ -196,54 +177,48 @@ PetscErrorCode ViewDiagnosticQuantities(MPI_Comm comm, DM dmU, // -- Restore PETSc vector; keep y_ceed viewing memory of Y_loc for use below CeedVectorTakeArray(user->x_ceed, MemTypeP2C(x_mem_type), NULL); - ierr = VecRestoreArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x); - CHKERRQ(ierr); + PetscCall(VecRestoreArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x)); // -- Local-to-global - ierr = VecZeroEntries(Diagnostic); CHKERRQ(ierr); - ierr = DMLocalToGlobal(user->dm, Y_loc, ADD_VALUES, Diagnostic); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(Diagnostic)); + PetscCall(DMLocalToGlobal(user->dm, Y_loc, ADD_VALUES, Diagnostic)); // --------------------------------------------------------------------------- // Scale for multiplicity // --------------------------------------------------------------------------- // -- Setup vectors - ierr = VecDuplicate(Diagnostic, &mult_vec); CHKERRQ(ierr); - ierr = VecZeroEntries(Y_loc); CHKERRQ(ierr); + PetscCall(VecDuplicate(Diagnostic, &mult_vec)); + PetscCall(VecZeroEntries(Y_loc)); // -- Compute multiplicity CeedElemRestrictionGetMultiplicity(elem_restr_diagnostic, y_ceed); // -- Restore vectors CeedVectorTakeArray(y_ceed, MemTypeP2C(y_mem_type), NULL); - ierr = VecRestoreArrayAndMemType(Y_loc, &y); CHKERRQ(ierr); + PetscCall(VecRestoreArrayAndMemType(Y_loc, &y)); // -- Local-to-global - ierr = VecZeroEntries(mult_vec); CHKERRQ(ierr); - ierr = DMLocalToGlobal(user->dm, Y_loc, ADD_VALUES, mult_vec); - CHKERRQ(ierr); + PetscCall(VecZeroEntries(mult_vec)); + PetscCall(DMLocalToGlobal(user->dm, Y_loc, ADD_VALUES, mult_vec)); // -- Scale - ierr = VecReciprocal(mult_vec); CHKERRQ(ierr); - ierr = VecPointwiseMult(Diagnostic, Diagnostic, mult_vec); + PetscCall(VecReciprocal(mult_vec)); + PetscCall(VecPointwiseMult(Diagnostic, Diagnostic, mult_vec)); // --------------------------------------------------------------------------- // Output solution vector // --------------------------------------------------------------------------- - ierr = PetscSNPrintf(output_filename, sizeof output_filename, - "%s/diagnostic_quantities.vtu", - app_ctx->output_dir); CHKERRQ(ierr); - ierr = PetscViewerVTKOpen(comm, output_filename, FILE_MODE_WRITE, &viewer); - CHKERRQ(ierr); - ierr = VecView(Diagnostic, viewer); CHKERRQ(ierr); - ierr = PetscViewerDestroy(&viewer); CHKERRQ(ierr); + PetscCall(PetscSNPrintf(output_filename, sizeof output_filename, "%s/diagnostic_quantities.vtu", app_ctx->output_dir)); + PetscCall(PetscViewerVTKOpen(comm, output_filename, FILE_MODE_WRITE, &viewer)); + PetscCall(VecView(Diagnostic, viewer)); + PetscCall(PetscViewerDestroy(&viewer)); // --------------------------------------------------------------------------- // Cleanup // --------------------------------------------------------------------------- - ierr = VecDestroy(&Diagnostic); CHKERRQ(ierr); - ierr = VecDestroy(&mult_vec); CHKERRQ(ierr); - ierr = VecDestroy(&Y_loc); CHKERRQ(ierr); + PetscCall(VecDestroy(&Diagnostic)); + PetscCall(VecDestroy(&mult_vec)); + PetscCall(VecDestroy(&Y_loc)); CeedVectorDestroy(&y_ceed); PetscFunctionReturn(0); @@ -253,20 +228,18 @@ PetscErrorCode ViewDiagnosticQuantities(MPI_Comm comm, DM dmU, // Regression testing // ----------------------------------------------------------------------------- // test option change. could remove the loading step. Run only with one loading step and compare relatively to ref file -// option: expect_final_strain_energy and check against the relative error to ref is within tolerance (10^-5) I.e. one Newton solve then check final energy +// option: expect_final_strain_energy and check against the relative error to ref is within tolerance (10^-5) I.e. one Newton solve then check final +// energy PetscErrorCode RegressionTests_solids(AppCtx app_ctx, PetscReal energy) { PetscFunctionBegin; if (app_ctx->expect_final_strain >= 0.) { PetscReal energy_ref = app_ctx->expect_final_strain; - PetscReal error = PetscAbsReal(energy - energy_ref) / energy_ref; + PetscReal error = PetscAbsReal(energy - energy_ref) / energy_ref; if (error > app_ctx->test_tol) { - PetscErrorCode ierr; - ierr = PetscPrintf(PETSC_COMM_WORLD, - "Energy %e does not match expected energy %e: relative tolerance %e > %e\n", - (double)energy, (double)energy_ref, (double)error, app_ctx->test_tol); - CHKERRQ(ierr); + PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Energy %e does not match expected energy %e: relative tolerance %e > %e\n", (double)energy, + (double)energy_ref, (double)error, app_ctx->test_tol)); } } PetscFunctionReturn(0); diff --git a/examples/solids/src/setup-dm.c b/examples/solids/src/setup-dm.c index 8e805c4fb6..f50fb9cde8 100644 --- a/examples/solids/src/setup-dm.c +++ b/examples/solids/src/setup-dm.c @@ -8,91 +8,83 @@ /// @file /// DM setup for solid mechanics example using PETSc -#include "../include/boundary.h" #include "../include/setup-dm.h" +#include "../include/boundary.h" + // ----------------------------------------------------------------------------- // Setup DM // ----------------------------------------------------------------------------- PetscErrorCode CreateBCLabel(DM dm, const char name[]) { - PetscErrorCode ierr; DMLabel label; PetscFunctionBeginUser; - ierr = DMCreateLabel(dm, name); CHKERRQ(ierr); - ierr = DMGetLabel(dm, name, &label); CHKERRQ(ierr); - ierr = DMPlexMarkBoundaryFaces(dm, 1, label); CHKERRQ(ierr); + PetscCall(DMCreateLabel(dm, name)); + PetscCall(DMGetLabel(dm, name, &label)); + PetscCall(DMPlexMarkBoundaryFaces(dm, 1, label)); PetscFunctionReturn(0); }; // Read mesh and distribute DM in parallel PetscErrorCode CreateDistributedDM(MPI_Comm comm, AppCtx app_ctx, DM *dm) { - PetscErrorCode ierr; - const char *filename = app_ctx->mesh_file; - PetscBool interpolate = PETSC_TRUE; - DM distributed_mesh = NULL; + const char *filename = app_ctx->mesh_file; + PetscBool interpolate = PETSC_TRUE; + DM distributed_mesh = NULL; PetscPartitioner part; PetscFunctionBeginUser; if (!*filename) { PetscInt dim = 3, faces[3] = {3, 3, 3}; - ierr = PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", - faces, &dim, NULL); CHKERRQ(ierr); + PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-dm_plex_box_faces", faces, &dim, NULL)); if (!dim) dim = 3; - ierr = DMPlexCreateBoxMesh(comm, dim, PETSC_FALSE, faces, NULL, - NULL, NULL, interpolate, dm); CHKERRQ(ierr); + PetscCall(DMPlexCreateBoxMesh(comm, dim, PETSC_FALSE, faces, NULL, NULL, NULL, interpolate, dm)); } else { - ierr = DMPlexCreateFromFile(comm, filename, NULL, interpolate, dm); - CHKERRQ(ierr); + PetscCall(DMPlexCreateFromFile(comm, filename, NULL, interpolate, dm)); } // Distribute DM in parallel - ierr = DMPlexGetPartitioner(*dm, &part); CHKERRQ(ierr); - ierr = PetscPartitionerSetFromOptions(part); CHKERRQ(ierr); - ierr = DMPlexDistribute(*dm, 0, NULL, &distributed_mesh); CHKERRQ(ierr); + PetscCall(DMPlexGetPartitioner(*dm, &part)); + PetscCall(PetscPartitionerSetFromOptions(part)); + PetscCall(DMPlexDistribute(*dm, 0, NULL, &distributed_mesh)); if (distributed_mesh) { - ierr = DMDestroy(dm); CHKERRQ(ierr); - *dm = distributed_mesh; + PetscCall(DMDestroy(dm)); + *dm = distributed_mesh; } - ierr = DMViewFromOptions(*dm, NULL, "-dm_view"); CHKERRQ(ierr); + PetscCall(DMViewFromOptions(*dm, NULL, "-dm_view")); PetscFunctionReturn(0); }; // Setup DM with FE space of appropriate degree -PetscErrorCode SetupDMByDegree(DM dm, AppCtx app_ctx, PetscInt order, - PetscBool boundary, PetscInt num_comp_u) { - PetscErrorCode ierr; +PetscErrorCode SetupDMByDegree(DM dm, AppCtx app_ctx, PetscInt order, PetscBool boundary, PetscInt num_comp_u) { MPI_Comm comm; PetscInt dim; PetscFE fe; IS face_set_is; // Index Set for Face Sets - const char *name = "Face Sets"; // PETSc internal requirement + const char *name = "Face Sets"; // PETSc internal requirement PetscInt num_face_sets; // Number of FaceSets in face_set_is - const PetscInt *face_set_ids; // id of each FaceSet + const PetscInt *face_set_ids; // id of each FaceSet PetscFunctionBeginUser; // Setup DM - ierr = DMGetDimension(dm, &dim); CHKERRQ(ierr); - ierr = PetscObjectGetComm((PetscObject)dm, &comm); CHKERRQ(ierr); - ierr = PetscFECreateLagrange(comm, dim, num_comp_u, PETSC_FALSE, order, order, - &fe); CHKERRQ(ierr); - ierr = DMSetFromOptions(dm); CHKERRQ(ierr); - ierr = DMAddField(dm, NULL, (PetscObject)fe); CHKERRQ(ierr); - ierr = DMCreateDS(dm); CHKERRQ(ierr); + PetscCall(DMGetDimension(dm, &dim)); + PetscCall(PetscObjectGetComm((PetscObject)dm, &comm)); + PetscCall(PetscFECreateLagrange(comm, dim, num_comp_u, PETSC_FALSE, order, order, &fe)); + PetscCall(DMSetFromOptions(dm)); + PetscCall(DMAddField(dm, NULL, (PetscObject)fe)); + PetscCall(DMCreateDS(dm)); { /* create FE field for coordinates */ - PetscFE fe_coords; + PetscFE fe_coords; PetscInt num_comp_coord; - ierr = DMGetCoordinateDim(dm, &num_comp_coord); CHKERRQ(ierr); - ierr = PetscFECreateLagrange(comm, dim, num_comp_coord, PETSC_FALSE, 1, 1, - &fe_coords); CHKERRQ(ierr); - ierr = DMProjectCoordinates(dm, fe_coords); CHKERRQ(ierr); - ierr = PetscFEDestroy(&fe_coords); CHKERRQ(ierr); + PetscCall(DMGetCoordinateDim(dm, &num_comp_coord)); + PetscCall(PetscFECreateLagrange(comm, dim, num_comp_coord, PETSC_FALSE, 1, 1, &fe_coords)); + PetscCall(DMProjectCoordinates(dm, fe_coords)); + PetscCall(PetscFEDestroy(&fe_coords)); } // Add Dirichlet (Essential) boundary @@ -101,52 +93,42 @@ PetscErrorCode SetupDMByDegree(DM dm, AppCtx app_ctx, PetscInt order, if (app_ctx->test_mode) { // -- Test mode - box mesh PetscBool has_label; - PetscInt marker_ids[1] = {1}; - ierr = DMHasLabel(dm, "marker", &has_label); CHKERRQ(ierr); + PetscInt marker_ids[1] = {1}; + PetscCall(DMHasLabel(dm, "marker", &has_label)); if (!has_label) { - ierr = CreateBCLabel(dm, "marker"); CHKERRQ(ierr); + PetscCall(CreateBCLabel(dm, "marker")); } DMLabel label; - ierr = DMGetLabel(dm, "marker", &label); CHKERRQ(ierr); - ierr = DMAddBoundary(dm, DM_BC_ESSENTIAL, "mms", label, 1, marker_ids, - 0, 0, NULL, (void(*)(void))BCMMS, NULL, NULL, NULL); - CHKERRQ(ierr); + PetscCall(DMGetLabel(dm, "marker", &label)); + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "mms", label, 1, marker_ids, 0, 0, NULL, (void (*)(void))BCMMS, NULL, NULL, NULL)); } else { // -- ExodusII mesh with MMS - ierr = DMGetLabelIdIS(dm, name, &face_set_is); CHKERRQ(ierr); - ierr = ISGetSize(face_set_is,&num_face_sets); CHKERRQ(ierr); - ierr = ISGetIndices(face_set_is, &face_set_ids); CHKERRQ(ierr); + PetscCall(DMGetLabelIdIS(dm, name, &face_set_is)); + PetscCall(ISGetSize(face_set_is, &num_face_sets)); + PetscCall(ISGetIndices(face_set_is, &face_set_ids)); DMLabel label; - ierr = DMGetLabel(dm, "Face Sets", &label); CHKERRQ(ierr); - ierr = DMAddBoundary(dm, DM_BC_ESSENTIAL, "mms", label, - num_face_sets, face_set_ids, 0, 0, NULL, - (void(*)(void))BCMMS, NULL, NULL, NULL); - CHKERRQ(ierr); - ierr = ISRestoreIndices(face_set_is, &face_set_ids); CHKERRQ(ierr); - ierr = ISDestroy(&face_set_is); CHKERRQ(ierr); + PetscCall(DMGetLabel(dm, "Face Sets", &label)); + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, "mms", label, num_face_sets, face_set_ids, 0, 0, NULL, (void (*)(void))BCMMS, NULL, NULL, NULL)); + PetscCall(ISRestoreIndices(face_set_is, &face_set_ids)); + PetscCall(ISDestroy(&face_set_is)); } } else { // -- Mesh with user specified BCs DMLabel label; - ierr = DMGetLabel(dm, "Face Sets", &label); CHKERRQ(ierr); + PetscCall(DMGetLabel(dm, "Face Sets", &label)); // -- Clamp BCs for (PetscInt i = 0; i < app_ctx->bc_clamp_count; i++) { char bcName[25]; - snprintf(bcName, sizeof bcName, "clamp_%" PetscInt_FMT, - app_ctx->bc_clamp_faces[i]); - ierr = DMAddBoundary(dm, DM_BC_ESSENTIAL, bcName, label, 1, - &app_ctx->bc_clamp_faces[i], 0, 0, - NULL, (void(*)(void))BCClamp, NULL, - (void *)&app_ctx->bc_clamp_max[i], NULL); - CHKERRQ(ierr); + snprintf(bcName, sizeof bcName, "clamp_%" PetscInt_FMT, app_ctx->bc_clamp_faces[i]); + PetscCall(DMAddBoundary(dm, DM_BC_ESSENTIAL, bcName, label, 1, &app_ctx->bc_clamp_faces[i], 0, 0, NULL, (void (*)(void))BCClamp, NULL, + (void *)&app_ctx->bc_clamp_max[i], NULL)); } } } - ierr = DMPlexSetClosurePermutationTensor(dm, PETSC_DETERMINE, NULL); - CHKERRQ(ierr); + PetscCall(DMPlexSetClosurePermutationTensor(dm, PETSC_DETERMINE, NULL)); // Cleanup - ierr = PetscFEDestroy(&fe); CHKERRQ(ierr); + PetscCall(PetscFEDestroy(&fe)); PetscFunctionReturn(0); }; diff --git a/examples/solids/src/setup-libceed.c b/examples/solids/src/setup-libceed.c index ac455a5f8c..aa218ab548 100644 --- a/examples/solids/src/setup-libceed.c +++ b/examples/solids/src/setup-libceed.c @@ -9,15 +9,16 @@ /// libCEED setup for solid mechanics example using PETSc #include "../include/setup-libceed.h" + #include "../include/structs.h" #include "../include/utils.h" -#include "../qfunctions/traction-boundary.h" // Traction boundaries -#include "../qfunctions/constant-force.h" // Constant forcing function -#include "../qfunctions/manufactured-force.h" // Manufactured solution forcing +#include "../qfunctions/constant-force.h" // Constant forcing function +#include "../qfunctions/manufactured-force.h" // Manufactured solution forcing +#include "../qfunctions/traction-boundary.h" // Traction boundaries -#if PETSC_VERSION_LT(3,14,0) -# define DMPlexGetClosureIndices(a,b,c,d,e,f,g,h,i) DMPlexGetClosureIndices(a,b,c,d,f,g,i) -# define DMPlexRestoreClosureIndices(a,b,c,d,e,f,g,h,i) DMPlexRestoreClosureIndices(a,b,c,d,f,g,i) +#if PETSC_VERSION_LT(3, 14, 0) +#define DMPlexGetClosureIndices(a, b, c, d, e, f, g, h, i) DMPlexGetClosureIndices(a, b, c, d, f, g, i) +#define DMPlexRestoreClosureIndices(a, b, c, d, e, f, g, h, i) DMPlexRestoreClosureIndices(a, b, c, d, f, g, i) #endif // ----------------------------------------------------------------------------- @@ -25,18 +26,9 @@ // ----------------------------------------------------------------------------- // Forcing function data forcingData forcing_options[3] = { - [FORCE_NONE] = { - .setup_forcing = NULL, - .setup_forcing_loc = NULL - }, - [FORCE_CONST] = { - .setup_forcing = SetupConstantForce, - .setup_forcing_loc = SetupConstantForce_loc - }, - [FORCE_MMS] = { - .setup_forcing = SetupMMSForce, - .setup_forcing_loc = SetupMMSForce_loc - } + [FORCE_NONE] = {.setup_forcing = NULL, .setup_forcing_loc = NULL }, + [FORCE_CONST] = {.setup_forcing = SetupConstantForce, .setup_forcing_loc = SetupConstantForce_loc}, + [FORCE_MMS] = {.setup_forcing = SetupMMSForce, .setup_forcing_loc = SetupMMSForce_loc } }; // ----------------------------------------------------------------------------- @@ -44,24 +36,20 @@ forcingData forcing_options[3] = { // ----------------------------------------------------------------------------- // Destroy libCEED objects PetscErrorCode CeedDataDestroy(CeedInt level, CeedData data) { - PetscErrorCode ierr; - PetscFunctionBegin; // Vectors CeedVectorDestroy(&data->x_ceed); CeedVectorDestroy(&data->y_ceed); CeedVectorDestroy(&data->geo_data); - for (CeedInt i = 0; i < SOLIDS_MAX_NUMBER_FIELDS; i++) - CeedVectorDestroy(&data->stored_fields[i]); + for (CeedInt i = 0; i < SOLIDS_MAX_NUMBER_FIELDS; i++) CeedVectorDestroy(&data->stored_fields[i]); CeedVectorDestroy(&data->geo_data_diagnostic); CeedVectorDestroy(&data->true_soln); // Restrictions CeedElemRestrictionDestroy(&data->elem_restr_x); CeedElemRestrictionDestroy(&data->elem_restr_u); CeedElemRestrictionDestroy(&data->elem_restr_geo_data_i); - for (CeedInt i = 0; i < SOLIDS_MAX_NUMBER_FIELDS; i++) - CeedElemRestrictionDestroy(&data->elem_restr_stored_fields_i[i]); + for (CeedInt i = 0; i < SOLIDS_MAX_NUMBER_FIELDS; i++) CeedElemRestrictionDestroy(&data->elem_restr_stored_fields_i[i]); CeedElemRestrictionDestroy(&data->elem_restr_energy); CeedElemRestrictionDestroy(&data->elem_restr_diagnostic); CeedElemRestrictionDestroy(&data->elem_restr_geo_data_diagnostic_i); @@ -85,173 +73,127 @@ PetscErrorCode CeedDataDestroy(CeedInt level, CeedData data) { CeedOperatorDestroy(&data->op_prolong); CeedOperatorDestroy(&data->op_restrict); - ierr = PetscFree(data); CHKERRQ(ierr); + PetscCall(PetscFree(data)); PetscFunctionReturn(0); }; // Utility function to create local CEED restriction from DMPlex -PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr) { +PetscErrorCode CreateRestrictionFromPlex(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, CeedInt value, CeedElemRestriction *elem_restr) { PetscInt num_elem, elem_size, num_dof, num_comp, *elem_restr_offsets; - PetscErrorCode ierr; PetscFunctionBeginUser; - ierr = DMPlexGetLocalOffsets(dm, domain_label, value, height, 0, &num_elem, - &elem_size, &num_comp, &num_dof, &elem_restr_offsets); - CHKERRQ(ierr); + PetscCall(DMPlexGetLocalOffsets(dm, domain_label, value, height, 0, &num_elem, &elem_size, &num_comp, &num_dof, &elem_restr_offsets)); - CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp, - 1, num_dof, CEED_MEM_HOST, CEED_COPY_VALUES, - elem_restr_offsets, elem_restr); - ierr = PetscFree(elem_restr_offsets); CHKERRQ(ierr); + CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp, 1, num_dof, CEED_MEM_HOST, CEED_COPY_VALUES, elem_restr_offsets, elem_restr); + PetscCall(PetscFree(elem_restr_offsets)); PetscFunctionReturn(0); }; // Utility function to get Ceed Restriction for each domain -PetscErrorCode GetRestrictionForDomain(Ceed ceed, DM dm, CeedInt height, - DMLabel domain_label, PetscInt value, - CeedInt Q, CeedInt q_data_size, - CeedElemRestriction *elem_restr_q, - CeedElemRestriction *elem_restr_x, - CeedElemRestriction *elem_restr_qd_i) { - - DM dm_coord; +PetscErrorCode GetRestrictionForDomain(Ceed ceed, DM dm, CeedInt height, DMLabel domain_label, PetscInt value, CeedInt Q, CeedInt q_data_size, + CeedElemRestriction *elem_restr_q, CeedElemRestriction *elem_restr_x, CeedElemRestriction *elem_restr_qd_i) { + DM dm_coord; CeedInt dim, num_local_elem; CeedInt Q_dim; - PetscErrorCode ierr; PetscFunctionBeginUser; - ierr = DMGetDimension(dm, &dim); CHKERRQ(ierr); + PetscCall(DMGetDimension(dm, &dim)); dim -= height; Q_dim = CeedIntPow(Q, dim); - ierr = DMGetCoordinateDM(dm, &dm_coord); CHKERRQ(ierr); - ierr = DMPlexSetClosurePermutationTensor(dm_coord, PETSC_DETERMINE, NULL); - CHKERRQ(ierr); + PetscCall(DMGetCoordinateDM(dm, &dm_coord)); + PetscCall(DMPlexSetClosurePermutationTensor(dm_coord, PETSC_DETERMINE, NULL)); if (elem_restr_q) { - ierr = CreateRestrictionFromPlex(ceed, dm, height, domain_label, value, - elem_restr_q); CHKERRQ(ierr); + PetscCall(CreateRestrictionFromPlex(ceed, dm, height, domain_label, value, elem_restr_q)); } if (elem_restr_x) { - ierr = CreateRestrictionFromPlex(ceed, dm_coord, height, domain_label, - value, elem_restr_x); CHKERRQ(ierr); + PetscCall(CreateRestrictionFromPlex(ceed, dm_coord, height, domain_label, value, elem_restr_x)); } if (elem_restr_qd_i) { CeedElemRestrictionGetNumElements(*elem_restr_q, &num_local_elem); - CeedElemRestrictionCreateStrided(ceed, num_local_elem, Q_dim, - q_data_size, q_data_size*num_local_elem*Q_dim, - CEED_STRIDES_BACKEND, elem_restr_qd_i); + CeedElemRestrictionCreateStrided(ceed, num_local_elem, Q_dim, q_data_size, q_data_size * num_local_elem * Q_dim, CEED_STRIDES_BACKEND, + elem_restr_qd_i); } PetscFunctionReturn(0); }; // Set up libCEED on the fine grid for a given degree -PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, - Ceed ceed, AppCtx app_ctx, - CeedQFunctionContext phys_ctx, - ProblemData problem_data, - PetscInt fine_level, PetscInt num_comp_u, - PetscInt U_g_size, PetscInt U_loc_size, - CeedVector force_ceed, - CeedVector neumann_ceed, CeedData *data) { - int ierr; - CeedInt P = app_ctx->level_degrees[fine_level] + 1; - CeedInt Q = app_ctx->level_degrees[fine_level] + 1 + app_ctx->q_extra; - CeedInt dim, num_comp_x, num_comp_e = 1, num_comp_d = 5; - CeedInt num_qpts; - CeedInt q_data_size = problem_data.q_data_size; - forcingType forcing_choice = app_ctx->forcing_choice; - DM dm_coord; - Vec coords; - PetscInt c_start, c_end, num_elem; +PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, Ceed ceed, AppCtx app_ctx, CeedQFunctionContext phys_ctx, + ProblemData problem_data, PetscInt fine_level, PetscInt num_comp_u, PetscInt U_g_size, PetscInt U_loc_size, + CeedVector force_ceed, CeedVector neumann_ceed, CeedData *data) { + CeedInt P = app_ctx->level_degrees[fine_level] + 1; + CeedInt Q = app_ctx->level_degrees[fine_level] + 1 + app_ctx->q_extra; + CeedInt dim, num_comp_x, num_comp_e = 1, num_comp_d = 5; + CeedInt num_qpts; + CeedInt q_data_size = problem_data.q_data_size; + forcingType forcing_choice = app_ctx->forcing_choice; + DM dm_coord; + Vec coords; + PetscInt c_start, c_end, num_elem; const PetscScalar *coordArray; - CeedVector x_coord; - CeedQFunction qf_setup_geo, qf_residual, qf_jacobian, qf_energy, qf_diagnostic; - CeedOperator op_setup_geo, op_residual, op_jacobian, op_energy, op_diagnostic; + CeedVector x_coord; + CeedQFunction qf_setup_geo, qf_residual, qf_jacobian, qf_energy, qf_diagnostic; + CeedOperator op_setup_geo, op_residual, op_jacobian, op_energy, op_diagnostic; PetscFunctionBeginUser; // --------------------------------------------------------------------------- // libCEED bases // --------------------------------------------------------------------------- - ierr = DMGetDimension(dm, &dim); CHKERRQ(ierr); + PetscCall(DMGetDimension(dm, &dim)); num_comp_x = dim; // -- Coordinate basis - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, Q, - problem_data.quadrature_mode, - &data[fine_level]->basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, Q, problem_data.quadrature_mode, &data[fine_level]->basis_x); // -- Solution basis - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_u, P, Q, - problem_data.quadrature_mode, - &data[fine_level]->basis_u); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_u, P, Q, problem_data.quadrature_mode, &data[fine_level]->basis_u); // -- Energy basis - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_e, P, Q, - problem_data.quadrature_mode, - &data[fine_level]->basis_energy); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_e, P, Q, problem_data.quadrature_mode, &data[fine_level]->basis_energy); // -- Diagnostic output basis - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_u, P, P, CEED_GAUSS_LOBATTO, - &data[fine_level]->basis_diagnostic); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_u, P, P, CEED_GAUSS_LOBATTO, &data[fine_level]->basis_diagnostic); // --------------------------------------------------------------------------- // libCEED restrictions // --------------------------------------------------------------------------- - ierr = DMGetCoordinateDM(dm, &dm_coord); CHKERRQ(ierr); - ierr = DMPlexSetClosurePermutationTensor(dm_coord, PETSC_DETERMINE, NULL); - CHKERRQ(ierr); + PetscCall(DMGetCoordinateDM(dm, &dm_coord)); + PetscCall(DMPlexSetClosurePermutationTensor(dm_coord, PETSC_DETERMINE, NULL)); // -- Coordinate restriction - ierr = CreateRestrictionFromPlex(ceed, dm_coord, 0, 0, 0, - &(data[fine_level]->elem_restr_x)); - CHKERRQ(ierr); + PetscCall(CreateRestrictionFromPlex(ceed, dm_coord, 0, 0, 0, &(data[fine_level]->elem_restr_x))); // -- Solution restriction - ierr = CreateRestrictionFromPlex(ceed, dm, 0, 0, 0, - &data[fine_level]->elem_restr_u); - CHKERRQ(ierr); + PetscCall(CreateRestrictionFromPlex(ceed, dm, 0, 0, 0, &data[fine_level]->elem_restr_u)); // -- Energy restriction - ierr = CreateRestrictionFromPlex(ceed, dm_energy, 0, 0, 0, - &data[fine_level]->elem_restr_energy); - CHKERRQ(ierr); + PetscCall(CreateRestrictionFromPlex(ceed, dm_energy, 0, 0, 0, &data[fine_level]->elem_restr_energy)); // -- Diagnostic data restriction - ierr = CreateRestrictionFromPlex(ceed, dm_diagnostic, 0, 0, 0, - &data[fine_level]->elem_restr_diagnostic); - CHKERRQ(ierr); + PetscCall(CreateRestrictionFromPlex(ceed, dm_diagnostic, 0, 0, 0, &data[fine_level]->elem_restr_diagnostic)); // -- Stored data at quadrature points - ierr = DMPlexGetHeightStratum(dm, 0, &c_start, &c_end); CHKERRQ(ierr); + PetscCall(DMPlexGetHeightStratum(dm, 0, &c_start, &c_end)); num_elem = c_end - c_start; CeedBasisGetNumQuadraturePoints(data[fine_level]->basis_u, &num_qpts); // ---- Geometric data restriction, residual and Jacobian operators - CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, q_data_size, - num_elem*num_qpts*q_data_size, - CEED_STRIDES_BACKEND, + CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, q_data_size, num_elem * num_qpts * q_data_size, CEED_STRIDES_BACKEND, &data[fine_level]->elem_restr_geo_data_i); // ---- Stored field restrictions for (CeedInt i = 0; i < problem_data.number_fields_stored; i++) { - CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, - problem_data.field_sizes[i], - num_elem*num_qpts*problem_data.field_sizes[i], - CEED_STRIDES_BACKEND, - &data[fine_level]->elem_restr_stored_fields_i[i]); + CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, problem_data.field_sizes[i], num_elem * num_qpts * problem_data.field_sizes[i], + CEED_STRIDES_BACKEND, &data[fine_level]->elem_restr_stored_fields_i[i]); } // ---- Geometric data restriction, diagnostic operator - CeedElemRestrictionCreateStrided(ceed, num_elem, P*P*P, q_data_size, - num_elem*P*P*P*q_data_size, - CEED_STRIDES_BACKEND, + CeedElemRestrictionCreateStrided(ceed, num_elem, P * P * P, q_data_size, num_elem * P * P * P * q_data_size, CEED_STRIDES_BACKEND, &data[fine_level]->elem_restr_geo_data_diagnostic_i); // --------------------------------------------------------------------------- // Element coordinates // --------------------------------------------------------------------------- - ierr = DMGetCoordinatesLocal(dm, &coords); CHKERRQ(ierr); - ierr = VecGetArrayRead(coords, &coordArray); CHKERRQ(ierr); + PetscCall(DMGetCoordinatesLocal(dm, &coords)); + PetscCall(VecGetArrayRead(coords, &coordArray)); CeedElemRestrictionCreateVector(data[fine_level]->elem_restr_x, &x_coord, NULL); - CeedVectorSetArray(x_coord, CEED_MEM_HOST, CEED_COPY_VALUES, - (PetscScalar *)coordArray); - ierr = VecRestoreArrayRead(coords, &coordArray); CHKERRQ(ierr); + CeedVectorSetArray(x_coord, CEED_MEM_HOST, CEED_COPY_VALUES, (PetscScalar *)coordArray); + PetscCall(VecRestoreArrayRead(coords, &coordArray)); // --------------------------------------------------------------------------- // Persistent libCEED vectors @@ -260,16 +202,13 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, CeedVectorCreate(ceed, U_loc_size, &data[fine_level]->x_ceed); CeedVectorCreate(ceed, U_loc_size, &data[fine_level]->y_ceed); // -- Geometric data vector - CeedVectorCreate(ceed, num_elem*num_qpts*q_data_size, - &data[fine_level]->geo_data); + CeedVectorCreate(ceed, num_elem * num_qpts * q_data_size, &data[fine_level]->geo_data); // -- Stored field vectors for (CeedInt i = 0; i < problem_data.number_fields_stored; i++) { - CeedVectorCreate(ceed, num_elem*num_qpts*problem_data.field_sizes[i], - &data[fine_level]->stored_fields[i]); + CeedVectorCreate(ceed, num_elem * num_qpts * problem_data.field_sizes[i], &data[fine_level]->stored_fields[i]); } // -- Collocated geometric data vector - CeedVectorCreate(ceed, num_elem*P*P*P*q_data_size, - &data[fine_level]->geo_data_diagnostic); + CeedVectorCreate(ceed, num_elem * P * P * P * q_data_size, &data[fine_level]->geo_data_diagnostic); // --------------------------------------------------------------------------- // Geometric factor computation @@ -278,24 +217,17 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, // geo_data returns dXdx_i,j and w * det. // --------------------------------------------------------------------------- // -- QFunction - CeedQFunctionCreateInterior(ceed, 1, problem_data.setup_geo, - problem_data.setup_geo_loc, &qf_setup_geo); - CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x*dim, CEED_EVAL_GRAD); + CeedQFunctionCreateInterior(ceed, 1, problem_data.setup_geo, problem_data.setup_geo_loc, &qf_setup_geo); + CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_setup_geo, "weight", 1, CEED_EVAL_WEIGHT); CeedQFunctionAddOutput(qf_setup_geo, "qdata", q_data_size, CEED_EVAL_NONE); // -- Operator - CeedOperatorCreate(ceed, qf_setup_geo, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_setup_geo); - CeedOperatorSetField(op_setup_geo, "dx", data[fine_level]->elem_restr_x, - data[fine_level]->basis_x, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, - data[fine_level]->basis_x, CEED_VECTOR_NONE); - CeedOperatorSetField(op_setup_geo, "qdata", - data[fine_level]->elem_restr_geo_data_i, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_setup_geo, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_geo); + CeedOperatorSetField(op_setup_geo, "dx", data[fine_level]->elem_restr_x, data[fine_level]->basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, data[fine_level]->basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_geo, "qdata", data[fine_level]->elem_restr_geo_data_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // -- Compute the quadrature data - CeedOperatorApply(op_setup_geo, x_coord, data[fine_level]->geo_data, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_setup_geo, x_coord, data[fine_level]->geo_data, CEED_REQUEST_IMMEDIATE); // -- Cleanup CeedQFunctionDestroy(&qf_setup_geo); CeedOperatorDestroy(&op_setup_geo); @@ -307,30 +239,21 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, // non-linear PDE. // --------------------------------------------------------------------------- // -- QFunction - CeedQFunctionCreateInterior(ceed, 1, problem_data.residual, - problem_data.residual_loc, &qf_residual); - CeedQFunctionAddInput(qf_residual, "du", num_comp_u*dim, CEED_EVAL_GRAD); + CeedQFunctionCreateInterior(ceed, 1, problem_data.residual, problem_data.residual_loc, &qf_residual); + CeedQFunctionAddInput(qf_residual, "du", num_comp_u * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_residual, "qdata", q_data_size, CEED_EVAL_NONE); - CeedQFunctionAddOutput(qf_residual, "dv", num_comp_u*dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_residual, "dv", num_comp_u * dim, CEED_EVAL_GRAD); for (CeedInt i = 0; i < problem_data.number_fields_stored; i++) { - CeedQFunctionAddOutput(qf_residual, problem_data.field_names[i], - problem_data.field_sizes[i], CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_residual, problem_data.field_names[i], problem_data.field_sizes[i], CEED_EVAL_NONE); } CeedQFunctionSetContext(qf_residual, phys_ctx); // -- Operator - CeedOperatorCreate(ceed, qf_residual, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_residual); - CeedOperatorSetField(op_residual, "du", data[fine_level]->elem_restr_u, - data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_residual, "qdata", - data[fine_level]->elem_restr_geo_data_i, - CEED_BASIS_COLLOCATED, data[fine_level]->geo_data); - CeedOperatorSetField(op_residual, "dv", data[fine_level]->elem_restr_u, - data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_residual, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_residual); + CeedOperatorSetField(op_residual, "du", data[fine_level]->elem_restr_u, data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_residual, "qdata", data[fine_level]->elem_restr_geo_data_i, CEED_BASIS_COLLOCATED, data[fine_level]->geo_data); + CeedOperatorSetField(op_residual, "dv", data[fine_level]->elem_restr_u, data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); for (CeedInt i = 0; i < problem_data.number_fields_stored; i++) { - CeedOperatorSetField(op_residual, problem_data.field_names[i], - data[fine_level]->elem_restr_stored_fields_i[i], - CEED_BASIS_COLLOCATED, + CeedOperatorSetField(op_residual, problem_data.field_names[i], data[fine_level]->elem_restr_stored_fields_i[i], CEED_BASIS_COLLOCATED, data[fine_level]->stored_fields[i]); } // -- Save libCEED data @@ -344,30 +267,21 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, // Jacobian for each linear solve. // --------------------------------------------------------------------------- // -- QFunction - CeedQFunctionCreateInterior(ceed, 1, problem_data.jacobian, - problem_data.jacobian_loc, &qf_jacobian); - CeedQFunctionAddInput(qf_jacobian, "delta du", num_comp_u*dim, CEED_EVAL_GRAD); + CeedQFunctionCreateInterior(ceed, 1, problem_data.jacobian, problem_data.jacobian_loc, &qf_jacobian); + CeedQFunctionAddInput(qf_jacobian, "delta du", num_comp_u * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_jacobian, "qdata", q_data_size, CEED_EVAL_NONE); for (CeedInt i = 0; i < problem_data.number_fields_stored; i++) { - CeedQFunctionAddInput(qf_jacobian, problem_data.field_names[i], - problem_data.field_sizes[i], CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_jacobian, problem_data.field_names[i], problem_data.field_sizes[i], CEED_EVAL_NONE); } - CeedQFunctionAddOutput(qf_jacobian, "delta dv", num_comp_u*dim, CEED_EVAL_GRAD); + CeedQFunctionAddOutput(qf_jacobian, "delta dv", num_comp_u * dim, CEED_EVAL_GRAD); CeedQFunctionSetContext(qf_jacobian, phys_ctx); // -- Operator - CeedOperatorCreate(ceed, qf_jacobian, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_jacobian); - CeedOperatorSetField(op_jacobian, "delta du", data[fine_level]->elem_restr_u, - data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_jacobian, "qdata", - data[fine_level]->elem_restr_geo_data_i, - CEED_BASIS_COLLOCATED, data[fine_level]->geo_data); - CeedOperatorSetField(op_jacobian, "delta dv", data[fine_level]->elem_restr_u, - data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_jacobian, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_jacobian); + CeedOperatorSetField(op_jacobian, "delta du", data[fine_level]->elem_restr_u, data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_jacobian, "qdata", data[fine_level]->elem_restr_geo_data_i, CEED_BASIS_COLLOCATED, data[fine_level]->geo_data); + CeedOperatorSetField(op_jacobian, "delta dv", data[fine_level]->elem_restr_u, data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); for (CeedInt i = 0; i < problem_data.number_fields_stored; i++) { - CeedOperatorSetField(op_jacobian, problem_data.field_names[i], - data[fine_level]->elem_restr_stored_fields_i[i], - CEED_BASIS_COLLOCATED, + CeedOperatorSetField(op_jacobian, problem_data.field_names[i], data[fine_level]->elem_restr_stored_fields_i[i], CEED_BASIS_COLLOCATED, data[fine_level]->stored_fields[i]); } // -- Save libCEED data @@ -380,51 +294,39 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, if (app_ctx->bc_traction_count > 0) { // -- Setup DMLabel domain_label; - ierr = DMGetLabel(dm, "Face Sets", &domain_label); CHKERRQ(ierr); - ierr = DMGetDimension(dm, &dim); CHKERRQ(ierr); + PetscCall(DMGetLabel(dm, "Face Sets", &domain_label)); + PetscCall(DMGetDimension(dm, &dim)); // -- Basis - CeedInt height = 1; + CeedInt height = 1; CeedBasis basis_x_face, basis_u_face; - CeedBasisCreateTensorH1Lagrange(ceed, dim - height, num_comp_x, 2, Q, - problem_data.quadrature_mode, &basis_x_face); - CeedBasisCreateTensorH1Lagrange(ceed, dim - height, num_comp_u, P, Q, - problem_data.quadrature_mode, &basis_u_face); + CeedBasisCreateTensorH1Lagrange(ceed, dim - height, num_comp_x, 2, Q, problem_data.quadrature_mode, &basis_x_face); + CeedBasisCreateTensorH1Lagrange(ceed, dim - height, num_comp_u, P, Q, problem_data.quadrature_mode, &basis_u_face); // -- QFunction - CeedQFunction qf_traction; + CeedQFunction qf_traction; CeedQFunctionContext traction_ctx; - CeedQFunctionCreateInterior(ceed, 1, SetupTractionBCs, SetupTractionBCs_loc, - &qf_traction); + CeedQFunctionCreateInterior(ceed, 1, SetupTractionBCs, SetupTractionBCs_loc, &qf_traction); CeedQFunctionContextCreate(ceed, &traction_ctx); CeedQFunctionSetContext(qf_traction, traction_ctx); - CeedQFunctionAddInput(qf_traction, "dx", num_comp_x*(num_comp_x - height), - CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_traction, "dx", num_comp_x * (num_comp_x - height), CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_traction, "weight", 1, CEED_EVAL_WEIGHT); CeedQFunctionAddOutput(qf_traction, "v", num_comp_u, CEED_EVAL_INTERP); // -- Compute contribution on each boundary face for (CeedInt i = 0; i < app_ctx->bc_traction_count; i++) { CeedElemRestriction elem_restr_x_face, elem_restr_u_face; - CeedOperator op_traction; - CeedQFunctionContextSetData(traction_ctx, CEED_MEM_HOST, CEED_USE_POINTER, - 3 * sizeof(CeedScalar), - app_ctx->bc_traction_vector[i]); + CeedOperator op_traction; + CeedQFunctionContextSetData(traction_ctx, CEED_MEM_HOST, CEED_USE_POINTER, 3 * sizeof(CeedScalar), app_ctx->bc_traction_vector[i]); // Setup restriction - ierr = GetRestrictionForDomain(ceed, dm, 1, domain_label, - app_ctx->bc_traction_faces[i], Q, - 0, &elem_restr_u_face, &elem_restr_x_face, NULL); - CHKERRQ(ierr); + PetscCall( + GetRestrictionForDomain(ceed, dm, 1, domain_label, app_ctx->bc_traction_faces[i], Q, 0, &elem_restr_u_face, &elem_restr_x_face, NULL)); // ---- Create boundary Operator CeedOperatorCreate(ceed, qf_traction, NULL, NULL, &op_traction); - CeedOperatorSetField(op_traction, "dx", elem_restr_x_face, basis_x_face, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_traction, "weight", CEED_ELEMRESTRICTION_NONE, - basis_x_face, CEED_VECTOR_NONE); - CeedOperatorSetField(op_traction, "v", elem_restr_u_face, - basis_u_face, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_traction, "dx", elem_restr_x_face, basis_x_face, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_traction, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_face, CEED_VECTOR_NONE); + CeedOperatorSetField(op_traction, "v", elem_restr_u_face, basis_u_face, CEED_VECTOR_ACTIVE); // ---- Compute traction on face - CeedOperatorApplyAdd(op_traction, x_coord, neumann_ceed, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApplyAdd(op_traction, x_coord, neumann_ceed, CEED_REQUEST_IMMEDIATE); // ---- Cleanup CeedElemRestrictionDestroy(&elem_restr_x_face); CeedElemRestrictionDestroy(&elem_restr_u_face); @@ -445,38 +347,28 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, // --------------------------------------------------------------------------- if (forcing_choice != FORCE_NONE) { CeedQFunction qf_setup_force; - CeedOperator op_setup_force; + CeedOperator op_setup_force; // -- QFunction - CeedQFunctionCreateInterior(ceed, 1, - forcing_options[forcing_choice].setup_forcing, - forcing_options[forcing_choice].setup_forcing_loc, + CeedQFunctionCreateInterior(ceed, 1, forcing_options[forcing_choice].setup_forcing, forcing_options[forcing_choice].setup_forcing_loc, &qf_setup_force); CeedQFunctionAddInput(qf_setup_force, "x", num_comp_x, CEED_EVAL_INTERP); - CeedQFunctionAddInput(qf_setup_force, "qdata", q_data_size, - CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_setup_force, "qdata", q_data_size, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_setup_force, "force", num_comp_u, CEED_EVAL_INTERP); if (forcing_choice == FORCE_MMS) { CeedQFunctionSetContext(qf_setup_force, phys_ctx); } else { CeedQFunctionContext ctxForcing; CeedQFunctionContextCreate(ceed, &ctxForcing); - CeedQFunctionContextSetData(ctxForcing, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(*app_ctx->forcing_vector), - app_ctx->forcing_vector); + CeedQFunctionContextSetData(ctxForcing, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(*app_ctx->forcing_vector), app_ctx->forcing_vector); CeedQFunctionSetContext(qf_setup_force, ctxForcing); CeedQFunctionContextDestroy(&ctxForcing); } // -- Operator - CeedOperatorCreate(ceed, qf_setup_force, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_setup_force); - CeedOperatorSetField(op_setup_force, "x", data[fine_level]->elem_restr_x, - data[fine_level]->basis_x, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_force, "qdata", - data[fine_level]->elem_restr_geo_data_i, - CEED_BASIS_COLLOCATED, data[fine_level]->geo_data); - CeedOperatorSetField(op_setup_force, "force", data[fine_level]->elem_restr_u, - data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_setup_force, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_force); + CeedOperatorSetField(op_setup_force, "x", data[fine_level]->elem_restr_x, data[fine_level]->basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_force, "qdata", data[fine_level]->elem_restr_geo_data_i, CEED_BASIS_COLLOCATED, data[fine_level]->geo_data); + CeedOperatorSetField(op_setup_force, "force", data[fine_level]->elem_restr_u, data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); // -- Compute forcing term CeedOperatorApply(op_setup_force, x_coord, force_ceed, CEED_REQUEST_IMMEDIATE); // -- Cleanup @@ -491,43 +383,35 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, // the mesh nodes for validation with the manufactured solution. // --------------------------------------------------------------------------- if (problem_data.true_soln) { - CeedScalar *true_array; + CeedScalar *true_array; const CeedScalar *mult_array; - CeedVector mult_vec; - CeedBasis basis_x_true; - CeedQFunction qf_true; - CeedOperator op_true; + CeedVector mult_vec; + CeedBasis basis_x_true; + CeedQFunction qf_true; + CeedOperator op_true; // -- Solution vector CeedVectorCreate(ceed, U_loc_size, &(data[fine_level]->true_soln)); // -- Basis - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, P, CEED_GAUSS_LOBATTO, - &basis_x_true); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, P, CEED_GAUSS_LOBATTO, &basis_x_true); // QFunction - CeedQFunctionCreateInterior(ceed, 1, problem_data.true_soln, - problem_data.true_soln_loc, &qf_true); + CeedQFunctionCreateInterior(ceed, 1, problem_data.true_soln, problem_data.true_soln_loc, &qf_true); CeedQFunctionAddInput(qf_true, "x", num_comp_x, CEED_EVAL_INTERP); CeedQFunctionAddOutput(qf_true, "true solution", num_comp_u, CEED_EVAL_NONE); // Operator - CeedOperatorCreate(ceed, qf_true, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_true); - CeedOperatorSetField(op_true, "x", data[fine_level]->elem_restr_x, basis_x_true, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_true, "true solution", data[fine_level]->elem_restr_u, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_true, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_true); + CeedOperatorSetField(op_true, "x", data[fine_level]->elem_restr_x, basis_x_true, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_true, "true solution", data[fine_level]->elem_restr_u, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // -- Compute true solution - CeedOperatorApply(op_true, x_coord, data[fine_level]->true_soln, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_true, x_coord, data[fine_level]->true_soln, CEED_REQUEST_IMMEDIATE); // -- Multiplicity calculation - CeedElemRestrictionCreateVector(data[fine_level]->elem_restr_u, &mult_vec, - NULL); + CeedElemRestrictionCreateVector(data[fine_level]->elem_restr_u, &mult_vec, NULL); CeedVectorSetValue(mult_vec, 0.); CeedElemRestrictionGetMultiplicity(data[fine_level]->elem_restr_u, mult_vec); // -- Multiplicity correction CeedVectorGetArray(data[fine_level]->true_soln, CEED_MEM_HOST, &true_array); CeedVectorGetArrayRead(mult_vec, CEED_MEM_HOST, &mult_array); - for (CeedInt i = 0; i < U_loc_size; i++) - true_array[i] /= mult_array[i]; + for (CeedInt i = 0; i < U_loc_size; i++) true_array[i] /= mult_array[i]; CeedVectorRestoreArray(data[fine_level]->true_soln, &true_array); CeedVectorRestoreArrayRead(mult_vec, &mult_array); // -- Cleanup @@ -543,22 +427,16 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, // Create the QFunction and Operator that computes the strain energy // --------------------------------------------------------------------------- // -- QFunction - CeedQFunctionCreateInterior(ceed, 1, problem_data.energy, - problem_data.energy_loc, &qf_energy); - CeedQFunctionAddInput(qf_energy, "du", num_comp_u*dim, CEED_EVAL_GRAD); + CeedQFunctionCreateInterior(ceed, 1, problem_data.energy, problem_data.energy_loc, &qf_energy); + CeedQFunctionAddInput(qf_energy, "du", num_comp_u * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_energy, "qdata", q_data_size, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_energy, "energy", num_comp_e, CEED_EVAL_INTERP); CeedQFunctionSetContext(qf_energy, phys_ctx); // -- Operator - CeedOperatorCreate(ceed, qf_energy, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_energy); - CeedOperatorSetField(op_energy, "du", data[fine_level]->elem_restr_u, - data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_energy, "qdata", - data[fine_level]->elem_restr_geo_data_i, - CEED_BASIS_COLLOCATED, data[fine_level]->geo_data); - CeedOperatorSetField(op_energy, "energy", data[fine_level]->elem_restr_energy, - data[fine_level]->basis_energy, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_energy, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_energy); + CeedOperatorSetField(op_energy, "du", data[fine_level]->elem_restr_u, data[fine_level]->basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_energy, "qdata", data[fine_level]->elem_restr_geo_data_i, CEED_BASIS_COLLOCATED, data[fine_level]->geo_data); + CeedOperatorSetField(op_energy, "energy", data[fine_level]->elem_restr_energy, data[fine_level]->basis_energy, CEED_VECTOR_ACTIVE); // -- Save libCEED data data[fine_level]->qf_energy = qf_energy; data[fine_level]->op_energy = op_energy; @@ -571,27 +449,19 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, // Geometric factors // -- Coordinate basis CeedBasis basis_x; - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, Q, CEED_GAUSS_LOBATTO, - &basis_x); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_x, 2, Q, CEED_GAUSS_LOBATTO, &basis_x); // -- QFunction - CeedQFunctionCreateInterior(ceed, 1, problem_data.setup_geo, - problem_data.setup_geo_loc, &qf_setup_geo); - CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x*dim, CEED_EVAL_GRAD); + CeedQFunctionCreateInterior(ceed, 1, problem_data.setup_geo, problem_data.setup_geo_loc, &qf_setup_geo); + CeedQFunctionAddInput(qf_setup_geo, "dx", num_comp_x * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_setup_geo, "weight", 1, CEED_EVAL_WEIGHT); CeedQFunctionAddOutput(qf_setup_geo, "qdata", q_data_size, CEED_EVAL_NONE); // -- Operator - CeedOperatorCreate(ceed, qf_setup_geo, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_setup_geo); - CeedOperatorSetField(op_setup_geo, "dx", data[fine_level]->elem_restr_x, - basis_x, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, - basis_x, CEED_VECTOR_NONE); - CeedOperatorSetField(op_setup_geo, "qdata", - data[fine_level]->elem_restr_geo_data_diagnostic_i, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_setup_geo, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_geo); + CeedOperatorSetField(op_setup_geo, "dx", data[fine_level]->elem_restr_x, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_geo, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_geo, "qdata", data[fine_level]->elem_restr_geo_data_diagnostic_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // -- Compute the quadrature data - CeedOperatorApply(op_setup_geo, x_coord, data[fine_level]->geo_data_diagnostic, - CEED_REQUEST_IMMEDIATE); + CeedOperatorApply(op_setup_geo, x_coord, data[fine_level]->geo_data_diagnostic, CEED_REQUEST_IMMEDIATE); // -- Cleanup CeedBasisDestroy(&basis_x); CeedQFunctionDestroy(&qf_setup_geo); @@ -599,27 +469,19 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, // Diagnostic quantities // -- QFunction - CeedQFunctionCreateInterior(ceed, 1, problem_data.diagnostic, - problem_data.diagnostic_loc, &qf_diagnostic); + CeedQFunctionCreateInterior(ceed, 1, problem_data.diagnostic, problem_data.diagnostic_loc, &qf_diagnostic); CeedQFunctionAddInput(qf_diagnostic, "u", num_comp_u, CEED_EVAL_INTERP); - CeedQFunctionAddInput(qf_diagnostic, "du", num_comp_u*dim, CEED_EVAL_GRAD); + CeedQFunctionAddInput(qf_diagnostic, "du", num_comp_u * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_diagnostic, "qdata", q_data_size, CEED_EVAL_NONE); - CeedQFunctionAddOutput(qf_diagnostic, "diagnostic values", - num_comp_u + num_comp_d, CEED_EVAL_NONE); + CeedQFunctionAddOutput(qf_diagnostic, "diagnostic values", num_comp_u + num_comp_d, CEED_EVAL_NONE); CeedQFunctionSetContext(qf_diagnostic, phys_ctx); // -- Operator - CeedOperatorCreate(ceed, qf_diagnostic, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_diagnostic); - CeedOperatorSetField(op_diagnostic, "u", data[fine_level]->elem_restr_u, - data[fine_level]->basis_diagnostic, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_diagnostic, "du", data[fine_level]->elem_restr_u, - data[fine_level]->basis_diagnostic, CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_diagnostic, "qdata", - data[fine_level]->elem_restr_geo_data_diagnostic_i, - CEED_BASIS_COLLOCATED, data[fine_level]->geo_data_diagnostic); - CeedOperatorSetField(op_diagnostic, "diagnostic values", - data[fine_level]->elem_restr_diagnostic, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_diagnostic, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diagnostic); + CeedOperatorSetField(op_diagnostic, "u", data[fine_level]->elem_restr_u, data[fine_level]->basis_diagnostic, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_diagnostic, "du", data[fine_level]->elem_restr_u, data[fine_level]->basis_diagnostic, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_diagnostic, "qdata", data[fine_level]->elem_restr_geo_data_diagnostic_i, CEED_BASIS_COLLOCATED, + data[fine_level]->geo_data_diagnostic); + CeedOperatorSetField(op_diagnostic, "diagnostic values", data[fine_level]->elem_restr_diagnostic, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // -- Save libCEED data data[fine_level]->qf_diagnostic = qf_diagnostic; data[fine_level]->op_diagnostic = op_diagnostic; @@ -634,37 +496,29 @@ PetscErrorCode SetupLibceedFineLevel(DM dm, DM dm_energy, DM dm_diagnostic, // Set up libCEED multigrid level for a given degree // Prolongation and Restriction are between level and level+1 -PetscErrorCode SetupLibceedLevel(DM dm, Ceed ceed, AppCtx app_ctx, - ProblemData problem_data, PetscInt level, - PetscInt num_comp_u, PetscInt U_g_size, - PetscInt U_loc_size, CeedVector fine_mult, - CeedData *data) { - PetscErrorCode ierr; - CeedInt fine_level = app_ctx->num_levels - 1; - CeedInt P = app_ctx->level_degrees[level] + 1; - CeedInt Q = app_ctx->level_degrees[fine_level] + 1 + app_ctx->q_extra; - CeedInt dim; - CeedOperator op_jacobian, op_prolong, op_restrict; +PetscErrorCode SetupLibceedLevel(DM dm, Ceed ceed, AppCtx app_ctx, ProblemData problem_data, PetscInt level, PetscInt num_comp_u, PetscInt U_g_size, + PetscInt U_loc_size, CeedVector fine_mult, CeedData *data) { + CeedInt fine_level = app_ctx->num_levels - 1; + CeedInt P = app_ctx->level_degrees[level] + 1; + CeedInt Q = app_ctx->level_degrees[fine_level] + 1 + app_ctx->q_extra; + CeedInt dim; + CeedOperator op_jacobian, op_prolong, op_restrict; PetscFunctionBeginUser; - ierr = DMGetDimension(dm, &dim); CHKERRQ(ierr); + PetscCall(DMGetDimension(dm, &dim)); // --------------------------------------------------------------------------- // libCEED restrictions // --------------------------------------------------------------------------- // -- Solution restriction - ierr = CreateRestrictionFromPlex(ceed, dm, 0, 0, 0, - &data[level]->elem_restr_u); - CHKERRQ(ierr); + PetscCall(CreateRestrictionFromPlex(ceed, dm, 0, 0, 0, &data[level]->elem_restr_u)); // --------------------------------------------------------------------------- // libCEED bases // --------------------------------------------------------------------------- // -- Solution basis - CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_u, P, Q, - problem_data.quadrature_mode, - &data[level]->basis_u); + CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp_u, P, Q, problem_data.quadrature_mode, &data[level]->basis_u); // --------------------------------------------------------------------------- // Persistent libCEED vectors @@ -678,14 +532,13 @@ PetscErrorCode SetupLibceedLevel(DM dm, Ceed ceed, AppCtx app_ctx, // Create the Operators that compute the prolongation and // restriction between the p-multigrid levels and the coarse grid eval. // --------------------------------------------------------------------------- - CeedOperatorMultigridLevelCreate(data[level+1]->op_jacobian, fine_mult, - data[level]->elem_restr_u, data[level]->basis_u, - &op_jacobian, &op_prolong, &op_restrict); + CeedOperatorMultigridLevelCreate(data[level + 1]->op_jacobian, fine_mult, data[level]->elem_restr_u, data[level]->basis_u, &op_jacobian, + &op_prolong, &op_restrict); // -- Save libCEED data - data[level]->op_jacobian = op_jacobian; - data[level+1]->op_prolong = op_prolong; - data[level+1]->op_restrict = op_restrict; + data[level]->op_jacobian = op_jacobian; + data[level + 1]->op_prolong = op_prolong; + data[level + 1]->op_restrict = op_restrict; PetscFunctionReturn(0); }; diff --git a/gallery/ceed-gallery-weak.c b/gallery/ceed-gallery-weak.c index 62a887edc0..a9114c52f0 100644 --- a/gallery/ceed-gallery-weak.c +++ b/gallery/ceed-gallery-weak.c @@ -17,8 +17,8 @@ static int CeedQFunctionRegister_Weak(const char *name) { } // LCOV_EXCL_STOP -#define MACRO(name) \ - CEED_INTERN int name(void) __attribute__((weak)); \ - int name(void) { return CeedQFunctionRegister_Weak(__func__); } +#define MACRO(name) \ + CEED_INTERN int name(void) __attribute__((weak)); \ + int name(void) { return CeedQFunctionRegister_Weak(__func__); } #include "ceed-gallery-list.h" #undef MACRO diff --git a/gallery/identity/ceed-identity.c b/gallery/identity/ceed-identity.c index 857a2ed808..68091ecebf 100644 --- a/gallery/identity/ceed-identity.c +++ b/gallery/identity/ceed-identity.c @@ -5,45 +5,37 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include +#include #include #include -#include /** @brief Set fields identity QFunction that copies inputs directly into outputs **/ -static int CeedQFunctionInit_Identity(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Identity(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Identity"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // QFunction fields 'input' and 'output' with requested emodes added // by the library rather than being added here - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 0); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 0)); // Context data CeedQFunctionContext ctx; - IdentityCtx ctx_data = {.size = 1}; - ierr = CeedQFunctionContextCreate(ceed, &ctx); CeedChk(ierr); - ierr = CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(ctx_data), (void *)&ctx_data); - CeedChk(ierr); - ierr = CeedQFunctionContextRegisterInt32(ctx, "size", - offsetof(IdentityCtx, size), 1, "field size of identity QFunction"); - CeedChk(ierr); - ierr = CeedQFunctionSetContext(qf, ctx); CeedChk(ierr); - ierr = CeedQFunctionContextDestroy(&ctx); CeedChk(ierr); + IdentityCtx ctx_data = {.size = 1}; + CeedCall(CeedQFunctionContextCreate(ceed, &ctx)); + CeedCall(CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(ctx_data), (void *)&ctx_data)); + CeedCall(CeedQFunctionContextRegisterInt32(ctx, "size", offsetof(IdentityCtx, size), 1, "field size of identity QFunction")); + CeedCall(CeedQFunctionSetContext(qf, ctx)); + CeedCall(CeedQFunctionContextDestroy(&ctx)); return CEED_ERROR_SUCCESS; } @@ -52,6 +44,5 @@ static int CeedQFunctionInit_Identity(Ceed ceed, const char *requested, @brief Register identity QFunction that copies inputs directly into outputs **/ CEED_INTERN int CeedQFunctionRegister_Identity(void) { - return CeedQFunctionRegister("Identity", Identity_loc, 1, Identity, - CeedQFunctionInit_Identity); + return CeedQFunctionRegister("Identity", Identity_loc, 1, Identity, CeedQFunctionInit_Identity); } diff --git a/gallery/mass-vector/ceed-vectormassapply.c b/gallery/mass-vector/ceed-vectormassapply.c index 29d9e5e0a7..c1c413aa8c 100644 --- a/gallery/mass-vector/ceed-vectormassapply.c +++ b/gallery/mass-vector/ceed-vectormassapply.c @@ -5,37 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction for applying the mass matrix on a vector system with three components **/ -static int CeedQFunctionInit_Vector3MassApply(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Vector3MassApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Vector3MassApply"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt num_comp = 3; - ierr = CeedQFunctionAddInput(qf, "u", num_comp, CEED_EVAL_INTERP); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "qdata", 1, CEED_EVAL_NONE); CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "v", num_comp, CEED_EVAL_INTERP); - CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "u", num_comp, CEED_EVAL_INTERP)); + CeedCall(CeedQFunctionAddInput(qf, "qdata", 1, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf, "v", num_comp, CEED_EVAL_INTERP)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, num_comp); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp)); return CEED_ERROR_SUCCESS; } @@ -45,6 +39,5 @@ static int CeedQFunctionInit_Vector3MassApply(Ceed ceed, const char *requested, with three components **/ CEED_INTERN int CeedQFunctionRegister_Vector3MassApply(void) { - return CeedQFunctionRegister("Vector3MassApply", Vector3MassApply_loc, 1, - Vector3MassApply, CeedQFunctionInit_Vector3MassApply); + return CeedQFunctionRegister("Vector3MassApply", Vector3MassApply_loc, 1, Vector3MassApply, CeedQFunctionInit_Vector3MassApply); } diff --git a/gallery/mass/ceed-mass1dbuild.c b/gallery/mass/ceed-mass1dbuild.c index 6506125a82..a96b6871ef 100644 --- a/gallery/mass/ceed-mass1dbuild.c +++ b/gallery/mass/ceed-mass1dbuild.c @@ -5,37 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction building the geometric data for the 1D mass matrix **/ -static int CeedQFunctionInit_Mass1DBuild(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Mass1DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Mass1DBuild"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 1; - ierr = CeedQFunctionAddInput(qf, "dx", dim*dim, CEED_EVAL_GRAD); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE); CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); + CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 1); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1)); return CEED_ERROR_SUCCESS; } @@ -45,6 +39,5 @@ static int CeedQFunctionInit_Mass1DBuild(Ceed ceed, const char *requested, matrix **/ CEED_INTERN int CeedQFunctionRegister_Mass1DBuild(void) { - return CeedQFunctionRegister("Mass1DBuild", Mass1DBuild_loc, 1, Mass1DBuild, - CeedQFunctionInit_Mass1DBuild); + return CeedQFunctionRegister("Mass1DBuild", Mass1DBuild_loc, 1, Mass1DBuild, CeedQFunctionInit_Mass1DBuild); } diff --git a/gallery/mass/ceed-mass2dbuild.c b/gallery/mass/ceed-mass2dbuild.c index ce3aeb00f2..527f9964fe 100644 --- a/gallery/mass/ceed-mass2dbuild.c +++ b/gallery/mass/ceed-mass2dbuild.c @@ -5,37 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction building the geometric data for the 2D mass matrix **/ -static int CeedQFunctionInit_Mass2DBuild(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Mass2DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Mass2DBuild"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 2; - ierr = CeedQFunctionAddInput(qf, "dx", dim*dim, CEED_EVAL_GRAD); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE); CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); + CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 4); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 4)); return CEED_ERROR_SUCCESS; } @@ -45,6 +39,5 @@ static int CeedQFunctionInit_Mass2DBuild(Ceed ceed, const char *requested, matrix **/ CEED_INTERN int CeedQFunctionRegister_Mass2DBuild(void) { - return CeedQFunctionRegister("Mass2DBuild", Mass2DBuild_loc, 1, Mass2DBuild, - CeedQFunctionInit_Mass2DBuild); + return CeedQFunctionRegister("Mass2DBuild", Mass2DBuild_loc, 1, Mass2DBuild, CeedQFunctionInit_Mass2DBuild); } diff --git a/gallery/mass/ceed-mass3dbuild.c b/gallery/mass/ceed-mass3dbuild.c index ec8de0c671..d722986168 100644 --- a/gallery/mass/ceed-mass3dbuild.c +++ b/gallery/mass/ceed-mass3dbuild.c @@ -5,37 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction building the geometric data for the 3D mass matrix **/ -static int CeedQFunctionInit_Mass3DBuild(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Mass3DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Mass3DBuild"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 3; - ierr = CeedQFunctionAddInput(qf, "dx", dim*dim, CEED_EVAL_GRAD); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE); CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); + CeedCall(CeedQFunctionAddOutput(qf, "qdata", 1, CEED_EVAL_NONE)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 15); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 15)); return CEED_ERROR_SUCCESS; } @@ -45,6 +39,5 @@ static int CeedQFunctionInit_Mass3DBuild(Ceed ceed, const char *requested, matrix **/ CEED_INTERN int CeedQFunctionRegister_Mass3DBuild(void) { - return CeedQFunctionRegister("Mass3DBuild", Mass3DBuild_loc, 1, Mass3DBuild, - CeedQFunctionInit_Mass3DBuild); + return CeedQFunctionRegister("Mass3DBuild", Mass3DBuild_loc, 1, Mass3DBuild, CeedQFunctionInit_Mass3DBuild); } diff --git a/gallery/mass/ceed-massapply.c b/gallery/mass/ceed-massapply.c index a110cfd52f..99180d9ec3 100644 --- a/gallery/mass/ceed-massapply.c +++ b/gallery/mass/ceed-massapply.c @@ -5,33 +5,29 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction for applying the mass matrix **/ -static int CeedQFunctionInit_MassApply(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_MassApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "MassApply"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields - ierr = CeedQFunctionAddInput(qf, "u", 1, CEED_EVAL_INTERP); CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "qdata", 1, CEED_EVAL_NONE); CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "v", 1, CEED_EVAL_INTERP); CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "u", 1, CEED_EVAL_INTERP)); + CeedCall(CeedQFunctionAddInput(qf, "qdata", 1, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf, "v", 1, CEED_EVAL_INTERP)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 1); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1)); return CEED_ERROR_SUCCESS; } @@ -40,6 +36,5 @@ static int CeedQFunctionInit_MassApply(Ceed ceed, const char *requested, @brief Register Ceed QFunction for applying the mass matrix **/ CEED_INTERN int CeedQFunctionRegister_MassApply(void) { - return CeedQFunctionRegister("MassApply", MassApply_loc, 1, MassApply, - CeedQFunctionInit_MassApply); + return CeedQFunctionRegister("MassApply", MassApply_loc, 1, MassApply, CeedQFunctionInit_MassApply); } diff --git a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c index cd54a03446..050004f252 100644 --- a/gallery/poisson-vector/ceed-vectorpoisson1dapply.c +++ b/gallery/poisson-vector/ceed-vectorpoisson1dapply.c @@ -5,39 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction applying the 1D Poisson operator on a vector system with three components **/ -static int CeedQFunctionInit_Vector3Poisson1DApply(Ceed ceed, - const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Vector3Poisson1DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Vector3Poisson1DApply"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 1, num_comp = 3; - ierr = CeedQFunctionAddInput(qf, "du", num_comp*dim, CEED_EVAL_GRAD); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "dv", num_comp*dim, CEED_EVAL_GRAD); - CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, num_comp); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp)); return CEED_ERROR_SUCCESS; } @@ -47,7 +39,5 @@ static int CeedQFunctionInit_Vector3Poisson1DApply(Ceed ceed, on a vector system with three components **/ CEED_INTERN int CeedQFunctionRegister_Vector3Poisson1DApply(void) { - return CeedQFunctionRegister("Vector3Poisson1DApply", Vector3Poisson1DApply_loc, - 1, Vector3Poisson1DApply, - CeedQFunctionInit_Vector3Poisson1DApply); + return CeedQFunctionRegister("Vector3Poisson1DApply", Vector3Poisson1DApply_loc, 1, Vector3Poisson1DApply, CeedQFunctionInit_Vector3Poisson1DApply); } diff --git a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c index 66fc448ae5..88c842a49c 100644 --- a/gallery/poisson-vector/ceed-vectorpoisson2dapply.c +++ b/gallery/poisson-vector/ceed-vectorpoisson2dapply.c @@ -5,39 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction applying the 2D Poisson operator on a vector system with three components **/ -static int CeedQFunctionInit_Vector3Poisson2DApply(Ceed ceed, - const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Vector3Poisson2DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Vector3Poisson2DApply"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 2, num_comp = 3; - ierr = CeedQFunctionAddInput(qf, "du", num_comp*dim, CEED_EVAL_GRAD); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "dv", num_comp*dim, CEED_EVAL_GRAD); - CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 6); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 6)); return CEED_ERROR_SUCCESS; } @@ -47,7 +39,5 @@ static int CeedQFunctionInit_Vector3Poisson2DApply(Ceed ceed, on a vector system with three components **/ CEED_INTERN int CeedQFunctionRegister_Vector3Poisson2DApply(void) { - return CeedQFunctionRegister("Vector3Poisson2DApply", Vector3Poisson2DApply_loc, - 1, Vector3Poisson2DApply, - CeedQFunctionInit_Vector3Poisson2DApply); + return CeedQFunctionRegister("Vector3Poisson2DApply", Vector3Poisson2DApply_loc, 1, Vector3Poisson2DApply, CeedQFunctionInit_Vector3Poisson2DApply); } diff --git a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c index bf924d492b..9c5d982c2e 100644 --- a/gallery/poisson-vector/ceed-vectorpoisson3dapply.c +++ b/gallery/poisson-vector/ceed-vectorpoisson3dapply.c @@ -5,39 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction applying the 3D Poisson operator on a vector system with three components **/ -static int CeedQFunctionInit_Vector3Poisson3DApply(Ceed ceed, - const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Vector3Poisson3DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Vector3Poisson3DApply"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 3, num_comp = 3; - ierr = CeedQFunctionAddInput(qf, "du", num_comp*dim, CEED_EVAL_GRAD); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "dv", num_comp*dim, CEED_EVAL_GRAD); - CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "du", num_comp * dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf, "dv", num_comp * dim, CEED_EVAL_GRAD)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 15); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, num_comp * 15)); return CEED_ERROR_SUCCESS; } @@ -47,7 +39,5 @@ static int CeedQFunctionInit_Vector3Poisson3DApply(Ceed ceed, on a vector system with three components **/ CEED_INTERN int CeedQFunctionRegister_Vector3Poisson3DApply(void) { - return CeedQFunctionRegister("Vector3Poisson3DApply", Vector3Poisson3DApply_loc, - 1, Vector3Poisson3DApply, - CeedQFunctionInit_Vector3Poisson3DApply); + return CeedQFunctionRegister("Vector3Poisson3DApply", Vector3Poisson3DApply_loc, 1, Vector3Poisson3DApply, CeedQFunctionInit_Vector3Poisson3DApply); } diff --git a/gallery/poisson/ceed-poisson1dapply.c b/gallery/poisson/ceed-poisson1dapply.c index c6e7d7cdfa..bf470b00ba 100644 --- a/gallery/poisson/ceed-poisson1dapply.c +++ b/gallery/poisson/ceed-poisson1dapply.c @@ -5,35 +5,30 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction applying the 1D Poisson operator **/ -static int CeedQFunctionInit_Poisson1DApply(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Poisson1DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson1DApply"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 1; - ierr = CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD); CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD); CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 1); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1)); return CEED_ERROR_SUCCESS; } @@ -42,6 +37,5 @@ static int CeedQFunctionInit_Poisson1DApply(Ceed ceed, const char *requested, @brief Register Ceed QFunction for applying the 1D Poisson operator **/ CEED_INTERN int CeedQFunctionRegister_Poisson1DApply(void) { - return CeedQFunctionRegister("Poisson1DApply", Poisson1DApply_loc, 1, - Poisson1DApply, CeedQFunctionInit_Poisson1DApply); + return CeedQFunctionRegister("Poisson1DApply", Poisson1DApply_loc, 1, Poisson1DApply, CeedQFunctionInit_Poisson1DApply); } diff --git a/gallery/poisson/ceed-poisson1dbuild.c b/gallery/poisson/ceed-poisson1dbuild.c index 20d418aa68..47f000d4f1 100644 --- a/gallery/poisson/ceed-poisson1dbuild.c +++ b/gallery/poisson/ceed-poisson1dbuild.c @@ -5,38 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction building the geometric data for the 1D Poisson operator **/ -static int CeedQFunctionInit_Poisson1DBuild(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Poisson1DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson1DBuild"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 1; - ierr = CeedQFunctionAddInput(qf, "dx", dim*dim, CEED_EVAL_GRAD); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); + CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 1); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 1)); return CEED_ERROR_SUCCESS; } @@ -46,6 +39,5 @@ static int CeedQFunctionInit_Poisson1DBuild(Ceed ceed, const char *requested, Poisson operator **/ CEED_INTERN int CeedQFunctionRegister_Poisson1DBuild(void) { - return CeedQFunctionRegister("Poisson1DBuild", Poisson1DBuild_loc, 1, - Poisson1DBuild, CeedQFunctionInit_Poisson1DBuild); + return CeedQFunctionRegister("Poisson1DBuild", Poisson1DBuild_loc, 1, Poisson1DBuild, CeedQFunctionInit_Poisson1DBuild); } diff --git a/gallery/poisson/ceed-poisson2dapply.c b/gallery/poisson/ceed-poisson2dapply.c index e1f47b359d..374c62eed2 100644 --- a/gallery/poisson/ceed-poisson2dapply.c +++ b/gallery/poisson/ceed-poisson2dapply.c @@ -5,35 +5,30 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction applying the 2D Poisson operator **/ -static int CeedQFunctionInit_Poisson2DApply(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Poisson2DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson2DApply"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 2; - ierr = CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD); CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD); CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 6); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 6)); return CEED_ERROR_SUCCESS; } @@ -42,6 +37,5 @@ static int CeedQFunctionInit_Poisson2DApply(Ceed ceed, const char *requested, @brief Register Ceed QFunction for applying the 2D Poisson operator **/ CEED_INTERN int CeedQFunctionRegister_Poisson2DApply(void) { - return CeedQFunctionRegister("Poisson2DApply", Poisson2DApply_loc, 1, - Poisson2DApply, CeedQFunctionInit_Poisson2DApply); + return CeedQFunctionRegister("Poisson2DApply", Poisson2DApply_loc, 1, Poisson2DApply, CeedQFunctionInit_Poisson2DApply); } diff --git a/gallery/poisson/ceed-poisson2dbuild.c b/gallery/poisson/ceed-poisson2dbuild.c index f79896baca..c47d11d0b5 100644 --- a/gallery/poisson/ceed-poisson2dbuild.c +++ b/gallery/poisson/ceed-poisson2dbuild.c @@ -5,38 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction building the geometric data for the 2D Poisson operator **/ -static int CeedQFunctionInit_Poisson2DBuild(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Poisson2DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson2DBuild"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 2; - ierr = CeedQFunctionAddInput(qf, "dx", dim*dim, CEED_EVAL_GRAD); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); + CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 17); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 17)); return CEED_ERROR_SUCCESS; } @@ -46,6 +39,5 @@ static int CeedQFunctionInit_Poisson2DBuild(Ceed ceed, const char *requested, Poisson operator **/ CEED_INTERN int CeedQFunctionRegister_Poisson2DBuild(void) { - return CeedQFunctionRegister("Poisson2DBuild", Poisson2DBuild_loc, 1, - Poisson2DBuild, CeedQFunctionInit_Poisson2DBuild); + return CeedQFunctionRegister("Poisson2DBuild", Poisson2DBuild_loc, 1, Poisson2DBuild, CeedQFunctionInit_Poisson2DBuild); } diff --git a/gallery/poisson/ceed-poisson3dapply.c b/gallery/poisson/ceed-poisson3dapply.c index 682c1ee3c3..efb7c362d7 100644 --- a/gallery/poisson/ceed-poisson3dapply.c +++ b/gallery/poisson/ceed-poisson3dapply.c @@ -5,35 +5,30 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction applying the 3D Poisson operator **/ -static int CeedQFunctionInit_Poisson3DApply(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Poisson3DApply(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson3DApply"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 3; - ierr = CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD); CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD); CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "du", dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf, "dv", dim, CEED_EVAL_GRAD)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 15); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 15)); return CEED_ERROR_SUCCESS; } @@ -42,6 +37,5 @@ static int CeedQFunctionInit_Poisson3DApply(Ceed ceed, const char *requested, @brief Register Ceed QFunction for applying the 3D Poisson operator **/ CEED_INTERN int CeedQFunctionRegister_Poisson3DApply(void) { - return CeedQFunctionRegister("Poisson3DApply", Poisson3DApply_loc, 1, - Poisson3DApply, CeedQFunctionInit_Poisson3DApply); + return CeedQFunctionRegister("Poisson3DApply", Poisson3DApply_loc, 1, Poisson3DApply, CeedQFunctionInit_Poisson3DApply); } diff --git a/gallery/poisson/ceed-poisson3dbuild.c b/gallery/poisson/ceed-poisson3dbuild.c index 5bed48856a..fa5cde6325 100644 --- a/gallery/poisson/ceed-poisson3dbuild.c +++ b/gallery/poisson/ceed-poisson3dbuild.c @@ -5,38 +5,31 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for Ceed QFunction building the geometric data for the 3D Poisson operator **/ -static int CeedQFunctionInit_Poisson3DBuild(Ceed ceed, const char *requested, - CeedQFunction qf) { - int ierr; - +static int CeedQFunctionInit_Poisson3DBuild(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Poisson3DBuild"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // Add QFunction fields const CeedInt dim = 3; - ierr = CeedQFunctionAddInput(qf, "dx", dim*dim, CEED_EVAL_GRAD); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf, "qdata", dim*(dim+1)/2, CEED_EVAL_NONE); - CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(qf, "dx", dim * dim, CEED_EVAL_GRAD)); + CeedCall(CeedQFunctionAddInput(qf, "weights", 1, CEED_EVAL_WEIGHT)); + CeedCall(CeedQFunctionAddOutput(qf, "qdata", dim * (dim + 1) / 2, CEED_EVAL_NONE)); - ierr = CeedQFunctionSetUserFlopsEstimate(qf, 69); CeedChk(ierr); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf, 69)); return CEED_ERROR_SUCCESS; } @@ -46,6 +39,5 @@ static int CeedQFunctionInit_Poisson3DBuild(Ceed ceed, const char *requested, Poisson operator **/ CEED_INTERN int CeedQFunctionRegister_Poisson3DBuild(void) { - return CeedQFunctionRegister("Poisson3DBuild", Poisson3DBuild_loc, 1, - Poisson3DBuild, CeedQFunctionInit_Poisson3DBuild); + return CeedQFunctionRegister("Poisson3DBuild", Poisson3DBuild_loc, 1, Poisson3DBuild, CeedQFunctionInit_Poisson3DBuild); } diff --git a/gallery/scale/ceed-scale.c b/gallery/scale/ceed-scale.c index 14d24f9084..377fe3ab2b 100644 --- a/gallery/scale/ceed-scale.c +++ b/gallery/scale/ceed-scale.c @@ -5,24 +5,22 @@ // // This file is part of CEED: http://github.com/ceed -#include #include -#include +#include #include +#include /** @brief Set fields for vector scaling QFunction that scales inputs **/ -static int CeedQFunctionInit_Scale(Ceed ceed, const char *requested, - CeedQFunction qf) { +static int CeedQFunctionInit_Scale(Ceed ceed, const char *requested, CeedQFunction qf) { // Check QFunction name const char *name = "Scale"; - if (strcmp(name, requested)) + if (strcmp(name, requested)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "QFunction '%s' does not match requested name: %s", - name, requested); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "QFunction '%s' does not match requested name: %s", name, requested); + // LCOV_EXCL_STOP + } // QFunction fields 'input' and 'output' with requested emodes added // by the library rather than being added here @@ -33,7 +31,4 @@ static int CeedQFunctionInit_Scale(Ceed ceed, const char *requested, /** @brief Register scaling QFunction **/ -CEED_INTERN int CeedQFunctionRegister_Scale(void) { - return CeedQFunctionRegister("Scale", Scale_loc, 1, Scale, - CeedQFunctionInit_Scale); -} +CEED_INTERN int CeedQFunctionRegister_Scale(void) { return CeedQFunctionRegister("Scale", Scale_loc, 1, Scale, CeedQFunctionInit_Scale); } diff --git a/include/ceed-fortran-name.h b/include/ceed-fortran-name.h index 492cf243b2..942b5b60c4 100644 --- a/include/ceed-fortran-name.h +++ b/include/ceed-fortran-name.h @@ -21,30 +21,30 @@ /* the following macro functions like a##b, but will expand a and/or b if they are themselves macros */ -#define TOKEN_PASTE_(a,b) a##b -#define TOKEN_PASTE(a,b) TOKEN_PASTE_(a,b) +#define TOKEN_PASTE_(a, b) a##b +#define TOKEN_PASTE(a, b) TOKEN_PASTE_(a, b) #ifdef PREFIX -# define PREFIXED_NAME(x) TOKEN_PASTE(PREFIX,x) +#define PREFIXED_NAME(x) TOKEN_PASTE(PREFIX, x) #else -# define PREFIXED_NAME(x) x +#define PREFIXED_NAME(x) x #endif #ifdef FPREFIX -# define FPREFIXED_NAME(x) TOKEN_PASTE(FPREFIX,x) +#define FPREFIXED_NAME(x) TOKEN_PASTE(FPREFIX, x) #else -# define FPREFIXED_NAME(x) x +#define FPREFIXED_NAME(x) x #endif #if defined(UPCASE) -# define FORTRAN_NAME(low,up) FPREFIXED_NAME(up) -# define FORTRAN_UNPREFIXED(low,up) up +#define FORTRAN_NAME(low, up) FPREFIXED_NAME(up) +#define FORTRAN_UNPREFIXED(low, up) up #elif defined(UNDERSCORE) -# define FORTRAN_NAME(low,up) FPREFIXED_NAME(TOKEN_PASTE(low,_)) -# define FORTRAN_UNPREFIXED(low,up) TOKEN_PASTE(low,_) +#define FORTRAN_NAME(low, up) FPREFIXED_NAME(TOKEN_PASTE(low, _)) +#define FORTRAN_UNPREFIXED(low, up) TOKEN_PASTE(low, _) #else -# define FORTRAN_NAME(low,up) FPREFIXED_NAME(low) -# define FORTRAN_UNPREFIXED(low,up) low +#define FORTRAN_NAME(low, up) FPREFIXED_NAME(low) +#define FORTRAN_UNPREFIXED(low, up) low #endif #endif diff --git a/include/ceed-impl.h b/include/ceed-impl.h index ef53339139..4810184383 100644 --- a/include/ceed-impl.h +++ b/include/ceed-impl.h @@ -10,11 +10,11 @@ #ifndef _ceed_impl_h #define _ceed_impl_h -#include #include +#include #include -CEED_INTERN const char CeedJitSourceRootDefault[]; +CEED_INTERN const char *CeedJitSourceRootDefault; /** @defgroup CeedUser Public API for Ceed @ingroup Ceed @@ -74,59 +74,48 @@ CEED_INTERN const char CeedJitSourceRootDefault[]; // Lookup table field for backend functions typedef struct { const char *func_name; - size_t offset; + size_t offset; } FOffset; // Lookup table field for object delegates typedef struct { char *obj_name; - Ceed delegate; + Ceed delegate; } ObjDelegate; struct Ceed_private { - const char *resource; - Ceed delegate; - Ceed parent; + const char *resource; + Ceed delegate; + Ceed parent; ObjDelegate *obj_delegates; - int obj_delegate_count; - Ceed op_fallback_ceed, op_fallback_parent; - const char *op_fallback_resource; - char **jit_source_roots; - CeedInt num_jit_source_roots; - int (*Error)(Ceed, const char *, int, const char *, int, const char *, - va_list *); + int obj_delegate_count; + Ceed op_fallback_ceed, op_fallback_parent; + const char *op_fallback_resource; + char **jit_source_roots; + CeedInt num_jit_source_roots; + int (*Error)(Ceed, const char *, int, const char *, int, const char *, va_list *); int (*GetPreferredMemType)(CeedMemType *); int (*Destroy)(Ceed); int (*VectorCreate)(CeedSize, CeedVector); - int (*ElemRestrictionCreate)(CeedMemType, CeedCopyMode, - const CeedInt *, CeedElemRestriction); - int (*ElemRestrictionCreateOriented)(CeedMemType, CeedCopyMode, - const CeedInt *, const bool *, - CeedElemRestriction); - int (*ElemRestrictionCreateBlocked)(CeedMemType, CeedCopyMode, - const CeedInt *, CeedElemRestriction); - int (*BasisCreateTensorH1)(CeedInt, CeedInt, CeedInt, const CeedScalar *, - const CeedScalar *, const CeedScalar *, - const CeedScalar *, CeedBasis); - int (*BasisCreateH1)(CeedElemTopology, CeedInt, CeedInt, CeedInt, - const CeedScalar *, - const CeedScalar *, const CeedScalar *, - const CeedScalar *, CeedBasis); - int (*BasisCreateHdiv)(CeedElemTopology, CeedInt, CeedInt, CeedInt, - const CeedScalar *, - const CeedScalar *, const CeedScalar *, - const CeedScalar *, CeedBasis); + int (*ElemRestrictionCreate)(CeedMemType, CeedCopyMode, const CeedInt *, CeedElemRestriction); + int (*ElemRestrictionCreateOriented)(CeedMemType, CeedCopyMode, const CeedInt *, const bool *, CeedElemRestriction); + int (*ElemRestrictionCreateBlocked)(CeedMemType, CeedCopyMode, const CeedInt *, CeedElemRestriction); + int (*BasisCreateTensorH1)(CeedInt, CeedInt, CeedInt, const CeedScalar *, const CeedScalar *, const CeedScalar *, const CeedScalar *, CeedBasis); + int (*BasisCreateH1)(CeedElemTopology, CeedInt, CeedInt, CeedInt, const CeedScalar *, const CeedScalar *, const CeedScalar *, const CeedScalar *, + CeedBasis); + int (*BasisCreateHdiv)(CeedElemTopology, CeedInt, CeedInt, CeedInt, const CeedScalar *, const CeedScalar *, const CeedScalar *, const CeedScalar *, + CeedBasis); int (*TensorContractCreate)(CeedBasis, CeedTensorContract); int (*QFunctionCreate)(CeedQFunction); int (*QFunctionContextCreate)(CeedQFunctionContext); int (*OperatorCreate)(CeedOperator); int (*CompositeOperatorCreate)(CeedOperator); - int ref_count; - void *data; - bool is_debug; - bool has_valid_op_fallback_resource; - bool is_deterministic; - char err_msg[CEED_MAX_RESOURCE_LEN]; + int ref_count; + void *data; + bool is_debug; + bool has_valid_op_fallback_resource; + bool is_deterministic; + char err_msg[CEED_MAX_RESOURCE_LEN]; FOffset *f_offsets; }; @@ -149,92 +138,82 @@ struct CeedVector_private { int (*PointwiseMult)(CeedVector, CeedVector, CeedVector); int (*Reciprocal)(CeedVector); int (*Destroy)(CeedVector); - int ref_count; + int ref_count; CeedSize length; uint64_t state; uint64_t num_readers; - void *data; + void *data; }; struct CeedElemRestriction_private { Ceed ceed; - int (*Apply)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, - CeedRequest *); - int (*ApplyBlock)(CeedElemRestriction, CeedInt, CeedTransposeMode, CeedVector, - CeedVector, CeedRequest *); + int (*Apply)(CeedElemRestriction, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *); + int (*ApplyBlock)(CeedElemRestriction, CeedInt, CeedTransposeMode, CeedVector, CeedVector, CeedRequest *); int (*GetOffsets)(CeedElemRestriction, CeedMemType, const CeedInt **); int (*Destroy)(CeedElemRestriction); - int ref_count; - CeedInt num_elem; /* number of elements */ - CeedInt elem_size; /* number of nodes per element */ - CeedInt num_comp; /* number of components */ - CeedInt comp_stride; /* Component stride for L-vector ordering */ - CeedSize l_size; /* size of the L-vector, can be used for checking - for correct vector sizes */ - CeedInt blk_size; /* number of elements in a batch */ - CeedInt num_blk; /* number of blocks of elements */ - CeedInt *strides; /* strides between [nodes, components, elements] */ - CeedInt layout[3]; /* E-vector layout [nodes, components, elements] */ - uint64_t num_readers; /* number of instances of offset read only access */ - bool is_oriented; /* flag for oriented restriction */ - void *data; /* place for the backend to store any data */ + int ref_count; + CeedInt num_elem; /* number of elements */ + CeedInt elem_size; /* number of nodes per element */ + CeedInt num_comp; /* number of components */ + CeedInt comp_stride; /* Component stride for L-vector ordering */ + CeedSize l_size; /* size of the L-vector, can be used for checking + for correct vector sizes */ + CeedInt blk_size; /* number of elements in a batch */ + CeedInt num_blk; /* number of blocks of elements */ + CeedInt *strides; /* strides between [nodes, components, elements] */ + CeedInt layout[3]; /* E-vector layout [nodes, components, elements] */ + uint64_t num_readers; /* number of instances of offset read only access */ + bool is_oriented; /* flag for oriented restriction */ + void *data; /* place for the backend to store any data */ }; struct CeedBasis_private { Ceed ceed; - int (*Apply)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, - CeedVector, CeedVector); + int (*Apply)(CeedBasis, CeedInt, CeedTransposeMode, CeedEvalMode, CeedVector, CeedVector); int (*Destroy)(CeedBasis); - int ref_count; - bool tensor_basis; /* flag for tensor basis */ - CeedInt dim; /* topological dimension */ - CeedElemTopology topo; /* element topology */ - CeedInt num_comp; /* number of field components (1 for scalar fields) */ - CeedInt Q_comp; /* number of Q-vector components (1 for H^1, dim for H(div)) */ - CeedInt P_1d; /* number of nodes in one dimension */ - CeedInt Q_1d; /* number of quadrature points in one dimension */ - CeedInt P; /* total number of nodes */ - CeedInt Q; /* total number of quadrature points */ - CeedScalar *q_ref_1d; /* Array of length Q1d holding the locations of - quadrature points on the 1D reference - element [-1, 1] */ - CeedScalar - *q_weight_1d; /* array of length Q1d holding the quadrature weights on - the reference element */ - CeedScalar - *interp; /* row-major matrix of shape [Q_comp*Q, P] expressing the values of - nodal basis functions at quadrature points */ - CeedScalar - *interp_1d; /* row-major matrix of shape [Q1d, P1d] expressing the values of - nodal basis functions at quadrature points */ - CeedScalar - *grad; /* row-major matrix of shape [dim*Q_comp*Q, P] matrix expressing - derivatives of nodal basis functions at quadrature points */ - CeedScalar - *grad_1d; /* row-major matrix of shape [Q1d, P1d] matrix expressing - derivatives of nodal basis functions at quadrature points */ - CeedTensorContract contract; /* tensor contraction object */ - CeedFESpace basis_space; /* Initialize in basis constructor - with 1,2 for H^1, H(div) FE space */ - CeedScalar *div; /* row-major matrix of shape [Q, P] expressing - the divergence of nodal basis functions - at quadrature points for H(div) discretizations */ - void *data; /* place for the backend to store any data */ + int ref_count; + bool tensor_basis; /* flag for tensor basis */ + CeedInt dim; /* topological dimension */ + CeedElemTopology topo; /* element topology */ + CeedInt num_comp; /* number of field components (1 for scalar fields) */ + CeedInt Q_comp; /* number of Q-vector components (1 for H^1, dim for H(div)) */ + CeedInt P_1d; /* number of nodes in one dimension */ + CeedInt Q_1d; /* number of quadrature points in one dimension */ + CeedInt P; /* total number of nodes */ + CeedInt Q; /* total number of quadrature points */ + CeedScalar *q_ref_1d; /* Array of length Q1d holding the locations of + quadrature points on the 1D reference element [-1, 1] */ + CeedScalar *q_weight_1d; /* array of length Q1d holding the quadrature weights on + the reference element */ + CeedScalar *interp; /* row-major matrix of shape [Q_comp*Q, P] expressing the values of + nodal basis functions at quadrature points */ + CeedScalar *interp_1d; /* row-major matrix of shape [Q1d, P1d] expressing the values of + nodal basis functions at quadrature points */ + CeedScalar *grad; /* row-major matrix of shape [dim*Q_comp*Q, P] matrix expressing + derivatives of nodal basis functions at quadrature points */ + CeedScalar *grad_1d; /* row-major matrix of shape [Q1d, P1d] matrix expressing + derivatives of nodal basis functions at quadrature points */ + CeedTensorContract contract; /* tensor contraction object */ + CeedFESpace basis_space; /* Initialize in basis constructor + with 1,2 for H^1, H(div) FE space */ + CeedScalar *div; /* row-major matrix of shape [Q, P] expressing + the divergence of nodal basis functions + at quadrature points for H(div) discretizations */ + void *data; /* place for the backend to store any data */ }; struct CeedTensorContract_private { Ceed ceed; - int (*Apply)(CeedTensorContract, CeedInt, CeedInt, CeedInt, CeedInt, - const CeedScalar *restrict, CeedTransposeMode, const CeedInt, + int (*Apply)(CeedTensorContract, CeedInt, CeedInt, CeedInt, CeedInt, const CeedScalar *restrict, CeedTransposeMode, const CeedInt, const CeedScalar *restrict, CeedScalar *restrict); int (*Destroy)(CeedTensorContract); - int ref_count; + int ref_count; void *data; }; struct CeedQFunctionField_private { - const char *field_name; - CeedInt size; + const char *field_name; + CeedInt size; CeedEvalMode eval_mode; }; @@ -244,30 +223,30 @@ struct CeedQFunction_private { int (*SetCUDAUserFunction)(CeedQFunction, void *); int (*SetHIPUserFunction)(CeedQFunction, void *); int (*Destroy)(CeedQFunction); - int ref_count; + int ref_count; CeedInt vec_length; /* Number of quadrature points must be padded to a multiple of vec_length */ - CeedQFunctionField *input_fields; - CeedQFunctionField *output_fields; - CeedInt num_input_fields, num_output_fields; - CeedQFunctionUser function; - CeedInt user_flop_estimate; - const char *user_source; - const char *source_path; - const char *kernel_name; - const char *gallery_name; - bool is_gallery; - bool is_identity; - bool is_fortran; - bool is_immutable; - bool is_context_writable; - CeedQFunctionContext ctx; /* user context for function */ - void *data; /* place for the backend to store any data */ + CeedQFunctionField *input_fields; + CeedQFunctionField *output_fields; + CeedInt num_input_fields, num_output_fields; + CeedQFunctionUser function; + CeedInt user_flop_estimate; + const char *user_source; + const char *source_path; + const char *kernel_name; + const char *gallery_name; + bool is_gallery; + bool is_identity; + bool is_fortran; + bool is_immutable; + bool is_context_writable; + CeedQFunctionContext ctx; /* user context for function */ + void *data; /* place for the backend to store any data */ }; struct CeedQFunctionContext_private { Ceed ceed; - int ref_count; + int ref_count; int (*HasValidData)(CeedQFunctionContext, bool *); int (*HasBorrowedDataOfType)(CeedQFunctionContext, CeedMemType, bool *); int (*SetData)(CeedQFunctionContext, CeedMemType, CeedCopyMode, void *); @@ -279,92 +258,77 @@ struct CeedQFunctionContext_private { int (*DataDestroy)(CeedQFunctionContext); int (*Destroy)(CeedQFunctionContext); CeedQFunctionContextDataDestroyUser data_destroy_function; - CeedMemType data_destroy_mem_type; - CeedInt num_fields; - CeedInt max_fields; - CeedContextFieldLabel *field_labels; - uint64_t state; - uint64_t num_readers; - size_t ctx_size; - void *data; + CeedMemType data_destroy_mem_type; + CeedInt num_fields; + CeedInt max_fields; + CeedContextFieldLabel *field_labels; + uint64_t state; + uint64_t num_readers; + size_t ctx_size; + void *data; }; /// Struct to handle the context data to use the Fortran QFunction stub /// @ingroup CeedQFunction struct CeedFortranContext_private { CeedQFunctionContext inner_ctx; - void (*f)(void *ctx, int *nq, - const CeedScalar *u,const CeedScalar *u1, - const CeedScalar *u2,const CeedScalar *u3, - const CeedScalar *u4,const CeedScalar *u5, - const CeedScalar *u6,const CeedScalar *u7, - const CeedScalar *u8,const CeedScalar *u9, - const CeedScalar *u10,const CeedScalar *u11, - const CeedScalar *u12,const CeedScalar *u13, - const CeedScalar *u14,const CeedScalar *u15, - CeedScalar *v,CeedScalar *v1,CeedScalar *v2, - CeedScalar *v3,CeedScalar *v4,CeedScalar *v5, - CeedScalar *v6,CeedScalar *v7,CeedScalar *v8, - CeedScalar *v9, CeedScalar *v10,CeedScalar *v11, - CeedScalar *v12,CeedScalar *v13,CeedScalar *v14, - CeedScalar *v15, int *err); + void (*f)(void *ctx, int *nq, const CeedScalar *u, const CeedScalar *u1, const CeedScalar *u2, const CeedScalar *u3, const CeedScalar *u4, + const CeedScalar *u5, const CeedScalar *u6, const CeedScalar *u7, const CeedScalar *u8, const CeedScalar *u9, const CeedScalar *u10, + const CeedScalar *u11, const CeedScalar *u12, const CeedScalar *u13, const CeedScalar *u14, const CeedScalar *u15, CeedScalar *v, + CeedScalar *v1, CeedScalar *v2, CeedScalar *v3, CeedScalar *v4, CeedScalar *v5, CeedScalar *v6, CeedScalar *v7, CeedScalar *v8, + CeedScalar *v9, CeedScalar *v10, CeedScalar *v11, CeedScalar *v12, CeedScalar *v13, CeedScalar *v14, CeedScalar *v15, int *err); }; typedef struct CeedFortranContext_private *CeedFortranContext; struct CeedContextFieldLabel_private { - const char *name; - const char *description; - CeedContextFieldType type; - size_t size; - size_t num_values; - size_t offset; - CeedInt num_sub_labels; + const char *name; + const char *description; + CeedContextFieldType type; + size_t size; + size_t num_values; + size_t offset; + CeedInt num_sub_labels; CeedContextFieldLabel *sub_labels; }; struct CeedOperatorField_private { CeedElemRestriction elem_restr; /* Restriction from L-vector */ - CeedBasis basis; /* Basis or CEED_BASIS_COLLOCATED for + CeedBasis basis; /* Basis or CEED_BASIS_COLLOCATED for collocated fields */ CeedVector vec; /* State vector for passive fields or CEED_VECTOR_NONE for no vector */ - const char *field_name; /* matching QFunction field name */ + const char *field_name; /* matching QFunction field name */ }; struct CeedQFunctionAssemblyData_private { - Ceed ceed; - int ref_count; - bool is_setup; - bool reuse_data; - bool needs_data_update; - CeedVector vec; + Ceed ceed; + int ref_count; + bool is_setup; + bool reuse_data; + bool needs_data_update; + CeedVector vec; CeedElemRestriction rstr; }; struct CeedOperatorAssemblyData_private { - Ceed ceed; - CeedInt num_eval_mode_in, num_eval_mode_out; + Ceed ceed; + CeedInt num_eval_mode_in, num_eval_mode_out; CeedEvalMode *eval_mode_in, *eval_mode_out; - CeedScalar *B_in, *B_out; - CeedBasis basis_in, basis_out; + CeedScalar *B_in, *B_out; + CeedBasis basis_in, basis_out; }; struct CeedOperator_private { - Ceed ceed; + Ceed ceed; CeedOperator op_fallback; - int ref_count; - int (*LinearAssembleQFunction)(CeedOperator, CeedVector *, - CeedElemRestriction *, CeedRequest *); - int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, - CeedElemRestriction, CeedRequest *); + int ref_count; + int (*LinearAssembleQFunction)(CeedOperator, CeedVector *, CeedElemRestriction *, CeedRequest *); + int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *); int (*LinearAssembleDiagonal)(CeedOperator, CeedVector, CeedRequest *); int (*LinearAssembleAddDiagonal)(CeedOperator, CeedVector, CeedRequest *); - int (*LinearAssemblePointBlockDiagonal)(CeedOperator, CeedVector, - CeedRequest *); - int (*LinearAssembleAddPointBlockDiagonal)(CeedOperator, CeedVector, - CeedRequest *); - int (*LinearAssembleSymbolic)(CeedOperator, CeedSize *, CeedInt **, - CeedInt **); + int (*LinearAssemblePointBlockDiagonal)(CeedOperator, CeedVector, CeedRequest *); + int (*LinearAssembleAddPointBlockDiagonal)(CeedOperator, CeedVector, CeedRequest *); + int (*LinearAssembleSymbolic)(CeedOperator, CeedSize *, CeedInt **, CeedInt **); int (*LinearAssemble)(CeedOperator, CeedVector); int (*LinearAssembleSingle)(CeedOperator, CeedInt, CeedVector); int (*CreateFDMElementInverse)(CeedOperator, CeedOperator *, CeedRequest *); @@ -372,35 +336,33 @@ struct CeedOperator_private { int (*ApplyComposite)(CeedOperator, CeedVector, CeedVector, CeedRequest *); int (*ApplyAdd)(CeedOperator, CeedVector, CeedVector, CeedRequest *); int (*ApplyAddComposite)(CeedOperator, CeedVector, CeedVector, CeedRequest *); - int (*ApplyJacobian)(CeedOperator, CeedVector, CeedVector, CeedVector, - CeedVector, CeedRequest *); + int (*ApplyJacobian)(CeedOperator, CeedVector, CeedVector, CeedVector, CeedVector, CeedRequest *); int (*Destroy)(CeedOperator); - CeedOperatorField *input_fields; - CeedOperatorField *output_fields; - CeedSize input_size, output_size; - CeedInt num_elem; /* Number of elements */ - CeedInt num_qpts; /* Number of quadrature points over all elements */ - CeedInt num_fields; /* Number of fields that have been set */ - CeedQFunction qf; - CeedQFunction dqf; - CeedQFunction dqfT; - const char *name; - bool is_immutable; - bool is_interface_setup; - bool is_backend_setup; - bool is_composite; - bool has_restriction; + CeedOperatorField *input_fields; + CeedOperatorField *output_fields; + CeedSize input_size, output_size; + CeedInt num_elem; /* Number of elements */ + CeedInt num_qpts; /* Number of quadrature points over all elements */ + CeedInt num_fields; /* Number of fields that have been set */ + CeedQFunction qf; + CeedQFunction dqf; + CeedQFunction dqfT; + const char *name; + bool is_immutable; + bool is_interface_setup; + bool is_backend_setup; + bool is_composite; + bool has_restriction; CeedQFunctionAssemblyData qf_assembled; - CeedOperatorAssemblyData op_assembled; - CeedOperator *sub_operators; - CeedInt num_suboperators; - void *data; - CeedInt num_context_labels; - CeedInt max_context_labels; - CeedContextFieldLabel *context_labels; + CeedOperatorAssemblyData op_assembled; + CeedOperator *sub_operators; + CeedInt num_suboperators; + void *data; + CeedInt num_context_labels; + CeedInt max_context_labels; + CeedContextFieldLabel *context_labels; }; -CEED_INTERN int CeedOperatorGetFallback(CeedOperator op, - CeedOperator *op_fallback); +CEED_INTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback); #endif diff --git a/include/ceed/backend.h b/include/ceed/backend.h index ecaf0b1806..94fbc7c6e9 100644 --- a/include/ceed/backend.h +++ b/include/ceed/backend.h @@ -14,10 +14,12 @@ #include #include -#ifdef __cplusplus -# define CEED_INTERN extern "C" CEED_VISIBILITY(hidden) +#if defined(__clang_analyzer__) +#define CEED_INTERN +#elif defined(__cplusplus) +#define CEED_INTERN extern "C" CEED_VISIBILITY(hidden) #else -# define CEED_INTERN extern CEED_VISIBILITY(hidden) +#define CEED_INTERN extern CEED_VISIBILITY(hidden) #endif #define CEED_UNUSED __attribute__((unused)) @@ -34,15 +36,15 @@ are sensitive to floting point optimizations. **/ #ifndef CeedPragmaOptimizeOff -# if defined(__clang__) -# define CeedPragmaOptimizeOff _Pragma("clang optimize off") -# elif defined(__GNUC__) -# define CeedPragmaOptimizeOff _Pragma("GCC push_options") _Pragma("GCC optimize 0") -# elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) -# define CeedPragmaOptimizeOff _Pragma("optimize('', off)") -# else -# define CeedPragmaOptimizeOff -# endif +#if defined(__clang__) +#define CeedPragmaOptimizeOff _Pragma("clang optimize off") +#elif defined(__GNUC__) +#define CeedPragmaOptimizeOff _Pragma("GCC push_options") _Pragma("GCC optimize 0") +#elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) +#define CeedPragmaOptimizeOff _Pragma("optimize('', off)") +#else +#define CeedPragmaOptimizeOff +#endif #endif /** @@ -50,15 +52,15 @@ This macro restores previously set optimization flags after CeedPragmaOptimizeOff. **/ #ifndef CeedPragmaOptimizeOn -# if defined(__clang__) -# define CeedPragmaOptimizeOn _Pragma("clang optimize on") -# elif defined(__GNUC__) -# define CeedPragmaOptimizeOn _Pragma("GCC pop_options") -# elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) -# define CeedPragmaOptimizeOff _Pragma("optimize('', on)") -# else -# define CeedPragmaOptimizeOn -# endif +#if defined(__clang__) +#define CeedPragmaOptimizeOn _Pragma("clang optimize on") +#elif defined(__GNUC__) +#define CeedPragmaOptimizeOn _Pragma("GCC pop_options") +#elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) +#define CeedPragmaOptimizeOff _Pragma("optimize('', on)") +#else +#define CeedPragmaOptimizeOn +#endif #endif /// CEED_DEBUG_COLOR default value, forward CeedDebug* declarations & macros @@ -67,12 +69,16 @@ CEED_EXTERN void CeedDebugImpl256(const unsigned char, const char *, ...); CEED_EXTERN bool CeedDebugFlag(const Ceed ceed); CEED_EXTERN bool CeedDebugFlagEnv(void); -#define CeedDebug256(ceed, color, ...) \ - { if (CeedDebugFlag(ceed)) CeedDebugImpl256(color, ## __VA_ARGS__); } -#define CeedDebug(ceed, ...) CeedDebug256(ceed, (unsigned char)CEED_DEBUG_COLOR_NONE, ## __VA_ARGS__) -#define CeedDebugEnv256(color, ...) \ - { if (CeedDebugFlagEnv()) CeedDebugImpl256(color, ## __VA_ARGS__); } -#define CeedDebugEnv(...) CeedDebugEnv256((unsigned char)CEED_DEBUG_COLOR_NONE, ## __VA_ARGS__) +#define CeedDebug256(ceed, color, ...) \ + { \ + if (CeedDebugFlag(ceed)) CeedDebugImpl256(color, ##__VA_ARGS__); \ + } +#define CeedDebug(ceed, ...) CeedDebug256(ceed, (unsigned char)CEED_DEBUG_COLOR_NONE, ##__VA_ARGS__) +#define CeedDebugEnv256(color, ...) \ + { \ + if (CeedDebugFlagEnv()) CeedDebugImpl256(color, ##__VA_ARGS__); \ + } +#define CeedDebugEnv(...) CeedDebugEnv256((unsigned char)CEED_DEBUG_COLOR_NONE, ##__VA_ARGS__) /// Handle for object handling TensorContraction /// @ingroup CeedBasis @@ -94,8 +100,31 @@ CEED_INTERN int CeedReallocArray(size_t n, size_t unit, void *p); CEED_INTERN int CeedStringAllocCopy(const char *source, char **copy); CEED_INTERN int CeedFree(void *p); -#define CeedChk(ierr) do { int ierr_ = ierr; if (ierr_) return ierr_; } while (0) -#define CeedChkBackend(ierr) do { int ierr_ = ierr; if (ierr_) { if (ierr_ > CEED_ERROR_SUCCESS) return CEED_ERROR_BACKEND; else return ierr_; } } while (0) +#define CeedChk(ierr) \ + do { \ + int ierr_ = ierr; \ + if (ierr_) return ierr_; \ + } while (0) +#define CeedChkBackend(ierr) \ + do { \ + int ierr_ = ierr; \ + if (ierr_) { \ + if (ierr_ > CEED_ERROR_SUCCESS) return CEED_ERROR_BACKEND; \ + else return ierr_; \ + } \ + } while (0) + +#define CeedCall(...) \ + do { \ + int ierr_q_ = __VA_ARGS__; \ + CeedChk(ierr_q_); \ + } while (0); +#define CeedCallBackend(...) \ + do { \ + int ierr_q_ = __VA_ARGS__; \ + CeedChkBackend(ierr_q_); \ + } while (0); + /* Note that CeedMalloc and CeedCalloc will, generally, return pointers with different memory alignments: CeedMalloc returns pointers aligned at CEED_ALIGN bytes, while CeedCalloc uses the alignment of calloc. */ @@ -103,38 +132,27 @@ CEED_INTERN int CeedFree(void *p); #define CeedCalloc(n, p) CeedCallocArray((n), sizeof(**(p)), p) #define CeedRealloc(n, p) CeedReallocArray((n), sizeof(**(p)), p) -CEED_EXTERN int CeedRegister(const char *prefix, - int (*init)(const char *, Ceed), - unsigned int priority); -CEED_EXTERN int CeedRegisterImpl(const char *prefix, - int (*init)(const char *, Ceed), - unsigned int priority); +CEED_EXTERN int CeedRegister(const char *prefix, int (*init)(const char *, Ceed), unsigned int priority); +CEED_EXTERN int CeedRegisterImpl(const char *prefix, int (*init)(const char *, Ceed), unsigned int priority); CEED_EXTERN int CeedIsDebug(Ceed ceed, bool *is_debug); CEED_EXTERN int CeedGetParent(Ceed ceed, Ceed *parent); CEED_EXTERN int CeedGetDelegate(Ceed ceed, Ceed *delegate); CEED_EXTERN int CeedSetDelegate(Ceed ceed, Ceed delegate); -CEED_EXTERN int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, - const char *obj_name); -CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, - const char *obj_name); -CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, - const char **resource); +CEED_EXTERN int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name); +CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name); +CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource); CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed); -CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, - const char *resource); +CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource); CEED_EXTERN int CeedGetOperatorFallbackParentCeed(Ceed ceed, Ceed *parent); CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic); -CEED_EXTERN int CeedSetBackendFunction(Ceed ceed, - const char *type, void *object, - const char *func_name, int (*f)()); +CEED_EXTERN int CeedSetBackendFunction(Ceed ceed, const char *type, void *object, const char *func_name, int (*f)()); CEED_EXTERN int CeedGetData(Ceed ceed, void *data); CEED_EXTERN int CeedSetData(Ceed ceed, void *data); CEED_EXTERN int CeedReference(Ceed ceed); CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array); -CEED_EXTERN int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, - bool *has_borrowed_array_of_type); +CEED_EXTERN int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type); CEED_EXTERN int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array); CEED_EXTERN int CeedVectorGetState(CeedVector vec, uint64_t *state); CEED_EXTERN int CeedVectorAddReference(CeedVector vec); @@ -142,26 +160,16 @@ CEED_EXTERN int CeedVectorGetData(CeedVector vec, void *data); CEED_EXTERN int CeedVectorSetData(CeedVector vec, void *data); CEED_EXTERN int CeedVectorReference(CeedVector vec); -CEED_EXTERN int CeedElemRestrictionGetStrides(CeedElemRestriction rstr, - CeedInt (*strides)[3]); -CEED_EXTERN int CeedElemRestrictionGetOffsets(CeedElemRestriction rstr, - CeedMemType mem_type, const CeedInt **offsets); -CEED_EXTERN int CeedElemRestrictionRestoreOffsets(CeedElemRestriction rstr, - const CeedInt **offsets); -CEED_EXTERN int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, - bool *is_strided); -CEED_EXTERN int CeedElemRestrictionIsOriented(CeedElemRestriction rstr, - bool *is_oriented); -CEED_EXTERN int CeedElemRestrictionHasBackendStrides(CeedElemRestriction rstr, - bool *has_backend_strides); -CEED_EXTERN int CeedElemRestrictionGetELayout(CeedElemRestriction rstr, - CeedInt (*layout)[3]); -CEED_EXTERN int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, - CeedInt layout[3]); -CEED_EXTERN int CeedElemRestrictionGetData(CeedElemRestriction rstr, - void *data); -CEED_EXTERN int CeedElemRestrictionSetData(CeedElemRestriction rstr, - void *data); +CEED_EXTERN int CeedElemRestrictionGetStrides(CeedElemRestriction rstr, CeedInt (*strides)[3]); +CEED_EXTERN int CeedElemRestrictionGetOffsets(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets); +CEED_EXTERN int CeedElemRestrictionRestoreOffsets(CeedElemRestriction rstr, const CeedInt **offsets); +CEED_EXTERN int CeedElemRestrictionIsStrided(CeedElemRestriction rstr, bool *is_strided); +CEED_EXTERN int CeedElemRestrictionIsOriented(CeedElemRestriction rstr, bool *is_oriented); +CEED_EXTERN int CeedElemRestrictionHasBackendStrides(CeedElemRestriction rstr, bool *has_backend_strides); +CEED_EXTERN int CeedElemRestrictionGetELayout(CeedElemRestriction rstr, CeedInt (*layout)[3]); +CEED_EXTERN int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, CeedInt layout[3]); +CEED_EXTERN int CeedElemRestrictionGetData(CeedElemRestriction rstr, void *data); +CEED_EXTERN int CeedElemRestrictionSetData(CeedElemRestriction rstr, void *data); CEED_EXTERN int CeedElemRestrictionReference(CeedElemRestriction rstr); CEED_EXTERN int CeedElemRestrictionGetFlopsEstimate(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedSize *flops); @@ -169,68 +177,46 @@ CEED_EXTERN int CeedElemRestrictionGetFlopsEstimate(CeedElemRestriction rstr, Ce /// @ingroup CeedBasis typedef enum { /// H1 FE space - CEED_FE_SPACE_H1 = 1, + CEED_FE_SPACE_H1 = 1, /// H(div) FE space CEED_FE_SPACE_HDIV = 2, } CeedFESpace; CEED_EXTERN const char *const CeedFESpaces[]; -CEED_EXTERN int CeedBasisGetCollocatedGrad(CeedBasis basis, - CeedScalar *colo_grad_1d); -CEED_EXTERN int CeedHouseholderApplyQ(CeedScalar *A, const CeedScalar *Q, - const CeedScalar *tau, CeedTransposeMode t_mode, - CeedInt m, CeedInt n, CeedInt k, - CeedInt row, CeedInt col); +CEED_EXTERN int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *colo_grad_1d); +CEED_EXTERN int CeedHouseholderApplyQ(CeedScalar *A, const CeedScalar *Q, const CeedScalar *tau, CeedTransposeMode t_mode, CeedInt m, CeedInt n, + CeedInt k, CeedInt row, CeedInt col); CEED_EXTERN int CeedBasisIsTensor(CeedBasis basis, bool *is_tensor); CEED_EXTERN int CeedBasisGetData(CeedBasis basis, void *data); CEED_EXTERN int CeedBasisSetData(CeedBasis basis, void *data); CEED_EXTERN int CeedBasisReference(CeedBasis basis); CEED_EXTERN int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedSize *flops); -CEED_EXTERN int CeedBasisGetTopologyDimension(CeedElemTopology topo, - CeedInt *dim); +CEED_EXTERN int CeedBasisGetTopologyDimension(CeedElemTopology topo, CeedInt *dim); -CEED_EXTERN int CeedBasisGetTensorContract(CeedBasis basis, - CeedTensorContract *contract); -CEED_EXTERN int CeedBasisSetTensorContract(CeedBasis basis, - CeedTensorContract contract); -CEED_EXTERN int CeedTensorContractCreate(Ceed ceed, CeedBasis basis, - CeedTensorContract *contract); -CEED_EXTERN int CeedTensorContractApply(CeedTensorContract contract, CeedInt A, - CeedInt B, CeedInt C, CeedInt J, - const CeedScalar *__restrict__ t, - CeedTransposeMode t_mode, - const CeedInt Add, - const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v); -CEED_EXTERN int CeedTensorContractGetCeed(CeedTensorContract contract, - Ceed *ceed); -CEED_EXTERN int CeedTensorContractGetData(CeedTensorContract contract, - void *data); -CEED_EXTERN int CeedTensorContractSetData(CeedTensorContract contract, - void *data); +CEED_EXTERN int CeedBasisGetTensorContract(CeedBasis basis, CeedTensorContract *contract); +CEED_EXTERN int CeedBasisSetTensorContract(CeedBasis basis, CeedTensorContract contract); +CEED_EXTERN int CeedTensorContractCreate(Ceed ceed, CeedBasis basis, CeedTensorContract *contract); +CEED_EXTERN int CeedTensorContractApply(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *__restrict__ t, + CeedTransposeMode t_mode, const CeedInt Add, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v); +CEED_EXTERN int CeedTensorContractGetCeed(CeedTensorContract contract, Ceed *ceed); +CEED_EXTERN int CeedTensorContractGetData(CeedTensorContract contract, void *data); +CEED_EXTERN int CeedTensorContractSetData(CeedTensorContract contract, void *data); CEED_EXTERN int CeedTensorContractReference(CeedTensorContract contract); CEED_EXTERN int CeedTensorContractDestroy(CeedTensorContract *contract); -CEED_EXTERN int CeedQFunctionRegister(const char *, const char *, CeedInt, - CeedQFunctionUser, int (*init)(Ceed, const char *, CeedQFunction)); +CEED_EXTERN int CeedQFunctionRegister(const char *, const char *, CeedInt, CeedQFunctionUser, int (*init)(Ceed, const char *, CeedQFunction)); CEED_EXTERN int CeedQFunctionSetFortranStatus(CeedQFunction qf, bool status); -CEED_EXTERN int CeedQFunctionGetVectorLength(CeedQFunction qf, - CeedInt *vec_length); -CEED_EXTERN int CeedQFunctionGetNumArgs(CeedQFunction qf, - CeedInt *num_input_fields, - CeedInt *num_output_fields); +CEED_EXTERN int CeedQFunctionGetVectorLength(CeedQFunction qf, CeedInt *vec_length); +CEED_EXTERN int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input_fields, CeedInt *num_output_fields); CEED_EXTERN int CeedQFunctionGetKernelName(CeedQFunction qf, char **kernel_name); CEED_EXTERN int CeedQFunctionGetSourcePath(CeedQFunction qf, char **source_path); CEED_EXTERN int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, char **source_buffer); -CEED_EXTERN int CeedQFunctionGetUserFunction(CeedQFunction qf, - CeedQFunctionUser *f); -CEED_EXTERN int CeedQFunctionGetContext(CeedQFunction qf, - CeedQFunctionContext *ctx); +CEED_EXTERN int CeedQFunctionGetUserFunction(CeedQFunction qf, CeedQFunctionUser *f); +CEED_EXTERN int CeedQFunctionGetContext(CeedQFunction qf, CeedQFunctionContext *ctx); CEED_EXTERN int CeedQFunctionGetContextData(CeedQFunction qf, CeedMemType mem_type, void *data); CEED_EXTERN int CeedQFunctionRestoreContextData(CeedQFunction qf, void *data); -CEED_EXTERN int CeedQFunctionGetInnerContext(CeedQFunction qf, - CeedQFunctionContext *ctx); +CEED_EXTERN int CeedQFunctionGetInnerContext(CeedQFunction qf, CeedQFunctionContext *ctx); CEED_EXTERN int CeedQFunctionGetInnerContextData(CeedQFunction qf, CeedMemType mem_type, void *data); CEED_EXTERN int CeedQFunctionRestoreInnerContextData(CeedQFunction qf, void *data); CEED_EXTERN int CeedQFunctionIsIdentity(CeedQFunction qf, bool *is_identity); @@ -240,29 +226,18 @@ CEED_EXTERN int CeedQFunctionSetData(CeedQFunction qf, void *data); CEED_EXTERN int CeedQFunctionReference(CeedQFunction qf); CEED_EXTERN int CeedQFunctionGetFlopsEstimate(CeedQFunction qf, CeedSize *flops); -CEED_EXTERN int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, - Ceed *ceed); -CEED_EXTERN int CeedQFunctionContextHasValidData(CeedQFunctionContext ctx, - bool *has_valid_data); -CEED_EXTERN int CeedQFunctionContextHasBorrowedDataOfType(CeedQFunctionContext ctx, - CeedMemType mem_type, bool *has_borrowed_data_of_type); -CEED_EXTERN int CeedQFunctionContextGetState(CeedQFunctionContext ctx, - uint64_t *state); -CEED_EXTERN int CeedQFunctionContextGetBackendData(CeedQFunctionContext ctx, - void *data); -CEED_EXTERN int CeedQFunctionContextSetBackendData(CeedQFunctionContext ctx, - void *data); -CEED_EXTERN int CeedQFunctionContextGetFieldLabel(CeedQFunctionContext ctx, - const char *field_name, CeedContextFieldLabel *field_label); -CEED_EXTERN int CeedQFunctionContextSetGeneric(CeedQFunctionContext ctx, - CeedContextFieldLabel field_label, - CeedContextFieldType field_type, void *value); -CEED_EXTERN int CeedQFunctionContextSetDouble(CeedQFunctionContext ctx, - CeedContextFieldLabel field_label, double *values); -CEED_EXTERN int CeedQFunctionContextSetInt32(CeedQFunctionContext ctx, - CeedContextFieldLabel field_label, int *values); -CEED_EXTERN int CeedQFunctionContextGetDataDestroy(CeedQFunctionContext ctx, - CeedMemType *f_mem_type, CeedQFunctionContextDataDestroyUser *f); +CEED_EXTERN int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed); +CEED_EXTERN int CeedQFunctionContextHasValidData(CeedQFunctionContext ctx, bool *has_valid_data); +CEED_EXTERN int CeedQFunctionContextHasBorrowedDataOfType(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type); +CEED_EXTERN int CeedQFunctionContextGetState(CeedQFunctionContext ctx, uint64_t *state); +CEED_EXTERN int CeedQFunctionContextGetBackendData(CeedQFunctionContext ctx, void *data); +CEED_EXTERN int CeedQFunctionContextSetBackendData(CeedQFunctionContext ctx, void *data); +CEED_EXTERN int CeedQFunctionContextGetFieldLabel(CeedQFunctionContext ctx, const char *field_name, CeedContextFieldLabel *field_label); +CEED_EXTERN int CeedQFunctionContextSetGeneric(CeedQFunctionContext ctx, CeedContextFieldLabel field_label, CeedContextFieldType field_type, + void *value); +CEED_EXTERN int CeedQFunctionContextSetDouble(CeedQFunctionContext ctx, CeedContextFieldLabel field_label, double *values); +CEED_EXTERN int CeedQFunctionContextSetInt32(CeedQFunctionContext ctx, CeedContextFieldLabel field_label, int *values); +CEED_EXTERN int CeedQFunctionContextGetDataDestroy(CeedQFunctionContext ctx, CeedMemType *f_mem_type, CeedQFunctionContextDataDestroyUser *f); CEED_EXTERN int CeedQFunctionContextReference(CeedQFunctionContext ctx); CEED_EXTERN int CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data); @@ -277,28 +252,27 @@ CEED_EXTERN int CeedQFunctionAssemblyDataGetObjects(CeedQFunctionAssemblyData da CEED_EXTERN int CeedQFunctionAssemblyDataDestroy(CeedQFunctionAssemblyData *data); CEED_EXTERN int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssemblyData *data); -CEED_EXTERN int CeedOperatorAssemblyDataGetEvalModes(CeedOperatorAssemblyData data, CeedInt *num_eval_mode_in, const CeedEvalMode **eval_mode_in, CeedInt *num_eval_mode_out, const CeedEvalMode **eval_mode_out); -CEED_EXTERN int CeedOperatorAssemblyDataGetBases(CeedOperatorAssemblyData data, CeedBasis *basis_in, const CeedScalar **B_in, CeedBasis *basis_out, const CeedScalar **B_out); +CEED_EXTERN int CeedOperatorAssemblyDataGetEvalModes(CeedOperatorAssemblyData data, CeedInt *num_eval_mode_in, const CeedEvalMode **eval_mode_in, + CeedInt *num_eval_mode_out, const CeedEvalMode **eval_mode_out); +CEED_EXTERN int CeedOperatorAssemblyDataGetBases(CeedOperatorAssemblyData data, CeedBasis *basis_in, const CeedScalar **B_in, CeedBasis *basis_out, + const CeedScalar **B_out); CEED_EXTERN int CeedOperatorAssemblyDataDestroy(CeedOperatorAssemblyData *data); CEED_EXTERN int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyData *data); -CEED_EXTERN int CeedOperatorGetActiveBasis(CeedOperator op, - CeedBasis *active_basis); +CEED_EXTERN int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis); CEED_EXTERN int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *active_rstr); CEED_EXTERN int CeedOperatorGetNumArgs(CeedOperator op, CeedInt *num_args); CEED_EXTERN int CeedOperatorIsSetupDone(CeedOperator op, bool *is_setup_done); CEED_EXTERN int CeedOperatorGetQFunction(CeedOperator op, CeedQFunction *qf); CEED_EXTERN int CeedOperatorIsComposite(CeedOperator op, bool *is_composite); CEED_EXTERN int CeedOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators); -CEED_EXTERN int CeedOperatorGetSubList(CeedOperator op, - CeedOperator **sub_operators); +CEED_EXTERN int CeedOperatorGetSubList(CeedOperator op, CeedOperator **sub_operators); CEED_EXTERN int CeedOperatorGetData(CeedOperator op, void *data); CEED_EXTERN int CeedOperatorSetData(CeedOperator op, void *data); CEED_EXTERN int CeedOperatorReference(CeedOperator op); CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op); -CEED_INTERN int CeedMatrixMatrixMultiply(Ceed ceed, const CeedScalar *mat_A, - const CeedScalar *mat_B, CeedScalar *mat_C, - CeedInt m, CeedInt n, CeedInt kk); +CEED_INTERN int CeedMatrixMatrixMultiply(Ceed ceed, const CeedScalar *mat_A, const CeedScalar *mat_B, CeedScalar *mat_C, CeedInt m, CeedInt n, + CeedInt kk); #endif diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h index 7ee7b43f8c..0946e1c152 100644 --- a/include/ceed/ceed-f32.h +++ b/include/ceed/ceed-f32.h @@ -7,8 +7,8 @@ /// @file /// Public header for definitions related to using FP32 floating point (single -/// precision) for CeedScalar. Include this header in ceed/ceed.h to use -/// float instead of double. +/// precision) for CeedScalar. Include this header in ceed/ceed.h to use +/// float instead of double. #ifndef _ceed_f32_h #define _ceed_f32_h diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h index fb557df17d..bc556dd11d 100644 --- a/include/ceed/ceed-f64.h +++ b/include/ceed/ceed-f64.h @@ -7,7 +7,7 @@ /// @file /// Public header for definitions related to using FP64 floating point (double -/// precision) for CeedScalar. This is the default header included in ceed/ceed.h. +/// precision) for CeedScalar. This is the default header included in ceed/ceed.h. #ifndef _ceed_f64_h #define _ceed_f64_h diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h index 47f52a7b10..b7d8b5d36f 100644 --- a/include/ceed/ceed.h +++ b/include/ceed/ceed.h @@ -47,9 +47,9 @@ /// libCEED and can generally be found in "ceed-impl.h". #if !defined(CEED_SKIP_VISIBILITY) -# define CEED_VISIBILITY(mode) __attribute__((visibility(#mode))) +#define CEED_VISIBILITY(mode) __attribute__((visibility(#mode))) #else -# define CEED_VISIBILITY(mode) +#define CEED_VISIBILITY(mode) #endif /** @@ -58,10 +58,12 @@ No other file should declare publicly visible symbols, thus it should never be used outside ceed.h. */ -#ifdef __cplusplus -# define CEED_EXTERN extern "C" CEED_VISIBILITY(default) +#if defined(__clang_analyzer__) +#define CEED_EXTERN extern +#elif defined(__cplusplus) +#define CEED_EXTERN extern "C" CEED_VISIBILITY(default) #else -# define CEED_EXTERN extern CEED_VISIBILITY(default) +#define CEED_EXTERN extern CEED_VISIBILITY(default) #endif #include @@ -120,8 +122,7 @@ CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root); CEED_EXTERN int CeedView(Ceed ceed, FILE *stream); CEED_EXTERN int CeedDestroy(Ceed *ceed); -CEED_EXTERN int CeedErrorImpl(Ceed, const char *, int, const char *, int, - const char *, ...); +CEED_EXTERN int CeedErrorImpl(Ceed, const char *, int, const char *, int, const char *, ...); /// Raise an error on ceed object /// /// @param ceed Ceed library context or NULL @@ -133,25 +134,17 @@ CEED_EXTERN int CeedErrorImpl(Ceed, const char *, int, const char *, int, #if defined(__clang__) /// Use nonstandard ternary to convince the compiler/clang-tidy that this /// function never returns zero. -# define CeedError(ceed, ecode, ...) \ - (CeedErrorImpl((ceed), __FILE__, __LINE__, __func__, (ecode), __VA_ARGS__), (ecode)) +#define CeedError(ceed, ecode, ...) (CeedErrorImpl((ceed), __FILE__, __LINE__, __func__, (ecode), __VA_ARGS__), (ecode)) #else -# define CeedError(ceed, ecode, ...) \ - CeedErrorImpl((ceed), __FILE__, __LINE__, __func__, (ecode), __VA_ARGS__) ?: (ecode) +#define CeedError(ceed, ecode, ...) CeedErrorImpl((ceed), __FILE__, __LINE__, __func__, (ecode), __VA_ARGS__) ?: (ecode) #endif /// Ceed error handlers -CEED_EXTERN int CeedErrorReturn(Ceed, const char *, int, const char *, int, - const char *, va_list *); -CEED_EXTERN int CeedErrorStore(Ceed, const char *, int, const char *, int, - const char *, va_list *); -CEED_EXTERN int CeedErrorAbort(Ceed, const char *, int, const char *, int, - const char *, va_list *); -CEED_EXTERN int CeedErrorExit(Ceed, const char *, int, const char *, int, - const char *, va_list *); -typedef int (*CeedErrorHandler)(Ceed, const char *, int, - const char *, int, const char *, - va_list *); +CEED_EXTERN int CeedErrorReturn(Ceed, const char *, int, const char *, int, const char *, va_list *); +CEED_EXTERN int CeedErrorStore(Ceed, const char *, int, const char *, int, const char *, va_list *); +CEED_EXTERN int CeedErrorAbort(Ceed, const char *, int, const char *, int, const char *, va_list *); +CEED_EXTERN int CeedErrorExit(Ceed, const char *, int, const char *, int, const char *, va_list *); +typedef int (*CeedErrorHandler)(Ceed, const char *, int, const char *, int, const char *, va_list *); CEED_EXTERN int CeedSetErrorHandler(Ceed ceed, CeedErrorHandler eh); CEED_EXTERN int CeedGetErrorMessage(Ceed, const char **err_msg); CEED_EXTERN int CeedResetErrorMessage(Ceed, const char **err_msg); @@ -181,15 +174,12 @@ CEED_EXTERN int CeedResetErrorMessage(Ceed, const char **err_msg); /// /// @ingroup Ceed /// @sa CeedGetVersion() -#define CEED_VERSION_GE(major, minor, patch) \ - (!CEED_VERSION_RELEASE || \ - (CEED_VERSION_MAJOR > major || \ - (CEED_VERSION_MAJOR == major && \ - (CEED_VERSION_MINOR > minor || \ - (CEED_VERSION_MINOR == minor && CEED_VERSION_PATCH >= patch))))) +#define CEED_VERSION_GE(major, minor, patch) \ + (!CEED_VERSION_RELEASE || \ + (CEED_VERSION_MAJOR > major || \ + (CEED_VERSION_MAJOR == major && (CEED_VERSION_MINOR > minor || (CEED_VERSION_MINOR == minor && CEED_VERSION_PATCH >= patch))))) -CEED_EXTERN int CeedGetVersion(int *major, int *minor, int *patch, - bool *release); +CEED_EXTERN int CeedGetVersion(int *major, int *minor, int *patch, bool *release); CEED_EXTERN int CeedGetScalarType(CeedScalarType *scalar_type); @@ -241,23 +231,16 @@ typedef enum { CEED_EXTERN int CeedVectorCreate(Ceed ceed, CeedSize len, CeedVector *vec); CEED_EXTERN int CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy); -CEED_EXTERN int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, - CeedCopyMode copy_mode, CeedScalar *array); +CEED_EXTERN int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array); CEED_EXTERN int CeedVectorSetValue(CeedVector vec, CeedScalar value); CEED_EXTERN int CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type); -CEED_EXTERN int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, - CeedScalar **array); -CEED_EXTERN int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, - CeedScalar **array); -CEED_EXTERN int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, - const CeedScalar **array); -CEED_EXTERN int CeedVectorGetArrayWrite(CeedVector vec, CeedMemType mem_type, - CeedScalar **array); +CEED_EXTERN int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array); +CEED_EXTERN int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array); +CEED_EXTERN int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, const CeedScalar **array); +CEED_EXTERN int CeedVectorGetArrayWrite(CeedVector vec, CeedMemType mem_type, CeedScalar **array); CEED_EXTERN int CeedVectorRestoreArray(CeedVector vec, CeedScalar **array); -CEED_EXTERN int CeedVectorRestoreArrayRead(CeedVector vec, - const CeedScalar **array); -CEED_EXTERN int CeedVectorNorm(CeedVector vec, CeedNormType type, - CeedScalar *norm); +CEED_EXTERN int CeedVectorRestoreArrayRead(CeedVector vec, const CeedScalar **array); +CEED_EXTERN int CeedVectorNorm(CeedVector vec, CeedNormType type, CeedScalar *norm); CEED_EXTERN int CeedVectorScale(CeedVector x, CeedScalar alpha); CEED_EXTERN int CeedVectorAXPY(CeedVector y, CeedScalar alpha, CeedVector x); CEED_EXTERN int CeedVectorPointwiseMult(CeedVector w, CeedVector x, CeedVector y); @@ -269,7 +252,7 @@ CEED_EXTERN int CeedVectorDestroy(CeedVector *vec); CEED_EXTERN CeedRequest *const CEED_REQUEST_IMMEDIATE; CEED_EXTERN CeedRequest *const CEED_REQUEST_ORDERED; -CEED_EXTERN int CeedRequestWait(CeedRequest *req); +CEED_EXTERN int CeedRequestWait(CeedRequest *req); /// Argument for CeedOperatorSetField that vector is collocated with /// quadrature points, only used with CeedEvalMode CEED_EVAL_NONE @@ -312,51 +295,32 @@ CEED_EXTERN const char *const CeedTransposeModes[]; /// @ingroup CeedElemRestriction CEED_EXTERN const CeedInt CEED_STRIDES_BACKEND[3]; -CEED_EXTERN int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, - CeedInt elem_size, CeedInt num_comp, CeedInt comp_stride, CeedSize l_size, - CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, - CeedElemRestriction *rstr); -CEED_EXTERN int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, - CeedInt elem_size, CeedInt num_comp, CeedInt comp_stride, CeedSize l_size, - CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, - const bool *orient, CeedElemRestriction *rstr); -CEED_EXTERN int CeedElemRestrictionCreateStrided(Ceed ceed, - CeedInt num_elem, CeedInt elem_size, CeedInt num_comp, CeedSize l_size, - const CeedInt strides[3], CeedElemRestriction *rstr); -CEED_EXTERN int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, - CeedInt elem_size, CeedInt blk_size, CeedInt num_comp, CeedInt comp_stride, - CeedSize l_size, CeedMemType mem_type, CeedCopyMode copy_mode, - const CeedInt *offsets, CeedElemRestriction *rstr); -CEED_EXTERN int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, - CeedInt num_elem, CeedInt elem_size, CeedInt blk_size, CeedInt num_comp, - CeedSize l_size, const CeedInt strides[3], CeedElemRestriction *rstr); -CEED_EXTERN int CeedElemRestrictionReferenceCopy(CeedElemRestriction rstr, - CeedElemRestriction *rstr_copy); -CEED_EXTERN int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, - CeedVector *lvec, CeedVector *evec); -CEED_EXTERN int CeedElemRestrictionApply(CeedElemRestriction rstr, - CeedTransposeMode t_mode, CeedVector u, CeedVector ru, CeedRequest *request); -CEED_EXTERN int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, - CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector ru, - CeedRequest *request); -CEED_EXTERN int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, - Ceed *ceed); -CEED_EXTERN int CeedElemRestrictionGetCompStride(CeedElemRestriction rstr, - CeedInt *comp_stride); -CEED_EXTERN int CeedElemRestrictionGetNumElements(CeedElemRestriction rstr, - CeedInt *num_elem); -CEED_EXTERN int CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, - CeedInt *elem_size); -CEED_EXTERN int CeedElemRestrictionGetLVectorSize(CeedElemRestriction rstr, - CeedSize *l_size); -CEED_EXTERN int CeedElemRestrictionGetNumComponents(CeedElemRestriction rstr, - CeedInt *num_comp); -CEED_EXTERN int CeedElemRestrictionGetNumBlocks(CeedElemRestriction rstr, - CeedInt *num_blk); -CEED_EXTERN int CeedElemRestrictionGetBlockSize(CeedElemRestriction rstr, - CeedInt *blk_size); -CEED_EXTERN int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, - CeedVector mult); +CEED_EXTERN int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt num_comp, CeedInt comp_stride, CeedSize l_size, + CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, CeedElemRestriction *rstr); +CEED_EXTERN int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt num_comp, CeedInt comp_stride, + CeedSize l_size, CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, + const bool *orient, CeedElemRestriction *rstr); +CEED_EXTERN int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt num_comp, CeedSize l_size, + const CeedInt strides[3], CeedElemRestriction *rstr); +CEED_EXTERN int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt blk_size, CeedInt num_comp, + CeedInt comp_stride, CeedSize l_size, CeedMemType mem_type, CeedCopyMode copy_mode, + const CeedInt *offsets, CeedElemRestriction *rstr); +CEED_EXTERN int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt blk_size, CeedInt num_comp, + CeedSize l_size, const CeedInt strides[3], CeedElemRestriction *rstr); +CEED_EXTERN int CeedElemRestrictionReferenceCopy(CeedElemRestriction rstr, CeedElemRestriction *rstr_copy); +CEED_EXTERN int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *lvec, CeedVector *evec); +CEED_EXTERN int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector ru, CeedRequest *request); +CEED_EXTERN int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector ru, + CeedRequest *request); +CEED_EXTERN int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, Ceed *ceed); +CEED_EXTERN int CeedElemRestrictionGetCompStride(CeedElemRestriction rstr, CeedInt *comp_stride); +CEED_EXTERN int CeedElemRestrictionGetNumElements(CeedElemRestriction rstr, CeedInt *num_elem); +CEED_EXTERN int CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, CeedInt *elem_size); +CEED_EXTERN int CeedElemRestrictionGetLVectorSize(CeedElemRestriction rstr, CeedSize *l_size); +CEED_EXTERN int CeedElemRestrictionGetNumComponents(CeedElemRestriction rstr, CeedInt *num_comp); +CEED_EXTERN int CeedElemRestrictionGetNumBlocks(CeedElemRestriction rstr, CeedInt *num_blk); +CEED_EXTERN int CeedElemRestrictionGetBlockSize(CeedElemRestriction rstr, CeedInt *blk_size); +CEED_EXTERN int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult); CEED_EXTERN int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream); CEED_EXTERN int CeedElemRestrictionDestroy(CeedElemRestriction *rstr); @@ -371,15 +335,15 @@ CEED_EXTERN int CeedElemRestrictionDestroy(CeedElemRestriction *rstr); typedef enum { /// Perform no evaluation (either because there is no data or it is already at /// quadrature points) - CEED_EVAL_NONE = 0, + CEED_EVAL_NONE = 0, /// Interpolate from nodes to quadrature points CEED_EVAL_INTERP = 1, /// Evaluate gradients at quadrature points from input in a nodal basis - CEED_EVAL_GRAD = 2, + CEED_EVAL_GRAD = 2, /// Evaluate divergence at quadrature points from input in a nodal basis - CEED_EVAL_DIV = 4, + CEED_EVAL_DIV = 4, /// Evaluate curl at quadrature points from input in a nodal basis - CEED_EVAL_CURL = 8, + CEED_EVAL_CURL = 8, /// Using no input, evaluate quadrature weights on the reference element CEED_EVAL_WEIGHT = 16, } CeedEvalMode; @@ -389,7 +353,7 @@ CEED_EXTERN const char *const CeedEvalModes[]; /// @ingroup CeedBasis typedef enum { /// Gauss-Legendre quadrature - CEED_GAUSS = 0, + CEED_GAUSS = 0, /// Gauss-Legendre-Lobatto quadrature CEED_GAUSS_LOBATTO = 1, } CeedQuadMode; @@ -402,51 +366,34 @@ CEED_EXTERN const char *const CeedQuadModes[]; /// @ingroup CeedBasis typedef enum { /// Line - CEED_TOPOLOGY_LINE = 1 << 16 | 0, + CEED_TOPOLOGY_LINE = 1 << 16 | 0, /// Triangle - 2D shape CEED_TOPOLOGY_TRIANGLE = 2 << 16 | 1, /// Quadralateral - 2D shape - CEED_TOPOLOGY_QUAD = 2 << 16 | 2, + CEED_TOPOLOGY_QUAD = 2 << 16 | 2, /// Tetrahedron - 3D shape - CEED_TOPOLOGY_TET = 3 << 16 | 3, + CEED_TOPOLOGY_TET = 3 << 16 | 3, /// Pyramid - 3D shape - CEED_TOPOLOGY_PYRAMID = 3 << 16 | 4, + CEED_TOPOLOGY_PYRAMID = 3 << 16 | 4, /// Prism - 3D shape - CEED_TOPOLOGY_PRISM = 3 << 16 | 5, + CEED_TOPOLOGY_PRISM = 3 << 16 | 5, /// Hexehedron - 3D shape - CEED_TOPOLOGY_HEX = 3 << 16 | 6, + CEED_TOPOLOGY_HEX = 3 << 16 | 6, } CeedElemTopology; CEED_EXTERN const char *const CeedElemTopologies[]; -CEED_EXTERN int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, - CeedInt num_comp, CeedInt P, CeedInt Q, CeedQuadMode quad_mode, CeedBasis *basis); -CEED_EXTERN int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, - CeedInt P_1d, CeedInt Q_1d, - const CeedScalar *interp_1d, - const CeedScalar *grad_1d, - const CeedScalar *q_ref_1d, - const CeedScalar *q_weight_1d, - CeedBasis *basis); -CEED_EXTERN int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, - CeedInt num_comp, - CeedInt num_nodes, CeedInt nqpts, - const CeedScalar *interp, - const CeedScalar *grad, - const CeedScalar *q_ref, - const CeedScalar *q_weights, CeedBasis *basis); -CEED_EXTERN int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, - CeedInt num_comp, - CeedInt num_nodes, CeedInt nqpts, - const CeedScalar *interp, - const CeedScalar *div, - const CeedScalar *q_ref, - const CeedScalar *q_weights, CeedBasis *basis); +CEED_EXTERN int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P, CeedInt Q, CeedQuadMode quad_mode, + CeedBasis *basis); +CEED_EXTERN int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, + const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis *basis); +CEED_EXTERN int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis); +CEED_EXTERN int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt nqpts, const CeedScalar *interp, + const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weights, CeedBasis *basis); CEED_EXTERN int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project); CEED_EXTERN int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy); CEED_EXTERN int CeedBasisView(CeedBasis basis, FILE *stream); -CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, - CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector u, CeedVector v); +CEED_EXTERN int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v); CEED_EXTERN int CeedBasisGetCeed(CeedBasis basis, Ceed *ceed); CEED_EXTERN int CeedBasisGetDimension(CeedBasis basis, CeedInt *dim); CEED_EXTERN int CeedBasisGetTopology(CeedBasis basis, CeedElemTopology *topo); @@ -455,29 +402,21 @@ CEED_EXTERN int CeedBasisGetNumComponents(CeedBasis basis, CeedInt *num_comp); CEED_EXTERN int CeedBasisGetNumNodes(CeedBasis basis, CeedInt *P); CEED_EXTERN int CeedBasisGetNumNodes1D(CeedBasis basis, CeedInt *P_1d); CEED_EXTERN int CeedBasisGetNumQuadraturePoints(CeedBasis basis, CeedInt *Q); -CEED_EXTERN int CeedBasisGetNumQuadraturePoints1D(CeedBasis basis, - CeedInt *Q_1d); +CEED_EXTERN int CeedBasisGetNumQuadraturePoints1D(CeedBasis basis, CeedInt *Q_1d); CEED_EXTERN int CeedBasisGetQRef(CeedBasis basis, const CeedScalar **q_ref); -CEED_EXTERN int CeedBasisGetQWeights(CeedBasis basis, - const CeedScalar **q_weights); +CEED_EXTERN int CeedBasisGetQWeights(CeedBasis basis, const CeedScalar **q_weights); CEED_EXTERN int CeedBasisGetInterp(CeedBasis basis, const CeedScalar **interp); -CEED_EXTERN int CeedBasisGetInterp1D(CeedBasis basis, - const CeedScalar **interp_1d); +CEED_EXTERN int CeedBasisGetInterp1D(CeedBasis basis, const CeedScalar **interp_1d); CEED_EXTERN int CeedBasisGetGrad(CeedBasis basis, const CeedScalar **grad); CEED_EXTERN int CeedBasisGetGrad1D(CeedBasis basis, const CeedScalar **grad_1d); CEED_EXTERN int CeedBasisGetDiv(CeedBasis basis, const CeedScalar **div); CEED_EXTERN int CeedBasisDestroy(CeedBasis *basis); -CEED_EXTERN int CeedGaussQuadrature(CeedInt Q, CeedScalar *q_ref_1d, - CeedScalar *q_weight_1d); -CEED_EXTERN int CeedLobattoQuadrature(CeedInt Q, CeedScalar *q_ref_1d, - CeedScalar *q_weight_1d); -CEED_EXTERN int CeedQRFactorization(Ceed ceed, CeedScalar *mat, CeedScalar *tau, - CeedInt m, CeedInt n); -CEED_EXTERN int CeedSymmetricSchurDecomposition(Ceed ceed, CeedScalar *mat, - CeedScalar *lambda, CeedInt n); -CEED_EXTERN int CeedSimultaneousDiagonalization(Ceed ceed, CeedScalar *mat_A, - CeedScalar *mat_B, CeedScalar *x, CeedScalar *lambda, CeedInt n); +CEED_EXTERN int CeedGaussQuadrature(CeedInt Q, CeedScalar *q_ref_1d, CeedScalar *q_weight_1d); +CEED_EXTERN int CeedLobattoQuadrature(CeedInt Q, CeedScalar *q_ref_1d, CeedScalar *q_weight_1d); +CEED_EXTERN int CeedQRFactorization(Ceed ceed, CeedScalar *mat, CeedScalar *tau, CeedInt m, CeedInt n); +CEED_EXTERN int CeedSymmetricSchurDecomposition(Ceed ceed, CeedScalar *mat, CeedScalar *lambda, CeedInt n); +CEED_EXTERN int CeedSimultaneousDiagonalization(Ceed ceed, CeedScalar *mat_A, CeedScalar *mat_B, CeedScalar *x, CeedScalar *lambda, CeedInt n); /** Handle for the user provided CeedQFunction callback function @@ -498,42 +437,27 @@ CEED_EXTERN int CeedSimultaneousDiagonalization(Ceed ceed, CeedScalar *mat_A, @ingroup CeedQFunction **/ -typedef int (*CeedQFunctionUser)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out); - -CEED_EXTERN int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, - CeedQFunctionUser f, const char *source, CeedQFunction *qf); -CEED_EXTERN int CeedQFunctionCreateInteriorByName(Ceed ceed, const char *name, - CeedQFunction *qf); -CEED_EXTERN int CeedQFunctionCreateIdentity(Ceed ceed, CeedInt size, - CeedEvalMode in_mode, CeedEvalMode out_mode, CeedQFunction *qf); +typedef int (*CeedQFunctionUser)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out); + +CEED_EXTERN int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser f, const char *source, CeedQFunction *qf); +CEED_EXTERN int CeedQFunctionCreateInteriorByName(Ceed ceed, const char *name, CeedQFunction *qf); +CEED_EXTERN int CeedQFunctionCreateIdentity(Ceed ceed, CeedInt size, CeedEvalMode in_mode, CeedEvalMode out_mode, CeedQFunction *qf); CEED_EXTERN int CeedQFunctionReferenceCopy(CeedQFunction qf, CeedQFunction *qf_copy); -CEED_EXTERN int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, - CeedInt size, CeedEvalMode eval_mode); -CEED_EXTERN int CeedQFunctionAddOutput(CeedQFunction qf, const char *field_name, - CeedInt size, CeedEvalMode eval_mode); -CEED_EXTERN int CeedQFunctionGetFields(CeedQFunction qf, - CeedInt *num_input_fields, - CeedQFunctionField **input_fields, - CeedInt *num_output_fields, +CEED_EXTERN int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode); +CEED_EXTERN int CeedQFunctionAddOutput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode); +CEED_EXTERN int CeedQFunctionGetFields(CeedQFunction qf, CeedInt *num_input_fields, CeedQFunctionField **input_fields, CeedInt *num_output_fields, CeedQFunctionField **output_fields); -CEED_EXTERN int CeedQFunctionSetContext(CeedQFunction qf, - CeedQFunctionContext ctx); +CEED_EXTERN int CeedQFunctionSetContext(CeedQFunction qf, CeedQFunctionContext ctx); CEED_EXTERN int CeedQFunctionSetContextWritable(CeedQFunction qf, bool is_writable); CEED_EXTERN int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops); CEED_EXTERN int CeedQFunctionView(CeedQFunction qf, FILE *stream); CEED_EXTERN int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed); -CEED_EXTERN int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, - CeedVector *u, CeedVector *v); +CEED_EXTERN int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v); CEED_EXTERN int CeedQFunctionDestroy(CeedQFunction *qf); -CEED_EXTERN int CeedQFunctionFieldGetName(CeedQFunctionField qf_field, - char **field_name); -CEED_EXTERN int CeedQFunctionFieldGetSize(CeedQFunctionField qf_field, - CeedInt *size); -CEED_EXTERN int CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, - CeedEvalMode *eval_mode); +CEED_EXTERN int CeedQFunctionFieldGetName(CeedQFunctionField qf_field, char **field_name); +CEED_EXTERN int CeedQFunctionFieldGetSize(CeedQFunctionField qf_field, CeedInt *size); +CEED_EXTERN int CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, CeedEvalMode *eval_mode); /// Denotes type of data stored in a CeedQFunctionContext field /// @ingroup CeedQFunction @@ -541,7 +465,7 @@ typedef enum { /// Double precision value CEED_CONTEXT_FIELD_DOUBLE = 1, /// 32 bit integer value - CEED_CONTEXT_FIELD_INT32 = 2, + CEED_CONTEXT_FIELD_INT32 = 2, } CeedContextFieldType; CEED_EXTERN const char *const CeedContextFieldTypes[]; @@ -555,116 +479,74 @@ CEED_EXTERN const char *const CeedContextFieldTypes[]; **/ typedef int (*CeedQFunctionContextDataDestroyUser)(void *data); -CEED_EXTERN int CeedQFunctionContextCreate(Ceed ceed, - CeedQFunctionContext *ctx); -CEED_EXTERN int CeedQFunctionContextReferenceCopy(CeedQFunctionContext ctx, - CeedQFunctionContext *ctx_copy); -CEED_EXTERN int CeedQFunctionContextSetData(CeedQFunctionContext ctx, - CeedMemType mem_type, CeedCopyMode copy_mode, size_t size, void *data); -CEED_EXTERN int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, - CeedMemType mem_type, void *data); -CEED_EXTERN int CeedQFunctionContextGetData(CeedQFunctionContext ctx, - CeedMemType mem_type, void *data); -CEED_EXTERN int CeedQFunctionContextGetDataRead(CeedQFunctionContext ctx, - CeedMemType mem_type, void *data); -CEED_EXTERN int CeedQFunctionContextRestoreData(CeedQFunctionContext ctx, - void *data); -CEED_EXTERN int CeedQFunctionContextRestoreDataRead(CeedQFunctionContext ctx, - void *data); -CEED_EXTERN int CeedQFunctionContextRegisterDouble(CeedQFunctionContext ctx, - const char *field_name, size_t field_offset, size_t num_values, - const char *field_description); -CEED_EXTERN int CeedQFunctionContextRegisterInt32(CeedQFunctionContext ctx, - const char *field_name, size_t field_offset, size_t num_values, - const char *field_description); -CEED_EXTERN int CeedQFunctionContextGetAllFieldLabels(CeedQFunctionContext ctx, - const CeedContextFieldLabel **field_labels, CeedInt *num_fields); -CEED_EXTERN int CeedContextFieldLabelGetDescription(CeedContextFieldLabel label, - const char **field_name, const char **field_description, size_t *num_values, - CeedContextFieldType *field_type); -CEED_EXTERN int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, - size_t *ctx_size); -CEED_EXTERN int CeedQFunctionContextView(CeedQFunctionContext ctx, - FILE *stream); -CEED_EXTERN int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, - CeedMemType f_mem_type, CeedQFunctionContextDataDestroyUser f); +CEED_EXTERN int CeedQFunctionContextCreate(Ceed ceed, CeedQFunctionContext *ctx); +CEED_EXTERN int CeedQFunctionContextReferenceCopy(CeedQFunctionContext ctx, CeedQFunctionContext *ctx_copy); +CEED_EXTERN int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type, CeedCopyMode copy_mode, size_t size, void *data); +CEED_EXTERN int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data); +CEED_EXTERN int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data); +CEED_EXTERN int CeedQFunctionContextGetDataRead(CeedQFunctionContext ctx, CeedMemType mem_type, void *data); +CEED_EXTERN int CeedQFunctionContextRestoreData(CeedQFunctionContext ctx, void *data); +CEED_EXTERN int CeedQFunctionContextRestoreDataRead(CeedQFunctionContext ctx, void *data); +CEED_EXTERN int CeedQFunctionContextRegisterDouble(CeedQFunctionContext ctx, const char *field_name, size_t field_offset, size_t num_values, + const char *field_description); +CEED_EXTERN int CeedQFunctionContextRegisterInt32(CeedQFunctionContext ctx, const char *field_name, size_t field_offset, size_t num_values, + const char *field_description); +CEED_EXTERN int CeedQFunctionContextGetAllFieldLabels(CeedQFunctionContext ctx, const CeedContextFieldLabel **field_labels, CeedInt *num_fields); +CEED_EXTERN int CeedContextFieldLabelGetDescription(CeedContextFieldLabel label, const char **field_name, const char **field_description, + size_t *num_values, CeedContextFieldType *field_type); +CEED_EXTERN int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_size); +CEED_EXTERN int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream); +CEED_EXTERN int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, CeedMemType f_mem_type, CeedQFunctionContextDataDestroyUser f); CEED_EXTERN int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx); -CEED_EXTERN int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, - CeedQFunction dqf, CeedQFunction dqfT, - CeedOperator *op); +CEED_EXTERN int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op); CEED_EXTERN int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op); CEED_EXTERN int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy); -CEED_EXTERN int CeedOperatorSetField(CeedOperator op, const char *field_name, - CeedElemRestriction r, CeedBasis b, - CeedVector v); -CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, - CeedInt *num_input_fields, - CeedOperatorField **input_fields, - CeedInt *num_output_fields, +CEED_EXTERN int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction r, CeedBasis b, CeedVector v); +CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperatorField **input_fields, CeedInt *num_output_fields, CeedOperatorField **output_fields); -CEED_EXTERN int CeedCompositeOperatorAddSub(CeedOperator composite_op, - CeedOperator sub_op); +CEED_EXTERN int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op); CEED_EXTERN int CeedOperatorCheckReady(CeedOperator op); CEED_EXTERN int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size); CEED_EXTERN int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data); CEED_EXTERN int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update); -CEED_EXTERN int CeedOperatorLinearAssembleQFunction(CeedOperator op, - CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request); -CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, - CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request); -CEED_EXTERN int CeedOperatorLinearAssembleDiagonal(CeedOperator op, - CeedVector assembled, CeedRequest *request); -CEED_EXTERN int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, - CeedVector assembled, CeedRequest *request); -CEED_EXTERN int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, - CeedVector assembled, CeedRequest *request); -CEED_EXTERN int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, - CeedVector assembled, CeedRequest *request); -CEED_EXTERN int CeedOperatorLinearAssembleSymbolic(CeedOperator op, - CeedSize *num_entries, CeedInt **rows, CeedInt **cols); +CEED_EXTERN int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request); +CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request); +CEED_EXTERN int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request); +CEED_EXTERN int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request); +CEED_EXTERN int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request); +CEED_EXTERN int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request); +CEED_EXTERN int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols); CEED_EXTERN int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values); -CEED_EXTERN int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, - CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, - CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict); -CEED_EXTERN int CeedOperatorMultigridLevelCreateTensorH1( - CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, - CeedBasis basis_coarse, const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, - CeedOperator *op_prolong, CeedOperator *op_restrict); -CEED_EXTERN int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, - CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, - const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, - CeedOperator *op_prolong, CeedOperator *op_restrict); -CEED_EXTERN int CeedOperatorCreateFDMElementInverse(CeedOperator op, - CeedOperator *fdm_inv, CeedRequest *request); +CEED_EXTERN int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, + CeedBasis basis_coarse, CeedOperator *op_coarse, CeedOperator *op_prolong, + CeedOperator *op_restrict); +CEED_EXTERN int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, + CeedBasis basis_coarse, const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, + CeedOperator *op_prolong, CeedOperator *op_restrict); +CEED_EXTERN int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, + CeedBasis basis_coarse, const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, + CeedOperator *op_prolong, CeedOperator *op_restrict); +CEED_EXTERN int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedRequest *request); CEED_EXTERN int CeedOperatorSetNumQuadraturePoints(CeedOperator op, CeedInt num_qpts); CEED_EXTERN int CeedOperatorSetName(CeedOperator op, const char *name); CEED_EXTERN int CeedOperatorView(CeedOperator op, FILE *stream); CEED_EXTERN int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed); CEED_EXTERN int CeedOperatorGetNumElements(CeedOperator op, CeedInt *num_elem); -CEED_EXTERN int CeedOperatorGetNumQuadraturePoints(CeedOperator op, - CeedInt *num_qpts); +CEED_EXTERN int CeedOperatorGetNumQuadraturePoints(CeedOperator op, CeedInt *num_qpts); CEED_EXTERN int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops); -CEED_EXTERN int CeedOperatorContextGetFieldLabel(CeedOperator op, - const char *field_name, CeedContextFieldLabel *field_label); -CEED_EXTERN int CeedOperatorContextSetDouble(CeedOperator op, - CeedContextFieldLabel field_label, double *values); -CEED_EXTERN int CeedOperatorContextSetInt32(CeedOperator op, - CeedContextFieldLabel field_label, int *values); -CEED_EXTERN int CeedOperatorApply(CeedOperator op, CeedVector in, - CeedVector out, CeedRequest *request); -CEED_EXTERN int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, - CeedVector out, CeedRequest *request); +CEED_EXTERN int CeedOperatorContextGetFieldLabel(CeedOperator op, const char *field_name, CeedContextFieldLabel *field_label); +CEED_EXTERN int CeedOperatorContextSetDouble(CeedOperator op, CeedContextFieldLabel field_label, double *values); +CEED_EXTERN int CeedOperatorContextSetInt32(CeedOperator op, CeedContextFieldLabel field_label, int *values); +CEED_EXTERN int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request); +CEED_EXTERN int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request); CEED_EXTERN int CeedOperatorDestroy(CeedOperator *op); -CEED_EXTERN int CeedOperatorFieldGetName(CeedOperatorField op_field, - char **field_name); -CEED_EXTERN int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, - CeedElemRestriction *rstr); -CEED_EXTERN int CeedOperatorFieldGetBasis(CeedOperatorField op_field, - CeedBasis *basis); -CEED_EXTERN int CeedOperatorFieldGetVector(CeedOperatorField op_field, - CeedVector *vec); +CEED_EXTERN int CeedOperatorFieldGetName(CeedOperatorField op_field, char **field_name); +CEED_EXTERN int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRestriction *rstr); +CEED_EXTERN int CeedOperatorFieldGetBasis(CeedOperatorField op_field, CeedBasis *basis); +CEED_EXTERN int CeedOperatorFieldGetVector(CeedOperatorField op_field, CeedVector *vec); /** @brief Return integer power diff --git a/include/ceed/cuda.h b/include/ceed/cuda.h index 3a22ec96cf..1bdd641ade 100644 --- a/include/ceed/cuda.h +++ b/include/ceed/cuda.h @@ -11,7 +11,6 @@ #include #include -CEED_EXTERN int CeedQFunctionSetCUDAUserFunction(CeedQFunction qf, - CUfunction f); +CEED_EXTERN int CeedQFunctionSetCUDAUserFunction(CeedQFunction qf, CUfunction f); #endif diff --git a/include/ceed/hash.h b/include/ceed/hash.h index 769edce798..8cf0bcdddb 100644 --- a/include/ceed/hash.h +++ b/include/ceed/hash.h @@ -15,21 +15,21 @@ /* Required for khash <= 0.2.5 */ #if !defined(kcalloc) -#define kcalloc(N,Z) calloc(N,Z) +#define kcalloc(N, Z) calloc(N, Z) #endif #if !defined(kmalloc) #define kmalloc(Z) malloc(Z) #endif #if !defined(krealloc) -#define krealloc(P,Z) realloc(P,Z) +#define krealloc(P, Z) realloc(P, Z) #endif #if !defined(kfree) #define kfree(P) free(P) #endif -#define CeedHashGetValue(ht,k,v) ((v) = kh_value((ht),(k))) +#define CeedHashGetValue(ht, k, v) ((v) = kh_value((ht), (k))) -#define CeedHashMissing(ht,k) ((k) == kh_end((ht))) +#define CeedHashMissing(ht, k) ((k) == kh_end((ht))) /* --- Thomas Wang integer hash functions --- */ @@ -40,17 +40,15 @@ typedef khint_t CeedHash_t; /* Thomas Wang's second version for 32bit integers */ static inline CeedHash_t CeedHash_UInt32(CeedHash32_t key) { key = ~key + (key << 15); /* key = (key << 15) - key - 1; */ - key = key ^ (key >> 12); - key = key + (key << 2); - key = key ^ (key >> 4); - key = key * 2057; /* key = (key + (key << 3)) + (key << 11); */ - key = key ^ (key >> 16); + key = key ^ (key >> 12); + key = key + (key << 2); + key = key ^ (key >> 4); + key = key * 2057; /* key = (key + (key << 3)) + (key << 11); */ + key = key ^ (key >> 16); return key; } -static inline CeedHash_t CeedHashInt(CeedInt key) { - return CeedHash_UInt32((CeedHash32_t)key); -} +static inline CeedHash_t CeedHashInt(CeedInt key) { return CeedHash_UInt32((CeedHash32_t)key); } static inline CeedHash_t CeedHashCombine(CeedHash_t seed, CeedHash_t hash) { /* https://doi.org/10.1002/asi.10170 */ @@ -59,52 +57,47 @@ static inline CeedHash_t CeedHashCombine(CeedHash_t seed, CeedHash_t hash) { } // IJ - two keys -typedef struct _CeedHashIJKey { CeedInt i, j; } CeedHashIJKey; -#define CeedHashIJKeyHash(key) \ - CeedHashCombine(CeedHashInt((key).i),CeedHashInt((key).j)) +typedef struct _CeedHashIJKey { + CeedInt i, j; +} CeedHashIJKey; +#define CeedHashIJKeyHash(key) CeedHashCombine(CeedHashInt((key).i), CeedHashInt((key).j)) -#define CeedHashIJKeyEqual(k1,k2) \ - ((k1).i==(k2).i && (k1).j==(k2).j) +#define CeedHashIJKeyEqual(k1, k2) ((k1).i == (k2).i && (k1).j == (k2).j) -#define CeedHashIJInit(name, value) \ - KHASH_INIT(name,CeedHashIJKey,value,1,CeedHashIJKeyHash,CeedHashIJKeyEqual) +#define CeedHashIJInit(name, value) KHASH_INIT(name, CeedHashIJKey, value, 1, CeedHashIJKeyHash, CeedHashIJKeyEqual) -typedef struct _CeedHashIJKKey { CeedInt i, j, k; } CeedHashIJKKey; -#define CeedHashIJKKeyHash(key) \ - CeedHashCombine(CeedHashCombine(CeedHashInt((key).i),CeedHashInt((key).j)), \ - CeedHashInt((key).k)) +typedef struct _CeedHashIJKKey { + CeedInt i, j, k; +} CeedHashIJKKey; +#define CeedHashIJKKeyHash(key) CeedHashCombine(CeedHashCombine(CeedHashInt((key).i), CeedHashInt((key).j)), CeedHashInt((key).k)) // IJK - three keys -#define CeedHashIJKKeyEqual(k1,k2) \ - ((k1).i==(k2).i && (k1).j==(k2).j && (k1).k==(k2).k) +#define CeedHashIJKKeyEqual(k1, k2) ((k1).i == (k2).i && (k1).j == (k2).j && (k1).k == (k2).k) -#define CeedHashIJKInit(name, value) \ - KHASH_INIT(name,CeedHashIJKKey,value,1,CeedHashIJKKeyHash,CeedHashIJKKeyEqual) +#define CeedHashIJKInit(name, value) KHASH_INIT(name, CeedHashIJKKey, value, 1, CeedHashIJKKeyHash, CeedHashIJKKeyEqual) // IJKL - four keys -typedef struct _CeedHashIJKLKey { CeedInt i, j, k, l; } CeedHashIJKLKey; +typedef struct _CeedHashIJKLKey { + CeedInt i, j, k, l; +} CeedHashIJKLKey; #define CeedHashIJKLKeyHash(key) \ - CeedHashCombine(CeedHashCombine(CeedHashInt((key).i),CeedHashInt((key).j)), \ - CeedHashCombine(CeedHashInt((key).k),CeedHashInt((key).l))) + CeedHashCombine(CeedHashCombine(CeedHashInt((key).i), CeedHashInt((key).j)), CeedHashCombine(CeedHashInt((key).k), CeedHashInt((key).l))) -#define CeedHashIJKLKeyEqual(k1,k2) \ - ((k1).i==(k2).i && (k1).j==(k2).j && (k1).k==(k2).k && (k1).l==(k2).l) +#define CeedHashIJKLKeyEqual(k1, k2) ((k1).i == (k2).i && (k1).j == (k2).j && (k1).k == (k2).k && (k1).l == (k2).l) -#define CeedHashIJKLInit(name, value) \ - KHASH_INIT(name,CeedHashIJKLKey,value,1,CeedHashIJKLKeyHash,CeedHashIJKLKeyEqual) +#define CeedHashIJKLInit(name, value) KHASH_INIT(name, CeedHashIJKLKey, value, 1, CeedHashIJKLKeyHash, CeedHashIJKLKeyEqual) // IJKLM - five keys -typedef struct _CeedHashIJKLMKey { CeedInt i, j, k, l, m; } CeedHashIJKLMKey; -#define CeedHashIJKLMKeyHash(key) \ - CeedHashCombine( \ - CeedHashCombine(CeedHashCombine(CeedHashInt((key).i),CeedHashInt((key).j)), \ - CeedHashCombine(CeedHashInt((key).k),CeedHashInt((key).l))), \ - CeedHashInt((key).m)) +typedef struct _CeedHashIJKLMKey { + CeedInt i, j, k, l, m; +} CeedHashIJKLMKey; +#define CeedHashIJKLMKeyHash(key) \ + CeedHashCombine( \ + CeedHashCombine(CeedHashCombine(CeedHashInt((key).i), CeedHashInt((key).j)), CeedHashCombine(CeedHashInt((key).k), CeedHashInt((key).l))), \ + CeedHashInt((key).m)) -#define CeedHashIJKLMKeyEqual(k1,k2) \ - ((k1).i==(k2).i && (k1).j==(k2).j && (k1).k==(k2).k && (k1).l==(k2).l && (k1).m==(k2).m) +#define CeedHashIJKLMKeyEqual(k1, k2) ((k1).i == (k2).i && (k1).j == (k2).j && (k1).k == (k2).k && (k1).l == (k2).l && (k1).m == (k2).m) -#define CeedHashIJKLMInit(name, value) \ - KHASH_INIT(name,CeedHashIJKLMKey,value,1,CeedHashIJKLMKeyHash,CeedHashIJKLMKeyEqual) +#define CeedHashIJKLMInit(name, value) KHASH_INIT(name, CeedHashIJKLMKey, value, 1, CeedHashIJKLMKeyHash, CeedHashIJKLMKeyEqual) -#endif // _ceed_hash_h +#endif // _ceed_hash_h diff --git a/include/ceed/hip.h b/include/ceed/hip.h index 1b3419ca4a..8db995c093 100644 --- a/include/ceed/hip.h +++ b/include/ceed/hip.h @@ -11,7 +11,6 @@ #include #include -CEED_EXTERN int CeedQFunctionSetHIPUserFunction(CeedQFunction qf, - hipFunction_t f); +CEED_EXTERN int CeedQFunctionSetHIPUserFunction(CeedQFunction qf, hipFunction_t f); #endif diff --git a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h index 448ac31cda..365fd766f6 100644 --- a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h +++ b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h @@ -17,13 +17,10 @@ //------------------------------------------------------------------------------ __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) { unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = - atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + - __longlong_as_double(assumed))); + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN // (since NaN != NaN) } while (assumed != old); diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h index 0dafacd2c1..621b3f6439 100644 --- a/include/ceed/jit-source/cuda/cuda-gen-templates.h +++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h @@ -17,8 +17,7 @@ //------------------------------------------------------------------------------ template inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { - for (CeedInt i = data.t_id; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) - B[i] = d_B[i]; + for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } //------------------------------------------------------------------------------ @@ -29,12 +28,12 @@ inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__res // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P1d) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + COMPSTRIDE * comp]; + const CeedInt ind = indices[node + elem * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; } } @@ -45,9 +44,8 @@ template L-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { +inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P1d) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); + const CeedInt ind = indices[node + elem * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); } } @@ -71,9 +69,8 @@ template E-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d; - const CeedInt ind = indices[node + elem * P1d*P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + COMPSTRIDE * comp]; + const CeedInt node = data.t_id_x + data.t_id_y * P1d; + const CeedInt ind = indices[node + elem * P1d * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; } } @@ -100,10 +97,9 @@ inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt nno template inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d; - const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + comp * STRIDES_COMP]; + const CeedInt node = data.t_id_x + data.t_id_y * P1d; + const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } } @@ -111,12 +107,12 @@ inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt el // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { +inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d; - const CeedInt ind = indices[node + elem * P1d*P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); + const CeedInt node = data.t_id_x + data.t_id_y * P1d; + const CeedInt ind = indices[node + elem * P1d * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); } } @@ -126,10 +122,9 @@ inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nn template inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d; - const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - d_v[ind + comp * STRIDES_COMP] += r_v[comp]; + const CeedInt node = data.t_id_x + data.t_id_y * P1d; + const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; + for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; } } @@ -148,13 +143,13 @@ inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt e // - writeDofsOffset3d -> writeOffset3d ? // - writeDofsStrided3d -> writeStrided3d ? template -inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P1d && data.t_id_y < P1d) for (CeedInt z = 0; z < P1d; ++z) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; - const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; + const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; + const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + COMPSTRIDE * comp]; } } @@ -165,10 +160,9 @@ template Q-vector, offests provided //------------------------------------------------------------------------------ template -inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, + const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < Q1d && data.t_id_y < Q1d) { - const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; - const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + COMPSTRIDE * comp]; + const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; + const CeedInt ind = indices[node + elem * Q1d * Q1d * Q1d]; + ; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; } } @@ -189,12 +184,12 @@ inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedI // E-vector -> Q-vector, strided //------------------------------------------------------------------------------ template -inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, + CeedScalar *r_u) { if (data.t_id_x < Q1d && data.t_id_y < Q1d) { - const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; - const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + comp * STRIDES_COMP]; + const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; + const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } } @@ -202,13 +197,13 @@ inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const Ceed // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { +inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P1d && data.t_id_y < P1d) for (CeedInt z = 0; z < P1d; ++z) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; - const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); + const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; + const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z + comp * P1d]); } } @@ -219,10 +214,9 @@ template -inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { if (data.t_id_x < Q1d && data.t_id_y < Q1d) { for (CeedInt comp = 0; comp < NCOMP; ++comp) { - data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[q + comp*Q1d]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q1d]; __syncthreads(); // X derivative - r_V[comp+0*NCOMP] = 0.0; + r_V[comp + 0 * NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) - r_V[comp+0*NCOMP] += c_G[i + data.t_id_x*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) + r_V[comp + 0 * NCOMP] += c_G[i + data.t_id_x * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) // Y derivative - r_V[comp+1*NCOMP] = 0.0; + r_V[comp + 1 * NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) - r_V[comp+1*NCOMP] += c_G[i + data.t_id_y*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) + r_V[comp + 1 * NCOMP] += c_G[i + data.t_id_y * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) // Z derivative - r_V[comp+2*NCOMP] = 0.0; - for (CeedInt i = 0; i < Q1d; ++i) - r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) + r_V[comp + 2 * NCOMP] = 0.0; + for (CeedInt i = 0; i < Q1d; ++i) r_V[comp + 2 * NCOMP] += c_G[i + q * Q1d] * r_U[i + comp * Q1d]; // Contract z direction (Z derivative) __syncthreads(); } } @@ -256,24 +250,25 @@ inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const // 3D collocated derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { if (data.t_id_x < Q1d && data.t_id_y < Q1d) { for (CeedInt comp = 0; comp < NCOMP; ++comp) { // X derivative - data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 0*NCOMP]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NCOMP]; __syncthreads(); for (CeedInt i = 0; i < Q1d; ++i) - r_V[q+comp*Q1d] += c_G[data.t_id_x + i*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) + r_V[q + comp * Q1d] += c_G[data.t_id_x + i * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) __syncthreads(); // Y derivative - data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 1*NCOMP]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NCOMP]; __syncthreads(); for (CeedInt i = 0; i < Q1d; ++i) - r_V[q+comp*Q1d] += c_G[data.t_id_y + i*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) + r_V[q + comp * Q1d] += c_G[data.t_id_y + i * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) __syncthreads(); // Z derivative for (CeedInt i = 0; i < Q1d; ++i) - r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) + r_V[i + comp * Q1d] += c_G[i + q * Q1d] * r_U[comp + 2 * NCOMP]; // PARTIAL contract z direction (Z derivative) } } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h index 1a21204720..af5ec9c8d3 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h @@ -14,33 +14,28 @@ //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, - const CeedScalar *d_B, - const CeedScalar *__restrict__ d_U, +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { const CeedInt t_id = threadIdx.x; const CeedScalar *U; - CeedScalar V; - //TODO load B in shared memory if blockDim.z > 1? + CeedScalar V; + // TODO load B in shared memory if blockDim.z > 1? - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; - elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - if (transpose) { // run with P threads - U = d_U + elem*BASIS_Q + comp*num_elem*BASIS_Q; + if (transpose) { // run with P threads + U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q; V = 0.0; - for (CeedInt i = 0; i < BASIS_Q; i++) - V += d_B[t_id + i*BASIS_P]*U[i]; + for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i]; - d_V[elem*BASIS_P + comp*num_elem*BASIS_P + t_id] = V; - } else { // run with Q threads - U = d_U + elem*BASIS_P + comp*num_elem*BASIS_P; + d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; + } else { // run with Q threads + U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; V = 0.0; - for (CeedInt i = 0; i < BASIS_P; i++) - V += d_B[i + t_id*BASIS_P]*U[i]; + for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i]; - d_V[elem*BASIS_Q + comp*num_elem*BASIS_Q + t_id] = V; + d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V; } } } @@ -49,41 +44,34 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos //------------------------------------------------------------------------------ // Grad //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, - const CeedScalar *d_G, - const CeedScalar *__restrict__ d_U, +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { const CeedInt t_id = threadIdx.x; const CeedScalar *U; - //TODO load G in shared memory if blockDim.z > 1? + // TODO load G in shared memory if blockDim.z > 1? - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; - elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - if (transpose) { // run with P threads + if (transpose) { // run with P threads CeedScalar V = 0.0; for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { - U = d_U + elem*BASIS_Q + comp*num_elem*BASIS_Q + - dim*BASIS_NUM_COMP*num_elem*BASIS_Q; - for (CeedInt i = 0; i < BASIS_Q; i++) - V += d_G[t_id + i*BASIS_P + dim*BASIS_P*BASIS_Q]*U[i]; + U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q; + for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i]; } - d_V[elem*BASIS_P + comp*num_elem*BASIS_P + t_id] = V; - } else { // run with Q threads + d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; + } else { // run with Q threads CeedScalar V[BASIS_DIM]; - U = d_U + elem*BASIS_P + comp*num_elem*BASIS_P; - for (CeedInt dim = 0; dim < BASIS_DIM; dim++) - V[dim] = 0.0; + U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; + for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0; for (CeedInt i = 0; i < BASIS_P; i++) { const CeedScalar val = U[i]; - for(CeedInt dim = 0; dim < BASIS_DIM; dim++) - V[dim] += d_G[i + t_id*BASIS_P + dim*BASIS_P*BASIS_Q]*val; + for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val; } for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { - d_V[elem*BASIS_Q + comp*num_elem*BASIS_Q + dim*BASIS_NUM_COMP*num_elem*BASIS_Q + t_id] = V[dim]; + d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim]; } } } @@ -93,14 +81,11 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, //------------------------------------------------------------------------------ // Weight //------------------------------------------------------------------------------ -extern "C" __global__ void Weight(const CeedInt num_elem, - const CeedScalar *__restrict__ q_weight, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) { const CeedInt t_id = threadIdx.x; - //TODO load q_weight in shared memory if blockDim.z > 1? - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; - elem += gridDim.x*blockDim.z) { - d_V[elem*BASIS_Q + t_id] = q_weight[t_id]; + // TODO load q_weight in shared memory if blockDim.z > 1? + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h index 6898ef8987..18f6bcffad 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h @@ -14,46 +14,44 @@ //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, - const CeedScalar *__restrict__ interp_1d, - const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN]; - CeedScalar *s_interp_1d = s_mem; - CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; - CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { s_interp_1d[k] = interp_1d[k]; } - const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; - const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; - const CeedInt stride_0 = transpose ? 1 : BASIS_P_1D; - const CeedInt stride_1 = transpose ? BASIS_P_1D : 1; - const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; + const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; + const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; + const CeedInt stride_0 = transpose ? 1 : BASIS_P_1D; + const CeedInt stride_1 = transpose ? BASIS_P_1D : 1; + const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); - const CeedInt u_size = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt u_size = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; for (CeedInt k = i; k < u_size; k += blockDim.x) { s_buffer_1[k] = cur_u[k]; } - CeedInt pre = u_size; + CeedInt pre = u_size; CeedInt post = 1; for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); // Update bufferfers used pre /= P; - const CeedScalar *in = d % 2 ? s_buffer_2 : s_buffer_1; - CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedScalar *in = d % 2 ? s_buffer_2 : s_buffer_1; + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); // Contract along middle index const CeedInt writeLen = pre * post * Q; @@ -63,8 +61,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos const CeedInt a = k / (post * Q); CeedScalar vk = 0; - for (CeedInt b = 0; b < P; b++) - vk += s_interp_1d[j*stride_0 + b*stride_1] * in[(a*P + b)*post + c]; + for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; out[k] = vk; } @@ -78,72 +75,59 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos //------------------------------------------------------------------------------ // Grad //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, - const CeedScalar *__restrict__ interp_1d, - const CeedScalar *__restrict__ grad_1d, - const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d, + const CeedScalar *__restrict__ grad_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[2 * (BASIS_Q_1D * BASIS_P_1D + BASIS_BUF_LEN)]; - CeedScalar *s_interp_1d = s_mem; - CeedScalar *s_grad_1d = s_interp_1d + BASIS_Q_1D * BASIS_P_1D; - CeedScalar *s_buffer_1 = s_grad_1d + BASIS_Q_1D * BASIS_P_1D; - CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_interp_1d = s_mem; + CeedScalar *s_grad_1d = s_interp_1d + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_1 = s_grad_1d + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { s_interp_1d[k] = interp_1d[k]; - s_grad_1d[k] = grad_1d[k]; + s_grad_1d[k] = grad_1d[k]; } - const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; - const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; - const CeedInt stride_0 = transpose ? 1 : BASIS_P_1D; - const CeedInt stride_1 = transpose ? BASIS_P_1D : 1; - const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; + const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; + const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; + const CeedInt stride_0 = transpose ? 1 : BASIS_P_1D; + const CeedInt stride_1 = transpose ? BASIS_P_1D : 1; + const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); - const CeedInt u_dim_stride = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0; - const CeedInt v_dim_stride = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP; + const CeedInt u_dim_stride = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0; + const CeedInt v_dim_stride = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP; // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - // dim*dim contractions for grad for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - CeedInt pre = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - CeedInt post = 1; - const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + - comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * - v_comp_stride; + CeedInt pre = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + CeedInt post = 1; + const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; + CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { __syncthreads(); // Update bufferfers used pre /= P; - const CeedScalar *op = dim_1 == dim_2 ? s_grad_1d : s_interp_1d; - const CeedScalar *in = dim_2 == 0 - ? cur_u - : (dim_2 % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = dim_2 == BASIS_DIM - 1 - ? cur_v - : (dim_2 % 2 ? s_buffer_1 : s_buffer_2); + const CeedScalar *op = dim_1 == dim_2 ? s_grad_1d : s_interp_1d; + const CeedScalar *in = dim_2 == 0 ? cur_u : (dim_2 % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? cur_v : (dim_2 % 2 ? s_buffer_1 : s_buffer_2); // Contract along middle index const CeedInt writeLen = pre * post * Q; for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar v_k = 0; - for (CeedInt b = 0; b < P; b++) - v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; - - if (transpose && dim_2 == BASIS_DIM - 1) - out[k] += v_k; - else - out[k] = v_k; + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; + + if (transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= Q; @@ -156,29 +140,25 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, //------------------------------------------------------------------------------ // 1D quadrature weights //------------------------------------------------------------------------------ -__device__ void Weight1d(const CeedInt num_elem, const CeedScalar *q_weight_1d, - CeedScalar *w) { +__device__ void Weight1d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { const CeedInt i = threadIdx.x; if (i < BASIS_Q_1D) { const size_t elem = blockIdx.x; - if (elem < num_elem) - w[elem*BASIS_Q_1D + i] = q_weight_1d[i]; + if (elem < num_elem) w[elem * BASIS_Q_1D + i] = q_weight_1d[i]; } } //------------------------------------------------------------------------------ // 2D quadrature weights //------------------------------------------------------------------------------ -__device__ void Weight2d(const CeedInt num_elem, const CeedScalar *q_weight_1d, - CeedScalar *w) { - +__device__ void Weight2d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { const CeedInt i = threadIdx.x; const CeedInt j = threadIdx.y; if (i < BASIS_Q_1D && j < BASIS_Q_1D) { const size_t elem = blockIdx.x; if (elem < num_elem) { const size_t ind = (elem * BASIS_Q_1D + j) * BASIS_Q_1D + i; - w[ind] = q_weight_1d[i] * q_weight_1d[j]; + w[ind] = q_weight_1d[i] * q_weight_1d[j]; } } } @@ -186,8 +166,7 @@ __device__ void Weight2d(const CeedInt num_elem, const CeedScalar *q_weight_1d, //------------------------------------------------------------------------------ // 3D quadrature weights //------------------------------------------------------------------------------ -__device__ void Weight3d(const CeedInt num_elem, const CeedScalar *q_weight_1d, - CeedScalar *w) { +__device__ void Weight3d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { const CeedInt i = threadIdx.x; const CeedInt j = threadIdx.y; if (i < BASIS_Q_1D && j < BASIS_Q_1D) { @@ -195,7 +174,7 @@ __device__ void Weight3d(const CeedInt num_elem, const CeedScalar *q_weight_1d, if (elem < num_elem) { for (CeedInt k = 0; k < BASIS_Q_1D; k++) { const size_t ind = ((elem * BASIS_Q_1D + k) * BASIS_Q_1D + j) * BASIS_Q_1D + i; - w[ind] = q_weight_1d[i] * q_weight_1d[j] * q_weight_1d[k]; + w[ind] = q_weight_1d[i] * q_weight_1d[j] * q_weight_1d[k]; } } } @@ -204,15 +183,10 @@ __device__ void Weight3d(const CeedInt num_elem, const CeedScalar *q_weight_1d, //------------------------------------------------------------------------------ // Quadrature weights //------------------------------------------------------------------------------ -extern "C" __global__ void Weight(const CeedInt num_elem, - const CeedScalar *__restrict__ q_weight_1d, - CeedScalar *__restrict__ v) { - if (BASIS_DIM == 1) - Weight1d(num_elem, q_weight_1d, v); - else if (BASIS_DIM == 2) - Weight2d(num_elem, q_weight_1d, v); - else if (BASIS_DIM == 3) - Weight3d(num_elem, q_weight_1d, v); +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ v) { + if (BASIS_DIM == 1) Weight1d(num_elem, q_weight_1d, v); + else if (BASIS_DIM == 2) Weight2d(num_elem, q_weight_1d, v); + else if (BASIS_DIM == 3) Weight3d(num_elem, q_weight_1d, v); } //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h index b7c69b0e33..f62bacd5e9 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h +++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h @@ -14,15 +14,15 @@ typedef enum { /// Perform no evaluation (either because there is no data or it is already at /// quadrature points) - CEED_EVAL_NONE = 0, + CEED_EVAL_NONE = 0, /// Interpolate from nodes to quadrature points CEED_EVAL_INTERP = 1, /// Evaluate gradients at quadrature points from input in a nodal basis - CEED_EVAL_GRAD = 2, + CEED_EVAL_GRAD = 2, /// Evaluate divergence at quadrature points from input in a nodal basis - CEED_EVAL_DIV = 4, + CEED_EVAL_DIV = 4, /// Evaluate curl at quadrature points from input in a nodal basis - CEED_EVAL_CURL = 8, + CEED_EVAL_CURL = 8, /// Using no input, evaluate quadrature weights on the reference element CEED_EVAL_WEIGHT = 16, } CeedEvalMode; @@ -30,58 +30,48 @@ typedef enum { //------------------------------------------------------------------------------ // Get Basis Emode Pointer //------------------------------------------------------------------------------ -extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basisptr, - CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp, - const CeedScalar *grad) { +extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basisptr, CeedEvalMode emode, const CeedScalar *identity, + const CeedScalar *interp, const CeedScalar *grad) { switch (emode) { - case CEED_EVAL_NONE: - *basisptr = identity; - break; - case CEED_EVAL_INTERP: - *basisptr = interp; - break; - case CEED_EVAL_GRAD: - *basisptr = grad; - break; - case CEED_EVAL_WEIGHT: - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - break; // Caught by QF Assembly + case CEED_EVAL_NONE: + *basisptr = identity; + break; + case CEED_EVAL_INTERP: + *basisptr = interp; + break; + case CEED_EVAL_GRAD: + *basisptr = grad; + break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // Caught by QF Assembly } } //------------------------------------------------------------------------------ // Core code for diagonal assembly //------------------------------------------------------------------------------ -__device__ void diagonalCore(const CeedInt nelem, - const bool pointBlock, const CeedScalar *identity, - const CeedScalar *interpin, const CeedScalar *gradin, - const CeedScalar *interpout, const CeedScalar *gradout, - const CeedEvalMode *emodein, const CeedEvalMode *emodeout, - const CeedScalar *__restrict__ assembledqfarray, - CeedScalar *__restrict__ elemdiagarray) { - const int tid = threadIdx.x; // running with P threads, tid is evec node +__device__ void diagonalCore(const CeedInt nelem, const bool pointBlock, const CeedScalar *identity, const CeedScalar *interpin, + const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, + const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { + const int tid = threadIdx.x; // running with P threads, tid is evec node if (tid >= NNODES) return; // Compute the diagonal of B^T D B // Each element - for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < nelem; - e += gridDim.x*blockDim.z) { + for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < nelem; e += gridDim.x * blockDim.z) { CeedInt dout = -1; // Each basis eval mode pair for (CeedInt eout = 0; eout < NUMEMODEOUT; eout++) { const CeedScalar *bt = NULL; - if (emodeout[eout] == CEED_EVAL_GRAD) - dout += 1; - CeedOperatorGetBasisPointer_Cuda(&bt, emodeout[eout], identity, interpout, - &gradout[dout*NQPTS*NNODES]); + if (emodeout[eout] == CEED_EVAL_GRAD) dout += 1; + CeedOperatorGetBasisPointer_Cuda(&bt, emodeout[eout], identity, interpout, &gradout[dout * NQPTS * NNODES]); CeedInt din = -1; for (CeedInt ein = 0; ein < NUMEMODEIN; ein++) { const CeedScalar *b = NULL; - if (emodein[ein] == CEED_EVAL_GRAD) - din += 1; - CeedOperatorGetBasisPointer_Cuda(&b, emodein[ein], identity, interpin, - &gradin[din*NQPTS*NNODES]); + if (emodein[ein] == CEED_EVAL_GRAD) din += 1; + CeedOperatorGetBasisPointer_Cuda(&b, emodein[ein], identity, interpin, &gradin[din * NQPTS * NNODES]); // Each component for (CeedInt compOut = 0; compOut < NCOMP; compOut++) { // Each qpoint/node pair @@ -91,22 +81,20 @@ __device__ void diagonalCore(const CeedInt nelem, CeedScalar evalue = 0.; for (CeedInt q = 0; q < NQPTS; q++) { const CeedScalar qfvalue = - assembledqfarray[((((ein*NCOMP+compIn)*NUMEMODEOUT+eout)* - NCOMP+compOut)*nelem+e)*NQPTS+q]; - evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; + assembledqfarray[((((ein * NCOMP + compIn) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; + evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; } - elemdiagarray[((compOut*NCOMP+compIn)*nelem+e)*NNODES+tid] += evalue; + elemdiagarray[((compOut * NCOMP + compIn) * nelem + e) * NNODES + tid] += evalue; } } else { // Diagonal Only CeedScalar evalue = 0.; for (CeedInt q = 0; q < NQPTS; q++) { const CeedScalar qfvalue = - assembledqfarray[((((ein*NCOMP+compOut)*NUMEMODEOUT+eout)* - NCOMP+compOut)*nelem+e)*NQPTS+q]; - evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; + assembledqfarray[((((ein * NCOMP + compOut) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; + evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; } - elemdiagarray[(compOut*nelem+e)*NNODES+tid] += evalue; + elemdiagarray[(compOut * nelem + e) * NNODES + tid] += evalue; } } } @@ -117,29 +105,21 @@ __device__ void diagonalCore(const CeedInt nelem, //------------------------------------------------------------------------------ // Linear diagonal //------------------------------------------------------------------------------ -extern "C" __global__ void linearDiagonal(const CeedInt nelem, - const CeedScalar *identity, - const CeedScalar *interpin, const CeedScalar *gradin, - const CeedScalar *interpout, const CeedScalar *gradout, - const CeedEvalMode *emodein, const CeedEvalMode *emodeout, - const CeedScalar *__restrict__ assembledqfarray, - CeedScalar *__restrict__ elemdiagarray) { - diagonalCore(nelem, false, identity, interpin, gradin, interpout, - gradout, emodein, emodeout, assembledqfarray, elemdiagarray); +extern "C" __global__ void linearDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, const CeedScalar *gradin, + const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, + const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, + CeedScalar *__restrict__ elemdiagarray) { + diagonalCore(nelem, false, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); } //------------------------------------------------------------------------------ // Linear point block diagonal //------------------------------------------------------------------------------ -extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, - const CeedScalar *identity, - const CeedScalar *interpin, const CeedScalar *gradin, - const CeedScalar *interpout, const CeedScalar *gradout, - const CeedEvalMode *emodein, const CeedEvalMode *emodeout, - const CeedScalar *__restrict__ assembledqfarray, - CeedScalar *__restrict__ elemdiagarray) { - diagonalCore(nelem, true, identity, interpin, gradin, interpout, - gradout, emodein, emodeout, assembledqfarray, elemdiagarray); +extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, + const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, + const CeedEvalMode *emodein, const CeedEvalMode *emodeout, + const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { + diagonalCore(nelem, true, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); } //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h index 6f975c924b..c9c25d522c 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h +++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h @@ -10,106 +10,100 @@ //------------------------------------------------------------------------------ // Matrix assembly kernel for low-order elements (2D thread block) //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BLOCK_SIZE) - __global__ void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, - const CeedScalar *__restrict__ qf_array, - CeedScalar *__restrict__ values_array) { - - // This kernel assumes B_in and B_out have the same number of quadrature points and - // basis points. +extern "C" __launch_bounds__(BLOCK_SIZE) __global__ + void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, + CeedScalar *__restrict__ values_array) { + // This kernel assumes B_in and B_out have the same number of quadrature points and + // basis points. // TODO: expand to more general cases - const int i = threadIdx.x; // The output row index of each B^TDB operation - const int l = threadIdx.y; // The output column index of each B^TDB operation - // such that we have (Bout^T)_ij D_jk Bin_kl = C_il + const int i = threadIdx.x; // The output row index of each B^TDB operation + const int l = threadIdx.y; // The output column index of each B^TDB operation + // such that we have (Bout^T)_ij D_jk Bin_kl = C_il // Strides for final output ordering, determined by the reference (interface) implementation of - // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col + // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col const CeedInt comp_out_stride = NNODES * NNODES; - const CeedInt comp_in_stride = comp_out_stride * NCOMP; - const CeedInt e_stride = comp_in_stride * NCOMP; - // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const CeedInt qe_stride = NQPTS; - const CeedInt qcomp_out_stride = NELEM * qe_stride; + const CeedInt comp_in_stride = comp_out_stride * NCOMP; + const CeedInt e_stride = comp_in_stride * NCOMP; + // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt + const CeedInt qe_stride = NQPTS; + const CeedInt qcomp_out_stride = NELEM * qe_stride; const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP; - const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; - const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; + const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; + const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; // Loop over each element (if necessary) - for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM; - e += gridDim.x*blockDim.z) { + for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) { for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) { - CeedScalar result = 0.0; - CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; + CeedScalar result = 0.0; + CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { CeedInt b_in_index = emode_in * NQPTS * NNODES; - for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { + for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { CeedInt b_out_index = emode_out * NQPTS * NNODES; - CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; - // Perform the B^T D B operation for this 'chunk' of D (the qf_array) + CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + // Perform the B^T D B operation for this 'chunk' of D (the qf_array) for (CeedInt j = 0; j < NQPTS; j++) { - result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; - } - } // end of emode_out - } // end of emode_in - CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; - values_array[val_index] = result; - } // end of out component - } // end of in component - } // end of element loop + result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; + } + } // end of emode_out + } // end of emode_in + CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; + values_array[val_index] = result; + } // end of out component + } // end of in component + } // end of element loop } //------------------------------------------------------------------------------ // Fallback kernel for larger orders (1D thread block) //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BLOCK_SIZE) - __global__ void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, - const CeedScalar *__restrict__ qf_array, - CeedScalar *__restrict__ values_array) { - - // This kernel assumes B_in and B_out have the same number of quadrature points and - // basis points. +extern "C" __launch_bounds__(BLOCK_SIZE) __global__ + void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, + CeedScalar *__restrict__ values_array) { + // This kernel assumes B_in and B_out have the same number of quadrature points and + // basis points. // TODO: expand to more general cases - const int l = threadIdx.x; // The output column index of each B^TDB operation - // such that we have (Bout^T)_ij D_jk Bin_kl = C_il + const int l = threadIdx.x; // The output column index of each B^TDB operation + // such that we have (Bout^T)_ij D_jk Bin_kl = C_il // Strides for final output ordering, determined by the reference (interface) implementation of - // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col + // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col const CeedInt comp_out_stride = NNODES * NNODES; - const CeedInt comp_in_stride = comp_out_stride * NCOMP; - const CeedInt e_stride = comp_in_stride * NCOMP; - // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const CeedInt qe_stride = NQPTS; - const CeedInt qcomp_out_stride = NELEM * qe_stride; + const CeedInt comp_in_stride = comp_out_stride * NCOMP; + const CeedInt e_stride = comp_in_stride * NCOMP; + // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt + const CeedInt qe_stride = NQPTS; + const CeedInt qcomp_out_stride = NELEM * qe_stride; const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP; - const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; - const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; + const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; + const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; - // Loop over each element (if necessary) - for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM; - e += gridDim.x*blockDim.z) { + // Loop over each element (if necessary) + for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) { for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) { for (CeedInt i = 0; i < NNODES; i++) { - CeedScalar result = 0.0; - CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; + CeedScalar result = 0.0; + CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { CeedInt b_in_index = emode_in * NQPTS * NNODES; - for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { + for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { CeedInt b_out_index = emode_out * NQPTS * NNODES; - CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; - // Perform the B^T D B operation for this 'chunk' of D (the qf_array) + CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + // Perform the B^T D B operation for this 'chunk' of D (the qf_array) for (CeedInt j = 0; j < NQPTS; j++) { - result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; - } - } // end of emode_out - } // end of emode_in - CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; - values_array[val_index] = result; - } // end of loop over element node index, i - } // end of out component - } // end of in component - } // end of element loop + result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; + } + } // end of emode_out + } // end of emode_in + CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; + values_array[val_index] = result; + } // end of loop over element node index, i + } // end of out component + } // end of in component + } // end of element loop } //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h index 6ad3676282..ee6aa726a3 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-qfunction.h +++ b/include/ceed/jit-source/cuda/cuda-ref-qfunction.h @@ -11,9 +11,8 @@ template //------------------------------------------------------------------------------ // Read from quadrature points //------------------------------------------------------------------------------ -inline __device__ void readQuads(const CeedInt quad, const CeedInt num_qpts, - const CeedScalar* d_u, CeedScalar* r_u) { - for(CeedInt comp = 0; comp < SIZE; comp++) { +inline __device__ void readQuads(const CeedInt quad, const CeedInt num_qpts, const CeedScalar* d_u, CeedScalar* r_u) { + for (CeedInt comp = 0; comp < SIZE; comp++) { r_u[comp] = d_u[quad + num_qpts * comp]; } } @@ -22,9 +21,8 @@ inline __device__ void readQuads(const CeedInt quad, const CeedInt num_qpts, // Write at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void writeQuads(const CeedInt quad, const CeedInt num_qpts, - const CeedScalar* r_v, CeedScalar* d_v) { - for(CeedInt comp = 0; comp < SIZE; comp++) { +inline __device__ void writeQuads(const CeedInt quad, const CeedInt num_qpts, const CeedScalar* r_v, CeedScalar* d_v) { + for (CeedInt comp = 0; comp < SIZE; comp++) { d_v[quad + num_qpts * comp] = r_v[comp]; } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction.h b/include/ceed/jit-source/cuda/cuda-ref-restriction.h index 0f465d8f05..88526f6595 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction.h @@ -10,93 +10,70 @@ //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, - const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; - node < num_elem*RESTR_ELEM_SIZE; - node += blockDim.x * gridDim.x) { +extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + const CeedInt elem = node / RESTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - v[loc_node + comp*RESTR_ELEM_SIZE*RESTR_NUM_ELEM + - elem*RESTR_ELEM_SIZE] = - u[loc_node*RESTR_STRIDE_NODES + comp*RESTR_STRIDE_COMP + elem*RESTR_STRIDE_ELEM]; + v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = + u[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM]; } } //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, - const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ u, +extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; - node < num_elem*RESTR_ELEM_SIZE; - node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + const CeedInt elem = node / RESTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - v[loc_node + comp*RESTR_ELEM_SIZE*RESTR_NUM_ELEM + - elem*RESTR_ELEM_SIZE] = - u[ind + comp*RESTR_COMP_STRIDE]; + v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = u[ind + comp * RESTR_COMP_STRIDE]; } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -extern "C" __global__ void StridedTranspose(const CeedInt num_elem, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; - node < num_elem*RESTR_ELEM_SIZE; - node += blockDim.x * gridDim.x) { +extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + const CeedInt elem = node / RESTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - v[loc_node*RESTR_STRIDE_NODES + comp*RESTR_STRIDE_COMP + elem*RESTR_STRIDE_ELEM] += - u[loc_node + comp*RESTR_ELEM_SIZE*RESTR_NUM_ELEM + - elem*RESTR_ELEM_SIZE]; + v[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM] += + u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; } } //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ lvec_indices, - const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, - const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { +extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ lvec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { CeedScalar value[RESTR_NUM_COMP]; - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; - i < RESTR_NUM_NODES; - i += blockDim.x * gridDim.x) { - const CeedInt ind = lvec_indices[i]; + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RESTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = lvec_indices[i]; const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i+1]; + const CeedInt range_N = t_offsets[i + 1]; - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - value[comp] = 0.0; + for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) value[comp] = 0.0; for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; - CeedInt loc_node = t_ind % RESTR_ELEM_SIZE; - CeedInt elem = t_ind / RESTR_ELEM_SIZE; + const CeedInt t_ind = t_indices[j]; + CeedInt loc_node = t_ind % RESTR_ELEM_SIZE; + CeedInt elem = t_ind / RESTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - value[comp] += u[loc_node + comp*RESTR_ELEM_SIZE*RESTR_NUM_ELEM + - elem*RESTR_ELEM_SIZE]; + value[comp] += u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; } - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - v[ind + comp*RESTR_COMP_STRIDE] += value[comp]; + for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) v[ind + comp * RESTR_COMP_STRIDE] += value[comp]; } } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h index 7ec3249b7a..ef1e7289bf 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h @@ -20,10 +20,11 @@ // E-vector -> single element //------------------------------------------------------------------------------ template -inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_u[comp] = d_u[ind + comp * strides_comp]; } @@ -34,10 +35,11 @@ inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt // Single element -> E-vector //------------------------------------------------------------------------------ template -inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { +inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { d_v[ind + comp * strides_comp] = r_v[comp]; } @@ -52,10 +54,11 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn // E-vector -> single element //------------------------------------------------------------------------------ template -inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { - const CeedInt node = data.t_id_x + data.t_id_y*P_1D; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_u[comp] = d_u[ind + comp * strides_comp]; } @@ -66,10 +69,11 @@ inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt // Single element -> E-vector //------------------------------------------------------------------------------ template -inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { +inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { - const CeedInt node = data.t_id_x + data.t_id_y*P_1D; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { d_v[ind + comp * strides_comp] = r_v[comp]; } @@ -84,11 +88,12 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn // E-vector -> single element //------------------------------------------------------------------------------ template -inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { - const CeedInt node = data.t_id_x + data.t_id_y*P_1D + z*P_1D*P_1D; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp]; } @@ -100,11 +105,12 @@ inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt // Single element -> E-vector //------------------------------------------------------------------------------ template -inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { +inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { - const CeedInt node = data.t_id_x + data.t_id_y*P_1D + z*P_1D*P_1D; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { d_v[ind + comp * strides_comp] = r_v[z + comp * P_1D]; } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h index ceb44eabf0..7d4f099f8c 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h @@ -26,7 +26,7 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c *V = 0.0; if (data.t_id_x < Q_1D) { for (CeedInt i = 0; i < P_1D; i++) { - *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction + *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction } } __syncthreads(); @@ -42,7 +42,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca *V = 0.0; if (data.t_id_x < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction + *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction } } __syncthreads(); @@ -62,7 +62,8 @@ inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restr // 1D interpolate transpose //------------------------------------------------------------------------------ template -inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeX1d(data, r_U + comp, c_B, r_V + comp); } @@ -72,7 +73,8 @@ inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar // 1D derivatives at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX1d(data, r_U + comp, c_G, r_V + comp); } @@ -82,7 +84,8 @@ inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restric // 1D derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeX1d(data, r_U + comp, c_G, r_V + comp); } @@ -105,12 +108,12 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr //------------------------------------------------------------------------------ template inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < P_1D; i++) { - *V += B[i + data.t_id_x*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -121,12 +124,12 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ template inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt i = 0; i < P_1D; i++) { - *V += B[i + data.t_id_y*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -137,12 +140,12 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - *V += B[data.t_id_y + i*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -153,12 +156,12 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -169,11 +172,11 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -183,7 +186,8 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed // 2D interpolate to quadrature points //------------------------------------------------------------------------------ template -inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX2d(data, r_U + comp, c_B, r_t); @@ -195,7 +199,8 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_ // 2D interpolate transpose //------------------------------------------------------------------------------ template -inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeY2d(data, r_U + comp, c_B, r_t); @@ -207,13 +212,14 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed // 2D derivatives at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX2d(data, r_U + comp, c_G, r_t); - ContractY2d(data, r_t, c_B, r_V + comp + 0*NUM_COMP); + ContractY2d(data, r_t, c_B, r_V + comp + 0 * NUM_COMP); ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_G, r_V + comp + 1*NUM_COMP); + ContractY2d(data, r_t, c_G, r_V + comp + 1 * NUM_COMP); } } @@ -221,12 +227,13 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r // 2D derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp + 0*NUM_COMP, c_B, r_t); + ContractTransposeY2d(data, r_U + comp + 0 * NUM_COMP, c_B, r_t); ContractTransposeX2d(data, r_t, c_G, r_V + comp); - ContractTransposeY2d(data, r_U + comp + 1*NUM_COMP, c_G, r_t); + ContractTransposeY2d(data, r_U + comp + 1 * NUM_COMP, c_G, r_t); ContractTransposeAddX2d(data, r_t, c_B, r_V + comp); } } @@ -236,8 +243,7 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc //------------------------------------------------------------------------------ template inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { - *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? - q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0; + *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; } //------------------------------------------------------------------------------ @@ -251,16 +257,16 @@ template inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { - r_B[i] = B[i + data.t_id_x*P_1D]; + r_B[i] = B[i + data.t_id_x * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < P_1D; i++) { - V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -274,16 +280,16 @@ template inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { - r_B[i] = B[i + data.t_id_y*P_1D]; + r_B[i] = B[i + data.t_id_y * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt i = 0; i < P_1D; i++) { - V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -299,7 +305,7 @@ inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, c V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt i = 0; i < P_1D; i++) { - V[k] += B[i + k*P_1D] * U[i]; // Contract z direction + V[k] += B[i + k * P_1D] * U[i]; // Contract z direction } } } @@ -314,7 +320,7 @@ inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedSca V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += B[k + i*P_1D] * U[i]; // Contract z direction + V[k] += B[k + i * P_1D] * U[i]; // Contract z direction } } } @@ -327,16 +333,16 @@ template inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { - r_B[i] = B[data.t_id_y + i*P_1D]; + r_B[i] = B[data.t_id_y + i * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -350,15 +356,15 @@ template inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { - r_B[i] = B[data.t_id_y + i*P_1D]; + r_B[i] = B[data.t_id_y + i * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -372,16 +378,16 @@ template inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { - r_B[i] = B[data.t_id_x + i*P_1D]; + r_B[i] = B[data.t_id_x + i * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -395,15 +401,15 @@ template inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { - r_B[i] = B[data.t_id_x + i*P_1D]; + r_B[i] = B[data.t_id_x + i * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -414,13 +420,14 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed // 3D interpolate to quadrature points //------------------------------------------------------------------------------ template -inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp*P_1D, c_B, r_t1); + ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp*Q_1D); + ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D); } } @@ -428,13 +435,14 @@ inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *_ // 3D interpolate transpose //------------------------------------------------------------------------------ template -inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp*Q_1D, c_B, r_t1); + ContractTransposeZ3d(data, r_U + comp * Q_1D, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp*P_1D); + ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); } } @@ -442,19 +450,20 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed // 3D derivatives at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp*P_1D, c_G, r_t1); + ContractX3d(data, r_U + comp * P_1D, c_G, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D); - ContractX3d(data, r_U + comp*P_1D, c_B, r_t1); + ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); + ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); ContractY3d(data, r_t1, c_G, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D); - ContractX3d(data, r_U + comp*P_1D, c_B, r_t1); + ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); + ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D); + ContractZ3d(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); } } @@ -462,19 +471,20 @@ inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__r // 3D derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_B, r_t1); + ContractTransposeZ3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_G, r_V + comp*P_1D); - ContractTransposeZ3d(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_B, r_t1); + ContractTransposeX3d(data, r_t2, c_G, r_V + comp * P_1D); + ContractTransposeZ3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_G, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp*P_1D); - ContractTransposeZ3d(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t1); + ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp*P_1D); + ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); } } @@ -482,16 +492,17 @@ inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedSc // 3D derivatives at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp*P_1D, c_B, r_t1); + ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_B, r_t1); - ContractX3d(data, r_t1, c_G, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D); - ContractY3d(data, r_t1, c_G, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D); - ContractZ3d(data, r_t1, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D); + ContractX3d(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); + ContractY3d(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); + ContractZ3d(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); } } @@ -499,16 +510,17 @@ inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedS // 3D derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t2); - ContractTransposeAddY3d(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_G, r_t2); - ContractTransposeAddX3d(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_G, r_t2); + ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2); + ContractTransposeAddY3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2); + ContractTransposeAddX3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2); ContractTransposeZ3d(data, r_t2, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp*P_1D); + ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); } } @@ -517,10 +529,10 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co //------------------------------------------------------------------------------ template inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { - const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); - const CeedScalar pw = quad ? q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0; + const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); + const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; for (CeedInt q = 0; q < Q_1D; q++) { - w[q] = quad ? pw*q_weight_1d[q] : 0.0; + w[q] = quad ? pw * q_weight_1d[q] : 0.0; } } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h index 42f545b72e..43d3441d6a 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h @@ -11,74 +11,74 @@ #define _ceed_cuda_shared_basis_tensor_h #include + #include "cuda-shared-basis-read-write-templates.h" #include "cuda-shared-basis-tensor-templates.h" //------------------------------------------------------------------------------ // Interp kernel by dim //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, - const CeedScalar *c_B, - const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - ReadElementStrided1d(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, d_U, r_U); + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); Interp1d(data, r_U, c_B, r_V); - WriteElementStrided1d(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, r_V, d_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - ReadElementStrided2d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, d_U, r_U); + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); InterpTensor2d(data, r_U, c_B, r_V); - WriteElementStrided2d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); + WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 3) { - ReadElementStrided3d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, d_U, r_U); + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); InterpTensor3d(data, r_U, c_B, r_V); - WriteElementStrided3d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); + WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } } } -extern "C" __global__ void InterpTranspose(const CeedInt num_elem, - const CeedScalar *c_B, - const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - ReadElementStrided1d(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, d_U, r_U); + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); InterpTranspose1d(data, r_U, c_B, r_V); - WriteElementStrided1d(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, r_V, d_V); + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - ReadElementStrided2d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); InterpTransposeTensor2d(data, r_U, c_B, r_V); - WriteElementStrided2d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, r_V, d_V); + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { - ReadElementStrided3d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); InterpTransposeTensor3d(data, r_U, c_B, r_V); - WriteElementStrided3d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, r_V, d_V); + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } } @@ -86,9 +86,7 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, - const CeedScalar *c_B, const CeedScalar *c_G, - const CeedScalar *__restrict__ d_U, +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; @@ -96,60 +94,64 @@ extern "C" __global__ void Grad(const CeedInt num_elem, data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - ReadElementStrided1d(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, d_U, r_U); + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); Grad1d(data, r_U, c_B, c_G, r_V); - WriteElementStrided1d(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, r_V, d_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - ReadElementStrided2d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, d_U, r_U); + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); GradTensor2d(data, r_U, c_B, c_G, r_V); - WriteElementStrided2d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); + WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, + d_V); } else if (BASIS_DIM == 3) { - ReadElementStrided3d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, d_U, r_U); + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d(data, r_U, c_B, c_G, r_V); - else GradTensor3d(data, r_U, c_B, c_G, r_V); - WriteElementStrided3d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); + else GradTensor3d(data, r_U, c_B, c_G, r_V); + WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } } } -extern "C" __global__ void GradTranspose(const CeedInt num_elem, - const CeedScalar *c_B, const CeedScalar *c_G, - const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - ReadElementStrided1d(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, d_U, r_U); + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); GradTranspose1d(data, r_U, c_B, c_G, r_V); - WriteElementStrided1d(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, r_V, d_V); + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - ReadElementStrided2d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); GradTransposeTensor2d(data, r_U, c_B, c_G, r_V); - WriteElementStrided2d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, r_V, d_V); + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { - ReadElementStrided3d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, c_B, c_G, r_V); - else GradTransposeTensor3d(data, r_U, c_B, c_G, r_V); - WriteElementStrided3d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, r_V, d_V); + else GradTransposeTensor3d(data, r_U, c_B, c_G, r_V); + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } } @@ -157,30 +159,29 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, //------------------------------------------------------------------------------ // Weight kernels by dim //------------------------------------------------------------------------------ -extern "C" __global__ void Weight(const CeedInt num_elem, - const CeedScalar *__restrict__ q_weight_1d, - CeedScalar *__restrict__ d_W) { +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { Weight1d(data, q_weight_1d, r_W); - WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, r_W, d_W); + WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); } else if (BASIS_DIM == 2) { WeightTensor2d(data, q_weight_1d, r_W); - WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, r_W, d_W); + WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); } else if (BASIS_DIM == 3) { WeightTensor3d(data, q_weight_1d, r_W); - WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, r_W, d_W); + WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W, + d_W); } } } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis.h b/include/ceed/jit-source/cuda/cuda-shared-basis.h new file mode 100644 index 0000000000..cdddbca8dc --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-shared-basis.h @@ -0,0 +1,642 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +//------------------------------------------------------------------------------ +// Shared mem kernels +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Sum input into output +//------------------------------------------------------------------------------ +inline __device__ void add(CeedScalar *r_V, const CeedScalar *r_U) { + for (CeedInt i = 0; i < BASIS_P_1D; i++) r_V[i] += r_U[i]; +} + +//------------------------------------------------------------------------------ +// 1D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Read DoFs +//------------------------------------------------------------------------------ +inline __device__ void readDofs1d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedInt comp, + const CeedInt num_elem, const CeedScalar *d_U, CeedScalar *slice) { + for (CeedInt i = 0; i < BASIS_P_1D; i++) slice[i + t_in_z * BASIS_T_1D] = d_U[i + elem * BASIS_P_1D + comp * BASIS_P_1D * num_elem]; + for (CeedInt i = BASIS_P_1D; i < BASIS_Q_1D; i++) slice[i + t_in_z * BASIS_T_1D] = 0.0; +} + +//------------------------------------------------------------------------------ +// Write DoFs +//------------------------------------------------------------------------------ +inline __device__ void writeDofs1d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar &r_V, CeedScalar *d_V) { + if (t_in_x < BASIS_P_1D) d_V[t_in_x + elem * BASIS_P_1D + comp * BASIS_P_1D * num_elem] = r_V; +} + +//------------------------------------------------------------------------------ +// Read quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void readQuads1d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedInt comp, + const CeedInt dim, const CeedInt num_elem, const CeedScalar *d_U, CeedScalar *slice) { + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + slice[i + t_in_z * BASIS_T_1D] = d_U[i + elem * BASIS_Q_1D + comp * BASIS_Q_1D * num_elem + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D]; + for (CeedInt i = BASIS_Q_1D; i < BASIS_P_1D; i++) slice[i + t_in_z * BASIS_T_1D] = 0.0; +} + +//------------------------------------------------------------------------------ +// Write quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void writeQuads1d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar &r_V, CeedScalar *d_V) { + if (t_in_x < BASIS_Q_1D) d_V[t_in_x + elem * BASIS_Q_1D + comp * BASIS_Q_1D * num_elem + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D] = r_V; +} + +//------------------------------------------------------------------------------ +// 1D tensor contraction +//------------------------------------------------------------------------------ +inline __device__ void ContractX1d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + V = 0.0; + for (CeedInt i = 0; i < BASIS_P_1D; i++) V += B[i + t_in_x * BASIS_P_1D] * slice[i + t_in_z * BASIS_T_1D]; // Contract x direction +} + +//------------------------------------------------------------------------------ +// 1D transpose tensor contraction +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeX1d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + V = 0.0; + for (CeedInt i = 0; i < BASIS_Q_1D; i++) V += B[t_in_x + i * BASIS_P_1D] * slice[i + t_in_z * BASIS_T_1D]; // Contract x direction +} + +//------------------------------------------------------------------------------ +// 1D interpolate to quadrature points +//------------------------------------------------------------------------------ +inline __device__ void interp1d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_V; + CeedScalar r_t; + + const CeedInt t_in_x = threadIdx.x; + const CeedInt t_in_y = threadIdx.y; + const CeedInt t_in_z = threadIdx.z; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + if (transpose) { + readQuads1d(elem, t_in_x, t_in_y, t_in_z, comp, 0, num_elem, d_U, slice); + ContractTransposeX1d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_V); + writeDofs1d(elem, t_in_x, t_in_y, comp, num_elem, r_V, d_V); + } else { + readDofs1d(elem, t_in_x, t_in_y, t_in_z, comp, num_elem, d_U, slice); + ContractX1d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_V); + writeQuads1d(elem, t_in_x, t_in_y, comp, 0, num_elem, r_V, d_V); + } + } + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives at quadrature points +//------------------------------------------------------------------------------ +inline __device__ void grad1d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *c_B, const CeedScalar *c_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_U; + CeedScalar r_V; + + const CeedInt t_in_x = threadIdx.x; + const CeedInt t_in_y = threadIdx.y; + const CeedInt t_in_z = threadIdx.z; + CeedInt dim; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + if (transpose) { + dim = 0; + readQuads1d(elem, t_in_x, t_in_y, t_in_z, comp, dim, num_elem, d_U, slice); + ContractTransposeX1d(slice, t_in_x, t_in_y, t_in_z, r_U, c_G, r_V); + writeDofs1d(elem, t_in_x, t_in_y, comp, num_elem, r_V, d_V); + } else { + readDofs1d(elem, t_in_x, t_in_y, t_in_z, comp, num_elem, d_U, slice); + ContractX1d(slice, t_in_x, t_in_y, t_in_z, r_U, c_G, r_V); + dim = 0; + writeQuads1d(elem, t_in_x, t_in_y, comp, dim, num_elem, r_V, d_V); + } + } + } +} + +//------------------------------------------------------------------------------ +// 1D Quadrature weights +//------------------------------------------------------------------------------ +__device__ void weight1d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { + const CeedInt tid = threadIdx.x; + const CeedScalar weight = q_weight_1d[tid]; + for (CeedInt elem = blockIdx.x * blockDim.y + threadIdx.y; elem < num_elem; elem += gridDim.x * blockDim.y) { + const CeedInt ind = elem * BASIS_Q_1D + tid; + w[ind] = weight; + } +} + +//------------------------------------------------------------------------------ +// 2D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Read DoFs +//------------------------------------------------------------------------------ +inline __device__ void readDofs2d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar *d_U, CeedScalar &U) { + U = (t_in_x < BASIS_P_1D && t_in_y < BASIS_P_1D) + ? d_U[t_in_x + t_in_y * BASIS_P_1D + elem * BASIS_P_1D * BASIS_P_1D + comp * BASIS_P_1D * BASIS_P_1D * num_elem] + : 0.0; +} + +//------------------------------------------------------------------------------ +// Write DoFs +//------------------------------------------------------------------------------ +inline __device__ void writeDofs2d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar &r_V, CeedScalar *d_V) { + if (t_in_x < BASIS_P_1D && t_in_y < BASIS_P_1D) + d_V[t_in_x + t_in_y * BASIS_P_1D + elem * BASIS_P_1D * BASIS_P_1D + comp * BASIS_P_1D * BASIS_P_1D * num_elem] = r_V; +} + +//------------------------------------------------------------------------------ +// Read quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void readQuads2d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar *d_U, CeedScalar &U) { + U = (t_in_x < BASIS_Q_1D && t_in_y < BASIS_Q_1D) + ? d_U[t_in_x + t_in_y * BASIS_Q_1D + elem * BASIS_Q_1D * BASIS_Q_1D + comp * BASIS_Q_1D * BASIS_Q_1D * num_elem + + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D * BASIS_Q_1D] + : 0.0; +} + +//------------------------------------------------------------------------------ +// Write quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void writeQuads2d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar &r_V, CeedScalar *d_V) { + if (t_in_x < BASIS_Q_1D && t_in_y < BASIS_Q_1D) + d_V[t_in_x + t_in_y * BASIS_Q_1D + elem * BASIS_Q_1D * BASIS_Q_1D + comp * BASIS_Q_1D * BASIS_Q_1D * num_elem + + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D * BASIS_Q_1D] = r_V; +} + +//------------------------------------------------------------------------------ +// 2D tensor contraction x +//------------------------------------------------------------------------------ +inline __device__ void ContractX2d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + slice[t_in_x + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D] = U; + __syncthreads(); + V = 0.0; + if (t_in_x < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) + V += B[i + t_in_x * BASIS_P_1D] * slice[i + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D]; // Contract x direction + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 2D tensor contraction y +//------------------------------------------------------------------------------ +inline __device__ void ContractY2d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + slice[t_in_x + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D] = U; + __syncthreads(); + V = 0.0; + if (t_in_y < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) + V += B[i + t_in_y * BASIS_P_1D] * slice[t_in_x + i * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D]; // Contract y direction + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contraction y +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeY2d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + slice[t_in_x + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D] = U; + __syncthreads(); + V = 0.0; + if (t_in_y < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + V += B[t_in_y + i * BASIS_P_1D] * slice[t_in_x + i * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D]; // Contract y direction + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contraction x +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeX2d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + slice[t_in_x + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D] = U; + __syncthreads(); + V = 0.0; + if (t_in_x < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + V += B[t_in_x + i * BASIS_P_1D] * slice[i + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D]; // Contract x direction + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 2D interpolate to quadrature points +//------------------------------------------------------------------------------ +inline __device__ void interp2d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_V; + CeedScalar r_t; + + const CeedInt t_in_x = threadIdx.x; + const CeedInt t_in_y = threadIdx.y; + const CeedInt t_in_z = threadIdx.z; + const CeedInt blockElem = t_in_z / BASIS_NUM_COMP; + const CeedInt elemsPerBlock = blockDim.z / BASIS_NUM_COMP; + const CeedInt comp = t_in_z % BASIS_NUM_COMP; + + for (CeedInt elem = blockIdx.x * elemsPerBlock + blockElem; elem < num_elem; elem += gridDim.x * elemsPerBlock) { + const CeedInt comp = t_in_z % BASIS_NUM_COMP; + r_V = 0.0; + r_t = 0.0; + if (transpose) { + readQuads2d(elem, t_in_x, t_in_y, comp, 0, num_elem, d_U, r_V); + ContractTransposeY2d(slice, t_in_x, t_in_y, t_in_z, r_V, c_B, r_t); + ContractTransposeX2d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_V); + writeDofs2d(elem, t_in_x, t_in_y, comp, num_elem, r_V, d_V); + } else { + readDofs2d(elem, t_in_x, t_in_y, comp, num_elem, d_U, r_V); + ContractX2d(slice, t_in_x, t_in_y, t_in_z, r_V, c_B, r_t); + ContractY2d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_V); + writeQuads2d(elem, t_in_x, t_in_y, comp, 0, num_elem, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives at quadrature points +//------------------------------------------------------------------------------ +inline __device__ void grad2d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *c_B, const CeedScalar *c_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_U; + CeedScalar r_V; + CeedScalar r_t; + + const CeedInt t_in_x = threadIdx.x; + const CeedInt t_in_y = threadIdx.y; + const CeedInt t_in_z = threadIdx.z; + const CeedInt blockElem = t_in_z / BASIS_NUM_COMP; + const CeedInt elemsPerBlock = blockDim.z / BASIS_NUM_COMP; + const CeedInt comp = t_in_z % BASIS_NUM_COMP; + CeedInt dim; + + for (CeedInt elem = blockIdx.x * elemsPerBlock + blockElem; elem < num_elem; elem += gridDim.x * elemsPerBlock) { + if (transpose) { + dim = 0; + readQuads2d(elem, t_in_x, t_in_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeY2d(slice, t_in_x, t_in_y, t_in_z, r_U, c_B, r_t); + ContractTransposeX2d(slice, t_in_x, t_in_y, t_in_z, r_t, c_G, r_V); + dim = 1; + readQuads2d(elem, t_in_x, t_in_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeY2d(slice, t_in_x, t_in_y, t_in_z, r_U, c_G, r_t); + ContractTransposeX2d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_U); + r_V += r_U; + writeDofs2d(elem, t_in_x, t_in_y, comp, num_elem, r_V, d_V); + } else { + readDofs2d(elem, t_in_x, t_in_y, comp, num_elem, d_U, r_U); + ContractX2d(slice, t_in_x, t_in_y, t_in_z, r_U, c_G, r_t); + ContractY2d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_V); + dim = 0; + writeQuads2d(elem, t_in_x, t_in_y, comp, dim, num_elem, r_V, d_V); + ContractX2d(slice, t_in_x, t_in_y, t_in_z, r_U, c_B, r_t); + ContractY2d(slice, t_in_x, t_in_y, t_in_z, r_t, c_G, r_V); + dim = 1; + writeQuads2d(elem, t_in_x, t_in_y, comp, dim, num_elem, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// 2D quadrature weights +//------------------------------------------------------------------------------ +__device__ void weight2d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { + const CeedInt i = threadIdx.x; + const CeedInt j = threadIdx.y; + const CeedScalar weight = q_weight_1d[i] * q_weight_1d[j]; + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + const CeedInt ind = elem * BASIS_Q_1D * BASIS_Q_1D + i + j * BASIS_Q_1D; + w[ind] = weight; + } +} + +//------------------------------------------------------------------------------ +// 3D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Read DoFs +//------------------------------------------------------------------------------ +inline __device__ void readDofs3d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar *d_U, CeedScalar *r_U) { + for (CeedInt i = 0; i < BASIS_P_1D; i++) + r_U[i] = (t_in_x < BASIS_P_1D && t_in_y < BASIS_P_1D) + ? d_U[t_in_x + t_in_y * BASIS_P_1D + i * BASIS_P_1D * BASIS_P_1D + elem * BASIS_P_1D * BASIS_P_1D * BASIS_P_1D + + comp * BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem] + : 0.0; + for (CeedInt i = BASIS_P_1D; i < BASIS_Q_1D; i++) r_U[i] = 0.0; +} + +//------------------------------------------------------------------------------ +// Write DoFs +//------------------------------------------------------------------------------ +inline __device__ void writeDofs3d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar *r_V, CeedScalar *d_V) { + if (t_in_x < BASIS_P_1D && t_in_y < BASIS_P_1D) { + for (CeedInt i = 0; i < BASIS_P_1D; i++) + d_V[t_in_x + t_in_y * BASIS_P_1D + i * BASIS_P_1D * BASIS_P_1D + elem * BASIS_P_1D * BASIS_P_1D * BASIS_P_1D + + comp * BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem] = r_V[i]; + } +} + +//------------------------------------------------------------------------------ +// Read quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void readQuads3d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar *d_U, CeedScalar *r_U) { + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + r_U[i] = + (t_in_x < BASIS_Q_1D && t_in_y < BASIS_Q_1D) + ? d_U[t_in_x + t_in_y * BASIS_Q_1D + i * BASIS_Q_1D * BASIS_Q_1D + elem * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D + + comp * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D] + : 0.0; + for (CeedInt i = BASIS_Q_1D; i < BASIS_P_1D; i++) r_U[i] = 0.0; +} + +//------------------------------------------------------------------------------ +// Write quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void writeQuads3d(const CeedInt elem, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar *r_V, CeedScalar *d_V) { + if (t_in_x < BASIS_Q_1D && t_in_y < BASIS_Q_1D) { + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + d_V[t_in_x + t_in_y * BASIS_Q_1D + i * BASIS_Q_1D * BASIS_Q_1D + elem * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D + + comp * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D] = r_V[i]; + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract x +//------------------------------------------------------------------------------ +inline __device__ void ContractX3d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + slice[t_in_x + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D] = U[k]; + __syncthreads(); + V[k] = 0.0; + if (t_in_x < BASIS_Q_1D && t_in_y < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) + V[k] += B[i + t_in_x * BASIS_P_1D] * slice[i + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D]; // Contract x direction + __syncthreads(); + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract y +//------------------------------------------------------------------------------ +inline __device__ void ContractY3d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + slice[t_in_x + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D] = U[k]; + __syncthreads(); + V[k] = 0.0; + if (t_in_x < BASIS_Q_1D && t_in_y < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) + V[k] += B[i + t_in_y * BASIS_P_1D] * slice[t_in_x + i * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D]; // Contract y direction + __syncthreads(); + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract z +//------------------------------------------------------------------------------ +inline __device__ void ContractZ3d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_Q_1D; k++) { + V[k] = 0.0; + if (t_in_x < BASIS_Q_1D && t_in_y < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) V[k] += B[i + k * BASIS_P_1D] * U[i]; // Contract z direction + } + for (CeedInt k = BASIS_Q_1D; k < BASIS_P_1D; k++) V[k] = 0.0; +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract z +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeZ3d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + V[k] = 0.0; + if (t_in_x < BASIS_Q_1D && t_in_y < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) V[k] += B[k + i * BASIS_P_1D] * U[i]; // Contract z direction + } + for (CeedInt k = BASIS_P_1D; k < BASIS_Q_1D; k++) V[k] = 0.0; +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract y +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeY3d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + slice[t_in_x + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D] = U[k]; + __syncthreads(); + V[k] = 0.0; + if (t_in_x < BASIS_Q_1D && t_in_y < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + V[k] += B[t_in_y + i * BASIS_P_1D] * slice[t_in_x + i * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D]; // Contract y direction + __syncthreads(); + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract x +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeX3d(CeedScalar *slice, const CeedInt t_in_x, const CeedInt t_in_y, const CeedInt t_in_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + slice[t_in_x + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D] = U[k]; + __syncthreads(); + V[k] = 0.0; + if (t_in_x < BASIS_P_1D && t_in_y < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + V[k] += B[t_in_x + i * BASIS_P_1D] * slice[i + t_in_y * BASIS_T_1D + t_in_z * BASIS_T_1D * BASIS_T_1D]; // Contract x direction + __syncthreads(); + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate to quadrature points +//------------------------------------------------------------------------------ +inline __device__ void interp3d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_V[BASIS_T_1D]; + CeedScalar r_t[BASIS_T_1D]; + + const CeedInt t_in_x = threadIdx.x; + const CeedInt t_in_y = threadIdx.y; + const CeedInt t_in_z = threadIdx.z; + const CeedInt blockElem = t_in_z / BASIS_NUM_COMP; + const CeedInt elemsPerBlock = blockDim.z / BASIS_NUM_COMP; + const CeedInt comp = t_in_z % BASIS_NUM_COMP; + + for (CeedInt elem = blockIdx.x * elemsPerBlock + blockElem; elem < num_elem; elem += gridDim.x * elemsPerBlock) { + for (CeedInt i = 0; i < BASIS_T_1D; i++) { + r_V[i] = 0.0; + r_t[i] = 0.0; + } + if (transpose) { + readQuads3d(elem, t_in_x, t_in_y, comp, 0, num_elem, d_U, r_V); + ContractTransposeZ3d(slice, t_in_x, t_in_y, t_in_z, r_V, c_B, r_t); + ContractTransposeY3d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_V); + ContractTransposeX3d(slice, t_in_x, t_in_y, t_in_z, r_V, c_B, r_t); + writeDofs3d(elem, t_in_x, t_in_y, comp, num_elem, r_t, d_V); + } else { + readDofs3d(elem, t_in_x, t_in_y, comp, num_elem, d_U, r_V); + ContractX3d(slice, t_in_x, t_in_y, t_in_z, r_V, c_B, r_t); + ContractY3d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_V); + ContractZ3d(slice, t_in_x, t_in_y, t_in_z, r_V, c_B, r_t); + writeQuads3d(elem, t_in_x, t_in_y, comp, 0, num_elem, r_t, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points +//------------------------------------------------------------------------------ +inline __device__ void grad3d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *c_B, const CeedScalar *c_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V, CeedScalar *slice) { + // Use BASIS_P_1D for one of these + CeedScalar r_U[BASIS_T_1D]; + CeedScalar r_V[BASIS_T_1D]; + CeedScalar r_t[BASIS_T_1D]; + + const CeedInt t_in_x = threadIdx.x; + const CeedInt t_in_y = threadIdx.y; + const CeedInt t_in_z = threadIdx.z; + const CeedInt blockElem = t_in_z / BASIS_NUM_COMP; + const CeedInt elemsPerBlock = blockDim.z / BASIS_NUM_COMP; + const CeedInt comp = t_in_z % BASIS_NUM_COMP; + CeedInt dim; + + for (CeedInt elem = blockIdx.x * elemsPerBlock + blockElem; elem < num_elem; elem += gridDim.x * elemsPerBlock) { + for (CeedInt i = 0; i < BASIS_T_1D; i++) { + r_U[i] = 0.0; + r_V[i] = 0.0; + r_t[i] = 0.0; + } + if (transpose) { + dim = 0; + readQuads3d(elem, t_in_x, t_in_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeZ3d(slice, t_in_x, t_in_y, t_in_z, r_U, c_B, r_t); + ContractTransposeY3d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_U); + ContractTransposeX3d(slice, t_in_x, t_in_y, t_in_z, r_U, c_G, r_V); + dim = 1; + readQuads3d(elem, t_in_x, t_in_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeZ3d(slice, t_in_x, t_in_y, t_in_z, r_U, c_B, r_t); + ContractTransposeY3d(slice, t_in_x, t_in_y, t_in_z, r_t, c_G, r_U); + ContractTransposeX3d(slice, t_in_x, t_in_y, t_in_z, r_U, c_B, r_t); + add(r_V, r_t); + dim = 2; + readQuads3d(elem, t_in_x, t_in_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeZ3d(slice, t_in_x, t_in_y, t_in_z, r_U, c_G, r_t); + ContractTransposeY3d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_U); + ContractTransposeX3d(slice, t_in_x, t_in_y, t_in_z, r_U, c_B, r_t); + add(r_V, r_t); + writeDofs3d(elem, t_in_x, t_in_y, comp, num_elem, r_V, d_V); + } else { + readDofs3d(elem, t_in_x, t_in_y, comp, num_elem, d_U, r_U); + ContractX3d(slice, t_in_x, t_in_y, t_in_z, r_U, c_G, r_V); + ContractY3d(slice, t_in_x, t_in_y, t_in_z, r_V, c_B, r_t); + ContractZ3d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_V); + dim = 0; + writeQuads3d(elem, t_in_x, t_in_y, comp, dim, num_elem, r_V, d_V); + ContractX3d(slice, t_in_x, t_in_y, t_in_z, r_U, c_B, r_V); + ContractY3d(slice, t_in_x, t_in_y, t_in_z, r_V, c_G, r_t); + ContractZ3d(slice, t_in_x, t_in_y, t_in_z, r_t, c_B, r_V); + dim = 1; + writeQuads3d(elem, t_in_x, t_in_y, comp, dim, num_elem, r_V, d_V); + ContractX3d(slice, t_in_x, t_in_y, t_in_z, r_U, c_B, r_V); + ContractY3d(slice, t_in_x, t_in_y, t_in_z, r_V, c_B, r_t); + ContractZ3d(slice, t_in_x, t_in_y, t_in_z, r_t, c_G, r_V); + dim = 2; + writeQuads3d(elem, t_in_x, t_in_y, comp, dim, num_elem, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// 3D quadrature weights +//------------------------------------------------------------------------------ +__device__ void weight3d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { + const CeedInt i = threadIdx.x; + const CeedInt j = threadIdx.y; + const CeedInt k = threadIdx.z; + const CeedScalar weight = q_weight_1d[i] * q_weight_1d[j] * q_weight_1d[k]; + for (CeedInt e = blockIdx.x; e < num_elem; e += gridDim.x) { + const CeedInt ind = e * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D + i + j * BASIS_Q_1D + k * BASIS_Q_1D * BASIS_Q_1D; + w[ind] = weight; + } +} + +//------------------------------------------------------------------------------ +// Basis kernels +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interp kernel by dim +//------------------------------------------------------------------------------ +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + if (BASIS_DIM == 1) { + interp1d(num_elem, transpose, c_B, d_U, d_V, slice); + } else if (BASIS_DIM == 2) { + interp2d(num_elem, transpose, c_B, d_U, d_V, slice); + } else if (BASIS_DIM == 3) { + interp3d(num_elem, transpose, c_B, d_U, d_V, slice); + } +} + +//------------------------------------------------------------------------------ +// Grad kernel by dim +//------------------------------------------------------------------------------ +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *c_B, const CeedScalar *c_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + if (BASIS_DIM == 1) { + grad1d(num_elem, transpose, c_B, c_G, d_U, d_V, slice); + } else if (BASIS_DIM == 2) { + grad2d(num_elem, transpose, c_B, c_G, d_U, d_V, slice); + } else if (BASIS_DIM == 3) { + grad3d(num_elem, transpose, c_B, c_G, d_U, d_V, slice); + } +} + +//------------------------------------------------------------------------------ +// Weight kernels by dim +//------------------------------------------------------------------------------ +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ v) { + if (BASIS_DIM == 1) { + weight1d(num_elem, q_weight_1d, v); + } else if (BASIS_DIM == 2) { + weight2d(num_elem, q_weight_1d, v); + } else if (BASIS_DIM == 3) { + weight3d(num_elem, q_weight_1d, v); + } +} + +//------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h index 85c06a779d..44537fc458 100644 --- a/include/ceed/jit-source/cuda/cuda-types.h +++ b/include/ceed/jit-source/cuda/cuda-types.h @@ -16,7 +16,7 @@ typedef struct { const CeedScalar* inputs[CEED_CUDA_NUMBER_FIELDS]; - CeedScalar* outputs[CEED_CUDA_NUMBER_FIELDS]; + CeedScalar* outputs[CEED_CUDA_NUMBER_FIELDS]; } Fields_Cuda; typedef struct { @@ -25,10 +25,10 @@ typedef struct { } FieldsInt_Cuda; typedef struct { - CeedInt t_id_x; - CeedInt t_id_y; - CeedInt t_id_z; - CeedInt t_id; + CeedInt t_id_x; + CeedInt t_id_y; + CeedInt t_id_z; + CeedInt t_id; CeedScalar* slice; } SharedData_Cuda; diff --git a/include/ceed/jit-source/gallery/ceed-identity.h b/include/ceed/jit-source/gallery/ceed-identity.h index e7dbc2ddca..1996bffded 100644 --- a/include/ceed/jit-source/gallery/ceed-identity.h +++ b/include/ceed/jit-source/gallery/ceed-identity.h @@ -18,12 +18,10 @@ typedef struct { CeedInt size; } IdentityCtx; -CEED_QFUNCTION(Identity)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Identity)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Ctx holds field size - IdentityCtx identity_ctx = *(IdentityCtx *)ctx; - const CeedInt size = identity_ctx.size; + IdentityCtx identity_ctx = *(IdentityCtx *)ctx; + const CeedInt size = identity_ctx.size; // in[0] is input, size (Q*size) const CeedScalar *input = in[0]; @@ -31,12 +29,9 @@ CEED_QFUNCTION(Identity)(void *ctx, const CeedInt Q, CeedScalar *output = out[0]; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is Jacobians, size (Q) // in[1] is quadrature weights, size (Q) const CeedScalar *J = in[0], *w = in[1]; @@ -23,12 +22,9 @@ CEED_QFUNCTION(Mass1DBuild)(void *ctx, const CeedInt Q, CeedScalar *q_data = out[0]; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Mass2DBuild)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Mass2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // in[0] is Jacobians with shape [2, nc=2, Q] // in[1] is quadrature weights, size (Q) - const CeedScalar (*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0], - *w = in[1]; + const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0], *w = in[1]; // out[0] is quadrature data, size (Q) - CeedScalar *q_data = out[0]; + CeedScalar *q_data = out[0]; // *INDENT-ON* // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Mass3DBuild)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Mass3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // in[0] is Jacobians with shape [2, nc=3, Q] // in[1] is quadrature weights, size (Q) - const CeedScalar (*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], - *w = in[1]; + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], *w = in[1]; // out[0] is quadrature data, size (Q) - CeedScalar *q_data = out[0]; + CeedScalar *q_data = out[0]; // *INDENT-ON* // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is u, size (Q) // in[1] is quadrature data, size (Q) const CeedScalar *u = in[0], *q_data = in[1]; @@ -23,12 +22,9 @@ CEED_QFUNCTION(MassApply)(void *ctx, const CeedInt Q, CeedScalar *v = out[0]; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is gradient u, size (Q) // in[1] is quadrature data, size (Q) const CeedScalar *ug = in[0], *q_data = in[1]; @@ -25,12 +23,9 @@ CEED_QFUNCTION(Poisson1DApply)(void *ctx, const CeedInt Q, CeedScalar *vg = out[0]; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store // the symmetric part of the result. @@ -28,12 +26,9 @@ CEED_QFUNCTION(Poisson1DBuild)(void *ctx, const CeedInt Q, CeedScalar *q_data = out[0]; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Poisson2DApply)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // in[0] is gradient u, shape [2, nc=1, Q] // in[1] is quadrature data, size (3*Q) - const CeedScalar (*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; // out[0] is output to multiply against gradient v, shape [2, nc=1, Q] - CeedScalar (*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* const CeedInt dim = 2; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Poisson2DBuild)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Poisson2DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store // the symmetric part of the result. // *INDENT-OFF* // in[0] is Jacobians with shape [2, nc=2, Q] // in[1] is quadrature weights, size (Q) - const CeedScalar (*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0], - *w = in[1]; + const CeedScalar(*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[0], *w = in[1]; // out[0] is qdata, size (3*Q) - CeedScalar (*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Poisson3DApply)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // in[0] is gradient u, shape [3, nc=1, Q] // in[1] is quadrature data, size (6*Q) - const CeedScalar (*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; // out[0] is output to multiply against gradient v, shape [3, nc=1, Q] - CeedScalar (*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* const CeedInt dim = 3; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Poisson3DBuild)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Poisson3DBuild)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute w/det(J).adj(J).adj(J)^T and store // the symmetric part of the result. // *INDENT-OFF* // in[0] is Jacobians with shape [3, nc=3, Q] // in[1] is quadrature weights, size (Q) - const CeedScalar (*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], - *w = in[1]; + const CeedScalar(*J)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], *w = in[1]; // out[0] is qdata, size (6*Q) - CeedScalar (*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* const CeedInt dim = 3; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // Ctx holds field size const CeedInt size = *(CeedInt *)ctx; @@ -27,11 +26,8 @@ CEED_QFUNCTION(Scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *output = out[0]; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Vector3MassApply)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, CeedScalar *const *out) { +CEED_QFUNCTION(Vector3MassApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // in[0] is u, size (Q) // in[1] is quadrature data, size (Q) - const CeedScalar (*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - *q_data = in[1]; + const CeedScalar(*u)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], *q_data = in[1]; // out[0] is v, size (Q) - CeedScalar (*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*v)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* const CeedInt num_comp = 3; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Vector3Poisson1DApply)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Vector3Poisson1DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // in[0] is gradient u, shape [1, nc=3, Q] // in[1] is quadrature data, size (Q) - const CeedScalar (*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], - (*q_data) = in[1]; + const CeedScalar(*ug)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[0], (*q_data) = in[1]; // out[0] is output to multiply against gradient v, shape [1, nc=3, Q] - CeedScalar (*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + CeedScalar(*vg)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; // *INDENT-ON* const CeedInt num_comp = 3; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Vector3Poisson2DApply)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Vector3Poisson2DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // in[0] is gradient u, shape [2, nc=3, Q] // in[1] is quadrature data, size (3*Q) - const CeedScalar (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; // out[0] is output to multiply against gradient v, shape [2, nc=3, Q] - CeedScalar (*vg)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; + CeedScalar(*vg)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; // *INDENT-ON* const CeedInt dim = 2, num_comp = 3; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -CEED_QFUNCTION(Vector3Poisson3DApply)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(Vector3Poisson3DApply)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // *INDENT-OFF* // in[0] is gradient u, shape [3, nc=3, Q] // in[1] is quadrature data, size (6*Q) - const CeedScalar (*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], - (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; + const CeedScalar(*ug)[3][CEED_Q_VLA] = (const CeedScalar(*)[3][CEED_Q_VLA])in[0], (*q_data)[CEED_Q_VLA] = (const CeedScalar(*)[CEED_Q_VLA])in[1]; // out[0] is output to multiply against gradient v, shape [3, nc=3, Q] - CeedScalar (*vg)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; + CeedScalar(*vg)[3][CEED_Q_VLA] = (CeedScalar(*)[3][CEED_Q_VLA])out[0]; // *INDENT-ON* const CeedInt dim = 3, num_comp = 3; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i - //------------------------------------------------------------------------------ // Load matrices for basis actions //------------------------------------------------------------------------------ template inline __device__ void loadMatrix(SharedData_Hip& data, const CeedScalar* d_B, CeedScalar* B) { - for (CeedInt i = data.t_id; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) - B[i] = d_B[i]; + for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } //------------------------------------------------------------------------------ @@ -30,12 +28,12 @@ inline __device__ void loadMatrix(SharedData_Hip& data, const CeedScalar* d_B, C // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void readDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { +inline __device__ void readDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, + CeedScalar* r_u) { if (data.t_id_x < P1d) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + COMPSTRIDE * comp]; + const CeedInt ind = indices[node + elem * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; } } @@ -46,9 +44,8 @@ template L-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void writeDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { +inline __device__ void writeDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, + const CeedScalar* r_v, CeedScalar* d_v) { if (data.t_id_x < P1d) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); + const CeedInt ind = indices[node + elem * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); } } @@ -72,9 +69,8 @@ template E-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void readDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { +inline __device__ void readDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, + CeedScalar* r_u) { if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d; - const CeedInt ind = indices[node + elem * P1d*P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + COMPSTRIDE * comp]; + const CeedInt node = data.t_id_x + data.t_id_y * P1d; + const CeedInt ind = indices[node + elem * P1d * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; } } @@ -101,10 +97,9 @@ inline __device__ void readDofsOffset2d(SharedData_Hip& data, const CeedInt nnod template inline __device__ void readDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d; - const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + comp * STRIDES_COMP]; + const CeedInt node = data.t_id_x + data.t_id_y * P1d; + const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } } @@ -112,12 +107,12 @@ inline __device__ void readDofsStrided2d(SharedData_Hip& data, const CeedInt ele // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void writeDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { +inline __device__ void writeDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, + const CeedScalar* r_v, CeedScalar* d_v) { if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d; - const CeedInt ind = indices[node + elem * P1d*P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); + const CeedInt node = data.t_id_x + data.t_id_y * P1d; + const CeedInt ind = indices[node + elem * P1d * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); } } @@ -127,10 +122,9 @@ inline __device__ void writeDofsOffset2d(SharedData_Hip& data, const CeedInt nno template inline __device__ void writeDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d; - const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - d_v[ind + comp * STRIDES_COMP] += r_v[comp]; + const CeedInt node = data.t_id_x + data.t_id_y * P1d; + const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; + for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; } } @@ -142,13 +136,13 @@ inline __device__ void writeDofsStrided2d(SharedData_Hip& data, const CeedInt el // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void readDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { +inline __device__ void readDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, + CeedScalar* r_u) { if (data.t_id_x < P1d && data.t_id_y < P1d) for (CeedInt z = 0; z < P1d; ++z) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; - const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; + const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; + const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + COMPSTRIDE * comp]; } } @@ -159,10 +153,9 @@ template Q-vector, offests provided //------------------------------------------------------------------------------ template -inline __device__ void readSliceQuadsOffset3d(SharedData_Hip& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { +inline __device__ void readSliceQuadsOffset3d(SharedData_Hip& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, + const CeedScalar* d_u, CeedScalar* r_u) { if (data.t_id_x < Q1d && data.t_id_y < Q1d) { - const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; - const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + COMPSTRIDE * comp]; + const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; + const CeedInt ind = indices[node + elem * Q1d * Q1d * Q1d]; + ; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; } } @@ -185,10 +179,9 @@ inline __device__ void readSliceQuadsOffset3d(SharedData_Hip& data, const CeedIn template inline __device__ void readSliceQuadsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { if (data.t_id_x < Q1d && data.t_id_y < Q1d) { - const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; - const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - r_u[comp] = d_u[ind + comp * STRIDES_COMP]; + const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; + const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; + for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } } @@ -196,13 +189,13 @@ inline __device__ void readSliceQuadsStrided3d(SharedData_Hip& data, const CeedI // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ template -inline __device__ void writeDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { +inline __device__ void writeDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, + const CeedScalar* r_v, CeedScalar* d_v) { if (data.t_id_x < P1d && data.t_id_y < P1d) for (CeedInt z = 0; z < P1d; ++z) { - const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; - const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) - atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); + const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; + const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; + for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z + comp * P1d]); } } @@ -213,10 +206,9 @@ template -inline __device__ void gradCollo3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void gradCollo3d(SharedData_Hip& data, const CeedInt q, const CeedScalar* __restrict__ r_U, const CeedScalar* c_G, + CeedScalar* __restrict__ r_V) { if (data.t_id_x < Q1d && data.t_id_y < Q1d) { for (CeedInt comp = 0; comp < NCOMP; ++comp) { - data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[q + comp*Q1d]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q1d]; __syncthreads(); // X derivative - r_V[comp+0*NCOMP] = 0.0; + r_V[comp + 0 * NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) - r_V[comp+0*NCOMP] += c_G[i + data.t_id_x*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) + r_V[comp + 0 * NCOMP] += c_G[i + data.t_id_x * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) // Y derivative - r_V[comp+1*NCOMP] = 0.0; + r_V[comp + 1 * NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) - r_V[comp+1*NCOMP] += c_G[i + data.t_id_y*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) + r_V[comp + 1 * NCOMP] += c_G[i + data.t_id_y * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) // Z derivative - r_V[comp+2*NCOMP] = 0.0; - for (CeedInt i = 0; i < Q1d; ++i) - r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) + r_V[comp + 2 * NCOMP] = 0.0; + for (CeedInt i = 0; i < Q1d; ++i) r_V[comp + 2 * NCOMP] += c_G[i + q * Q1d] * r_U[i + comp * Q1d]; // Contract z direction (Z derivative) __syncthreads(); } } @@ -250,24 +242,25 @@ inline __device__ void gradCollo3d(SharedData_Hip& data, const CeedInt q, const // 3D collocated derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void gradColloTranspose3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void gradColloTranspose3d(SharedData_Hip& data, const CeedInt q, const CeedScalar* __restrict__ r_U, const CeedScalar* c_G, + CeedScalar* __restrict__ r_V) { if (data.t_id_x < Q1d && data.t_id_y < Q1d) { for (CeedInt comp = 0; comp < NCOMP; ++comp) { // X derivative - data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 0*NCOMP]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NCOMP]; __syncthreads(); for (CeedInt i = 0; i < Q1d; ++i) - r_V[q+comp*Q1d] += c_G[data.t_id_x + i*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) + r_V[q + comp * Q1d] += c_G[data.t_id_x + i * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) __syncthreads(); // Y derivative - data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 1*NCOMP]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NCOMP]; __syncthreads(); for (CeedInt i = 0; i < Q1d; ++i) - r_V[q+comp*Q1d] += c_G[data.t_id_y + i*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) + r_V[q + comp * Q1d] += c_G[data.t_id_y + i * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) __syncthreads(); // Z derivative for (CeedInt i = 0; i < Q1d; ++i) - r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) + r_V[i + comp * Q1d] += c_G[i + q * Q1d] * r_U[comp + 2 * NCOMP]; // PARTIAL contract z direction (Z derivative) } } } diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h index bc74ed38a9..1334c61bed 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h @@ -14,33 +14,28 @@ //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, - const CeedScalar *d_B, - const CeedScalar *__restrict__ d_U, +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { const CeedInt t_id = threadIdx.x; const CeedScalar *U; - CeedScalar V; - //TODO load B in shared memory if blockDim.z > 1? + CeedScalar V; + // TODO load B in shared memory if blockDim.z > 1? - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; - elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - if (transpose) { // run with P threads - U = d_U + elem*BASIS_Q + comp*num_elem*BASIS_Q; + if (transpose) { // run with P threads + U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q; V = 0.0; - for (CeedInt i = 0; i < BASIS_Q; i++) - V += d_B[t_id + i*BASIS_P]*U[i]; + for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i]; - d_V[elem*BASIS_P + comp*num_elem*BASIS_P + t_id] = V; - } else { // run with Q threads - U = d_U + elem*BASIS_P + comp*num_elem*BASIS_P; + d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; + } else { // run with Q threads + U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; V = 0.0; - for (CeedInt i = 0; i < BASIS_P; i++) - V += d_B[i + t_id*BASIS_P]*U[i]; + for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i]; - d_V[elem*BASIS_Q + comp*num_elem*BASIS_Q + t_id] = V; + d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V; } } } @@ -49,41 +44,33 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos //------------------------------------------------------------------------------ // Grad //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, - const CeedScalar *d_G, - const CeedScalar *__restrict__ d_U, +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { const CeedInt t_id = threadIdx.x; const CeedScalar *U; - //TODO load G in shared memory if blockDim.z > 1? + // TODO load G in shared memory if blockDim.z > 1? - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; - elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - if (transpose) { // run with P threads + if (transpose) { // run with P threads CeedScalar V = 0.0; for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { - U = d_U + elem*BASIS_Q + comp*num_elem*BASIS_Q + - dim*BASIS_NUM_COMP*num_elem*BASIS_Q; - for (CeedInt i = 0; i < BASIS_Q; i++) - V += d_G[t_id + i*BASIS_P + dim*BASIS_P*BASIS_Q]*U[i]; + U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q; + for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i]; } - d_V[elem*BASIS_P + comp*num_elem*BASIS_P + t_id] = V; - } else { // run with Q threads + d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; + } else { // run with Q threads CeedScalar V[BASIS_DIM]; - U = d_U + elem*BASIS_P + comp*num_elem*BASIS_P; - for (CeedInt dim = 0; dim < BASIS_DIM; dim++) - V[dim] = 0.0; + U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; + for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0; for (CeedInt i = 0; i < BASIS_P; i++) { const CeedScalar val = U[i]; - for(CeedInt dim = 0; dim < BASIS_DIM; dim++) - V[dim] += d_G[i + t_id*BASIS_P + dim*BASIS_P*BASIS_Q]*val; + for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val; } for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { - d_V[elem*BASIS_Q + comp*num_elem*BASIS_Q + - dim*BASIS_NUM_COMP*num_elem*BASIS_Q + t_id] = V[dim]; + d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim]; } } } @@ -93,14 +80,11 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, //------------------------------------------------------------------------------ // Weight //------------------------------------------------------------------------------ -extern "C" __global__ void Weight(const CeedInt num_elem, - const CeedScalar *__restrict__ qweight, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ qweight, CeedScalar *__restrict__ d_V) { const CeedInt t_id = threadIdx.x; - //TODO load qweight in shared memory if blockDim.z > 1? - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; - elem += gridDim.x*blockDim.z) { - d_V[elem*BASIS_Q + t_id] = qweight[t_id]; + // TODO load qweight in shared memory if blockDim.z > 1? + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + d_V[elem * BASIS_Q + t_id] = qweight[t_id]; } } diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h index 2dc68743cc..cba57abe59 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-tensor.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor.h @@ -14,46 +14,44 @@ //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, - const CeedScalar *__restrict__ interp_1d, - const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN]; - CeedScalar *s_interp_1d = s_mem; - CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; - CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_interp_1d = s_mem; + CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { s_interp_1d[k] = interp_1d[k]; } - const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; - const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; - const CeedInt stride0 = transpose ? 1 : BASIS_P_1D; - const CeedInt stride1 = transpose ? BASIS_P_1D : 1; - const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; + const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; + const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; + const CeedInt stride0 = transpose ? 1 : BASIS_P_1D; + const CeedInt stride1 = transpose ? BASIS_P_1D : 1; + const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); - const CeedInt u_size = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt u_size = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; for (CeedInt k = i; k < u_size; k += blockDim.x) { s_buffer_1[k] = cur_u[k]; } - CeedInt pre = u_size; + CeedInt pre = u_size; CeedInt post = 1; for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); // Update buffers used pre /= P; - const CeedScalar *in = d % 2 ? s_buffer_2 : s_buffer_1; - CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); + const CeedScalar *in = d % 2 ? s_buffer_2 : s_buffer_1; + CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); // Contract along middle index const CeedInt writeLen = pre * post * Q; @@ -63,8 +61,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos const CeedInt a = k / (post * Q); CeedScalar vk = 0; - for (CeedInt b = 0; b < P; b++) - vk += s_interp_1d[j*stride0 + b*stride1] * in[(a*P + b)*post + c]; + for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride0 + b * stride1] * in[(a * P + b) * post + c]; out[k] = vk; } @@ -78,72 +75,59 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos //------------------------------------------------------------------------------ // Grad //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, - const CeedScalar *__restrict__ interp_1d, - const CeedScalar *__restrict__ grad_1d, - const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *__restrict__ interp_1d, + const CeedScalar *__restrict__ grad_1d, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[2 * (BASIS_Q_1D * BASIS_P_1D + BASIS_BUF_LEN)]; - CeedScalar *s_interp_1d = s_mem; - CeedScalar *s_grad_1d = s_interp_1d + BASIS_Q_1D * BASIS_P_1D; - CeedScalar *s_buffer_1 = s_grad_1d + BASIS_Q_1D * BASIS_P_1D; - CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; + CeedScalar *s_interp_1d = s_mem; + CeedScalar *s_grad_1d = s_interp_1d + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_1 = s_grad_1d + BASIS_Q_1D * BASIS_P_1D; + CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { s_interp_1d[k] = interp_1d[k]; - s_grad_1d[k] = grad_1d[k]; + s_grad_1d[k] = grad_1d[k]; } - const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; - const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; - const CeedInt stride0 = transpose ? 1 : BASIS_P_1D; - const CeedInt stride1 = transpose ? BASIS_P_1D : 1; - const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; + const CeedInt P = transpose ? BASIS_Q_1D : BASIS_P_1D; + const CeedInt Q = transpose ? BASIS_P_1D : BASIS_Q_1D; + const CeedInt stride0 = transpose ? 1 : BASIS_P_1D; + const CeedInt stride1 = transpose ? BASIS_P_1D : 1; + const CeedInt u_stride = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + const CeedInt v_stride = transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS; const CeedInt u_comp_stride = num_elem * (transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES); const CeedInt v_comp_stride = num_elem * (transpose ? BASIS_NUM_NODES : BASIS_NUM_QPTS); - const CeedInt u_dim_stride = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0; - const CeedInt v_dim_stride = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP; + const CeedInt u_dim_stride = transpose ? num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP : 0; + const CeedInt v_dim_stride = transpose ? 0 : num_elem * BASIS_NUM_QPTS * BASIS_NUM_COMP; // Apply basis element by element for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { - // dim*dim contractions for grad for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { - CeedInt pre = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; - CeedInt post = 1; - const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + - comp * u_comp_stride; - CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * - v_comp_stride; + CeedInt pre = transpose ? BASIS_NUM_QPTS : BASIS_NUM_NODES; + CeedInt post = 1; + const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; + CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { __syncthreads(); // Update buffers used pre /= P; - const CeedScalar *op = dim_1 == dim_2 ? s_grad_1d : s_interp_1d; - const CeedScalar *in = dim_2 == 0 - ? cur_u - : (dim_2 % 2 ? s_buffer_2 : s_buffer_1); - CeedScalar *out = dim_2 == BASIS_DIM - 1 - ? cur_v - : (dim_2 % 2 ? s_buffer_1 : s_buffer_2); + const CeedScalar *op = dim_1 == dim_2 ? s_grad_1d : s_interp_1d; + const CeedScalar *in = dim_2 == 0 ? cur_u : (dim_2 % 2 ? s_buffer_2 : s_buffer_1); + CeedScalar *out = dim_2 == BASIS_DIM - 1 ? cur_v : (dim_2 % 2 ? s_buffer_1 : s_buffer_2); // Contract along middle index const CeedInt writeLen = pre * post * Q; for (CeedInt k = i; k < writeLen; k += blockDim.x) { - const CeedInt c = k % post; - const CeedInt j = (k / post) % Q; - const CeedInt a = k / (post * Q); - CeedScalar v_k = 0; - for (CeedInt b = 0; b < P; b++) - v_k += op[j * stride0 + b * stride1] * in[(a * P + b) * post + c]; - - if (transpose && dim_2 == BASIS_DIM - 1) - out[k] += v_k; - else - out[k] = v_k; + const CeedInt c = k % post; + const CeedInt j = (k / post) % Q; + const CeedInt a = k / (post * Q); + CeedScalar v_k = 0; + for (CeedInt b = 0; b < P; b++) v_k += op[j * stride0 + b * stride1] * in[(a * P + b) * post + c]; + + if (transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k; + else out[k] = v_k; } post *= Q; @@ -156,73 +140,55 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, //------------------------------------------------------------------------------ // 1D quadrature weights //------------------------------------------------------------------------------ -__device__ void Weight1d(const CeedInt num_elem, const CeedScalar *q_weight_1d, - CeedScalar *w) { +__device__ void Weight1d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { CeedScalar w1d[BASIS_Q_1D]; - for (CeedInt i = 0; i < BASIS_Q_1D; i++) - w1d[i] = q_weight_1d[i]; + for (CeedInt i = 0; i < BASIS_Q_1D; i++) w1d[i] = q_weight_1d[i]; - for (CeedInt e = blockIdx.x * blockDim.x + threadIdx.x; - e < num_elem; - e += blockDim.x * gridDim.x) + for (CeedInt e = blockIdx.x * blockDim.x + threadIdx.x; e < num_elem; e += blockDim.x * gridDim.x) for (CeedInt i = 0; i < BASIS_Q_1D; i++) { - const CeedInt ind = e*BASIS_Q_1D + i; // sequential - w[ind] = w1d[i]; + const CeedInt ind = e * BASIS_Q_1D + i; // sequential + w[ind] = w1d[i]; } } //------------------------------------------------------------------------------ // 2D quadrature weights //------------------------------------------------------------------------------ -__device__ void Weight2d(const CeedInt num_elem, const CeedScalar *q_weight_1d, - CeedScalar *w) { +__device__ void Weight2d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { CeedScalar w1d[BASIS_Q_1D]; - for (CeedInt i = 0; i < BASIS_Q_1D; i++) - w1d[i] = q_weight_1d[i]; + for (CeedInt i = 0; i < BASIS_Q_1D; i++) w1d[i] = q_weight_1d[i]; - for (CeedInt e = blockIdx.x * blockDim.x + threadIdx.x; - e < num_elem; - e += blockDim.x * gridDim.x) + for (CeedInt e = blockIdx.x * blockDim.x + threadIdx.x; e < num_elem; e += blockDim.x * gridDim.x) for (CeedInt i = 0; i < BASIS_Q_1D; i++) for (CeedInt j = 0; j < BASIS_Q_1D; j++) { - const CeedInt ind = e*BASIS_Q_1D*BASIS_Q_1D + i + j*BASIS_Q_1D; // sequential - w[ind] = w1d[i]*w1d[j]; + const CeedInt ind = e * BASIS_Q_1D * BASIS_Q_1D + i + j * BASIS_Q_1D; // sequential + w[ind] = w1d[i] * w1d[j]; } } //------------------------------------------------------------------------------ // 3D quadrature weights //------------------------------------------------------------------------------ -__device__ void Weight3d(const CeedInt num_elem, const CeedScalar *q_weight_1d, - CeedScalar *w) { +__device__ void Weight3d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { CeedScalar w1d[BASIS_Q_1D]; - for (CeedInt i = 0; i < BASIS_Q_1D; i++) - w1d[i] = q_weight_1d[i]; + for (CeedInt i = 0; i < BASIS_Q_1D; i++) w1d[i] = q_weight_1d[i]; - for (CeedInt e = blockIdx.x * blockDim.x + threadIdx.x; - e < num_elem; - e += blockDim.x * gridDim.x) + for (CeedInt e = blockIdx.x * blockDim.x + threadIdx.x; e < num_elem; e += blockDim.x * gridDim.x) for (CeedInt i = 0; i < BASIS_Q_1D; i++) for (CeedInt j = 0; j < BASIS_Q_1D; j++) for (CeedInt k = 0; k < BASIS_Q_1D; k++) { - const CeedInt ind = e*BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D + i + - j*BASIS_Q_1D + k*BASIS_Q_1D*BASIS_Q_1D; // sequential - w[ind] = w1d[i]*w1d[j]*w1d[k]; + const CeedInt ind = e * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D + i + j * BASIS_Q_1D + k * BASIS_Q_1D * BASIS_Q_1D; // sequential + w[ind] = w1d[i] * w1d[j] * w1d[k]; } } //------------------------------------------------------------------------------ // Quadrature weights //------------------------------------------------------------------------------ -extern "C" __global__ void Weight(const CeedInt num_elem, - const CeedScalar *__restrict__ q_weight_1d, - CeedScalar *__restrict__ v) { - if (BASIS_DIM == 1) - Weight1d(num_elem, q_weight_1d, v); - else if (BASIS_DIM == 2) - Weight2d(num_elem, q_weight_1d, v); - else if (BASIS_DIM == 3) - Weight3d(num_elem, q_weight_1d, v); +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ v) { + if (BASIS_DIM == 1) Weight1d(num_elem, q_weight_1d, v); + else if (BASIS_DIM == 2) Weight2d(num_elem, q_weight_1d, v); + else if (BASIS_DIM == 3) Weight3d(num_elem, q_weight_1d, v); } //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h index e2ac083b52..406f6ce2d6 100644 --- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h +++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h @@ -13,15 +13,15 @@ typedef enum { /// Perform no evaluation (either because there is no data or it is already at /// quadrature points) - CEED_EVAL_NONE = 0, + CEED_EVAL_NONE = 0, /// Interpolate from nodes to quadrature points CEED_EVAL_INTERP = 1, /// Evaluate gradients at quadrature points from input in a nodal basis - CEED_EVAL_GRAD = 2, + CEED_EVAL_GRAD = 2, /// Evaluate divergence at quadrature points from input in a nodal basis - CEED_EVAL_DIV = 4, + CEED_EVAL_DIV = 4, /// Evaluate curl at quadrature points from input in a nodal basis - CEED_EVAL_CURL = 8, + CEED_EVAL_CURL = 8, /// Using no input, evaluate quadrature weights on the reference element CEED_EVAL_WEIGHT = 16, } CeedEvalMode; @@ -29,58 +29,48 @@ typedef enum { //------------------------------------------------------------------------------ // Get Basis Emode Pointer //------------------------------------------------------------------------------ -extern "C" __device__ void CeedOperatorGetBasisPointer_Hip(const CeedScalar **basisptr, - CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp, - const CeedScalar *grad) { +extern "C" __device__ void CeedOperatorGetBasisPointer_Hip(const CeedScalar **basisptr, CeedEvalMode emode, const CeedScalar *identity, + const CeedScalar *interp, const CeedScalar *grad) { switch (emode) { - case CEED_EVAL_NONE: - *basisptr = identity; - break; - case CEED_EVAL_INTERP: - *basisptr = interp; - break; - case CEED_EVAL_GRAD: - *basisptr = grad; - break; - case CEED_EVAL_WEIGHT: - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - break; // Caught by QF Assembly + case CEED_EVAL_NONE: + *basisptr = identity; + break; + case CEED_EVAL_INTERP: + *basisptr = interp; + break; + case CEED_EVAL_GRAD: + *basisptr = grad; + break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // Caught by QF Assembly } } //------------------------------------------------------------------------------ // Core code for diagonal assembly //------------------------------------------------------------------------------ -__device__ void diagonalCore(const CeedInt nelem, - const bool pointBlock, const CeedScalar *identity, - const CeedScalar *interpin, const CeedScalar *gradin, - const CeedScalar *interpout, const CeedScalar *gradout, - const CeedEvalMode *emodein, const CeedEvalMode *emodeout, - const CeedScalar *__restrict__ assembledqfarray, - CeedScalar *__restrict__ elemdiagarray) { - const int tid = threadIdx.x; // running with P threads, tid is evec node +__device__ void diagonalCore(const CeedInt nelem, const bool pointBlock, const CeedScalar *identity, const CeedScalar *interpin, + const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, + const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { + const int tid = threadIdx.x; // running with P threads, tid is evec node if (tid >= NNODES) return; // Compute the diagonal of B^T D B // Each element - for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < nelem; - e += gridDim.x*blockDim.z) { + for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < nelem; e += gridDim.x * blockDim.z) { CeedInt dout = -1; // Each basis eval mode pair for (CeedInt eout = 0; eout < NUMEMODEOUT; eout++) { const CeedScalar *bt = NULL; - if (emodeout[eout] == CEED_EVAL_GRAD) - dout += 1; - CeedOperatorGetBasisPointer_Hip(&bt, emodeout[eout], identity, interpout, - &gradout[dout*NQPTS*NNODES]); + if (emodeout[eout] == CEED_EVAL_GRAD) dout += 1; + CeedOperatorGetBasisPointer_Hip(&bt, emodeout[eout], identity, interpout, &gradout[dout * NQPTS * NNODES]); CeedInt din = -1; for (CeedInt ein = 0; ein < NUMEMODEIN; ein++) { const CeedScalar *b = NULL; - if (emodein[ein] == CEED_EVAL_GRAD) - din += 1; - CeedOperatorGetBasisPointer_Hip(&b, emodein[ein], identity, interpin, - &gradin[din*NQPTS*NNODES]); + if (emodein[ein] == CEED_EVAL_GRAD) din += 1; + CeedOperatorGetBasisPointer_Hip(&b, emodein[ein], identity, interpin, &gradin[din * NQPTS * NNODES]); // Each component for (CeedInt compOut = 0; compOut < NCOMP; compOut++) { // Each qpoint/node pair @@ -90,22 +80,20 @@ __device__ void diagonalCore(const CeedInt nelem, CeedScalar evalue = 0.; for (CeedInt q = 0; q < NQPTS; q++) { const CeedScalar qfvalue = - assembledqfarray[((((ein*NCOMP+compIn)*NUMEMODEOUT+eout)* - NCOMP+compOut)*nelem+e)*NQPTS+q]; - evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; + assembledqfarray[((((ein * NCOMP + compIn) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; + evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; } - elemdiagarray[((compOut*NCOMP+compIn)*nelem+e)*NNODES+tid] += evalue; + elemdiagarray[((compOut * NCOMP + compIn) * nelem + e) * NNODES + tid] += evalue; } } else { // Diagonal Only CeedScalar evalue = 0.; for (CeedInt q = 0; q < NQPTS; q++) { const CeedScalar qfvalue = - assembledqfarray[((((ein*NCOMP+compOut)*NUMEMODEOUT+eout)* - NCOMP+compOut)*nelem+e)*NQPTS+q]; - evalue += bt[q*NNODES+tid] * qfvalue * b[q*NNODES+tid]; + assembledqfarray[((((ein * NCOMP + compOut) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; + evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; } - elemdiagarray[(compOut*nelem+e)*NNODES+tid] += evalue; + elemdiagarray[(compOut * nelem + e) * NNODES + tid] += evalue; } } } @@ -116,29 +104,21 @@ __device__ void diagonalCore(const CeedInt nelem, //------------------------------------------------------------------------------ // Linear diagonal //------------------------------------------------------------------------------ -extern "C" __global__ void linearDiagonal(const CeedInt nelem, - const CeedScalar *identity, - const CeedScalar *interpin, const CeedScalar *gradin, - const CeedScalar *interpout, const CeedScalar *gradout, - const CeedEvalMode *emodein, const CeedEvalMode *emodeout, - const CeedScalar *__restrict__ assembledqfarray, - CeedScalar *__restrict__ elemdiagarray) { - diagonalCore(nelem, false, identity, interpin, gradin, interpout, - gradout, emodein, emodeout, assembledqfarray, elemdiagarray); +extern "C" __global__ void linearDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, const CeedScalar *gradin, + const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, + const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, + CeedScalar *__restrict__ elemdiagarray) { + diagonalCore(nelem, false, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); } //------------------------------------------------------------------------------ // Linear point block diagonal //------------------------------------------------------------------------------ -extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, - const CeedScalar *identity, - const CeedScalar *interpin, const CeedScalar *gradin, - const CeedScalar *interpout, const CeedScalar *gradout, - const CeedEvalMode *emodein, const CeedEvalMode *emodeout, - const CeedScalar *__restrict__ assembledqfarray, - CeedScalar *__restrict__ elemdiagarray) { - diagonalCore(nelem, true, identity, interpin, gradin, interpout, - gradout, emodein, emodeout, assembledqfarray, elemdiagarray); +extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, + const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, + const CeedEvalMode *emodein, const CeedEvalMode *emodeout, + const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { + diagonalCore(nelem, true, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); } //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h index 6f975c924b..c9c25d522c 100644 --- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h +++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h @@ -10,106 +10,100 @@ //------------------------------------------------------------------------------ // Matrix assembly kernel for low-order elements (2D thread block) //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BLOCK_SIZE) - __global__ void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, - const CeedScalar *__restrict__ qf_array, - CeedScalar *__restrict__ values_array) { - - // This kernel assumes B_in and B_out have the same number of quadrature points and - // basis points. +extern "C" __launch_bounds__(BLOCK_SIZE) __global__ + void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, + CeedScalar *__restrict__ values_array) { + // This kernel assumes B_in and B_out have the same number of quadrature points and + // basis points. // TODO: expand to more general cases - const int i = threadIdx.x; // The output row index of each B^TDB operation - const int l = threadIdx.y; // The output column index of each B^TDB operation - // such that we have (Bout^T)_ij D_jk Bin_kl = C_il + const int i = threadIdx.x; // The output row index of each B^TDB operation + const int l = threadIdx.y; // The output column index of each B^TDB operation + // such that we have (Bout^T)_ij D_jk Bin_kl = C_il // Strides for final output ordering, determined by the reference (interface) implementation of - // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col + // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col const CeedInt comp_out_stride = NNODES * NNODES; - const CeedInt comp_in_stride = comp_out_stride * NCOMP; - const CeedInt e_stride = comp_in_stride * NCOMP; - // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const CeedInt qe_stride = NQPTS; - const CeedInt qcomp_out_stride = NELEM * qe_stride; + const CeedInt comp_in_stride = comp_out_stride * NCOMP; + const CeedInt e_stride = comp_in_stride * NCOMP; + // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt + const CeedInt qe_stride = NQPTS; + const CeedInt qcomp_out_stride = NELEM * qe_stride; const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP; - const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; - const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; + const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; + const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; // Loop over each element (if necessary) - for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM; - e += gridDim.x*blockDim.z) { + for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) { for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) { - CeedScalar result = 0.0; - CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; + CeedScalar result = 0.0; + CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { CeedInt b_in_index = emode_in * NQPTS * NNODES; - for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { + for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { CeedInt b_out_index = emode_out * NQPTS * NNODES; - CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; - // Perform the B^T D B operation for this 'chunk' of D (the qf_array) + CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + // Perform the B^T D B operation for this 'chunk' of D (the qf_array) for (CeedInt j = 0; j < NQPTS; j++) { - result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; - } - } // end of emode_out - } // end of emode_in - CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; - values_array[val_index] = result; - } // end of out component - } // end of in component - } // end of element loop + result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; + } + } // end of emode_out + } // end of emode_in + CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; + values_array[val_index] = result; + } // end of out component + } // end of in component + } // end of element loop } //------------------------------------------------------------------------------ // Fallback kernel for larger orders (1D thread block) //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BLOCK_SIZE) - __global__ void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, - const CeedScalar *__restrict__ qf_array, - CeedScalar *__restrict__ values_array) { - - // This kernel assumes B_in and B_out have the same number of quadrature points and - // basis points. +extern "C" __launch_bounds__(BLOCK_SIZE) __global__ + void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, + CeedScalar *__restrict__ values_array) { + // This kernel assumes B_in and B_out have the same number of quadrature points and + // basis points. // TODO: expand to more general cases - const int l = threadIdx.x; // The output column index of each B^TDB operation - // such that we have (Bout^T)_ij D_jk Bin_kl = C_il + const int l = threadIdx.x; // The output column index of each B^TDB operation + // such that we have (Bout^T)_ij D_jk Bin_kl = C_il // Strides for final output ordering, determined by the reference (interface) implementation of - // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col + // the symbolic assembly, slowest --> fastest: element, comp_in, comp_out, node_row, node_col const CeedInt comp_out_stride = NNODES * NNODES; - const CeedInt comp_in_stride = comp_out_stride * NCOMP; - const CeedInt e_stride = comp_in_stride * NCOMP; - // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const CeedInt qe_stride = NQPTS; - const CeedInt qcomp_out_stride = NELEM * qe_stride; + const CeedInt comp_in_stride = comp_out_stride * NCOMP; + const CeedInt e_stride = comp_in_stride * NCOMP; + // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt + const CeedInt qe_stride = NQPTS; + const CeedInt qcomp_out_stride = NELEM * qe_stride; const CeedInt qemode_out_stride = qcomp_out_stride * NCOMP; - const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; - const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; + const CeedInt qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; + const CeedInt qemode_in_stride = qcomp_in_stride * NCOMP; - // Loop over each element (if necessary) - for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < NELEM; - e += gridDim.x*blockDim.z) { + // Loop over each element (if necessary) + for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { for (CeedInt comp_in = 0; comp_in < NCOMP; comp_in++) { for (CeedInt comp_out = 0; comp_out < NCOMP; comp_out++) { for (CeedInt i = 0; i < NNODES; i++) { - CeedScalar result = 0.0; - CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; + CeedScalar result = 0.0; + CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; for (CeedInt emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { CeedInt b_in_index = emode_in * NQPTS * NNODES; - for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { + for (CeedInt emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { CeedInt b_out_index = emode_out * NQPTS * NNODES; - CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; - // Perform the B^T D B operation for this 'chunk' of D (the qf_array) + CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + // Perform the B^T D B operation for this 'chunk' of D (the qf_array) for (CeedInt j = 0; j < NQPTS; j++) { - result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; - } - } // end of emode_out - } // end of emode_in - CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; - values_array[val_index] = result; - } // end of loop over element node index, i - } // end of out component - } // end of in component - } // end of element loop + result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; + } + } // end of emode_out + } // end of emode_in + CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; + values_array[val_index] = result; + } // end of loop over element node index, i + } // end of out component + } // end of in component + } // end of element loop } //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/hip/hip-ref-qfunction.h b/include/ceed/jit-source/hip/hip-ref-qfunction.h index 6ad3676282..ee6aa726a3 100644 --- a/include/ceed/jit-source/hip/hip-ref-qfunction.h +++ b/include/ceed/jit-source/hip/hip-ref-qfunction.h @@ -11,9 +11,8 @@ template //------------------------------------------------------------------------------ // Read from quadrature points //------------------------------------------------------------------------------ -inline __device__ void readQuads(const CeedInt quad, const CeedInt num_qpts, - const CeedScalar* d_u, CeedScalar* r_u) { - for(CeedInt comp = 0; comp < SIZE; comp++) { +inline __device__ void readQuads(const CeedInt quad, const CeedInt num_qpts, const CeedScalar* d_u, CeedScalar* r_u) { + for (CeedInt comp = 0; comp < SIZE; comp++) { r_u[comp] = d_u[quad + num_qpts * comp]; } } @@ -22,9 +21,8 @@ inline __device__ void readQuads(const CeedInt quad, const CeedInt num_qpts, // Write at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void writeQuads(const CeedInt quad, const CeedInt num_qpts, - const CeedScalar* r_v, CeedScalar* d_v) { - for(CeedInt comp = 0; comp < SIZE; comp++) { +inline __device__ void writeQuads(const CeedInt quad, const CeedInt num_qpts, const CeedScalar* r_v, CeedScalar* d_v) { + for (CeedInt comp = 0; comp < SIZE; comp++) { d_v[quad + num_qpts * comp] = r_v[comp]; } } diff --git a/include/ceed/jit-source/hip/hip-ref-restriction.h b/include/ceed/jit-source/hip/hip-ref-restriction.h index 598bb6b402..613b510949 100644 --- a/include/ceed/jit-source/hip/hip-ref-restriction.h +++ b/include/ceed/jit-source/hip/hip-ref-restriction.h @@ -10,93 +10,70 @@ //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, - const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; - node < num_elem*RESTR_ELEM_SIZE; - node += blockDim.x * gridDim.x) { +extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + const CeedInt elem = node / RESTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - v[loc_node + comp*RESTR_ELEM_SIZE*RESTR_NUM_ELEM + - elem*RESTR_ELEM_SIZE] = - u[loc_node*RESTR_STRIDE_NODES + comp*RESTR_STRIDE_COMP + elem*RESTR_STRIDE_ELEM]; + v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = + u[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM]; } } //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, - const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ u, +extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; - node < num_elem*RESTR_ELEM_SIZE; - node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + const CeedInt elem = node / RESTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - v[loc_node + comp*RESTR_ELEM_SIZE*RESTR_NUM_ELEM + - elem*RESTR_ELEM_SIZE] = - u[ind + comp*RESTR_COMP_STRIDE]; + v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = u[ind + comp * RESTR_COMP_STRIDE]; } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -extern "C" __global__ void StridedTranspose(const CeedInt num_elem, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; - node < num_elem*RESTR_ELEM_SIZE; - node += blockDim.x * gridDim.x) { +extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + const CeedInt elem = node / RESTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - v[loc_node*RESTR_STRIDE_NODES + comp*RESTR_STRIDE_COMP + elem*RESTR_STRIDE_ELEM] += - u[loc_node + comp*RESTR_ELEM_SIZE*RESTR_NUM_ELEM + - elem*RESTR_ELEM_SIZE]; + v[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM] += + u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; } } //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ l_vec_indices, - const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, - const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { +extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { CeedScalar value[RESTR_NUM_COMP]; - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; - i < RESTR_NUM_NODES; - i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RESTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i+1]; + const CeedInt range_N = t_offsets[i + 1]; - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - value[comp] = 0.0; + for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) value[comp] = 0.0; for (CeedInt j = range_1; j < range_N; ++j) { - const CeedInt t_ind = t_indices[j]; - CeedInt loc_node = t_ind % RESTR_ELEM_SIZE; - CeedInt elem = t_ind / RESTR_ELEM_SIZE; + const CeedInt t_ind = t_indices[j]; + CeedInt loc_node = t_ind % RESTR_ELEM_SIZE; + CeedInt elem = t_ind / RESTR_ELEM_SIZE; for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - value[comp] += u[loc_node + comp*RESTR_ELEM_SIZE*RESTR_NUM_ELEM + - elem*RESTR_ELEM_SIZE]; + value[comp] += u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; } - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) - v[ind + comp*RESTR_COMP_STRIDE] += value[comp]; + for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) v[ind + comp * RESTR_COMP_STRIDE] += value[comp]; } } diff --git a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h index 378ab738bb..8a7ccda1eb 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-read-write-templates.h @@ -12,15 +12,13 @@ #include - //------------------------------------------------------------------------------ // Helper function: load matrices for basis actions //------------------------------------------------------------------------------ template -inline __device__ void loadMatrix(const CeedScalar* d_B, CeedScalar* B) { - CeedInt tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - for (CeedInt i = tid; i < SIZE; i += blockDim.x*blockDim.y*blockDim.z) - B[i] = d_B[i]; +inline __device__ void loadMatrix(const CeedScalar *d_B, CeedScalar *B) { + CeedInt tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + for (CeedInt i = tid; i < SIZE; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } //------------------------------------------------------------------------------ @@ -31,10 +29,11 @@ inline __device__ void loadMatrix(const CeedScalar* d_B, CeedScalar* B) { // E-vector -> single element //------------------------------------------------------------------------------ template -inline __device__ void ReadElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void ReadElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_u[comp] = d_u[ind + comp * strides_comp]; } @@ -45,10 +44,11 @@ inline __device__ void ReadElementStrided1d(SharedData_Hip &data, const CeedInt // Single element -> E-vector //------------------------------------------------------------------------------ template -inline __device__ void WriteElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { +inline __device__ void WriteElementStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { d_v[ind + comp * strides_comp] = r_v[comp]; } @@ -63,10 +63,11 @@ inline __device__ void WriteElementStrided1d(SharedData_Hip &data, const CeedInt // E-vector -> single element //------------------------------------------------------------------------------ template -inline __device__ void ReadElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void ReadElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { - const CeedInt node = data.t_id_x + data.t_id_y*P_1D; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_u[comp] = d_u[ind + comp * strides_comp]; } @@ -77,10 +78,11 @@ inline __device__ void ReadElementStrided2d(SharedData_Hip &data, const CeedInt // Single element -> E-vector //------------------------------------------------------------------------------ template -inline __device__ void WriteElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { +inline __device__ void WriteElementStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { - const CeedInt node = data.t_id_x + data.t_id_y*P_1D; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt node = data.t_id_x + data.t_id_y * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { d_v[ind + comp * strides_comp] = r_v[comp]; } @@ -95,11 +97,12 @@ inline __device__ void WriteElementStrided2d(SharedData_Hip &data, const CeedInt // E-vector -> single element //------------------------------------------------------------------------------ template -inline __device__ void ReadElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { +inline __device__ void ReadElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { - const CeedInt node = data.t_id_x + data.t_id_y*P_1D + z*P_1D*P_1D; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp]; } @@ -111,11 +114,12 @@ inline __device__ void ReadElementStrided3d(SharedData_Hip &data, const CeedInt // Single element -> E-vector //------------------------------------------------------------------------------ template -inline __device__ void WriteElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { +inline __device__ void WriteElementStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, + const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { - const CeedInt node = data.t_id_x + data.t_id_y*P_1D + z*P_1D*P_1D; - const CeedInt ind = node * strides_node + elem * strides_elem; + const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; + const CeedInt ind = node * strides_node + elem * strides_elem; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { d_v[ind + comp * strides_comp] = r_v[z + comp * P_1D]; } diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h index bdd526c1d7..a871be1c4c 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-templates.h @@ -26,7 +26,7 @@ inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, co *V = 0.0; if (data.t_id_x < Q_1D) { for (CeedInt i = 0; i < P_1D; i++) { - *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction + *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction } } __syncthreads(); @@ -42,7 +42,7 @@ inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScal *V = 0.0; if (data.t_id_x < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction + *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction } } __syncthreads(); @@ -62,7 +62,8 @@ inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restri // 1D interpolate transpose //------------------------------------------------------------------------------ template -inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeX1d(data, r_U + comp, c_B, r_V + comp); } @@ -72,7 +73,8 @@ inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar // 1D derivatives at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX1d(data, r_U + comp, c_G, r_V + comp); } @@ -82,7 +84,8 @@ inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict // 1D derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeX1d(data, r_U + comp, c_G, r_V + comp); } @@ -105,12 +108,12 @@ inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restri //------------------------------------------------------------------------------ template inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < P_1D; i++) { - *V += B[i + data.t_id_x*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -121,12 +124,12 @@ inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, co //------------------------------------------------------------------------------ template inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt i = 0; i < P_1D; i++) { - *V += B[i + data.t_id_y*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -137,12 +140,12 @@ inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, co //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - *V += B[data.t_id_y + i*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -153,12 +156,12 @@ inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScal //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); *V = 0.0; if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -169,11 +172,11 @@ inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScal //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; + data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -195,7 +198,8 @@ inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__ // 2D interpolate transpose //------------------------------------------------------------------------------ template -inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeY2d(data, r_U + comp, c_B, r_t); @@ -207,13 +211,14 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedS // 2D derivatives at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX2d(data, r_U + comp, c_G, r_t); - ContractY2d(data, r_t, c_B, r_V + comp + 0*NUM_COMP); + ContractY2d(data, r_t, c_B, r_V + comp + 0 * NUM_COMP); ContractX2d(data, r_U + comp, c_B, r_t); - ContractY2d(data, r_t, c_G, r_V + comp + 1*NUM_COMP); + ContractY2d(data, r_t, c_G, r_V + comp + 1 * NUM_COMP); } } @@ -221,12 +226,13 @@ inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__re // 2D derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeY2d(data, r_U + comp + 0*NUM_COMP, c_B, r_t); + ContractTransposeY2d(data, r_U + comp + 0 * NUM_COMP, c_B, r_t); ContractTransposeX2d(data, r_t, c_G, r_V + comp); - ContractTransposeY2d(data, r_U + comp + 1*NUM_COMP, c_G, r_t); + ContractTransposeY2d(data, r_U + comp + 1 * NUM_COMP, c_G, r_t); ContractTransposeAddX2d(data, r_t, c_B, r_V + comp); } } @@ -236,8 +242,7 @@ inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedSca //------------------------------------------------------------------------------ template inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { - *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? - q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0; + *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; } //------------------------------------------------------------------------------ @@ -251,16 +256,16 @@ template inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { - r_B[i] = B[i + data.t_id_x*P_1D]; + r_B[i] = B[i + data.t_id_x * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < P_1D; i++) { - V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -274,16 +279,16 @@ template inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { - r_B[i] = B[i + data.t_id_y*P_1D]; + r_B[i] = B[i + data.t_id_y * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt i = 0; i < P_1D; i++) { - V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -299,7 +304,7 @@ inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, co V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt i = 0; i < P_1D; i++) { - V[k] += B[i + k*P_1D] * U[i]; // Contract z direction + V[k] += B[i + k * P_1D] * U[i]; // Contract z direction } } } @@ -314,7 +319,7 @@ inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScal V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += B[k + i*P_1D] * U[i]; // Contract z direction + V[k] += B[k + i * P_1D] * U[i]; // Contract z direction } } } @@ -327,16 +332,16 @@ template inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { - r_B[i] = B[data.t_id_y + i*P_1D]; + r_B[i] = B[data.t_id_y + i * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -350,15 +355,15 @@ template inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { - r_B[i] = B[data.t_id_y + i*P_1D]; + r_B[i] = B[data.t_id_y + i * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction + V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction } } __syncthreads(); @@ -372,16 +377,16 @@ template inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { - r_B[i] = B[data.t_id_x + i*P_1D]; + r_B[i] = B[data.t_id_x + i * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); V[k] = 0.0; if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -395,15 +400,15 @@ template inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { - r_B[i] = B[data.t_id_x + i*P_1D]; + r_B[i] = B[data.t_id_x + i * P_1D]; } for (CeedInt k = 0; k < P_1D; k++) { - data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; __syncthreads(); if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt i = 0; i < Q_1D; i++) { - V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction + V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction } } __syncthreads(); @@ -418,9 +423,9 @@ inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__ CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp*P_1D, c_B, r_t1); + ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp*Q_1D); + ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D); } } @@ -428,13 +433,14 @@ inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__ // 3D interpolate transpose //------------------------------------------------------------------------------ template -inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp*Q_1D, c_B, r_t1); + ContractTransposeZ3d(data, r_U + comp * Q_1D, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp*P_1D); + ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); } } @@ -442,19 +448,20 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedS // 3D derivatives at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp*P_1D, c_G, r_t1); + ContractX3d(data, r_U + comp * P_1D, c_G, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D); - ContractX3d(data, r_U + comp*P_1D, c_B, r_t1); + ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); + ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); ContractY3d(data, r_t1, c_G, r_t2); - ContractZ3d(data, r_t2, c_B, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D); - ContractX3d(data, r_U + comp*P_1D, c_B, r_t1); + ContractZ3d(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); + ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); - ContractZ3d(data, r_t2, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D); + ContractZ3d(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); } } @@ -462,19 +469,20 @@ inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__re // 3D derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_B, r_t1); + ContractTransposeZ3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_G, r_V + comp*P_1D); - ContractTransposeZ3d(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_B, r_t1); + ContractTransposeX3d(data, r_t2, c_G, r_V + comp * P_1D); + ContractTransposeZ3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_G, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp*P_1D); - ContractTransposeZ3d(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t1); + ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); + ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp*P_1D); + ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp * P_1D); } } @@ -482,16 +490,17 @@ inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedSca // 3D derivatives at quadrature points //------------------------------------------------------------------------------ template -inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, + CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractX3d(data, r_U + comp*P_1D, c_B, r_t1); + ContractX3d(data, r_U + comp * P_1D, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_B, r_t1); - ContractX3d(data, r_t1, c_G, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D); - ContractY3d(data, r_t1, c_G, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D); - ContractZ3d(data, r_t1, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D); + ContractX3d(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); + ContractY3d(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); + ContractZ3d(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); } } @@ -499,16 +508,17 @@ inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedSc // 3D derivatives transpose //------------------------------------------------------------------------------ template -inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, + const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { - ContractTransposeZ3d(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t2); - ContractTransposeAddY3d(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_G, r_t2); - ContractTransposeAddX3d(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_G, r_t2); + ContractTransposeZ3d(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2); + ContractTransposeAddY3d(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2); + ContractTransposeAddX3d(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2); ContractTransposeZ3d(data, r_t2, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); - ContractTransposeX3d(data, r_t2, c_B, r_V + comp*P_1D); + ContractTransposeX3d(data, r_t2, c_B, r_V + comp * P_1D); } } @@ -517,10 +527,10 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, con //------------------------------------------------------------------------------ template inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { - const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); - const CeedScalar pw = quad ? q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0; + const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); + const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; for (CeedInt q = 0; q < Q_1D; q++) { - w[q] = quad ? pw*q_weight_1d[q] : 0.0; + w[q] = quad ? pw * q_weight_1d[q] : 0.0; } } diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h index bd11683665..f7d38c23ea 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h @@ -11,84 +11,83 @@ #define _ceed_hip_shared_basis_tensor_h #include + #include "hip-shared-basis-read-write-templates.h" #include "hip-shared-basis-tensor-templates.h" //------------------------------------------------------------------------------ // Interp kernel by dim //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) - __global__ void Interp(const CeedInt num_elem, - const CeedScalar *d_interp_1d, - const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void Interp(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; // load interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D*BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); __syncthreads(); SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - ReadElementStrided1d(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, d_U, r_U); + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); Interp1d(data, r_U, s_B, r_V); - WriteElementStrided1d(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, r_V, d_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - ReadElementStrided2d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, d_U, r_U); + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); InterpTensor2d(data, r_U, s_B, r_V); - WriteElementStrided2d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); + WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 3) { - ReadElementStrided3d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, d_U, r_U); + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); InterpTensor3d(data, r_U, s_B, r_V); - WriteElementStrided3d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); + WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } } } -extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) - __global__ void InterpTranspose(const CeedInt num_elem, - const CeedScalar *d_interp_1d, - const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; // load interp_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D*BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); __syncthreads(); SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - ReadElementStrided1d(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, d_U, r_U); + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); InterpTranspose1d(data, r_U, s_B, r_V); - WriteElementStrided1d(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, r_V, d_V); + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - ReadElementStrided2d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); InterpTransposeTensor2d(data, r_U, s_B, r_V); - WriteElementStrided2d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, r_V, d_V); + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { - ReadElementStrided3d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); InterpTransposeTensor3d(data, r_U, s_B, r_V); - WriteElementStrided3d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, r_V, d_V); + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } } @@ -96,86 +95,86 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) - __global__ void Grad(const CeedInt num_elem, - const CeedScalar *d_interp_1d, - const CeedScalar *d_grad_1d, - const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ + void Grad(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; // load interp_1d and grad_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D*BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; - loadMatrix(d_grad_1d, s_G); + loadMatrix(d_grad_1d, s_G); __syncthreads(); SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - ReadElementStrided1d(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, d_U, r_U); + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); Grad1d(data, r_U, s_B, s_G, r_V); - WriteElementStrided1d(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, r_V, d_V); + WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - ReadElementStrided2d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, d_U, r_U); + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); GradTensor2d(data, r_U, s_B, s_G, r_V); - WriteElementStrided2d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); + WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, + d_V); } else if (BASIS_DIM == 3) { - ReadElementStrided3d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, d_U, r_U); + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d(data, r_U, s_B, s_G, r_V); - else GradTensor3d(data, r_U, s_B, s_G, r_V); - WriteElementStrided3d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); + else GradTensor3d(data, r_U, s_B, s_G, r_V); + WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); } } } -extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) - __global__ void GradTranspose(const CeedInt num_elem, - const CeedScalar *d_interp_1d, - const CeedScalar *d_grad_1d, - const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ + void GradTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; // load interp_1d and grad_1d into shared memory - __shared__ CeedScalar s_B[BASIS_P_1D*BASIS_Q_1D]; - loadMatrix(d_interp_1d, s_B); + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; - loadMatrix(d_grad_1d, s_G); + loadMatrix(d_grad_1d, s_G); __syncthreads(); SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { - ReadElementStrided1d(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, d_U, r_U); + ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); GradTranspose1d(data, r_U, s_B, s_G, r_V); - WriteElementStrided1d(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, r_V, d_V); + WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 2) { - ReadElementStrided2d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); + ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, + r_U); GradTransposeTensor2d(data, r_U, s_B, s_G, r_V); - WriteElementStrided2d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, r_V, d_V); + WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } else if (BASIS_DIM == 3) { - ReadElementStrided3d(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); + ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, + BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d(data, r_U, s_B, s_G, r_V); - else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); - WriteElementStrided3d(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, r_V, d_V); + else GradTransposeTensor3d(data, r_U, s_B, s_G, r_V); + WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } } @@ -183,31 +182,30 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) //------------------------------------------------------------------------------ // Weight kernels by dim //------------------------------------------------------------------------------ -extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) - __global__ void Weight(const CeedInt num_elem, - const CeedScalar *__restrict__ q_weight_1d, - CeedScalar *__restrict__ d_W) { +extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__ + void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { extern __shared__ CeedScalar slice[]; SharedData_Hip data; data.t_id_x = threadIdx.x; data.t_id_y = threadIdx.y; data.t_id_z = threadIdx.z; - data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; - data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; - for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { if (BASIS_DIM == 1) { Weight1d(data, q_weight_1d, r_W); - WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, r_W, d_W); + WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); } else if (BASIS_DIM == 2) { WeightTensor2d(data, q_weight_1d, r_W); - WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, r_W, d_W); + WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); } else if (BASIS_DIM == 3) { WeightTensor3d(data, q_weight_1d, r_W); - WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, r_W, d_W); + WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W, + d_W); } } } diff --git a/include/ceed/jit-source/hip/hip-shared-basis.h b/include/ceed/jit-source/hip/hip-shared-basis.h new file mode 100644 index 0000000000..41d5d2bd63 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-shared-basis.h @@ -0,0 +1,664 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include + +//------------------------------------------------------------------------------ +// Shared mem kernels +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Sum input into output +//------------------------------------------------------------------------------ +inline __device__ void add(CeedScalar *r_V, const CeedScalar *r_U) { + for (CeedInt i = 0; i < BASIS_P_1D; i++) r_V[i] += r_U[i]; +} + +//------------------------------------------------------------------------------ +// Load matrices for basis actions +//------------------------------------------------------------------------------ +inline __device__ void loadMatrix(const CeedScalar *d_B, CeedScalar *B) { + CeedInt tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + for (CeedInt i = tid; i < BASIS_P_1D * BASIS_Q_1D; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; +} + +//------------------------------------------------------------------------------ +// 1D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Read DoFs +//------------------------------------------------------------------------------ +inline __device__ void readDofs1d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedInt comp, + const CeedInt num_elem, const CeedScalar *d_U, CeedScalar *slice) { + for (CeedInt i = 0; i < BASIS_P_1D; i++) slice[i + t_id_z * BASIS_T_1D] = d_U[i + elem * BASIS_P_1D + comp * BASIS_P_1D * num_elem]; + for (CeedInt i = BASIS_P_1D; i < BASIS_Q_1D; i++) slice[i + t_id_z * BASIS_T_1D] = 0.0; +} + +//------------------------------------------------------------------------------ +// Write DoFs +//------------------------------------------------------------------------------ +inline __device__ void writeDofs1d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar &r_V, CeedScalar *d_V) { + if (t_id_x < BASIS_P_1D) d_V[t_id_x + elem * BASIS_P_1D + comp * BASIS_P_1D * num_elem] = r_V; +} + +//------------------------------------------------------------------------------ +// Read quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void readQuads1d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedInt comp, + const CeedInt dim, const CeedInt num_elem, const CeedScalar *d_U, CeedScalar *slice) { + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + slice[i + t_id_z * BASIS_T_1D] = d_U[i + elem * BASIS_Q_1D + comp * BASIS_Q_1D * num_elem + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D]; + for (CeedInt i = BASIS_Q_1D; i < BASIS_P_1D; i++) slice[i + t_id_z * BASIS_T_1D] = 0.0; +} + +//------------------------------------------------------------------------------ +// Write quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void writeQuads1d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar &r_V, CeedScalar *d_V) { + if (t_id_x < BASIS_Q_1D) d_V[t_id_x + elem * BASIS_Q_1D + comp * BASIS_Q_1D * num_elem + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D] = r_V; +} + +//------------------------------------------------------------------------------ +// 1D tensor contraction +//------------------------------------------------------------------------------ +inline __device__ void ContractX1d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + V = 0.0; + for (CeedInt i = 0; i < BASIS_P_1D; i++) V += B[i + t_id_x * BASIS_P_1D] * slice[i + t_id_z * BASIS_T_1D]; // Contract x direction +} + +//------------------------------------------------------------------------------ +// 1D transpose tensor contraction +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeX1d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + V = 0.0; + for (CeedInt i = 0; i < BASIS_Q_1D; i++) V += B[t_id_x + i * BASIS_P_1D] * slice[i + t_id_z * BASIS_T_1D]; // Contract x direction +} + +//------------------------------------------------------------------------------ +// 1D interpolate to quadrature points +//------------------------------------------------------------------------------ +inline __device__ void interp1d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *s_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_V; + CeedScalar r_t; + + const CeedInt t_id_x = threadIdx.x; + const CeedInt t_id_y = threadIdx.y; + const CeedInt t_id_z = threadIdx.z; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + if (transpose) { + readQuads1d(elem, t_id_x, t_id_y, t_id_z, comp, 0, num_elem, d_U, slice); + ContractTransposeX1d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_V); + writeDofs1d(elem, t_id_x, t_id_y, comp, num_elem, r_V, d_V); + } else { + readDofs1d(elem, t_id_x, t_id_y, t_id_z, comp, num_elem, d_U, slice); + ContractX1d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_V); + writeQuads1d(elem, t_id_x, t_id_y, comp, 0, num_elem, r_V, d_V); + } + } + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives at quadrature points +//------------------------------------------------------------------------------ +inline __device__ void grad1d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *s_B, const CeedScalar *s_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_U; + CeedScalar r_V; + + const CeedInt t_id_x = threadIdx.x; + const CeedInt t_id_y = threadIdx.y; + const CeedInt t_id_z = threadIdx.z; + int dim; + + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { + if (transpose) { + dim = 0; + readQuads1d(elem, t_id_x, t_id_y, t_id_z, comp, dim, num_elem, d_U, slice); + ContractTransposeX1d(slice, t_id_x, t_id_y, t_id_z, r_U, s_G, r_V); + writeDofs1d(elem, t_id_x, t_id_y, comp, num_elem, r_V, d_V); + } else { + readDofs1d(elem, t_id_x, t_id_y, t_id_z, comp, num_elem, d_U, slice); + ContractX1d(slice, t_id_x, t_id_y, t_id_z, r_U, s_G, r_V); + dim = 0; + writeQuads1d(elem, t_id_x, t_id_y, comp, dim, num_elem, r_V, d_V); + } + } + } +} + +//------------------------------------------------------------------------------ +// 1D Quadrature weights +//------------------------------------------------------------------------------ +__device__ void weight1d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { + const CeedInt tid = threadIdx.x; + const CeedScalar weight = q_weight_1d[tid]; + for (CeedInt elem = blockIdx.x * blockDim.y + threadIdx.y; elem < num_elem; elem += gridDim.x * blockDim.y) { + const CeedInt ind = elem * BASIS_Q_1D + tid; + w[ind] = weight; + } +} + +//------------------------------------------------------------------------------ +// 2D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Read DoFs +//------------------------------------------------------------------------------ +inline __device__ void readDofs2d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar *d_U, CeedScalar &U) { + U = (t_id_x < BASIS_P_1D && t_id_y < BASIS_P_1D) + ? d_U[t_id_x + t_id_y * BASIS_P_1D + elem * BASIS_P_1D * BASIS_P_1D + comp * BASIS_P_1D * BASIS_P_1D * num_elem] + : 0.0; +} + +//------------------------------------------------------------------------------ +// Write DoFs +//------------------------------------------------------------------------------ +inline __device__ void writeDofs2d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar &r_V, CeedScalar *d_V) { + if (t_id_x < BASIS_P_1D && t_id_y < BASIS_P_1D) + d_V[t_id_x + t_id_y * BASIS_P_1D + elem * BASIS_P_1D * BASIS_P_1D + comp * BASIS_P_1D * BASIS_P_1D * num_elem] = r_V; +} + +//------------------------------------------------------------------------------ +// Read quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void readQuads2d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar *d_U, CeedScalar &U) { + U = (t_id_x < BASIS_Q_1D && t_id_y < BASIS_Q_1D) + ? d_U[t_id_x + t_id_y * BASIS_Q_1D + elem * BASIS_Q_1D * BASIS_Q_1D + comp * BASIS_Q_1D * BASIS_Q_1D * num_elem + + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D * BASIS_Q_1D] + : 0.0; +} + +//------------------------------------------------------------------------------ +// Write quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void writeQuads2d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar &r_V, CeedScalar *d_V) { + if (t_id_x < BASIS_Q_1D && t_id_y < BASIS_Q_1D) + d_V[t_id_x + t_id_y * BASIS_Q_1D + elem * BASIS_Q_1D * BASIS_Q_1D + comp * BASIS_Q_1D * BASIS_Q_1D * num_elem + + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D * BASIS_Q_1D] = r_V; +} + +//------------------------------------------------------------------------------ +// 2D tensor contraction x +//------------------------------------------------------------------------------ +inline __device__ void ContractX2d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + slice[t_id_x + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D] = U; + __syncthreads(); + V = 0.0; + if (t_id_x < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) + V += B[i + t_id_x * BASIS_P_1D] * slice[i + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D]; // Contract x direction + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 2D tensor contraction y +//------------------------------------------------------------------------------ +inline __device__ void ContractY2d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + slice[t_id_x + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D] = U; + __syncthreads(); + V = 0.0; + if (t_id_y < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) + V += B[i + t_id_y * BASIS_P_1D] * slice[t_id_x + i * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D]; // Contract y direction + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contraction y +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeY2d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + slice[t_id_x + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D] = U; + __syncthreads(); + V = 0.0; + if (t_id_y < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + V += B[t_id_y + i * BASIS_P_1D] * slice[t_id_x + i * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D]; // Contract y direction + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 2D transpose tensor contraction x +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeX2d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar &U, + const CeedScalar *B, CeedScalar &V) { + slice[t_id_x + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D] = U; + __syncthreads(); + V = 0.0; + if (t_id_x < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + V += B[t_id_x + i * BASIS_P_1D] * slice[i + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D]; // Contract x direction + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// 2D interpolate to quadrature points +//------------------------------------------------------------------------------ +inline __device__ void interp2d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *s_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_V; + CeedScalar r_t; + + const CeedInt t_id_x = threadIdx.x; + const CeedInt t_id_y = threadIdx.y; + const CeedInt t_id_z = threadIdx.z; + const CeedInt blockElem = t_id_z / BASIS_NUM_COMP; + const CeedInt elemsPerBlock = blockDim.z / BASIS_NUM_COMP; + const CeedInt comp = t_id_z % BASIS_NUM_COMP; + + for (CeedInt elem = blockIdx.x * elemsPerBlock + blockElem; elem < num_elem; elem += gridDim.x * elemsPerBlock) { + const CeedInt comp = t_id_z % BASIS_NUM_COMP; + r_V = 0.0; + r_t = 0.0; + if (transpose) { + readQuads2d(elem, t_id_x, t_id_y, comp, 0, num_elem, d_U, r_V); + ContractTransposeY2d(slice, t_id_x, t_id_y, t_id_z, r_V, s_B, r_t); + ContractTransposeX2d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_V); + writeDofs2d(elem, t_id_x, t_id_y, comp, num_elem, r_V, d_V); + } else { + readDofs2d(elem, t_id_x, t_id_y, comp, num_elem, d_U, r_V); + ContractX2d(slice, t_id_x, t_id_y, t_id_z, r_V, s_B, r_t); + ContractY2d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_V); + writeQuads2d(elem, t_id_x, t_id_y, comp, 0, num_elem, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives at quadrature points +//------------------------------------------------------------------------------ +inline __device__ void grad2d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *s_B, const CeedScalar *s_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_U; + CeedScalar r_V; + CeedScalar r_t; + + const CeedInt t_id_x = threadIdx.x; + const CeedInt t_id_y = threadIdx.y; + const CeedInt t_id_z = threadIdx.z; + const CeedInt blockElem = t_id_z / BASIS_NUM_COMP; + const CeedInt elemsPerBlock = blockDim.z / BASIS_NUM_COMP; + const CeedInt comp = t_id_z % BASIS_NUM_COMP; + int dim; + + for (CeedInt elem = blockIdx.x * elemsPerBlock + blockElem; elem < num_elem; elem += gridDim.x * elemsPerBlock) { + if (transpose) { + dim = 0; + readQuads2d(elem, t_id_x, t_id_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeY2d(slice, t_id_x, t_id_y, t_id_z, r_U, s_B, r_t); + ContractTransposeX2d(slice, t_id_x, t_id_y, t_id_z, r_t, s_G, r_V); + dim = 1; + readQuads2d(elem, t_id_x, t_id_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeY2d(slice, t_id_x, t_id_y, t_id_z, r_U, s_G, r_t); + ContractTransposeX2d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_U); + r_V += r_U; + writeDofs2d(elem, t_id_x, t_id_y, comp, num_elem, r_V, d_V); + } else { + readDofs2d(elem, t_id_x, t_id_y, comp, num_elem, d_U, r_U); + ContractX2d(slice, t_id_x, t_id_y, t_id_z, r_U, s_G, r_t); + ContractY2d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_V); + dim = 0; + writeQuads2d(elem, t_id_x, t_id_y, comp, dim, num_elem, r_V, d_V); + ContractX2d(slice, t_id_x, t_id_y, t_id_z, r_U, s_B, r_t); + ContractY2d(slice, t_id_x, t_id_y, t_id_z, r_t, s_G, r_V); + dim = 1; + writeQuads2d(elem, t_id_x, t_id_y, comp, dim, num_elem, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// 2D quadrature weights +//------------------------------------------------------------------------------ +__device__ void weight2d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { + const CeedInt i = threadIdx.x; + const CeedInt j = threadIdx.y; + const CeedScalar weight = q_weight_1d[i] * q_weight_1d[j]; + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + const CeedInt ind = elem * BASIS_Q_1D * BASIS_Q_1D + i + j * BASIS_Q_1D; + w[ind] = weight; + } +} + +//------------------------------------------------------------------------------ +// 3D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Read DoFs +//------------------------------------------------------------------------------ +inline __device__ void readDofs3d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar *d_U, CeedScalar *r_U) { + for (CeedInt i = 0; i < BASIS_P_1D; i++) + r_U[i] = (t_id_x < BASIS_P_1D && t_id_y < BASIS_P_1D) + ? d_U[t_id_x + t_id_y * BASIS_P_1D + i * BASIS_P_1D * BASIS_P_1D + elem * BASIS_P_1D * BASIS_P_1D * BASIS_P_1D + + comp * BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem] + : 0.0; + for (CeedInt i = BASIS_P_1D; i < BASIS_Q_1D; i++) r_U[i] = 0.0; +} + +//------------------------------------------------------------------------------ +// Write DoFs +//------------------------------------------------------------------------------ +inline __device__ void writeDofs3d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt num_elem, + const CeedScalar *r_V, CeedScalar *d_V) { + if (t_id_x < BASIS_P_1D && t_id_y < BASIS_P_1D) { + for (CeedInt i = 0; i < BASIS_P_1D; i++) + d_V[t_id_x + t_id_y * BASIS_P_1D + i * BASIS_P_1D * BASIS_P_1D + elem * BASIS_P_1D * BASIS_P_1D * BASIS_P_1D + + comp * BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem] = r_V[i]; + } +} + +//------------------------------------------------------------------------------ +// Read quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void readQuads3d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar *d_U, CeedScalar *r_U) { + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + r_U[i] = + (t_id_x < BASIS_Q_1D && t_id_y < BASIS_Q_1D) + ? d_U[t_id_x + t_id_y * BASIS_Q_1D + i * BASIS_Q_1D * BASIS_Q_1D + elem * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D + + comp * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D] + : 0.0; + for (CeedInt i = BASIS_Q_1D; i < BASIS_P_1D; i++) r_U[i] = 0.0; +} + +//------------------------------------------------------------------------------ +// Write quadrature point data +//------------------------------------------------------------------------------ +inline __device__ void writeQuads3d(const CeedInt elem, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt comp, const CeedInt dim, + const CeedInt num_elem, const CeedScalar *r_V, CeedScalar *d_V) { + if (t_id_x < BASIS_Q_1D && t_id_y < BASIS_Q_1D) { + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + d_V[t_id_x + t_id_y * BASIS_Q_1D + i * BASIS_Q_1D * BASIS_Q_1D + elem * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D + + comp * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem + dim * BASIS_NUM_COMP * num_elem * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D] = r_V[i]; + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract x +//------------------------------------------------------------------------------ +inline __device__ void ContractX3d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + slice[t_id_x + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D] = U[k]; + __syncthreads(); + V[k] = 0.0; + if (t_id_x < BASIS_Q_1D && t_id_y < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) + V[k] += B[i + t_id_x * BASIS_P_1D] * slice[i + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D]; // Contract x direction + __syncthreads(); + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract y +//------------------------------------------------------------------------------ +inline __device__ void ContractY3d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + slice[t_id_x + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D] = U[k]; + __syncthreads(); + V[k] = 0.0; + if (t_id_x < BASIS_Q_1D && t_id_y < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) + V[k] += B[i + t_id_y * BASIS_P_1D] * slice[t_id_x + i * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D]; // Contract y direction + __syncthreads(); + } +} + +//------------------------------------------------------------------------------ +// 3D tensor contract z +//------------------------------------------------------------------------------ +inline __device__ void ContractZ3d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_Q_1D; k++) { + V[k] = 0.0; + if (t_id_x < BASIS_Q_1D && t_id_y < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_P_1D; i++) V[k] += B[i + k * BASIS_P_1D] * U[i]; // Contract z direction + } + for (CeedInt k = BASIS_Q_1D; k < BASIS_P_1D; k++) V[k] = 0.0; +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract z +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeZ3d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + V[k] = 0.0; + if (t_id_x < BASIS_Q_1D && t_id_y < BASIS_Q_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) V[k] += B[k + i * BASIS_P_1D] * U[i]; // Contract z direction + } + for (CeedInt k = BASIS_P_1D; k < BASIS_Q_1D; k++) V[k] = 0.0; +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract y +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeY3d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + slice[t_id_x + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D] = U[k]; + __syncthreads(); + V[k] = 0.0; + if (t_id_x < BASIS_Q_1D && t_id_y < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + V[k] += B[t_id_y + i * BASIS_P_1D] * slice[t_id_x + i * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D]; // Contract y direction + __syncthreads(); + } +} + +//------------------------------------------------------------------------------ +// 3D transpose tensor contract x +//------------------------------------------------------------------------------ +inline __device__ void ContractTransposeX3d(CeedScalar *slice, const CeedInt t_id_x, const CeedInt t_id_y, const CeedInt t_id_z, const CeedScalar *U, + const CeedScalar *B, CeedScalar *V) { + for (CeedInt k = 0; k < BASIS_P_1D; k++) { + slice[t_id_x + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D] = U[k]; + __syncthreads(); + V[k] = 0.0; + if (t_id_x < BASIS_P_1D && t_id_y < BASIS_P_1D) + for (CeedInt i = 0; i < BASIS_Q_1D; i++) + V[k] += B[t_id_x + i * BASIS_P_1D] * slice[i + t_id_y * BASIS_T_1D + t_id_z * BASIS_T_1D * BASIS_T_1D]; // Contract x direction + __syncthreads(); + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate to quadrature points +//------------------------------------------------------------------------------ +inline __device__ void interp3d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *s_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V, CeedScalar *slice) { + CeedScalar r_V[BASIS_T_1D]; + CeedScalar r_t[BASIS_T_1D]; + + const CeedInt t_id_x = threadIdx.x; + const CeedInt t_id_y = threadIdx.y; + const CeedInt t_id_z = threadIdx.z; + const CeedInt blockElem = t_id_z / BASIS_NUM_COMP; + const CeedInt elemsPerBlock = blockDim.z / BASIS_NUM_COMP; + const CeedInt comp = t_id_z % BASIS_NUM_COMP; + + for (CeedInt elem = blockIdx.x * elemsPerBlock + blockElem; elem < num_elem; elem += gridDim.x * elemsPerBlock) { + for (CeedInt i = 0; i < BASIS_T_1D; i++) { + r_V[i] = 0.0; + r_t[i] = 0.0; + } + if (transpose) { + readQuads3d(elem, t_id_x, t_id_y, comp, 0, num_elem, d_U, r_V); + ContractTransposeZ3d(slice, t_id_x, t_id_y, t_id_z, r_V, s_B, r_t); + ContractTransposeY3d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_V); + ContractTransposeX3d(slice, t_id_x, t_id_y, t_id_z, r_V, s_B, r_t); + writeDofs3d(elem, t_id_x, t_id_y, comp, num_elem, r_t, d_V); + } else { + readDofs3d(elem, t_id_x, t_id_y, comp, num_elem, d_U, r_V); + ContractX3d(slice, t_id_x, t_id_y, t_id_z, r_V, s_B, r_t); + ContractY3d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_V); + ContractZ3d(slice, t_id_x, t_id_y, t_id_z, r_V, s_B, r_t); + writeQuads3d(elem, t_id_x, t_id_y, comp, 0, num_elem, r_t, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives at quadrature points +//------------------------------------------------------------------------------ +inline __device__ void grad3d(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *s_B, const CeedScalar *s_G, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V, CeedScalar *slice) { + // Use BASIS_P_1D for one of these + CeedScalar r_U[BASIS_T_1D]; + CeedScalar r_V[BASIS_T_1D]; + CeedScalar r_t[BASIS_T_1D]; + + const CeedInt t_id_x = threadIdx.x; + const CeedInt t_id_y = threadIdx.y; + const CeedInt t_id_z = threadIdx.z; + const CeedInt blockElem = t_id_z / BASIS_NUM_COMP; + const CeedInt elemsPerBlock = blockDim.z / BASIS_NUM_COMP; + const CeedInt comp = t_id_z % BASIS_NUM_COMP; + int dim; + + for (CeedInt elem = blockIdx.x * elemsPerBlock + blockElem; elem < num_elem; elem += gridDim.x * elemsPerBlock) { + for (CeedInt i = 0; i < BASIS_T_1D; i++) { + r_U[i] = 0.0; + r_V[i] = 0.0; + r_t[i] = 0.0; + } + if (transpose) { + dim = 0; + readQuads3d(elem, t_id_x, t_id_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeZ3d(slice, t_id_x, t_id_y, t_id_z, r_U, s_B, r_t); + ContractTransposeY3d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_U); + ContractTransposeX3d(slice, t_id_x, t_id_y, t_id_z, r_U, s_G, r_V); + dim = 1; + readQuads3d(elem, t_id_x, t_id_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeZ3d(slice, t_id_x, t_id_y, t_id_z, r_U, s_B, r_t); + ContractTransposeY3d(slice, t_id_x, t_id_y, t_id_z, r_t, s_G, r_U); + ContractTransposeX3d(slice, t_id_x, t_id_y, t_id_z, r_U, s_B, r_t); + add(r_V, r_t); + dim = 2; + readQuads3d(elem, t_id_x, t_id_y, comp, dim, num_elem, d_U, r_U); + ContractTransposeZ3d(slice, t_id_x, t_id_y, t_id_z, r_U, s_G, r_t); + ContractTransposeY3d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_U); + ContractTransposeX3d(slice, t_id_x, t_id_y, t_id_z, r_U, s_B, r_t); + add(r_V, r_t); + writeDofs3d(elem, t_id_x, t_id_y, comp, num_elem, r_V, d_V); + } else { + readDofs3d(elem, t_id_x, t_id_y, comp, num_elem, d_U, r_U); + ContractX3d(slice, t_id_x, t_id_y, t_id_z, r_U, s_G, r_V); + ContractY3d(slice, t_id_x, t_id_y, t_id_z, r_V, s_B, r_t); + ContractZ3d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_V); + dim = 0; + writeQuads3d(elem, t_id_x, t_id_y, comp, dim, num_elem, r_V, d_V); + ContractX3d(slice, t_id_x, t_id_y, t_id_z, r_U, s_B, r_V); + ContractY3d(slice, t_id_x, t_id_y, t_id_z, r_V, s_G, r_t); + ContractZ3d(slice, t_id_x, t_id_y, t_id_z, r_t, s_B, r_V); + dim = 1; + writeQuads3d(elem, t_id_x, t_id_y, comp, dim, num_elem, r_V, d_V); + ContractX3d(slice, t_id_x, t_id_y, t_id_z, r_U, s_B, r_V); + ContractY3d(slice, t_id_x, t_id_y, t_id_z, r_V, s_B, r_t); + ContractZ3d(slice, t_id_x, t_id_y, t_id_z, r_t, s_G, r_V); + dim = 2; + writeQuads3d(elem, t_id_x, t_id_y, comp, dim, num_elem, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// 3D quadrature weights +//------------------------------------------------------------------------------ +__device__ void weight3d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { + const CeedInt i = threadIdx.x; + const CeedInt j = threadIdx.y; + const CeedInt k = threadIdx.z; + const CeedScalar weight = q_weight_1d[i] * q_weight_1d[j] * q_weight_1d[k]; + for (CeedInt e = blockIdx.x; e < num_elem; e += gridDim.x) { + const CeedInt ind = e * BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D + i + j * BASIS_Q_1D + k * BASIS_Q_1D * BASIS_Q_1D; + w[ind] = weight; + } +} + +//------------------------------------------------------------------------------ +// Basis kernels +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interp kernel by dim +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, CeedScalar *d_interp_1d, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + HIP_DYNAMIC_SHARED(CeedScalar, slice) + // load interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); + __syncthreads(); + + if (BASIS_DIM == 1) { + interp1d(num_elem, transpose, s_B, d_U, d_V, slice); + } else if (BASIS_DIM == 2) { + interp2d(num_elem, transpose, s_B, d_U, d_V, slice); + } else if (BASIS_DIM == 3) { + interp3d(num_elem, transpose, s_B, d_U, d_V, slice); + } +} + +//------------------------------------------------------------------------------ +// Grad kernel by dim +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ + void Grad(const CeedInt num_elem, const CeedInt transpose, CeedScalar *d_interp_1d, CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { + HIP_DYNAMIC_SHARED(CeedScalar, slice) + // load interp_1d and grad_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_interp_1d, s_B); + __shared__ CeedScalar s_G[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_grad_1d, s_G); + __syncthreads(); + + if (BASIS_DIM == 1) { + grad1d(num_elem, transpose, s_B, s_G, d_U, d_V, slice); + } else if (BASIS_DIM == 2) { + grad2d(num_elem, transpose, s_B, s_G, d_U, d_V, slice); + } else if (BASIS_DIM == 3) { + grad3d(num_elem, transpose, s_B, s_G, d_U, d_V, slice); + } +} + +//------------------------------------------------------------------------------ +// Weight kernels by dim +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__ + void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ v) { + if (BASIS_DIM == 1) { + weight1d(num_elem, q_weight_1d, v); + } else if (BASIS_DIM == 2) { + weight2d(num_elem, q_weight_1d, v); + } else if (BASIS_DIM == 3) { + weight3d(num_elem, q_weight_1d, v); + } +} + +//------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/hip/hip-types.h b/include/ceed/jit-source/hip/hip-types.h index 180e5686ab..31c4de5a01 100644 --- a/include/ceed/jit-source/hip/hip-types.h +++ b/include/ceed/jit-source/hip/hip-types.h @@ -16,7 +16,7 @@ typedef struct { const CeedScalar* inputs[CEED_HIP_NUMBER_FIELDS]; - CeedScalar* outputs[CEED_HIP_NUMBER_FIELDS]; + CeedScalar* outputs[CEED_HIP_NUMBER_FIELDS]; } Fields_Hip; typedef struct { @@ -25,10 +25,10 @@ typedef struct { } FieldsInt_Hip; typedef struct { - CeedInt t_id_x; - CeedInt t_id_y; - CeedInt t_id_z; - CeedInt t_id; + CeedInt t_id_x; + CeedInt t_id_y; + CeedInt t_id_z; + CeedInt t_id; CeedScalar* slice; } SharedData_Hip; diff --git a/include/ceed/jit-source/magma/elem_restriction.h b/include/ceed/jit-source/magma/elem_restriction.h index 344f294fdf..d4ab6267b7 100644 --- a/include/ceed/jit-source/magma/elem_restriction.h +++ b/include/ceed/jit-source/magma/elem_restriction.h @@ -16,20 +16,18 @@ // c: component // Go from L-vector (du) to E-vector (dv): // -// dv(i, e, c) = du( offsets(i, e) + compstride * c) -extern "C" __launch_bounds__(MAGMA_ERSTR_THREADS) __global__ void -magma_readDofsOffset_kernel(const int NCOMP, const int compstride, - const int esize, const int nelem, int *offsets, - const CeedScalar *du, CeedScalar *dv) -{ - const int pid = threadIdx.x; +// dv(i, e, c) = du( offsets(i, e) + compstride * c) +extern "C" __launch_bounds__(MAGMA_ERSTR_THREADS) __global__ + void magma_readDofsOffset_kernel(const int NCOMP, const int compstride, const int esize, const int nelem, int *offsets, const CeedScalar *du, + CeedScalar *dv) { + const int pid = threadIdx.x; const int elem = blockIdx.x; - + for (CeedInt i = pid; i < esize; i += blockDim.x) { - const CeedInt ind = offsets ? offsets[i + elem * esize] : i + elem * esize; - for (CeedInt comp = 0; comp < NCOMP; ++comp) { - dv[i+elem*esize+comp*esize*nelem] = du[ind + compstride * comp]; - } + const CeedInt ind = offsets ? offsets[i + elem * esize] : i + elem * esize; + for (CeedInt comp = 0; comp < NCOMP; ++comp) { + dv[i + elem * esize + comp * esize * nelem] = du[ind + compstride * comp]; + } } } @@ -38,23 +36,19 @@ magma_readDofsOffset_kernel(const int NCOMP, const int compstride, // i : related to nodes // e : elements // c: component -// Go from L-vector (du) to E-vector (dv), with strides provided +// Go from L-vector (du) to E-vector (dv), with strides provided // to describe the L-vector layout // -// dv(i, e, c) = du( i * strides[0] + c * strides[1] + e * strides[2] ) -extern "C" __launch_bounds__(MAGMA_ERSTR_THREADS) __global__ void -magma_readDofsStrided_kernel(const int NCOMP, const int esize, const int nelem, - const int *strides, const CeedScalar *du, CeedScalar *dv) -{ - const int pid = threadIdx.x; +// dv(i, e, c) = du( i * strides[0] + c * strides[1] + e * strides[2] ) +extern "C" __launch_bounds__(MAGMA_ERSTR_THREADS) __global__ + void magma_readDofsStrided_kernel(const int NCOMP, const int esize, const int nelem, const int *strides, const CeedScalar *du, CeedScalar *dv) { + const int pid = threadIdx.x; const int elem = blockIdx.x; - + for (CeedInt i = pid; i < esize; i += blockDim.x) { - for (CeedInt comp = 0; comp < NCOMP; ++comp) { - dv[i+elem*esize+comp*esize*nelem] = du[i * strides[0] + - comp * strides[1] + - elem * strides[2]]; - } + for (CeedInt comp = 0; comp < NCOMP; ++comp) { + dv[i + elem * esize + comp * esize * nelem] = du[i * strides[0] + comp * strides[1] + elem * strides[2]]; + } } } @@ -65,45 +59,38 @@ magma_readDofsStrided_kernel(const int NCOMP, const int esize, const int nelem, // Go from E-vector (du) to L-vector (dv): // // dv(offsets(i, e) + compstride * c) = du(i, e, c) -extern "C" __launch_bounds__(MAGMA_ERSTR_THREADS) __global__ void -magma_writeDofsOffset_kernel(const int NCOMP, const int compstride, - const int esize, const int nelem, int *offsets, - const CeedScalar *du, CeedScalar *dv) -{ - const int pid = threadIdx.x; - const int elem = blockIdx.x; +extern "C" __launch_bounds__(MAGMA_ERSTR_THREADS) __global__ + void magma_writeDofsOffset_kernel(const int NCOMP, const int compstride, const int esize, const int nelem, int *offsets, const CeedScalar *du, + CeedScalar *dv) { + const int pid = threadIdx.x; + const int elem = blockIdx.x; - for (CeedInt i = pid; i < esize; i += blockDim.x) { - const CeedInt ind = offsets ? offsets[i + elem * esize] : i + elem * esize; - for (CeedInt comp = 0; comp < NCOMP; ++comp) { - atomicAdd(dv + (ind + compstride * comp), - du[i+elem*esize+comp*esize*nelem]); - } + for (CeedInt i = pid; i < esize; i += blockDim.x) { + const CeedInt ind = offsets ? offsets[i + elem * esize] : i + elem * esize; + for (CeedInt comp = 0; comp < NCOMP; ++comp) { + atomicAdd(dv + (ind + compstride * comp), du[i + elem * esize + comp * esize * nelem]); } + } } // Fastest index listed first // i : related to nodes // e : elements // c: component -// Go from E-vector (du) to L-vector (dv), with strides provided +// Go from E-vector (du) to L-vector (dv), with strides provided // to describe the L-vector layout // -// dv( i * strides[0] + c * strides[1] + e * strides[2] ) = du(i, e, c) -extern "C" __launch_bounds__(MAGMA_ERSTR_THREADS) __global__ void -magma_writeDofsStrided_kernel(const int NCOMP, const int esize, const int nelem, - const int *strides, const CeedScalar *du, CeedScalar *dv) -{ - const int pid = threadIdx.x; - const int elem = blockIdx.x; +// dv( i * strides[0] + c * strides[1] + e * strides[2] ) = du(i, e, c) +extern "C" __launch_bounds__(MAGMA_ERSTR_THREADS) __global__ + void magma_writeDofsStrided_kernel(const int NCOMP, const int esize, const int nelem, const int *strides, const CeedScalar *du, CeedScalar *dv) { + const int pid = threadIdx.x; + const int elem = blockIdx.x; - for (CeedInt i = pid; i < esize; i += blockDim.x) { - for (CeedInt comp = 0; comp < NCOMP; ++comp) { - atomicAdd(dv + (i * strides[0] + comp * strides[1] + - elem * strides[2]), - du[i+elem*esize+comp*esize*nelem]); - } + for (CeedInt i = pid; i < esize; i += blockDim.x) { + for (CeedInt comp = 0; comp < NCOMP; ++comp) { + atomicAdd(dv + (i * strides[0] + comp * strides[1] + elem * strides[2]), du[i + elem * esize + comp * esize * nelem]); } + } } -#endif // CEED_MAGMA_ELEM_RESTRICTION_DEVICE_H +#endif // CEED_MAGMA_ELEM_RESTRICTION_DEVICE_H diff --git a/include/ceed/jit-source/magma/grad-1d.h b/include/ceed/jit-source/magma/grad-1d.h index bd71b550de..cadbd38a2f 100644 --- a/include/ceed/jit-source/magma/grad-1d.h +++ b/include/ceed/jit-source/magma/grad-1d.h @@ -6,136 +6,124 @@ // This file is part of CEED: http://github.com/ceed // macros to abstract access of shared memory and reg. file -#define sT(i,j) sT[(j) * P_ + (i)] +#define sT(i, j) sT[(j)*P_ + (i)] ////////////////////////////////////////////////////////////////////////////////////////// // grad basis action (1D) -template -static __device__ __inline__ void -magma_grad_1d_device( - const T *sT, magma_trans_t transT, - T* sU[NCOMP_], T* sV[NCOMP_], const int tx) -{ - // Assumptions - // 1. 1D threads of size max(P_,Q_) - // 2. sU[i] is 1xP_: in shared memory - // 3. sV[i] is 1xQ_: in shared memory - // 4. P_roduct per component is one row (1xP_) times T matrix (P_xQ_) => one row (1xQ_) - // 5. Each thread computes one entry in sV[i] - // 6. Must sync before and after call - // 7. Note that the layout for U and V is different from 2D/3D problem - - T rv; - if (tx < Q_) { - for(int icomp = 0; icomp < NCOMP_; icomp++) { - rv = (transT == MagmaTrans) ? sV[icomp][tx] : 0.0; - for(int i = 0; i < P_; i++) { - rv += sU[icomp][i] * sT(i,tx); - } - sV[icomp][tx] = rv; - } +template +static __device__ __inline__ void magma_grad_1d_device(const T* sT, magma_trans_t transT, T* sU[NCOMP_], T* sV[NCOMP_], const int tx) { + // Assumptions + // 1. 1D threads of size max(P_,Q_) + // 2. sU[i] is 1xP_: in shared memory + // 3. sV[i] is 1xQ_: in shared memory + // 4. P_roduct per component is one row (1xP_) times T matrix (P_xQ_) => one row (1xQ_) + // 5. Each thread computes one entry in sV[i] + // 6. Must sync before and after call + // 7. Note that the layout for U and V is different from 2D/3D problem + + T rv; + if (tx < Q_) { + for (int icomp = 0; icomp < NCOMP_; icomp++) { + rv = (transT == MagmaTrans) ? sV[icomp][tx] : 0.0; + for (int i = 0; i < P_; i++) { + rv += sU[icomp][i] * sT(i, tx); + } + sV[icomp][tx] = rv; } + } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ void -magma_gradn_1d_kernel( - const CeedScalar *dTinterp, const CeedScalar *dTgrad, - const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, - const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar* sU[NCOMP]; - CeedScalar* sV[NCOMP]; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sT = (CeedScalar*)(shared_data); - CeedScalar* sW = sT + P*Q; - sU[0] = sW + ty * NCOMP * (P + Q); - sV[0] = sU[0] + (NCOMP * 1 * P); - for(int icomp = 1; icomp < NCOMP; icomp++) { - sU[icomp] = sU[icomp-1] + (1 * P); - sV[icomp] = sV[icomp-1] + (1 * Q); - } - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dTgrad, sT); - } - - // read U - read_1d(dU, cstrdU, sU, tx); - - __syncthreads(); - magma_grad_1d_device(sT, transT, sU, sV, tx); - __syncthreads(); - - // write V - write_1d(sV, dV, cstrdV, tx); +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ + void magma_gradn_1d_kernel(const CeedScalar* dTinterp, const CeedScalar* dTgrad, const CeedScalar* dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar* dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar* sU[NCOMP]; + CeedScalar* sV[NCOMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sT = (CeedScalar*)(shared_data); + CeedScalar* sW = sT + P * Q; + sU[0] = sW + ty * NCOMP * (P + Q); + sV[0] = sU[0] + (NCOMP * 1 * P); + for (int icomp = 1; icomp < NCOMP; icomp++) { + sU[icomp] = sU[icomp - 1] + (1 * P); + sV[icomp] = sV[icomp - 1] + (1 * Q); + } + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dTgrad, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + __syncthreads(); + magma_grad_1d_device(sT, transT, sU, sV, tx); + __syncthreads(); + + // write V + write_1d(sV, dV, cstrdV, tx); } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ void -magma_gradt_1d_kernel( - const CeedScalar *dTinterp, const CeedScalar *dTgrad, - const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, - const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar* sU[NCOMP]; - CeedScalar* sV[NCOMP]; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sT = (CeedScalar*)(shared_data); - CeedScalar* sW = sT + Q*P; - sU[0] = sW + ty * NCOMP * (Q + P); - sV[0] = sU[0] + (NCOMP * 1 * Q); - for(int icomp = 1; icomp < NCOMP; icomp++) { - sU[icomp] = sU[icomp-1] + (1 * Q); - sV[icomp] = sV[icomp-1] + (1 * P); - } - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dTgrad, sT); - } - - // read U - read_1d(dU, cstrdU, sU, tx); - - // read V - read_1d(dV, cstrdV, sV, tx); - - __syncthreads(); - magma_grad_1d_device(sT, transT, sU, sV, tx); - __syncthreads(); - - // write V - write_1d(sV, dV, cstrdV, tx); +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ + void magma_gradt_1d_kernel(const CeedScalar* dTinterp, const CeedScalar* dTgrad, const CeedScalar* dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar* dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar* sU[NCOMP]; + CeedScalar* sV[NCOMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sT = (CeedScalar*)(shared_data); + CeedScalar* sW = sT + Q * P; + sU[0] = sW + ty * NCOMP * (Q + P); + sV[0] = sU[0] + (NCOMP * 1 * Q); + for (int icomp = 1; icomp < NCOMP; icomp++) { + sU[icomp] = sU[icomp - 1] + (1 * Q); + sV[icomp] = sV[icomp - 1] + (1 * P); + } + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dTgrad, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + // read V + read_1d(dV, cstrdV, sV, tx); + + __syncthreads(); + magma_grad_1d_device(sT, transT, sU, sV, tx); + __syncthreads(); + + // write V + write_1d(sV, dV, cstrdV, tx); } diff --git a/include/ceed/jit-source/magma/grad-2d.h b/include/ceed/jit-source/magma/grad-2d.h index f47f45dbf7..1f2763ac9f 100644 --- a/include/ceed/jit-source/magma/grad-2d.h +++ b/include/ceed/jit-source/magma/grad-2d.h @@ -6,197 +6,175 @@ // This file is part of CEED: http://github.com/ceed // macros to abstract access of shared memory and reg. file -#define sT(i,j) sT[(j) * P_ + (i)] -#define sTmp(i,j,ldw) sTmp[(j)*(ldw) + (i)] +#define sT(i, j) sT[(j)*P_ + (i)] +#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] ////////////////////////////////////////////////////////////////////////////////////////// // grad basis action (2D) // This function is called two times at a higher level for 2D // DIM_U -- for the size of rU[DIM_U * NCOMP_ * MAXP_Q_] // DIM_V -- for the size of rV[DIM_V * NCOMP_ * MAXP_Q_] -// iDIM_ -- the index of the outermost loop over dimensions in grad +// iDIM_ -- the index of the outermost loop over dimensions in grad // iDIM_U -- which dim index of rU is accessed (always 0 for notrans, 0 or 1 for trans) // iDIM_V -- which dim index of rV is accessed (0 or 1 for notrans, always 0 for trans) // the scalar beta is used to specify whether to accumulate to rV, or overwrite it -template -static __device__ __inline__ void -magma_grad_2d_device( - const T *sTinterp, const T *sTgrad, - T rU[DIM_U][NCOMP_][rUsize] , T rV[DIM_V][NCOMP_][rVsize], - T beta, const int tx, T rTmp, T* swork) -{ - // Assumptions - // 0. This device routine applies grad for one dim only (iDIM_), so it should be called twice for 2D - // 1. 1D threads of size max(P_,Q_) - // 2. input: rU[DIM_U x NCOMP_ x P_] in registers (per thread) - // 3. output: rV[DIM_V x NCOMP_ x Q_] in registers (per thread) - // 4. Two products per each (dim,component) pair - // 4.1 Batch P_ of (1xP_) matrices times (P_xQ_) matrix => Batch P_ of (1xQ_) matrices - // 4.2 Batch 1 of (Q_xP_) matrix times (P_xQ_) matrix => (Q_xQ_) matrix - // 6. Each thread computes one row of the output of each product - // 7. Sync is recommended before and after the call - - for(int icomp = 0; icomp < NCOMP_; icomp++){ - // 1st product -- Batch P_ of (1xP_) matrices [reg] x (P_xQ_) [shmem] => Batch P_ of (1xQ_) matrices - // the batch output P_ x (1xQ_) is written on the fly to shmem - if (tx < P_) { - const int batchid = tx; - const int sld = 1; - const T *sT = (iDIM_ == 0) ? sTgrad : sTinterp; - T* sTmp = swork + batchid * (1 * Q_); - for(int j = 0; j < Q_; j++){ - rTmp = 0.0; - for(int i = 0; i < P_; i++){ - rTmp += rU[iDIM_U][icomp][i] * sT(i,j); - } - sTmp(0,j,sld) = rTmp; - } - } // end of: if (tx < P_) - __syncthreads(); - - // 2nd product -- Batch 1 of a (Q_xP_) matrix [shmem] x (P_xQ_) [shmem] => (Q_xQ_) matrix [reg] - if (tx < Q_) { - const int batchid = 0; - const int sld = Q_; - const T *sT = (iDIM_ == 1) ? sTgrad : sTinterp; - T* sTmp = swork + batchid * (Q_*P_); - for(int j = 0; j < Q_; j++){ - rTmp = 0.0; - for(int i = 0; i < P_; i++){ - rTmp += sTmp(tx,i,sld) * sT(i,j); - } - rV[iDIM_V][icomp][j] *= beta; - rV[iDIM_V][icomp][j] += rTmp; - } +template +static __device__ __inline__ void magma_grad_2d_device(const T *sTinterp, const T *sTgrad, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize], + T beta, const int tx, T rTmp, T *swork) { + // Assumptions + // 0. This device routine applies grad for one dim only (iDIM_), so it should be called twice for 2D + // 1. 1D threads of size max(P_,Q_) + // 2. input: rU[DIM_U x NCOMP_ x P_] in registers (per thread) + // 3. output: rV[DIM_V x NCOMP_ x Q_] in registers (per thread) + // 4. Two products per each (dim,component) pair + // 4.1 Batch P_ of (1xP_) matrices times (P_xQ_) matrix => Batch P_ of (1xQ_) matrices + // 4.2 Batch 1 of (Q_xP_) matrix times (P_xQ_) matrix => (Q_xQ_) matrix + // 6. Each thread computes one row of the output of each product + // 7. Sync is recommended before and after the call + + for (int icomp = 0; icomp < NCOMP_; icomp++) { + // 1st product -- Batch P_ of (1xP_) matrices [reg] x (P_xQ_) [shmem] => Batch P_ of (1xQ_) matrices + // the batch output P_ x (1xQ_) is written on the fly to shmem + if (tx < P_) { + const int batchid = tx; + const int sld = 1; + const T *sT = (iDIM_ == 0) ? sTgrad : sTinterp; + T *sTmp = swork + batchid * (1 * Q_); + for (int j = 0; j < Q_; j++) { + rTmp = 0.0; + for (int i = 0; i < P_; i++) { + rTmp += rU[iDIM_U][icomp][i] * sT(i, j); } - __syncthreads(); - } // loop over NCOMP_ -} + sTmp(0, j, sld) = rTmp; + } + } // end of: if (tx < P_) + __syncthreads(); -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ void -magma_gradn_2d_kernel( - const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, - const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) -{ - - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][P] = { 0.0 }; // here DIMU = 1, but might be different for a fused operator - CeedScalar rV[1][NCOMP][Q] = { 0.0 }; // here DIMV = 1, but might be different for a fused operator - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sTinterp = (CeedScalar*)(shared_data); - CeedScalar* sTgrad = sTinterp + P*Q; - CeedScalar* sTmp = sTgrad + P*Q; - sTmp += ty * (P * MAXPQ); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + // 2nd product -- Batch 1 of a (Q_xP_) matrix [shmem] x (P_xQ_) [shmem] => (Q_xQ_) matrix [reg] + if (tx < Q_) { + const int batchid = 0; + const int sld = Q_; + const T *sT = (iDIM_ == 1) ? sTgrad : sTinterp; + T *sTmp = swork + batchid * (Q_ * P_); + for (int j = 0; j < Q_; j++) { + rTmp = 0.0; + for (int i = 0; i < P_; i++) { + rTmp += sTmp(tx, i, sld) * sT(i, j); + } + rV[iDIM_V][icomp][j] *= beta; + rV[iDIM_V][icomp][j] += rTmp; + } } - - // No need to read V ( required only in transposed grad ) - const CeedScalar beta = 0.0; - - /* read U (idim = 0 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_2d - (dU + (0*dstrdU), cstrdU, rU, sTmp, tx); - - /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 0) */ - magma_grad_2d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_2d_device */ - writeV_2d - (dV+(0*dstrdV), cstrdV, rV, tx); - - /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 1) */ - magma_grad_2d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_2d_device */ - writeV_2d - (dV+(1*dstrdV), cstrdV, rV, tx); + __syncthreads(); + } // loop over NCOMP_ } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ void -magma_gradt_2d_kernel( - const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, - const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][Q] = { 0.0 }; // here DIMU = 1, but might be different for a fused operator - CeedScalar rV[1][NCOMP][P] = { 0.0 }; // here DIMV = 1, but might be different for a fused operator - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sTinterp = (CeedScalar*)(shared_data); - CeedScalar* sTgrad = sTinterp + Q*P; - CeedScalar* sTmp = sTgrad + Q*P; - sTmp += ty * (Q*MAXPQ); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); - } - __syncthreads(); +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ + void magma_gradn_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][NCOMP][P] = {0.0}; // here DIMU = 1, but might be different for a fused operator + CeedScalar rV[1][NCOMP][Q] = {0.0}; // here DIMV = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)(shared_data); + CeedScalar *sTgrad = sTinterp + P * Q; + CeedScalar *sTmp = sTgrad + P * Q; + sTmp += ty * (P * MAXPQ); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); + dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + } + + // No need to read V ( required only in transposed grad ) + const CeedScalar beta = 0.0; + + /* read U (idim = 0 for dU, iDIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + + /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) -- + output from rV[0][][] into dV (idim = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + writeV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); + + /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) -- + output from rV[0][][] into dV (idim = 1) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + writeV_2d(dV + (1 * dstrdV), cstrdV, rV, tx); +} - /* read V (since this is transposed mode -- - idim = 0 for dV, iDIM = 0 for rV) */ - const CeedScalar beta = 1.0; - readV_2d - (dV + (0*dstrdV), cstrdV, rV, tx); - - /* read U (idim = 0 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_2d - (dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); - /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) */ - magma_grad_2d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_2d_device */ - - /* read U (idim = 1 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_2d - (dU + (1*dstrdU), cstrdU, rU, sTmp, tx); - /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) */ - magma_grad_2d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_2d_device */ - - // write V - writeV_2d - (dV + (0*dstrdV), cstrdV, rV, tx); +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ + void magma_gradt_2d_kernel(const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, const CeedScalar *dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][NCOMP][Q] = {0.0}; // here DIMU = 1, but might be different for a fused operator + CeedScalar rV[1][NCOMP][P] = {0.0}; // here DIMV = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar *sTinterp = (CeedScalar *)(shared_data); + CeedScalar *sTgrad = sTinterp + Q * P; + CeedScalar *sTmp = sTgrad + Q * P; + sTmp += ty * (Q * MAXPQ); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); + dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + } + __syncthreads(); + + /* read V (since this is transposed mode -- + idim = 0 for dV, iDIM = 0 for rV) */ + const CeedScalar beta = 1.0; + readV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); + + /* read U (idim = 0 for dU, iDIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_2d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + + /* read U (idim = 1 for dU, iDIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_2d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) */ + magma_grad_2d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_2d_device */ + + // write V + writeV_2d(dV + (0 * dstrdV), cstrdV, rV, tx); } diff --git a/include/ceed/jit-source/magma/grad-3d.h b/include/ceed/jit-source/magma/grad-3d.h index 7d3de8cc05..5198a27acf 100644 --- a/include/ceed/jit-source/magma/grad-3d.h +++ b/include/ceed/jit-source/magma/grad-3d.h @@ -6,235 +6,210 @@ // This file is part of CEED: http://github.com/ceed // macros to abstract access of shared memory and reg. file -#define sT(i,j) sT[(j) * P_ + (i)] -#define sTmp(i,j,ldw) sTmp[(j)*(ldw) + (i)] -#define sTmp2(i,j,ldw) sTmp2[(j)*(ldw) + (i)] +#define sT(i, j) sT[(j)*P_ + (i)] +#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] +#define sTmp2(i, j, ldw) sTmp2[(j) * (ldw) + (i)] ////////////////////////////////////////////////////////////////////////////////////////// // grad basis action (3D) // This function is called three times at a higher level for 3D // DIM_U -- for the size of rU[DIM_U * NCOMP_ * MAXP_Q_] // DIM_V -- for the size of rV[DIM_V * NCOMP_ * MAXP_Q_] -// iDIM_ -- the index of the outermost loop over dimensions in grad +// iDIM_ -- the index of the outermost loop over dimensions in grad // iDIM_U -- which dim index of rU is accessed (always 0 for notrans, 0, 1, or 2 for trans) // iDIM_V -- which dim index of rV is accessed (0, 1, or 2 for notrans, always 0 for trans) // the scalar beta is used to specify whether to accumulate to rV, or overwrite it -template -static __device__ __inline__ void -magma_grad_3d_device( - const T *sTinterp, const T *sTgrad, - T rU[DIM_U][NCOMP_][rUsize] , T rV[DIM_V][NCOMP_][rVsize], - T beta, const int tx, T rTmp, T* swork) -{ - // Assumptions - // 0. This device routine applies grad for one dim only (iDIM_), so it should be thrice for 3D - // 1. 1D threads of size max(P_,Q_)^2 - // 2. input: rU[DIM_U x NCOMP_ x rUsize] in registers (per thread) - // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread) - // 4. Three products per each (dim,component) pair - // 4.1 Batch P_^2 of (1xP_) matrices times (P_xQ_) matrix => Batch P_^2 of (1xQ_) matrices - // 4.2 Batch P_ of (Q_xP_) matrices times (P_xQ_) matrix => Batch P_ of (Q_xQ_) matrices - // 4.3 Batch 1 of (Q_^2xP_) matrix times (P_xQ_) matrix => (Q_^2xQ_) matrix - // 6. Each thread computes one row of the output of each product - // 7. Sync is recommended before and after the call - - T* sW1 = swork; - T* sW2 = sW1 + P_*P_*Q_; - for(int icomp = 0; icomp < NCOMP_; icomp++){ - // Batch P_^2 of (1xP_) matrices [reg] times (P_xQ_) matrix [shmem] => Batch P_^2 of (1xQ_) matrices [shmem] - if (tx < (P_*P_)) { - const int batchid = tx; - const int sld = 1; - const T *sT = (iDIM_ == 0) ? sTgrad : sTinterp; - T* sTmp = sW1 + batchid * (1*Q_); - for(int j = 0; j < Q_; j++){ - rTmp = 0.0; - for(int i = 0; i < P_; i++){ - rTmp += rU[iDIM_U][icomp][i] * sT(i,j); - } - sTmp(0,j,sld) = rTmp; - } - } // end of: if (tx < P_*P_) - __syncthreads(); - - // Batch P_ of (Q_xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch P_ of (Q_xQ_) matrices [reg] - if (tx < (P_*Q_)) { - const int batchid = tx / Q_; - const int tx_ = tx % Q_; - const int sld = Q_; - const T *sT = (iDIM_ == 1) ? sTgrad : sTinterp; - T* sTmp = sW1 + batchid * (Q_*P_); // sTmp is input - T* sTmp2 = sW2 + batchid * (Q_*Q_); // sTmp2 is output - for(int j = 0; j < Q_; j++){ - rTmp = 0.0; - for(int i = 0; i < P_; i++){ - rTmp += sTmp(tx_,i,sld) * sT(i,j); - } - sTmp2(tx_,j,sld) = rTmp; - } +template +static __device__ __inline__ void magma_grad_3d_device(const T* sTinterp, const T* sTgrad, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize], + T beta, const int tx, T rTmp, T* swork) { + // Assumptions + // 0. This device routine applies grad for one dim only (iDIM_), so it should be thrice for 3D + // 1. 1D threads of size max(P_,Q_)^2 + // 2. input: rU[DIM_U x NCOMP_ x rUsize] in registers (per thread) + // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread) + // 4. Three products per each (dim,component) pair + // 4.1 Batch P_^2 of (1xP_) matrices times (P_xQ_) matrix => Batch P_^2 of (1xQ_) matrices + // 4.2 Batch P_ of (Q_xP_) matrices times (P_xQ_) matrix => Batch P_ of (Q_xQ_) matrices + // 4.3 Batch 1 of (Q_^2xP_) matrix times (P_xQ_) matrix => (Q_^2xQ_) matrix + // 6. Each thread computes one row of the output of each product + // 7. Sync is recommended before and after the call + + T* sW1 = swork; + T* sW2 = sW1 + P_ * P_ * Q_; + for (int icomp = 0; icomp < NCOMP_; icomp++) { + // Batch P_^2 of (1xP_) matrices [reg] times (P_xQ_) matrix [shmem] => Batch P_^2 of (1xQ_) matrices [shmem] + if (tx < (P_ * P_)) { + const int batchid = tx; + const int sld = 1; + const T* sT = (iDIM_ == 0) ? sTgrad : sTinterp; + T* sTmp = sW1 + batchid * (1 * Q_); + for (int j = 0; j < Q_; j++) { + rTmp = 0.0; + for (int i = 0; i < P_; i++) { + rTmp += rU[iDIM_U][icomp][i] * sT(i, j); } - __syncthreads(); - - // Batch 1 of (Q_^2xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch 1 of (Q_^2xQ_) matrices [reg] - if (tx < (Q_*Q_)) { - // No need to declare batchid = (tx / Q_^2) = always zero - // No need to declare tx_ = (tx_ % Q_^2) = always tx - const int sld = Q_*Q_; - const T *sT = (iDIM_ == 2) ? sTgrad : sTinterp; - T* sTmp = sW2; // sTmp is input - for(int j = 0; j < Q_; j++) { - rTmp = 0.0; - for(int i = 0; i < P_; i++) { - rTmp += sTmp(tx,i,sld) * sT(i,j); - } - rV[iDIM_V][icomp][j] *= beta; - rV[iDIM_V][icomp][j] += rTmp; - } - } - __syncthreads(); - } // loop over NCOMP_ -} + sTmp(0, j, sld) = rTmp; + } + } // end of: if (tx < P_*P_) + __syncthreads(); -////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ*MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ void -magma_gradn_3d_kernel( - const CeedScalar* dinterp1d, const CeedScalar* dgrad1d, - const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][P] = { 0.0 }; // here DIMU = 1, but might be different for a fused operator - CeedScalar rV[1][NCOMP][Q] = { 0.0 }; // here DIMV = 1, but might be different for a fused operator - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sTinterp = (CeedScalar*)(shared_data); - CeedScalar* sTgrad = sTinterp + P*Q; - CeedScalar* sTmp = sTgrad + P*Q; - sTmp += ty * (max(P*P*P, (P*P*Q) + (P*Q*Q))); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + // Batch P_ of (Q_xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch P_ of (Q_xQ_) matrices [reg] + if (tx < (P_ * Q_)) { + const int batchid = tx / Q_; + const int tx_ = tx % Q_; + const int sld = Q_; + const T* sT = (iDIM_ == 1) ? sTgrad : sTinterp; + T* sTmp = sW1 + batchid * (Q_ * P_); // sTmp is input + T* sTmp2 = sW2 + batchid * (Q_ * Q_); // sTmp2 is output + for (int j = 0; j < Q_; j++) { + rTmp = 0.0; + for (int i = 0; i < P_; i++) { + rTmp += sTmp(tx_, i, sld) * sT(i, j); + } + sTmp2(tx_, j, sld) = rTmp; + } } __syncthreads(); - // No need to read V ( required only in transposed grad ) - const CeedScalar beta = 0.0; - - /* read U (idim = 0 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_3d - (dU + (0*dstrdU), cstrdU, rU, sTmp, tx); - - /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 0) */ - magma_grad_3d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - writeV_3d - (dV+ (0*dstrdV), cstrdV, rV, tx); - - /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 1) */ - magma_grad_3d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - writeV_3d - (dV+ (1*dstrdV), cstrdV, rV, tx); - - /* third call (iDIM = 2, iDIMU = 0, iDIMV = 0) -- - output from rV[0][][] into dV (idim = 2) */ - magma_grad_3d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - writeV_3d - (dV+ (2*dstrdV), cstrdV, rV, tx); + // Batch 1 of (Q_^2xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch 1 of (Q_^2xQ_) matrices [reg] + if (tx < (Q_ * Q_)) { + // No need to declare batchid = (tx / Q_^2) = always zero + // No need to declare tx_ = (tx_ % Q_^2) = always tx + const int sld = Q_ * Q_; + const T* sT = (iDIM_ == 2) ? sTgrad : sTinterp; + T* sTmp = sW2; // sTmp is input + for (int j = 0; j < Q_; j++) { + rTmp = 0.0; + for (int i = 0; i < P_; i++) { + rTmp += sTmp(tx, i, sld) * sT(i, j); + } + rV[iDIM_V][icomp][j] *= beta; + rV[iDIM_V][icomp][j] += rTmp; + } + } + __syncthreads(); + } // loop over NCOMP_ } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ*MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ void -magma_gradt_3d_kernel( - const CeedScalar *dinterp1d, const CeedScalar *dgrad1d, - const CeedScalar *dU, const int estrdU, const int cstrdU, const int dstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][Q] = { 0.0 }; // here DIMU = 1, but might be different for a fused operator - CeedScalar rV[1][NCOMP][P] = { 0.0 }; // here DIMV = 1, but might be different for a fused operator - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sTinterp = (CeedScalar*)(shared_data); - CeedScalar* sTgrad = sTinterp + Q*P; - CeedScalar* sTmp = sTgrad + Q*P; - sTmp += ty * (max(Q*Q*Q, (Q*Q*P) + (Q*P*P))); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); - dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); - } - __syncthreads(); +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ* MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ + void magma_gradn_3d_kernel(const CeedScalar* dinterp1d, const CeedScalar* dgrad1d, const CeedScalar* dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar* dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][NCOMP][P] = {0.0}; // here DIMU = 1, but might be different for a fused operator + CeedScalar rV[1][NCOMP][Q] = {0.0}; // here DIMV = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sTinterp = (CeedScalar*)(shared_data); + CeedScalar* sTgrad = sTinterp + P * Q; + CeedScalar* sTmp = sTgrad + P * Q; + sTmp += ty * (max(P * P * P, (P * P * Q) + (P * Q * Q))); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); + dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + } + __syncthreads(); + + // No need to read V ( required only in transposed grad ) + const CeedScalar beta = 0.0; + + /* read U (idim = 0 for dU, iDIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + + /* first call (iDIM = 0, iDIMU = 0, iDIMV = 0) -- + output from rV[0][][] into dV (idim = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + writeV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); + + /* second call (iDIM = 1, iDIMU = 0, iDIMV = 0) -- + output from rV[0][][] into dV (idim = 1) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + writeV_3d(dV + (1 * dstrdV), cstrdV, rV, tx); + + /* third call (iDIM = 2, iDIMU = 0, iDIMV = 0) -- + output from rV[0][][] into dV (idim = 2) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + writeV_3d(dV + (2 * dstrdV), cstrdV, rV, tx); +} - // read V (since this is transposed mode) - const CeedScalar beta = 1.0; - readV_3d - (dV + (0*dstrdV), cstrdV, rV, tx); - - /* read U (idim = 0 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_3d - (dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); - /* then first call (iDIM = 0, iDIMU = 0, iDIMV = 0) */ - magma_grad_3d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - - /* read U (idim = 1 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_3d - (dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); - /* then second call (iDIM = 1, iDIMU = 0, iDIMV = 0) */ - magma_grad_3d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - - /* read U (idim = 2 for dU, iDIM = 0 for rU) -- - there is a sync at the end of this function */ - readU_3d - (dU + (2 * dstrdU), cstrdU, rU, sTmp, tx); - /* then third call (iDIM = 2, iDIMU = 0, iDIMV = 0) */ - magma_grad_3d_device - (sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); - /* there is a sync at the end of magma_grad_3d_device */ - - // write V - writeV_3d - (dV + (0 * dstrdV), cstrdV, rV, tx); +////////////////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ* MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ + void magma_gradt_3d_kernel(const CeedScalar* dinterp1d, const CeedScalar* dgrad1d, const CeedScalar* dU, const int estrdU, const int cstrdU, + const int dstrdU, CeedScalar* dV, const int estrdV, const int cstrdV, const int dstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar rU[1][NCOMP][Q] = {0.0}; // here DIMU = 1, but might be different for a fused operator + CeedScalar rV[1][NCOMP][P] = {0.0}; // here DIMV = 1, but might be different for a fused operator + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sTinterp = (CeedScalar*)(shared_data); + CeedScalar* sTgrad = sTinterp + Q * P; + CeedScalar* sTmp = sTgrad + Q * P; + sTmp += ty * (max(Q * Q * Q, (Q * Q * P) + (Q * P * P))); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dinterp1d, sTinterp); + dread_T_gm2sm(tx, transT, dgrad1d, sTgrad); + } + __syncthreads(); + + // read V (since this is transposed mode) + const CeedScalar beta = 1.0; + readV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); + + /* read U (idim = 0 for dU, iDIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_3d(dU + (0 * dstrdU), cstrdU, rU, sTmp, tx); + /* then first call (iDIM = 0, iDIMU = 0, iDIMV = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + /* read U (idim = 1 for dU, iDIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_3d(dU + (1 * dstrdU), cstrdU, rU, sTmp, tx); + /* then second call (iDIM = 1, iDIMU = 0, iDIMV = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + /* read U (idim = 2 for dU, iDIM = 0 for rU) -- + there is a sync at the end of this function */ + readU_3d(dU + (2 * dstrdU), cstrdU, rU, sTmp, tx); + /* then third call (iDIM = 2, iDIMU = 0, iDIMV = 0) */ + magma_grad_3d_device(sTinterp, sTgrad, rU, rV, beta, tx, rTmp, sTmp); + /* there is a sync at the end of magma_grad_3d_device */ + + // write V + writeV_3d(dV + (0 * dstrdV), cstrdV, rV, tx); } diff --git a/include/ceed/jit-source/magma/interp-1d.h b/include/ceed/jit-source/magma/interp-1d.h index 7fd329b704..218b145962 100644 --- a/include/ceed/jit-source/magma/interp-1d.h +++ b/include/ceed/jit-source/magma/interp-1d.h @@ -6,136 +6,124 @@ // This file is part of CEED: http://github.com/ceed // macros to abstract access of shared memory and reg. file -#define sT(i,j) sT[(j) * P_ + (i)] +#define sT(i, j) sT[(j)*P_ + (i)] ////////////////////////////////////////////////////////////////////////////////////////// // interp basis action (1D) -template -static __device__ __inline__ void -magma_interp_1d_device( - const T *sT, magma_trans_t transT, - T* sU[NCOMP_], T* sV[NCOMP_], const int tx) -{ - // Assumptions - // 1. 1D threads of size max(P_,Q_) - // 2. sU[i] is 1xP_: in shared memory - // 3. sV[i] is 1xQ_: in shared memory - // 4. P_roduct per component is one row (1xP_) times T matrix (P_xQ_) => one row (1xQ_) - // 5. Each thread computes one entry in sV[i] - // 6. Must sync before and after call - // 7. Note that the layout for U and V is different from 2D/3D problem - - T rv; - if (tx < Q_) { - for(int icomp = 0; icomp < NCOMP_; icomp++) { - rv = (transT == MagmaTrans) ? sV[icomp][tx] : 0.0; - for(int i = 0; i < P_; i++) { - rv += sU[icomp][i] * sT(i,tx); //sT[tx * P_ + i]; - } - sV[icomp][tx] = rv; - } +template +static __device__ __inline__ void magma_interp_1d_device(const T* sT, magma_trans_t transT, T* sU[NCOMP_], T* sV[NCOMP_], const int tx) { + // Assumptions + // 1. 1D threads of size max(P_,Q_) + // 2. sU[i] is 1xP_: in shared memory + // 3. sV[i] is 1xQ_: in shared memory + // 4. P_roduct per component is one row (1xP_) times T matrix (P_xQ_) => one row (1xQ_) + // 5. Each thread computes one entry in sV[i] + // 6. Must sync before and after call + // 7. Note that the layout for U and V is different from 2D/3D problem + + T rv; + if (tx < Q_) { + for (int icomp = 0; icomp < NCOMP_; icomp++) { + rv = (transT == MagmaTrans) ? sV[icomp][tx] : 0.0; + for (int i = 0; i < P_; i++) { + rv += sU[icomp][i] * sT(i, tx); // sT[tx * P_ + i]; + } + sV[icomp][tx] = rv; } + } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ void -magma_interpn_1d_kernel( - const CeedScalar *dT, - const CeedScalar *dU, const int estrdU, const int cstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) -{ - - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar* sU[NCOMP]; - CeedScalar* sV[NCOMP]; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sT = (CeedScalar*)(shared_data); - CeedScalar* sW = sT + P*Q; - sU[0] = sW + ty * NCOMP * (P + Q); - sV[0] = sU[0] + (NCOMP * 1 * P); - for(int icomp = 1; icomp < NCOMP; icomp++) { - sU[icomp] = sU[icomp-1] + (1 * P); - sV[icomp] = sV[icomp-1] + (1 * Q); - } - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } - - // read U - read_1d(dU, cstrdU, sU, tx); - - __syncthreads(); - magma_interp_1d_device(sT, transT, sU, sV, tx); - __syncthreads(); - - // write V - write_1d(sV, dV, cstrdV, tx); +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interpn_1d_kernel(const CeedScalar* dT, const CeedScalar* dU, const int estrdU, const int cstrdU, CeedScalar* dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; + + if (elem_id >= nelem) return; + + CeedScalar* sU[NCOMP]; + CeedScalar* sV[NCOMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sT = (CeedScalar*)(shared_data); + CeedScalar* sW = sT + P * Q; + sU[0] = sW + ty * NCOMP * (P + Q); + sV[0] = sU[0] + (NCOMP * 1 * P); + for (int icomp = 1; icomp < NCOMP; icomp++) { + sU[icomp] = sU[icomp - 1] + (1 * P); + sV[icomp] = sV[icomp - 1] + (1 * Q); + } + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + __syncthreads(); + magma_interp_1d_device(sT, transT, sU, sV, tx); + __syncthreads(); + + // write V + write_1d(sV, dV, cstrdV, tx); } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ void -magma_interpt_1d_kernel( - const CeedScalar *dT, - const CeedScalar *dU, const int estrdU, const int cstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) -{ - - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar* sU[NCOMP]; - CeedScalar* sV[NCOMP]; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sT = (CeedScalar*)(shared_data); - CeedScalar* sW = sT + Q*P; - sU[0] = sW + ty * NCOMP * (Q + P); - sV[0] = sU[0] + (NCOMP * 1 * Q); - for(int icomp = 1; icomp < NCOMP; icomp++) { - sU[icomp] = sU[icomp-1] + (1 * Q); - sV[icomp] = sV[icomp-1] + (1 * P); - } - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } - - // read U - read_1d(dU, cstrdU, sU, tx); - - // read V - read_1d(dV, cstrdV, sV, tx); - - __syncthreads(); - magma_interp_1d_device(sT, transT, sU, sV, tx); - __syncthreads(); - - // write V - write_1d(sV, dV, cstrdV, tx); +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interpt_1d_kernel(const CeedScalar* dT, const CeedScalar* dU, const int estrdU, const int cstrdU, CeedScalar* dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; + + if (elem_id >= nelem) return; + + CeedScalar* sU[NCOMP]; + CeedScalar* sV[NCOMP]; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sT = (CeedScalar*)(shared_data); + CeedScalar* sW = sT + Q * P; + sU[0] = sW + ty * NCOMP * (Q + P); + sV[0] = sU[0] + (NCOMP * 1 * Q); + for (int icomp = 1; icomp < NCOMP; icomp++) { + sU[icomp] = sU[icomp - 1] + (1 * Q); + sV[icomp] = sV[icomp - 1] + (1 * P); + } + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read U + read_1d(dU, cstrdU, sU, tx); + + // read V + read_1d(dV, cstrdV, sV, tx); + + __syncthreads(); + magma_interp_1d_device(sT, transT, sU, sV, tx); + __syncthreads(); + + // write V + write_1d(sV, dV, cstrdV, tx); } diff --git a/include/ceed/jit-source/magma/interp-2d.h b/include/ceed/jit-source/magma/interp-2d.h index 35a5f730d2..07947793eb 100644 --- a/include/ceed/jit-source/magma/interp-2d.h +++ b/include/ceed/jit-source/magma/interp-2d.h @@ -6,152 +6,141 @@ // This file is part of CEED: http://github.com/ceed // macros to abstract access of shared memory and reg. file -#define sT(i,j) sT[(j) * P_ + (i)] -#define sTmp(i,j,ldw) sTmp[(j)*(ldw) + (i)] - +#define sT(i, j) sT[(j)*P_ + (i)] +#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] ////////////////////////////////////////////////////////////////////////////////////////// // interp basis action (2D) -template -static __device__ __inline__ void -magma_interp_2d_device( - const T *sT, magma_trans_t transT, - T rU[DIM_U][NCOMP_][rUsize] , T rV[DIM_V][NCOMP_][rVsize], - const int tx, T rTmp, T* swork) -{ - // Assumptions - // 1. 1D threads of size max(P_,Q_) - // 2. input: rU[DIM_U x NCOMP_ x rUsize] in registers (per thread) - // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread) - // 4. Two products per component - // 4.1 Batch P_ of (1xP_) matrices times (P_xQ_) matrix => Batch P_ of (1xQ_) matrices - // 4.2 Batch 1 of (Q_xP_) matrix times (P_xQ_) matrix => (Q_xQ_) matrix - // 5. Each thread computes one row of the output of each product - // 6. Sync is recommended before and after the call - - for(int icomp = 0; icomp < NCOMP_; icomp++){ - // 1st product -- Batch P_ of (1xP_) matrices [reg] x (P_xQ_) [shmem] => Batch P_ of (1xQ_) matrices - // the batch output P_ x (1xQ_) is written on the fly to shmem - if (tx < P_) { - const int batchid = tx; - const int sld = 1; - T* sTmp = swork + batchid * (1 * Q_); - for(int j = 0; j < Q_; j++){ - rTmp = 0.0; - for(int i = 0; i < P_; i++){ - rTmp += rU[0][icomp][i] * sT(i,j); - } - sTmp(0,j,sld) = rTmp; - } - } // end of: if (tx < P_) - __syncthreads(); - - // 2nd product -- Batch 1 of a (Q_xP_) matrix [shmem] x (P_xQ_) [shmem] => (Q_xQ_) matrix [reg] - if (tx < Q_) { - const int batchid = 0; - const int sld = Q_; - T* sTmp = swork + batchid * (Q_*P_); - for(int j = 0; j < Q_; j++){ - rTmp = 0.0; - for(int i = 0; i < P_; i++){ - rTmp += sTmp(tx,i,sld) * sT(i,j); - } - rV[0][icomp][j] += rTmp; - } +template +static __device__ __inline__ void magma_interp_2d_device(const T* sT, magma_trans_t transT, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize], + const int tx, T rTmp, T* swork) { + // Assumptions + // 1. 1D threads of size max(P_,Q_) + // 2. input: rU[DIM_U x NCOMP_ x rUsize] in registers (per thread) + // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread) + // 4. Two products per component + // 4.1 Batch P_ of (1xP_) matrices times (P_xQ_) matrix => Batch P_ of (1xQ_) matrices + // 4.2 Batch 1 of (Q_xP_) matrix times (P_xQ_) matrix => (Q_xQ_) matrix + // 5. Each thread computes one row of the output of each product + // 6. Sync is recommended before and after the call + + for (int icomp = 0; icomp < NCOMP_; icomp++) { + // 1st product -- Batch P_ of (1xP_) matrices [reg] x (P_xQ_) [shmem] => Batch P_ of (1xQ_) matrices + // the batch output P_ x (1xQ_) is written on the fly to shmem + if (tx < P_) { + const int batchid = tx; + const int sld = 1; + T* sTmp = swork + batchid * (1 * Q_); + for (int j = 0; j < Q_; j++) { + rTmp = 0.0; + for (int i = 0; i < P_; i++) { + rTmp += rU[0][icomp][i] * sT(i, j); + } + sTmp(0, j, sld) = rTmp; + } + } // end of: if (tx < P_) + __syncthreads(); + + // 2nd product -- Batch 1 of a (Q_xP_) matrix [shmem] x (P_xQ_) [shmem] => (Q_xQ_) matrix [reg] + if (tx < Q_) { + const int batchid = 0; + const int sld = Q_; + T* sTmp = swork + batchid * (Q_ * P_); + for (int j = 0; j < Q_; j++) { + rTmp = 0.0; + for (int i = 0; i < P_; i++) { + rTmp += sTmp(tx, i, sld) * sT(i, j); } - __syncthreads(); + rV[0][icomp][j] += rTmp; + } } + __syncthreads(); + } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ void -magma_interpn_2d_kernel( - const CeedScalar *dT, - const CeedScalar *dU, const int estrdU, const int cstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][P] = { 0.0 }; // for a non fused operator DIM is always 1 - CeedScalar rV[1][NCOMP][Q] = { 0.0 }; // for a non fused operator DIM is always 1 - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sT = (CeedScalar*)(shared_data); - CeedScalar* sTmp = sT + P*Q; - sTmp += ty * (P * MAXPQ); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ + void magma_interpn_2d_kernel(const CeedScalar* dT, const CeedScalar* dU, const int estrdU, const int cstrdU, CeedScalar* dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - // read U -- there is a sync at the end of this function - readU_2d(dU, cstrdU, rU, sTmp, tx); + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; - // no sync needed here -- readU_2d already syncs at the end - magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); - __syncthreads(); + if (elem_id >= nelem) return; + + CeedScalar rU[1][NCOMP][P] = {0.0}; // for a non fused operator DIM is always 1 + CeedScalar rV[1][NCOMP][Q] = {0.0}; // for a non fused operator DIM is always 1 + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sT = (CeedScalar*)(shared_data); + CeedScalar* sTmp = sT + P * Q; + sTmp += ty * (P * MAXPQ); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } - // write V - writeV_2d(dV, cstrdV, rV, tx); + // read U -- there is a sync at the end of this function + readU_2d(dU, cstrdU, rU, sTmp, tx); + + // no sync needed here -- readU_2d already syncs at the end + magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // write V + writeV_2d(dV, cstrdV, rV, tx); } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ void -magma_interpt_2d_kernel( - const CeedScalar *dT, - const CeedScalar *dU, const int estrdU, const int cstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][Q] = { 0.0 }; // for a non fused operator DIM is always 1 - CeedScalar rV[1][NCOMP][P] = { 0.0 }; // for a non fused operator DIM is always 1 - CeedScalar rTmp = 0.0; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sT = (CeedScalar*)(shared_data); - CeedScalar* sTmp = sT + Q*P; - sTmp += ty * (Q * MAXPQ); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ, MAGMA_MAXTHREADS_2D)) __global__ + void magma_interpt_2d_kernel(const CeedScalar* dT, const CeedScalar* dU, const int estrdU, const int cstrdU, CeedScalar* dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - // read V - readV_2d(dV, cstrdV, rV, tx); + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; - // read U -- there is a sync at the end of this function - readU_2d(dU, cstrdU, rU, sTmp, tx); + if (elem_id >= nelem) return; - // no sync needed here -- readU_2d already syncs at the end - magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); - __syncthreads(); + CeedScalar rU[1][NCOMP][Q] = {0.0}; // for a non fused operator DIM is always 1 + CeedScalar rV[1][NCOMP][P] = {0.0}; // for a non fused operator DIM is always 1 + CeedScalar rTmp = 0.0; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sT = (CeedScalar*)(shared_data); + CeedScalar* sTmp = sT + Q * P; + sTmp += ty * (Q * MAXPQ); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read V + readV_2d(dV, cstrdV, rV, tx); + + // read U -- there is a sync at the end of this function + readU_2d(dU, cstrdU, rU, sTmp, tx); + + // no sync needed here -- readU_2d already syncs at the end + magma_interp_2d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); - // write V - writeV_2d(dV, cstrdV, rV, tx); + // write V + writeV_2d(dV, cstrdV, rV, tx); } diff --git a/include/ceed/jit-source/magma/interp-3d.h b/include/ceed/jit-source/magma/interp-3d.h index 5d1302d7c0..5fceaf8a0f 100644 --- a/include/ceed/jit-source/magma/interp-3d.h +++ b/include/ceed/jit-source/magma/interp-3d.h @@ -6,180 +6,169 @@ // This file is part of CEED: http://github.com/ceed // macros to abstract access of shared memory and reg. file -#define sT(i,j) sT[(j) * P_ + (i)] -#define sTmp(i,j,ldw) sTmp[(j)*(ldw) + (i)] - +#define sT(i, j) sT[(j)*P_ + (i)] +#define sTmp(i, j, ldw) sTmp[(j) * (ldw) + (i)] ////////////////////////////////////////////////////////////////////////////////////////// // interp basis action (3D) -template -static __device__ __inline__ void -magma_interp_3d_device( - const T *sT, magma_trans_t transT, - T rU[DIM_U][NCOMP_][rUsize] , T rV[DIM_V][NCOMP_][rVsize], - const int tx, T rTmp[Q_], T* swork) -{ - // Assumptions - // 1. 1D threads of size max(P_,Q_)^2 - // 2. input: rU[DIM_U x NCOMP_ x rUsize] in registers (per thread) - // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread) - // 4. Three products per component - // 4.1 Batch P_^2 of (1xP_) matrices times (P_xQ_) matrix => Batch P_^2 of (1xQ_) matrices - // 4.2 Batch P_ of (Q_xP_) matrices times (P_xQ_) matrix => Batch P_ of (Q_xQ_) matrices - // 4.3 Batch 1 of (Q_^2xP_) matrix times (P_xQ_) matrix => (Q_^2xQ_) matrix - // 5. Each thread computes one row of the output of each product - // 6. Sync is recommended before and after the call - - for(int icomp = 0; icomp < NCOMP_; icomp++){ - // Batch P_^2 of (1xP_) matrices [reg] times (P_xQ_) matrix [shmem] => Batch P_^2 of (1xQ_) matrices [shmem] - if (tx < (P_*P_)) { - const int batchid = tx; - const int sld = 1; - T* sTmp = swork + batchid * (1*Q_); - for(int j = 0; j < Q_; j++){ - rTmp[0] = 0.0; - for(int i = 0; i < P_; i++){ - rTmp[0] += rU[0][icomp][i] * sT(i,j); - } - sTmp(0,j,sld) = rTmp[0]; - } - } // end of: if (tx < P_*P_) - __syncthreads(); - - // Batch P_ of (Q_xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch P_ of (Q_xQ_) matrices [reg] - if (tx < (P_*Q_)) { - const int batchid = tx / Q_; - const int tx_ = tx % Q_; - const int sld = Q_; - T* sTmp = swork + batchid * (Q_*P_); // sTmp is input - for(int j = 0; j < Q_; j++){ - rTmp[j] = 0.0; - for(int i = 0; i < P_; i++){ - rTmp[j] += sTmp(tx_,i,sld) * sT(i,j); - } - } +template +static __device__ __inline__ void magma_interp_3d_device(const T* sT, magma_trans_t transT, T rU[DIM_U][NCOMP_][rUsize], T rV[DIM_V][NCOMP_][rVsize], + const int tx, T rTmp[Q_], T* swork) { + // Assumptions + // 1. 1D threads of size max(P_,Q_)^2 + // 2. input: rU[DIM_U x NCOMP_ x rUsize] in registers (per thread) + // 3. output: rV[DIM_V x NCOMP_ x rVsize] in registers (per thread) + // 4. Three products per component + // 4.1 Batch P_^2 of (1xP_) matrices times (P_xQ_) matrix => Batch P_^2 of (1xQ_) matrices + // 4.2 Batch P_ of (Q_xP_) matrices times (P_xQ_) matrix => Batch P_ of (Q_xQ_) matrices + // 4.3 Batch 1 of (Q_^2xP_) matrix times (P_xQ_) matrix => (Q_^2xQ_) matrix + // 5. Each thread computes one row of the output of each product + // 6. Sync is recommended before and after the call + + for (int icomp = 0; icomp < NCOMP_; icomp++) { + // Batch P_^2 of (1xP_) matrices [reg] times (P_xQ_) matrix [shmem] => Batch P_^2 of (1xQ_) matrices [shmem] + if (tx < (P_ * P_)) { + const int batchid = tx; + const int sld = 1; + T* sTmp = swork + batchid * (1 * Q_); + for (int j = 0; j < Q_; j++) { + rTmp[0] = 0.0; + for (int i = 0; i < P_; i++) { + rTmp[0] += rU[0][icomp][i] * sT(i, j); + } + sTmp(0, j, sld) = rTmp[0]; + } + } // end of: if (tx < P_*P_) + __syncthreads(); + + // Batch P_ of (Q_xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch P_ of (Q_xQ_) matrices [reg] + if (tx < (P_ * Q_)) { + const int batchid = tx / Q_; + const int tx_ = tx % Q_; + const int sld = Q_; + T* sTmp = swork + batchid * (Q_ * P_); // sTmp is input + for (int j = 0; j < Q_; j++) { + rTmp[j] = 0.0; + for (int i = 0; i < P_; i++) { + rTmp[j] += sTmp(tx_, i, sld) * sT(i, j); } - __syncthreads(); - - // write rTmp[] into shmem as batch P_ of Q_xQ_ matrices - if (tx < (P_*Q_)){ - const int batchid = tx / Q_; - const int tx_ = tx % Q_; - const int sld = Q_; - T* sTmp = swork + batchid * (Q_*Q_); - for(int j = 0; j < Q_; j++){ - sTmp(tx_, j, sld) = rTmp[j]; - } + } + } + __syncthreads(); + + // write rTmp[] into shmem as batch P_ of Q_xQ_ matrices + if (tx < (P_ * Q_)) { + const int batchid = tx / Q_; + const int tx_ = tx % Q_; + const int sld = Q_; + T* sTmp = swork + batchid * (Q_ * Q_); + for (int j = 0; j < Q_; j++) { + sTmp(tx_, j, sld) = rTmp[j]; + } + } + __syncthreads(); + + // Batch 1 of (Q_^2xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch 1 of (Q_^2xQ_) matrices [reg] + if (tx < (Q_ * Q_)) { + // No need to declare batchid = (tx / Q_^2) = always zero + // No need to declare tx_ = (tx_ % Q_^2) = always tx + const int sld = Q_ * Q_; + T* sTmp = swork; + for (int j = 0; j < Q_; j++) { + rTmp[0] = 0.0; + for (int i = 0; i < P_; i++) { + rTmp[0] += sTmp(tx, i, sld) * sT(i, j); } - __syncthreads(); - - // Batch 1 of (Q_^2xP_) matrices [shmem] times (P_xQ_) matrix [shmem] => Batch 1 of (Q_^2xQ_) matrices [reg] - if (tx < (Q_*Q_)) { - // No need to declare batchid = (tx / Q_^2) = always zero - // No need to declare tx_ = (tx_ % Q_^2) = always tx - const int sld = Q_*Q_; - T* sTmp = swork; - for(int j = 0; j < Q_; j++) { - rTmp[0] = 0.0; - for(int i = 0; i < P_; i++) { - rTmp[0] += sTmp(tx,i,sld) * sT(i,j); - } - rV[0][icomp][j] += rTmp[0]; - } - } - __syncthreads(); + rV[0][icomp][j] += rTmp[0]; + } } + __syncthreads(); + } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ*MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ void -magma_interpn_3d_kernel( - const CeedScalar *dT, - const CeedScalar *dU, const int estrdU, const int cstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) -{ - MAGMA_DEVICE_SHARED( CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaNoTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][P] = { 0.0 }; // for a non fused operator DIM is always 1 - CeedScalar rV[1][NCOMP][Q] = { 0.0 }; // for a non fused operator DIM is always 1 - CeedScalar rTmp[Q] = { 0.0 }; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sT = (CeedScalar*)(shared_data); - CeedScalar* sTmp = sT + P*Q; - sTmp += ty * (max(P*P*MAXPQ, P*Q*Q)); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ* MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ + void magma_interpn_3d_kernel(const CeedScalar* dT, const CeedScalar* dU, const int estrdU, const int cstrdU, CeedScalar* dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - // read U (idim = 0 for dU, iDIM = 0 for rU, u_dimstride is always 0) - readU_3d(dU, cstrdU, rU, sTmp, tx); - // there is a sync at the end of this function + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaNoTrans; - magma_interp_3d_device(sT, transT, rU , rV, tx, rTmp, sTmp); - __syncthreads(); + if (elem_id >= nelem) return; + + CeedScalar rU[1][NCOMP][P] = {0.0}; // for a non fused operator DIM is always 1 + CeedScalar rV[1][NCOMP][Q] = {0.0}; // for a non fused operator DIM is always 1 + CeedScalar rTmp[Q] = {0.0}; - // write V - writeV_3d(dV, cstrdV, rV, tx); + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sT = (CeedScalar*)(shared_data); + CeedScalar* sTmp = sT + P * Q; + sTmp += ty * (max(P * P * MAXPQ, P * Q * Q)); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read U (idim = 0 for dU, iDIM = 0 for rU, u_dimstride is always 0) + readU_3d(dU, cstrdU, rU, sTmp, tx); + // there is a sync at the end of this function + + magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); + + // write V + writeV_3d(dV, cstrdV, rV, tx); } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ*MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ void -magma_interpt_3d_kernel( - const CeedScalar *dT, - const CeedScalar *dU, const int estrdU, const int cstrdU, - CeedScalar *dV, const int estrdV, const int cstrdV, const int nelem) -{ - MAGMA_DEVICE_SHARED( CeedScalar, shared_data) - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; - magma_trans_t transT = MagmaTrans; - - if (elem_id >= nelem) return; - - CeedScalar rU[1][NCOMP][Q] = { 0.0 }; // for a non fused operator DIM is always 1 - CeedScalar rV[1][NCOMP][P] = { 0.0 }; // for a non fused operator DIM is always 1 - CeedScalar rTmp[P] = { 0.0 }; - - // shift global memory pointers by elem stride - dU += elem_id * estrdU; - dV += elem_id * estrdV; - - // assign shared memory pointers - CeedScalar* sT = (CeedScalar*)(shared_data); - CeedScalar* sTmp = sT + Q*P; - sTmp += ty * (max(Q*Q*MAXPQ, Q*P*P)); - - // read T - if (ty == 0) { - dread_T_gm2sm(tx, transT, dT, sT); - } +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(MAXPQ* MAXPQ, MAGMA_MAXTHREADS_3D)) __global__ + void magma_interpt_3d_kernel(const CeedScalar* dT, const CeedScalar* dU, const int estrdU, const int cstrdU, CeedScalar* dV, const int estrdV, + const int cstrdV, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - // read V - readV_3d(dV, cstrdV, rV, tx); + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; + magma_trans_t transT = MagmaTrans; - // read U (idim = 0 for dU, iDIM = 0 for rU, u_dimstride is always 0) - readU_3d(dU, cstrdU, rU, sTmp, tx); - // there is a sync at the end of this function + if (elem_id >= nelem) return; - magma_interp_3d_device(sT, transT, rU , rV, tx, rTmp, sTmp); - __syncthreads(); + CeedScalar rU[1][NCOMP][Q] = {0.0}; // for a non fused operator DIM is always 1 + CeedScalar rV[1][NCOMP][P] = {0.0}; // for a non fused operator DIM is always 1 + CeedScalar rTmp[P] = {0.0}; + + // shift global memory pointers by elem stride + dU += elem_id * estrdU; + dV += elem_id * estrdV; + + // assign shared memory pointers + CeedScalar* sT = (CeedScalar*)(shared_data); + CeedScalar* sTmp = sT + Q * P; + sTmp += ty * (max(Q * Q * MAXPQ, Q * P * P)); + + // read T + if (ty == 0) { + dread_T_gm2sm(tx, transT, dT, sT); + } + + // read V + readV_3d(dV, cstrdV, rV, tx); + + // read U (idim = 0 for dU, iDIM = 0 for rU, u_dimstride is always 0) + readU_3d(dU, cstrdU, rU, sTmp, tx); + // there is a sync at the end of this function + + magma_interp_3d_device(sT, transT, rU, rV, tx, rTmp, sTmp); + __syncthreads(); - // write V - writeV_3d(dV, cstrdV, rV, tx); + // write V + writeV_3d(dV, cstrdV, rV, tx); } diff --git a/include/ceed/jit-source/magma/magma_common_device.h b/include/ceed/jit-source/magma/magma_common_device.h index 69ac89a059..c9e8f7dd8d 100644 --- a/include/ceed/jit-source/magma/magma_common_device.h +++ b/include/ceed/jit-source/magma/magma_common_device.h @@ -10,16 +10,11 @@ #ifdef CEED_MAGMA_USE_HIP #define MAGMA_DEVICE_SHARED(type, name) HIP_DYNAMIC_SHARED(type, name) -#else +#else #define MAGMA_DEVICE_SHARED(type, name) extern __shared__ type name[]; #endif -typedef enum { - MagmaNoTrans = 111, - MagmaTrans = 112, - MagmaConjTrans = 113, - Magma_ConjTrans = MagmaConjTrans -} magma_trans_t; +typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma_ConjTrans = MagmaConjTrans } magma_trans_t; #define MAGMA_MAXTHREADS_1D 128 #define MAGMA_MAXTHREADS_2D 128 @@ -31,36 +26,32 @@ typedef enum { // for use with __launch_bounds__() #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) -#define MAGMA_D_ZERO 0.0 -#define MAGMA_D_ONE 1.0 +#define MAGMA_D_ZERO 0.0 +#define MAGMA_D_ONE 1.0 ////////////////////////////////////////////////////////////////////////////////////////// // read U or V of a 1D element into shared memory sU[][] or sV[][] -- for all components // the devptr is assumed to point directly to the element // must sync after call -template -__device__ __inline__ void -read_1d(const T* devptr, const int compstride, T* sBuffer[NCOMP_], const int tx) -{ - if (tx < LENGTH) { - for(int icomp = 0; icomp < NCOMP_; icomp++) { - sBuffer[icomp][tx] = devptr[icomp * compstride + tx]; - } +template +__device__ __inline__ void read_1d(const T* devptr, const int compstride, T* sBuffer[NCOMP_], const int tx) { + if (tx < LENGTH) { + for (int icomp = 0; icomp < NCOMP_; icomp++) { + sBuffer[icomp][tx] = devptr[icomp * compstride + tx]; } + } } ////////////////////////////////////////////////////////////////////////////////////////// // write V of a 1D element into global memory from sV[][] -- for all components // the devptr is assumed to point directly to the element -template -__device__ __inline__ void -write_1d(T* sBuffer[NCOMP_], T* devptr, const int compstride, const int tx) -{ - if (tx < LENGTH) { - for(int icomp = 0; icomp < NCOMP_; icomp++) { - devptr[icomp * compstride + tx] = sBuffer[icomp][tx]; - } +template +__device__ __inline__ void write_1d(T* sBuffer[NCOMP_], T* devptr, const int compstride, const int tx) { + if (tx < LENGTH) { + for (int icomp = 0; icomp < NCOMP_; icomp++) { + devptr[icomp * compstride + tx] = sBuffer[icomp][tx]; } + } } ////////////////////////////////////////////////////////////////////////////////////////// @@ -70,36 +61,34 @@ write_1d(T* sBuffer[NCOMP_], T* devptr, const int compstride, const int tx) // iDIM specifies which dimension is being read into in rU // rUsize can be different from P_ (e.g. MAXP_Q) // sTmp is a shared memory workspace of size P_^2 -template -__device__ __inline__ void -readU_2d(const T* dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T* sTmp, const int tx) -{ - // read U as a batch P_ of (1xP_) vectors - // vec 0 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // vec 1 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // ... - // vec P_-1: [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // threads collaboratively read vec0 and then vec1 and so on - // but for the kernel, we want - // thread 0 to hold all of vec0 in registers, and - // thread 1 to hold all of vec1 in registers, and and so on - // so we need to transpose - for(int icomp = 0; icomp < NCOMP_; icomp++) { - // read from global memory into shared memory - if (tx < P_) { - for(int i = 0; i < P_; i++) { - sTmp[i*P_ + tx] = dU[icomp * compstride + i*P_ + tx]; - } - } - __syncthreads(); +template +__device__ __inline__ void readU_2d(const T* dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T* sTmp, const int tx) { + // read U as a batch P_ of (1xP_) vectors + // vec 0 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory + // vec 1 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory + // ... + // vec P_-1: [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory + // threads collaboratively read vec0 and then vec1 and so on + // but for the kernel, we want + // thread 0 to hold all of vec0 in registers, and + // thread 1 to hold all of vec1 in registers, and and so on + // so we need to transpose + for (int icomp = 0; icomp < NCOMP_; icomp++) { + // read from global memory into shared memory + if (tx < P_) { + for (int i = 0; i < P_; i++) { + sTmp[i * P_ + tx] = dU[icomp * compstride + i * P_ + tx]; + } + } + __syncthreads(); - if (tx < P_) { - for(int i = 0; i < P_; i++) { - rU[iDIM][icomp][i] = sTmp[tx*P_ + i]; - } - } - __syncthreads(); + if (tx < P_) { + for (int i = 0; i < P_; i++) { + rU[iDIM][icomp][i] = sTmp[tx * P_ + i]; + } } + __syncthreads(); + } } ////////////////////////////////////////////////////////////////////////////////////////// @@ -108,17 +97,15 @@ readU_2d(const T* dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T* sTmp, // register is assumed to be rV[DIMV][NCOMP_][rVsize] // iDIM specifies which dimension is being read into in rV // rVsize can be different from P_ (e.g. MAXP_Q) -template -__device__ __inline__ void -readV_2d(const T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) -{ - if (tx < Q_) { - for(int icomp = 0; icomp < NCOMP_; icomp++) { - for(int j = 0; j < Q_; j++) { - rV[iDIM][icomp][j] = dV[icomp * compstride + j*Q_ + tx]; - } - } +template +__device__ __inline__ void readV_2d(const T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) { + if (tx < Q_) { + for (int icomp = 0; icomp < NCOMP_; icomp++) { + for (int j = 0; j < Q_; j++) { + rV[iDIM][icomp][j] = dV[icomp * compstride + j * Q_ + tx]; + } } + } } ////////////////////////////////////////////////////////////////////////////////////////// @@ -128,17 +115,15 @@ readV_2d(const T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const in // iDIM specifies which dimension is being read from in rV // idim specifies which dimension is being written to in dV // rVsize can be different from P_ (e.g. MAXP_Q) -template -__device__ __inline__ void -writeV_2d(T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) -{ - if (tx < Q_) { - for(int icomp = 0; icomp < NCOMP_; icomp++) { - for(int j = 0; j < Q_; j++) { - dV[icomp * compstride + j*Q_ + tx] = rV[iDIM][icomp][j]; - } - } +template +__device__ __inline__ void writeV_2d(T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) { + if (tx < Q_) { + for (int icomp = 0; icomp < NCOMP_; icomp++) { + for (int j = 0; j < Q_; j++) { + dV[icomp * compstride + j * Q_ + tx] = rV[iDIM][icomp][j]; + } } + } } ////////////////////////////////////////////////////////////////////////////////////////// @@ -148,36 +133,34 @@ writeV_2d(T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) // iDIM specifies which dimension is being read into in rU // rUsize can be different from P_ (e.g. MAXP_Q) // sTmp is a shared memory workspace of size P_^3 -template -__device__ __inline__ void -readU_3d(const T* dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T* sTmp, const int tx) -{ - // read U as a batch P_^2 of (1xP_) vectors - // vec 0 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // vec 1 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // ... - // vec P_^2-1: [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory - // threads collaboratively read vec0 and then vec1 and so on - // but for the kernel, we want - // thread 0 to hold all of vec0 in registers, and - // thread 1 to hold all of vec1 in registers, and and so on - // so we need to transpose - for(int icomp = 0; icomp < NCOMP_; icomp++) { - // read from global memory into shared memory - if (tx < P_*P_) { - for(int i = 0; i < P_; i++) { - sTmp[i*P_*P_ + tx] = dU[icomp * compstride + i*P_*P_ + tx]; - } - } - __syncthreads(); +template +__device__ __inline__ void readU_3d(const T* dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T* sTmp, const int tx) { + // read U as a batch P_^2 of (1xP_) vectors + // vec 0 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory + // vec 1 : [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory + // ... + // vec P_^2-1: [u0, u1, u2, ... u_(P_-1)] -- contiguous in memory + // threads collaboratively read vec0 and then vec1 and so on + // but for the kernel, we want + // thread 0 to hold all of vec0 in registers, and + // thread 1 to hold all of vec1 in registers, and and so on + // so we need to transpose + for (int icomp = 0; icomp < NCOMP_; icomp++) { + // read from global memory into shared memory + if (tx < P_ * P_) { + for (int i = 0; i < P_; i++) { + sTmp[i * P_ * P_ + tx] = dU[icomp * compstride + i * P_ * P_ + tx]; + } + } + __syncthreads(); - if (tx < P_*P_) { - for(int i = 0; i < P_; i++) { - rU[iDIM][icomp][i] = sTmp[tx*P_ + i]; - } - } - __syncthreads(); + if (tx < P_ * P_) { + for (int i = 0; i < P_; i++) { + rU[iDIM][icomp][i] = sTmp[tx * P_ + i]; + } } + __syncthreads(); + } } ////////////////////////////////////////////////////////////////////////////////////////// @@ -186,17 +169,15 @@ readU_3d(const T* dU, const int compstride, T rU[DIMU][NCOMP_][rUsize], T* sTmp, // register is assumed to be rV[DIMV][NCOMP_][rVsize] // iDIM specifies which dimension is being read into in rV // rVsize can be different from P_ (e.g. MAXP_Q) -template -__device__ __inline__ void -readV_3d(const T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) -{ - if (tx < Q_*Q_) { - for(int icomp = 0; icomp < NCOMP_; icomp++) { - for(int j = 0; j < Q_; j++) { - rV[iDIM][icomp][j] = dV[icomp * compstride + j*(Q_*Q_) + tx]; - } - } +template +__device__ __inline__ void readV_3d(const T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) { + if (tx < Q_ * Q_) { + for (int icomp = 0; icomp < NCOMP_; icomp++) { + for (int j = 0; j < Q_; j++) { + rV[iDIM][icomp][j] = dV[icomp * compstride + j * (Q_ * Q_) + tx]; + } } + } } ////////////////////////////////////////////////////////////////////////////////////////// @@ -206,135 +187,109 @@ readV_3d(const T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const in // iDIM specifies which dimension is being read from in rV // idim specifies which dimension is being written to in dV // rVsize can be different from P_ (e.g. MAXP_Q) -template -__device__ __inline__ void -writeV_3d(T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) -{ - if (tx < (Q_*Q_)) { - for(int icomp = 0; icomp < NCOMP_; icomp++) { - for(int j = 0; j < Q_; j++) { - dV[icomp * compstride + j*(Q_*Q_) + tx] = rV[iDIM][icomp][j]; - } - } +template +__device__ __inline__ void writeV_3d(T* dV, const int compstride, T rV[DIMV][NCOMP_][rVsize], const int tx) { + if (tx < (Q_ * Q_)) { + for (int icomp = 0; icomp < NCOMP_; icomp++) { + for (int j = 0; j < Q_; j++) { + dV[icomp * compstride + j * (Q_ * Q_) + tx] = rV[iDIM][icomp][j]; + } } + } } ////////////////////////////////////////////////////////////////////////////////////////// // reads T into shared memory // must sync after call -template -__device__ __inline__ void -dread_T_gm2sm( - const int tx, const magma_trans_t transT, - const CeedScalar* dT, CeedScalar *sT ) -{ - if ( transT == MagmaNoTrans ) { - // T is B x J - if (tx < B) { - for(int i = 0; i < J; i++) { - sT[i * B + tx] = dT[i * B + tx]; - } - } +template +__device__ __inline__ void dread_T_gm2sm(const int tx, const magma_trans_t transT, const CeedScalar* dT, CeedScalar* sT) { + if (transT == MagmaNoTrans) { + // T is B x J + if (tx < B) { + for (int i = 0; i < J; i++) { + sT[i * B + tx] = dT[i * B + tx]; + } } - else { - // T is J x B - if (tx < J) { - for(int i = 0; i < B; i++) { - sT[tx * B + i] = dT[i * J + tx]; - } - } + } else { + // T is J x B + if (tx < J) { + for (int i = 0; i < B; i++) { + sT[tx * B + i] = dT[i * J + tx]; + } } - // must sync after call + } + // must sync after call } ////////////////////////////////////////////////////////////////////////////////////////// // reads a slice of U from shared/global memory into registers // the correct pointer U must be precomputed -template -__device__ __inline__ void -dread_U_gsm2reg( - const int C, const int tx_, - const CeedScalar* U, CeedScalar rU[B] ) -{ - for(int i = 0; i < B; i++){ - rU[i] = U[i * C + tx_]; - } +template +__device__ __inline__ void dread_U_gsm2reg(const int C, const int tx_, const CeedScalar* U, CeedScalar rU[B]) { + for (int i = 0; i < B; i++) { + rU[i] = U[i * C + tx_]; + } } ////////////////////////////////////////////////////////////////////////////////////////// // reads a slice of V from shared/global memory into registers with scaling // the correct pointer V must be precomputed -template -__device__ __inline__ void -dread_V_gsm2reg( - const int C, const int tx_, const CeedScalar* V, CeedScalar rV[J] ) -{ - for(int i = 0; i < J; i++){ - rV[i] = V[i * C + tx_]; - } +template +__device__ __inline__ void dread_V_gsm2reg(const int C, const int tx_, const CeedScalar* V, CeedScalar rV[J]) { + for (int i = 0; i < J; i++) { + rV[i] = V[i * C + tx_]; + } } ////////////////////////////////////////////////////////////////////////////////////////// // writes a slice of V from reg to shared/global memory // the correct pointer V must be precomputed -template -__device__ __inline__ void -dwrite_V_reg2gsm( - const int C, const int tx_, - CeedScalar rV[J], CeedScalar* V ) -{ - for(int i = 0; i < J; i++){ - V[i * C + tx_] = rV[i]; - } +template +__device__ __inline__ void dwrite_V_reg2gsm(const int C, const int tx_, CeedScalar rV[J], CeedScalar* V) { + for (int i = 0; i < J; i++) { + V[i * C + tx_] = rV[i]; + } } ////////////////////////////////////////////////////////////////////////////////////////// // multiply a slice of U times T to produce a slice of V -template -__device__ __inline__ void -dgemm_slice( - CeedScalar alpha, CeedScalar *sT, - CeedScalar rU[B], CeedScalar beta, CeedScalar rV[J] ) -{ - CeedScalar rTmp; - for(int j = 0; j < J; j++) { - rTmp = 0.0; - for(int b = 0; b < B; b++){ - rTmp += rU[ b ] * sT[ j * B + b ]; - } - rV[ j ] *= beta; - rV[ j ] += alpha * rTmp; +template +__device__ __inline__ void dgemm_slice(CeedScalar alpha, CeedScalar* sT, CeedScalar rU[B], CeedScalar beta, CeedScalar rV[J]) { + CeedScalar rTmp; + for (int j = 0; j < J; j++) { + rTmp = 0.0; + for (int b = 0; b < B; b++) { + rTmp += rU[b] * sT[j * B + b]; } + rV[j] *= beta; + rV[j] += alpha * rTmp; + } } ////////////////////////////////////////////////////////////////////////////////////////// -template -__device__ __inline__ void -dgemm_ceed_device( const int tx, const int A, const int C, magma_trans_t transT, CeedScalar *sT, - const CeedScalar alpha, const CeedScalar beta, - const CeedScalar *dU, CeedScalar *dV, - CeedScalar rU[B], CeedScalar rV[J]) -{ - const int tx_ = tx % C; - const int slice_id = tx / C; +template +__device__ __inline__ void dgemm_ceed_device(const int tx, const int A, const int C, magma_trans_t transT, CeedScalar* sT, const CeedScalar alpha, + const CeedScalar beta, const CeedScalar* dU, CeedScalar* dV, CeedScalar rU[B], CeedScalar rV[J]) { + const int tx_ = tx % C; + const int slice_id = tx / C; - // advance pointers for U and V - dU += slice_id * C * B; - dV += slice_id * C * J; + // advance pointers for U and V + dU += slice_id * C * B; + dV += slice_id * C * J; - // read V if beta is non-zero - if ( beta != 0.0 ) { - dread_V_gsm2reg(C, tx_, (const CeedScalar*)dV, rV); - } + // read V if beta is non-zero + if (beta != 0.0) { + dread_V_gsm2reg(C, tx_, (const CeedScalar*)dV, rV); + } - // read U - dread_U_gsm2reg(C, tx_, dU, rU); + // read U + dread_U_gsm2reg(C, tx_, dU, rU); - // multiply - dgemm_slice(alpha, sT, rU, beta, rV); + // multiply + dgemm_slice(alpha, sT, rU, beta, rV); - // write V back - dwrite_V_reg2gsm(C, tx_, rV, dV ); -} + // write V back + dwrite_V_reg2gsm(C, tx_, rV, dV); +} -#endif // CEED_MAGMA_COMMON_DEVICE_H +#endif // CEED_MAGMA_COMMON_DEVICE_H diff --git a/include/ceed/jit-source/magma/weight-1d.h b/include/ceed/jit-source/magma/weight-1d.h index c70ce3cb5e..93a671edec 100644 --- a/include/ceed/jit-source/magma/weight-1d.h +++ b/include/ceed/jit-source/magma/weight-1d.h @@ -7,48 +7,44 @@ ////////////////////////////////////////////////////////////////////////////////////////// // weight basis action -- 1D -template -__device__ __inline__ void -magma_weight_1d_device(const T* sTweight, T* sV, const int tx) -{ - // Assumptions - // 1. 1D thread configuration of size Q_ - // 2. The output sV is in shared memory -- size 1xQ_ - if (tx < Q_){ - sV[tx] = sTweight[tx]; - } +template +__device__ __inline__ void magma_weight_1d_device(const T* sTweight, T* sV, const int tx) { + // Assumptions + // 1. 1D thread configuration of size Q_ + // 2. The output sV is in shared memory -- size 1xQ_ + if (tx < Q_) { + sV[tx] = sTweight[tx]; + } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_1D)) __global__ void -magma_weight_1d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_weight_1d_kernel(const CeedScalar* dqweight1d, CeedScalar* dV, const int v_stride, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; - if (elem_id >= nelem) return; + if (elem_id >= nelem) return; - // global memory pointers - dV += elem_id * v_stride; + // global memory pointers + dV += elem_id * v_stride; - // shared memory pointers - CeedScalar* sTweight = (CeedScalar*)shared_data; - CeedScalar* sV = sTweight + Q; - sV += ty * Q; + // shared memory pointers + CeedScalar* sTweight = (CeedScalar*)shared_data; + CeedScalar* sV = sTweight + Q; + sV += ty * Q; - // read dqweight_1d - if (ty == 0 && tx < Q) { - sTweight[tx] = dqweight1d[tx]; - } + // read dqweight_1d + if (ty == 0 && tx < Q) { + sTweight[tx] = dqweight1d[tx]; + } - __syncthreads(); - magma_weight_1d_device(sTweight, sV, tx); - __syncthreads(); + __syncthreads(); + magma_weight_1d_device(sTweight, sV, tx); + __syncthreads(); - // write V - dV[ tx ] = sV[ tx ]; + // write V + dV[tx] = sV[tx]; } - diff --git a/include/ceed/jit-source/magma/weight-2d.h b/include/ceed/jit-source/magma/weight-2d.h index 8ae8530967..23c56d1574 100644 --- a/include/ceed/jit-source/magma/weight-2d.h +++ b/include/ceed/jit-source/magma/weight-2d.h @@ -7,57 +7,54 @@ ////////////////////////////////////////////////////////////////////////////////////////// // weight basis action -- 2D -template -__device__ __inline__ void -magma_weight_2d_device(const T* sTweight, T rV[DIM_][NCOMP_][Q_], const int tx) -{ - // Assumptions - // 1. 1D thread configuration of size Q_ - // 2. rV[][][] matches the storage used in other actions (interp, grad, ... etc) - // 3. iDIM and iCOMP specify which indexes to use in rV, - // since the output per thread is a register array of size Q_ - // 4. Sync is recommended after the call (to make sure sTweight can be overwritten) - - if (tx < Q_) { - // x sTweight[j] for first update - // x sTweight[tx] for second update - for(int j = 0; j < Q_; j++) { - rV[iDIM][iCOMP][j] = sTweight[j] * sTweight[tx]; - } +template +__device__ __inline__ void magma_weight_2d_device(const T* sTweight, T rV[DIM_][NCOMP_][Q_], const int tx) { + // Assumptions + // 1. 1D thread configuration of size Q_ + // 2. rV[][][] matches the storage used in other actions (interp, grad, ... etc) + // 3. iDIM and iCOMP specify which indexes to use in rV, + // since the output per thread is a register array of size Q_ + // 4. Sync is recommended after the call (to make sure sTweight can be overwritten) + + if (tx < Q_) { + // x sTweight[j] for first update + // x sTweight[tx] for second update + for (int j = 0; j < Q_; j++) { + rV[iDIM][iCOMP][j] = sTweight[j] * sTweight[tx]; } + } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_2D)) __global__ void -magma_weight_2d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q, MAGMA_MAXTHREADS_2D)) __global__ + void magma_weight_2d_kernel(const CeedScalar* dqweight1d, CeedScalar* dV, const int v_stride, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; - if (elem_id >= nelem) return; + if (elem_id >= nelem) return; - CeedScalar rV[1][1][Q]; // allocate with DIM=NCOMP=1, but sizes may differ for a fused operator - // global memory pointers - dV += elem_id * v_stride; + CeedScalar rV[1][1][Q]; // allocate with DIM=NCOMP=1, but sizes may differ for a fused operator + // global memory pointers + dV += elem_id * v_stride; - // shared memory pointers - CeedScalar* sTweight = (CeedScalar*)shared_data; + // shared memory pointers + CeedScalar* sTweight = (CeedScalar*)shared_data; - // read dqweight_1d - if (ty == 0 && tx < Q) { - sTweight[tx] = dqweight1d[tx]; - } + // read dqweight_1d + if (ty == 0 && tx < Q) { + sTweight[tx] = dqweight1d[tx]; + } - __syncthreads(); - magma_weight_2d_device(sTweight, rV, tx); + __syncthreads(); + magma_weight_2d_device(sTweight, rV, tx); - // write V - if (tx < Q) { - for(int j = 0; j < Q; j++) { - dV[ j*Q + tx ] = rV[0][0][j]; - } + // write V + if (tx < Q) { + for (int j = 0; j < Q; j++) { + dV[j * Q + tx] = rV[0][0][j]; } + } } diff --git a/include/ceed/jit-source/magma/weight-3d.h b/include/ceed/jit-source/magma/weight-3d.h index de5a253d90..98ac14d2fc 100644 --- a/include/ceed/jit-source/magma/weight-3d.h +++ b/include/ceed/jit-source/magma/weight-3d.h @@ -7,58 +7,55 @@ ////////////////////////////////////////////////////////////////////////////////////////// // weight basis action -- 3D -template -__device__ __inline__ void -magma_weight_3d_device(const T* sTweight, T rV[DIM_][NCOMP_][Q_], const int tx) -{ - // Assumptions - // 1. 1D thread configuration of size Q_^2 - // 2. rV[][][] matches the storage used in other actions (interp, grad, ... etc) - // 3. iDIM and iCOMP specify which indexes to use in rV, - // since the output per thread is a register array of size Q_ - // 4. Sync is recommended after the call (to make sure sTweight can be overwritten) - - if (tx < (Q_*Q_)) { - // x sTweight[j] for first update - // x sTweight[tx%Q_] for second update - // x sTweight[tx/Q_] for third update - for(int j = 0; j < Q_; j++) { - rV[iDIM][iCOMP][j] = sTweight[j] * sTweight[tx%Q_] * sTweight[tx/Q_]; - } +template +__device__ __inline__ void magma_weight_3d_device(const T* sTweight, T rV[DIM_][NCOMP_][Q_], const int tx) { + // Assumptions + // 1. 1D thread configuration of size Q_^2 + // 2. rV[][][] matches the storage used in other actions (interp, grad, ... etc) + // 3. iDIM and iCOMP specify which indexes to use in rV, + // since the output per thread is a register array of size Q_ + // 4. Sync is recommended after the call (to make sure sTweight can be overwritten) + + if (tx < (Q_ * Q_)) { + // x sTweight[j] for first update + // x sTweight[tx%Q_] for second update + // x sTweight[tx/Q_] for third update + for (int j = 0; j < Q_; j++) { + rV[iDIM][iCOMP][j] = sTweight[j] * sTweight[tx % Q_] * sTweight[tx / Q_]; } + } } ////////////////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q*Q, MAGMA_MAXTHREADS_3D)) __global__ void -magma_weight_3d_kernel(const CeedScalar *dqweight1d, CeedScalar *dV, const int v_stride, const int nelem) -{ - MAGMA_DEVICE_SHARED(CeedScalar, shared_data) +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(Q* Q, MAGMA_MAXTHREADS_3D)) __global__ + void magma_weight_3d_kernel(const CeedScalar* dqweight1d, CeedScalar* dV, const int v_stride, const int nelem) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data) - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int elem_id = (blockIdx.x * blockDim.y) + ty; + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int elem_id = (blockIdx.x * blockDim.y) + ty; - if (elem_id >= nelem) return; + if (elem_id >= nelem) return; - CeedScalar rV[1][1][Q]; // allocate with DIM=NCOMP=1, but sizes may differ for a fused operator - // global memory pointers - dV += elem_id * v_stride; + CeedScalar rV[1][1][Q]; // allocate with DIM=NCOMP=1, but sizes may differ for a fused operator + // global memory pointers + dV += elem_id * v_stride; - // shared memory pointers - CeedScalar* sTweight = (CeedScalar*)shared_data; + // shared memory pointers + CeedScalar* sTweight = (CeedScalar*)shared_data; - // read dqweight_1d - if (tx < Q) { - sTweight[tx] = dqweight1d[tx]; - } - __syncthreads(); + // read dqweight_1d + if (tx < Q) { + sTweight[tx] = dqweight1d[tx]; + } + __syncthreads(); - magma_weight_3d_device(sTweight, rV, tx); + magma_weight_3d_device(sTweight, rV, tx); - // write V - if (tx < (Q*Q)) { - for(int j = 0; j < Q; j++) { - dV[ j*(Q*Q) + tx ] = rV[0][0][j]; - } + // write V + if (tx < (Q * Q)) { + for (int j = 0; j < Q; j++) { + dV[j * (Q * Q) + tx] = rV[0][0][j]; } + } } diff --git a/include/ceed/jit-tools.h b/include/ceed/jit-tools.h index f951afc440..1d53a8125e 100644 --- a/include/ceed/jit-tools.h +++ b/include/ceed/jit-tools.h @@ -22,11 +22,8 @@ CEED_EXTERN int CeedCheckFilePath(Ceed ceed, const char *source_file_path, bool *is_valid); CEED_EXTERN int CeedLoadSourceToBuffer(Ceed ceed, const char *source_file_path, char **buffer); CEED_EXTERN int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, char **buffer); -CEED_EXTERN int CeedPathConcatenate(Ceed ceed, const char *base_file_path, - const char *relative_file_path, char **new_file_path); -CEED_EXTERN int CeedGetJitRelativePath(const char *absolute_file_path, - const char **relative_file_path); -CEED_EXTERN int CeedGetJitAbsolutePath(Ceed ceed, const char *relative_file_path, - char **absolute_file_path); +CEED_EXTERN int CeedPathConcatenate(Ceed ceed, const char *base_file_path, const char *relative_file_path, char **new_file_path); +CEED_EXTERN int CeedGetJitRelativePath(const char *absolute_file_path, const char **relative_file_path); +CEED_EXTERN int CeedGetJitAbsolutePath(Ceed ceed, const char *relative_file_path, char **absolute_file_path); #endif diff --git a/include/ceed/khash.h b/include/ceed/khash.h index 3a3dd4d91d..6c525716a5 100644 --- a/include/ceed/khash.h +++ b/include/ceed/khash.h @@ -24,62 +24,62 @@ #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { - int ret, is_missing; - khiter_t k; - khash_t(32) *h = kh_init(32); - k = kh_put(32, h, 5, &ret); - kh_value(h, k) = 10; - k = kh_get(32, h, 10); - is_missing = (k == kh_end(h)); - k = kh_get(32, h, 5); - kh_del(32, h, k); - for (k = kh_begin(h); k != kh_end(h); ++k) - if (kh_exist(h, k)) kh_value(h, k) = 1; - kh_destroy(32, h); - return 0; + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; } */ /* 2013-05-02 (0.2.8): - * Use quadratic probing. When the capacity is power of 2, stepping function - i*(i+1)/2 guarantees to traverse each bucket. It is better than double - hashing on cache performance and is more robust than linear probing. - In theory, double hashing should be more robust than quadratic probing. - However, my implementation is probably not for large hash tables, because - the second hash function is closely tied to the first hash function, - which reduce the effectiveness of double hashing. - Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php 2011-12-29 (0.2.7): * Minor code clean up; no actual effect. 2011-09-16 (0.2.6): - * The capacity is a power of 2. This seems to dramatically improve the - speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - - http://code.google.com/p/ulib/ - - http://nothings.org/computer/judy/ - * Allow to optionally use linear probing which usually has better - performance for random input. Double hashing is still the default as it - is more robust to certain non-random input. - * Added Wang's integer hash function (not used by default). This hash - function is more robust to certain non-random input. + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. 2011-02-14 (0.2.5): * Allow to declare global functions. 2009-09-26 (0.2.4): * Improve portability 2008-09-19 (0.2.3): - * Corrected the example - * Improved interfaces + * Corrected the example + * Improved interfaces 2008-09-11 (0.2.2): - * Improved speed a little in kh_put() + * Improved speed a little in kh_put() 2008-09-10 (0.2.1): - * Added kh_clear() - * Fixed a compiling error + * Added kh_clear() + * Fixed a compiling error 2008-09-02 (0.2.0): - * Changed to token concatenation which increases flexibility. + * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): - * Fixed a bug in kh_get(), which has not been tested previously. + * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): - * Added destructor + * Added destructor */ #ifndef __AC_KHASH_H @@ -92,16 +92,16 @@ int main() { #define AC_VERSION_KHASH_H "0.2.8" +#include #include #include -#include /* compiler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu -typedef unsigned long khint32_t; +typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX @@ -120,37 +120,37 @@ typedef unsigned long long khint64_t; #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) -#define klib_unused __attribute__ ((__unused__)) +#define klib_unused __attribute__((__unused__)) #else #define klib_unused #endif #endif /* klib_unused */ typedef khint32_t khint_t; -typedef khint_t khiter_t; +typedef khint_t khiter_t; -#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) -#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) -#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) -#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) -#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) -#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) -#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) +#define __ac_isempty(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 2) +#define __ac_isdel(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 1) +#define __ac_iseither(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 3) +#define __ac_set_isdel_false(flag, i) (flag[i >> 4] &= ~(1ul << ((i & 0xfU) << 1))) +#define __ac_set_isempty_false(flag, i) (flag[i >> 4] &= ~(2ul << ((i & 0xfU) << 1))) +#define __ac_set_isboth_false(flag, i) (flag[i >> 4] &= ~(3ul << ((i & 0xfU) << 1))) +#define __ac_set_isdel_true(flag, i) (flag[i >> 4] |= 1ul << ((i & 0xfU) << 1)) -#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) +#define __ac_fsize(m) ((m) < 16 ? 1 : (m) >> 4) #ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#define kroundup32(x) (--(x), (x) |= (x) >> 1, (x) |= (x) >> 2, (x) |= (x) >> 4, (x) |= (x) >> 8, (x) |= (x) >> 16, ++(x)) #endif #ifndef kcalloc -#define kcalloc(N,Z) calloc(N,Z) +#define kcalloc(N, Z) calloc(N, Z) #endif #ifndef kmalloc #define kmalloc(Z) malloc(Z) #endif #ifndef krealloc -#define krealloc(P,Z) realloc(P,Z) +#define krealloc(P, Z) realloc(P, Z) #endif #ifndef kfree #define kfree(P) free(P) @@ -158,179 +158,197 @@ typedef khint_t khiter_t; static const double __ac_HASH_UPPER = 0.77; -#define __KHASH_TYPE(name, khkey_t, khval_t) \ - typedef struct kh_##name##_s { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; - -#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ - extern kh_##name##_t *kh_init_##name(void); \ - extern void kh_destroy_##name(kh_##name##_t *h); \ - extern void kh_clear_##name(kh_##name##_t *h); \ - extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ - extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ - extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ - extern void kh_del_##name(kh_##name##_t *h, khint_t x); - -#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ - } \ - SCOPE void kh_destroy_##name(kh_##name##_t *h) \ - { \ - if (h) { \ - kfree((void *)h->keys); kfree(h->flags); \ - kfree((void *)h->vals); \ - kfree(h); \ - } \ - } \ - SCOPE void kh_clear_##name(kh_##name##_t *h) \ - { \ - if (h && h->flags) { \ - memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ - h->size = h->n_occupied = 0; \ - } \ - } \ - SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ - { \ - if (h->n_buckets) { \ - khint_t k, i, last, mask, step = 0; \ - mask = h->n_buckets - 1; \ - k = __hash_func(key); i = k & mask; \ - last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - i = (i + (++step)) & mask; \ - if (i == last) return h->n_buckets; \ - } \ - return __ac_iseither(h->flags, i)? h->n_buckets : i; \ - } else return 0; \ - } \ - SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ - khint32_t *new_flags = 0; \ - khint_t j = 1; \ - { \ - kroundup32(new_n_buckets); \ - if (new_n_buckets < 4) new_n_buckets = 4; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ - else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ - if (!new_flags) return -1; \ - memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { /* expand */ \ - khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (!new_keys) { kfree(new_flags); return -1; } \ - h->keys = new_keys; \ - if (kh_is_map) { \ - khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ - if (!new_vals) { kfree(new_flags); return -1; } \ - h->vals = new_vals; \ - } \ - } /* otherwise shrink */ \ - } \ - } \ - if (j) { /* rehashing is needed */ \ - for (j = 0; j != h->n_buckets; ++j) { \ - if (__ac_iseither(h->flags, j) == 0) { \ - khkey_t key = h->keys[j]; \ - khval_t val; \ - khint_t new_mask; \ - new_mask = new_n_buckets - 1; \ - if (kh_is_map) val = h->vals[j]; \ - __ac_set_isdel_true(h->flags, j); \ - while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khint_t k, i, step = 0; \ - k = __hash_func(key); \ - i = k & new_mask; \ - while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ - __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ - { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ - if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ - } else { /* write the element and jump out of the loop */ \ - h->keys[i] = key; \ - if (kh_is_map) h->vals[i] = val; \ - break; \ - } \ - } \ - } \ - } \ - if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ - kfree(h->flags); /* free the working space */ \ - h->flags = new_flags; \ - h->n_buckets = new_n_buckets; \ - h->n_occupied = h->size; \ - h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ - } \ - return 0; \ - } \ - SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ - { \ - khint_t x; \ - if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ - if (h->n_buckets > (h->size<<1)) { \ - if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ - *ret = -1; return h->n_buckets; \ - } \ - } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ - *ret = -1; return h->n_buckets; \ - } \ - } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ - { \ - khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ - x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ - if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ - else { \ - last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (__ac_isdel(h->flags, i)) site = i; \ - i = (i + (++step)) & mask; \ - if (i == last) { x = site; break; } \ - } \ - if (x == h->n_buckets) { \ - if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ - else x = i; \ - } \ - } \ - } \ - if (__ac_isempty(h->flags, x)) { /* not present at all */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; ++h->n_occupied; \ - *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; \ - *ret = 2; \ - } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ - return x; \ - } \ - SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ - { \ - if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ - __ac_set_isdel_true(h->flags, x); \ - --h->size; \ - } \ - } - -#define KHASH_DECLARE(name, khkey_t, khval_t) \ - __KHASH_TYPE(name, khkey_t, khval_t) \ - __KHASH_PROTOTYPES(name, khkey_t, khval_t) +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct kh_##name##_s { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { return (kh_##name##_t *)kcalloc(1, sizeof(kh_##name##_t)); } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) { \ + if (h) { \ + kfree((void *)h->keys); \ + kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); \ + i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i) ? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name( \ + kh_##name##_t *h, \ + khint_t new_n_buckets) { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t *)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t *)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { \ + kfree(new_flags); \ + return -1; \ + } \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t *)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) { \ + kfree(new_flags); \ + return -1; \ + } \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { \ + khkey_t tmp = h->keys[i]; \ + h->keys[i] = key; \ + key = tmp; \ + } \ + if (kh_is_map) { \ + khval_t tmp = h->vals[i]; \ + h->vals[i] = val; \ + val = tmp; \ + } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t *)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t *)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size << 1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; \ + return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; \ + return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; \ + k = __hash_func(key); \ + i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { \ + x = site; \ + break; \ + } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - __KHASH_TYPE(name, khkey_t, khval_t) \ - __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ @@ -349,7 +367,7 @@ static const double __ac_HASH_UPPER = 0.77; @param key The integer [khint64_t] @return The hash value [khint_t] */ -#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +#define kh_int64_hash_func(key) (khint32_t)((key) >> 33 ^ (key) ^ (key) << 11) /*! @function @abstract 64-bit integer comparison function */ @@ -361,7 +379,8 @@ static const double __ac_HASH_UPPER = 0.77; */ static kh_inline khint_t __ac_X31_hash_string(const char *s) { khint_t h = (khint_t)*s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + if (h) + for (++s; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } /*! @function @@ -377,11 +396,11 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s) { static kh_inline khint_t __ac_Wang_hash(khint_t key) { key += ~(key << 15); - key ^= (key >> 10); - key += (key << 3); - key ^= (key >> 6); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); key += ~(key << 11); - key ^= (key >> 16); + key ^= (key >> 16); return key; } #define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key) @@ -433,7 +452,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) { @param r Extra return code: -1 if the operation failed; 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in - the bucket has been deleted [int*] + the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) @@ -520,13 +539,16 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) { @param vvar Variable to which value will be assigned @param code Block of code to execute */ -#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ - if (!kh_exist(h,__i)) continue; \ - (kvar) = kh_key(h,__i); \ - (vvar) = kh_val(h,__i); \ - code; \ - } } +#define kh_foreach(h, kvar, vvar, code) \ + { \ + khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h, __i)) continue; \ + (kvar) = kh_key(h, __i); \ + (vvar) = kh_val(h, __i); \ + code; \ + } \ + } /*! @function @abstract Iterate over the values in the hash table @@ -534,12 +556,15 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) { @param vvar Variable to which value will be assigned @param code Block of code to execute */ -#define kh_foreach_value(h, vvar, code) { khint_t __i; \ - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ - if (!kh_exist(h,__i)) continue; \ - (vvar) = kh_val(h,__i); \ - code; \ - } } +#define kh_foreach_value(h, vvar, code) \ + { \ + khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h, __i)) continue; \ + (vvar) = kh_val(h, __i); \ + code; \ + } \ + } /* More convenient interfaces */ @@ -547,46 +572,40 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) { @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_INT(name) \ - KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_INT(name) KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT(name, khval_t) \ - KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_INT(name, khval_t) KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash set containing 64-bit integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_INT64(name) \ - KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_SET_INIT_INT64(name) KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT64(name, khval_t) \ - KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_MAP_INIT_INT64(name, khval_t) KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_STR(name) \ - KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) +#define KHASH_SET_INIT_STR(name) KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_STR(name, khval_t) \ - KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) +#define KHASH_MAP_INIT_STR(name, khval_t) KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #endif /* __AC_KHASH_H */ diff --git a/include/ceed/types.h b/include/ceed/types.h index f9e74e7afa..10677c670e 100644 --- a/include/ceed/types.h +++ b/include/ceed/types.h @@ -23,15 +23,15 @@ **/ #ifndef CEED_QFUNCTION_ATTR #ifndef __NO_INLINE__ -# if defined(__GNUC__) || defined(__clang__) -# define CEED_QFUNCTION_ATTR __attribute__((flatten)) -# elif defined(__INTEL_COMPILER) -# define CEED_QFUNCTION_ATTR _Pragma("forceinline") -# else -# define CEED_QFUNCTION_ATTR -# endif +#if defined(__GNUC__) || defined(__clang__) +#define CEED_QFUNCTION_ATTR __attribute__((flatten)) +#elif defined(__INTEL_COMPILER) +#define CEED_QFUNCTION_ATTR _Pragma("forceinline") #else -# define CEED_QFUNCTION_ATTR +#define CEED_QFUNCTION_ATTR +#endif +#else +#define CEED_QFUNCTION_ATTR #endif #endif @@ -43,8 +43,8 @@ source path for creating the respective User QFunction. **/ #ifndef CEED_QFUNCTION -#define CEED_QFUNCTION(name) \ - static const char name ## _loc[] = __FILE__ ":" #name; \ +#define CEED_QFUNCTION(name) \ + static const char name##_loc[] = __FILE__ ":" #name; \ CEED_QFUNCTION_ATTR static int name #endif @@ -66,7 +66,7 @@ syntax with the CUDA backends. **/ #ifndef CEED_Q_VLA -# define CEED_Q_VLA Q +#define CEED_Q_VLA Q #endif /** @@ -75,8 +75,8 @@ environment. Code generation backends may redefine this macro, as needed. **/ #ifndef CeedPragmaSIMD -# if defined(__INTEL_COMPILER) -# define CeedPragmaSIMD _Pragma("vector") +#if defined(__INTEL_COMPILER) +#define CeedPragmaSIMD _Pragma("vector") /// Cannot use Intel pragma ivdep because it miscompiles unpacking symmetric tensors, as in /// Poisson2DApply, where the SIMD loop body contains temporaries such as the following. /// @@ -87,13 +87,13 @@ /// /// Miscompilation with pragma ivdep observed with icc (ICC) 19.0.5.281 20190815 /// at -O2 and above. -# elif defined(__GNUC__) && __GNUC__ >= 5 -# define CeedPragmaSIMD _Pragma("GCC ivdep") -# elif defined(_OPENMP) && _OPENMP >= 201307 // OpenMP-4.0 (July, 2013) -# define CeedPragmaSIMD _Pragma("omp simd") -# else -# define CeedPragmaSIMD -# endif +#elif defined(__GNUC__) && __GNUC__ >= 5 +#define CeedPragmaSIMD _Pragma("GCC ivdep") +#elif defined(_OPENMP) && _OPENMP >= 201307 // OpenMP-4.0 (July, 2013) +#define CeedPragmaSIMD _Pragma("omp simd") +#else +#define CeedPragmaSIMD +#endif #endif /// Integer type, used for indexing @@ -114,7 +114,7 @@ typedef enum { /// Double precision CEED_SCALAR_FP64 } CeedScalarType; -/// Base scalar type for the library to use: change which header is +/// Base scalar type for the library to use: change which header is /// included to change the precision. #include "ceed-f64.h" @@ -128,23 +128,23 @@ typedef enum { /// @ingroup Ceed typedef enum { /// Success error code - CEED_ERROR_SUCCESS = 0, + CEED_ERROR_SUCCESS = 0, /// Minor error, generic - CEED_ERROR_MINOR = 1, + CEED_ERROR_MINOR = 1, /// Minor error, dimension mismatch in inputs - CEED_ERROR_DIMENSION = 2, + CEED_ERROR_DIMENSION = 2, /// Minor error, incomplete object setup - CEED_ERROR_INCOMPLETE = 3, + CEED_ERROR_INCOMPLETE = 3, /// Minor error, incompatible arguments/configuration CEED_ERROR_INCOMPATIBLE = 4, /// Minor error, access lock problem - CEED_ERROR_ACCESS = 5, + CEED_ERROR_ACCESS = 5, /// Major error, generic - CEED_ERROR_MAJOR = -1, + CEED_ERROR_MAJOR = -1, /// Major error, internal backend error - CEED_ERROR_BACKEND = -2, + CEED_ERROR_BACKEND = -2, /// Major error, operation unsupported by current backend - CEED_ERROR_UNSUPPORTED = -3, + CEED_ERROR_UNSUPPORTED = -3, } CeedErrorType; #endif diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c index 1e4e179554..ecabec4689 100644 --- a/interface/ceed-basis.c +++ b/interface/ceed-basis.c @@ -5,9 +5,9 @@ // // This file is part of CEED: http://github.com/ceed -#include -#include #include +#include +#include #include #include #include @@ -52,16 +52,12 @@ const CeedBasis CEED_BASIS_COLLOCATED = &ceed_basis_collocated; @ref Developer **/ -static int CeedHouseholderReflect(CeedScalar *A, const CeedScalar *v, - CeedScalar b, CeedInt m, CeedInt n, - CeedInt row, CeedInt col) { - for (CeedInt j=0; j 1) - fprintf(stream, "%12s[%" CeedInt_FMT "]:", name, i); - else - fprintf(stream, "%12s:", name); - for (CeedInt j=0; j 1E-14 ? a[i*n+j] : 0); +static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, CeedInt n, const CeedScalar *a, FILE *stream) { + for (CeedInt i = 0; i < m; i++) { + if (m > 1) fprintf(stream, "%12s[%" CeedInt_FMT "]:", name, i); + else fprintf(stream, "%12s:", name); + for (CeedInt j = 0; j < n; j++) fprintf(stream, fp_fmt, fabs(a[i * n + j]) > 1E-14 ? a[i * n + j] : 0); fputs("\n", stream); } return CEED_ERROR_SUCCESS; @@ -190,91 +177,83 @@ static int CeedScalarView(const char *name, const char *fp_fmt, CeedInt m, @ref Developer **/ -static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, - CeedBasis basis_to, CeedScalar **interp_project, CeedScalar **grad_project) { - int ierr; +static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, CeedBasis basis_to, CeedScalar **interp_project, CeedScalar **grad_project) { Ceed ceed; - ierr = CeedBasisGetCeed(basis_to, &ceed); CeedChk(ierr); + CeedCall(CeedBasisGetCeed(basis_to, &ceed)); // Check for compatible quadrature spaces CeedInt Q_to, Q_from; - ierr = CeedBasisGetNumQuadraturePoints(basis_to, &Q_to); CeedChk(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis_from, &Q_from); CeedChk(ierr); - if (Q_to != Q_from) + CeedCall(CeedBasisGetNumQuadraturePoints(basis_to, &Q_to)); + CeedCall(CeedBasisGetNumQuadraturePoints(basis_from, &Q_from)); + if (Q_to != Q_from) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Bases must have compatible quadrature spaces"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces"); + // LCOV_EXCL_STOP + } // Check for matching tensor or non-tensor CeedInt P_to, P_from, Q = Q_to; - bool is_tensor_to, is_tensor_from; - ierr = CeedBasisIsTensor(basis_to, &is_tensor_to); CeedChk(ierr); - ierr = CeedBasisIsTensor(basis_from, &is_tensor_from); CeedChk(ierr); + bool is_tensor_to, is_tensor_from; + CeedCall(CeedBasisIsTensor(basis_to, &is_tensor_to)); + CeedCall(CeedBasisIsTensor(basis_from, &is_tensor_from)); if (is_tensor_to && is_tensor_from) { - ierr = CeedBasisGetNumNodes1D(basis_to, &P_to); CeedChk(ierr); - ierr = CeedBasisGetNumNodes1D(basis_from, &P_from); CeedChk(ierr); - ierr = CeedBasisGetNumQuadraturePoints1D(basis_from, &Q); CeedChk(ierr); + CeedCall(CeedBasisGetNumNodes1D(basis_to, &P_to)); + CeedCall(CeedBasisGetNumNodes1D(basis_from, &P_from)); + CeedCall(CeedBasisGetNumQuadraturePoints1D(basis_from, &Q)); } else if (!is_tensor_to && !is_tensor_from) { - ierr = CeedBasisGetNumNodes(basis_to, &P_to); CeedChk(ierr); - ierr = CeedBasisGetNumNodes(basis_from, &P_from); CeedChk(ierr); + CeedCall(CeedBasisGetNumNodes(basis_to, &P_to)); + CeedCall(CeedBasisGetNumNodes(basis_from, &P_from)); } else { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_MINOR, - "Bases must both be tensor or non-tensor"); + return CeedError(ceed, CEED_ERROR_MINOR, "Bases must both be tensor or non-tensor"); // LCOV_EXCL_STOP } // Get source matrices - CeedInt dim; + CeedInt dim; CeedScalar *interp_to, *interp_from, *tau; - ierr = CeedBasisGetDimension(basis_to, &dim); CeedChk(ierr); - ierr = CeedMalloc(Q * P_from, &interp_from); CeedChk(ierr); - ierr = CeedMalloc(Q * P_to, &interp_to); CeedChk(ierr); - ierr = CeedCalloc(P_to * P_from, interp_project); CeedChk(ierr); - ierr = CeedCalloc(P_to * P_from * (is_tensor_to ? 1 : dim), grad_project); - CeedChk(ierr); - ierr = CeedMalloc(Q, &tau); CeedChk(ierr); - const CeedScalar *interp_to_source = NULL, *interp_from_source = NULL, - *grad_from_source; + CeedCall(CeedBasisGetDimension(basis_to, &dim)); + CeedCall(CeedMalloc(Q * P_from, &interp_from)); + CeedCall(CeedMalloc(Q * P_to, &interp_to)); + CeedCall(CeedCalloc(P_to * P_from, interp_project)); + CeedCall(CeedCalloc(P_to * P_from * (is_tensor_to ? 1 : dim), grad_project)); + CeedCall(CeedMalloc(Q, &tau)); + const CeedScalar *interp_to_source = NULL, *interp_from_source = NULL, *grad_from_source; if (is_tensor_to) { - ierr = CeedBasisGetInterp1D(basis_to, &interp_to_source); CeedChk(ierr); - ierr = CeedBasisGetInterp1D(basis_from, &interp_from_source); CeedChk(ierr); - ierr = CeedBasisGetGrad1D(basis_from, &grad_from_source); CeedChk(ierr); + CeedCall(CeedBasisGetInterp1D(basis_to, &interp_to_source)); + CeedCall(CeedBasisGetInterp1D(basis_from, &interp_from_source)); + CeedCall(CeedBasisGetGrad1D(basis_from, &grad_from_source)); } else { - ierr = CeedBasisGetInterp(basis_to, &interp_to_source); CeedChk(ierr); - ierr = CeedBasisGetInterp(basis_from, &interp_from_source); CeedChk(ierr); - ierr = CeedBasisGetGrad(basis_from, &grad_from_source); CeedChk(ierr); + CeedCall(CeedBasisGetInterp(basis_to, &interp_to_source)); + CeedCall(CeedBasisGetInterp(basis_from, &interp_from_source)); + CeedCall(CeedBasisGetGrad(basis_from, &grad_from_source)); } // Build matrices - CeedInt num_matrices = 1 + (is_tensor_to ? 1 : dim); + CeedInt num_matrices = 1 + (is_tensor_to ? 1 : dim); CeedScalar *input_from[num_matrices], *output_project[num_matrices]; - input_from[0] = (CeedScalar *)interp_from_source; + input_from[0] = (CeedScalar *)interp_from_source; output_project[0] = *interp_project; for (CeedInt m = 1; m < num_matrices; m++) { - input_from[m] = (CeedScalar *)&grad_from_source[(m - 1) * Q * P_from]; + input_from[m] = (CeedScalar *)&grad_from_source[(m - 1) * Q * P_from]; output_project[m] = &((*grad_project)[(m - 1) * P_to * P_from]); } for (CeedInt m = 0; m < num_matrices; m++) { // -- QR Factorization, interp_to = Q R memcpy(interp_to, interp_to_source, Q * P_to * sizeof(interp_to_source[0])); - ierr = CeedQRFactorization(ceed, interp_to, tau, Q, P_to); CeedChk(ierr); + CeedCall(CeedQRFactorization(ceed, interp_to, tau, Q, P_to)); // -- Apply Qtranspose, interp_to = Qtranspose interp_from memcpy(interp_from, input_from[m], Q * P_from * sizeof(input_from[m][0])); - ierr = CeedHouseholderApplyQ(interp_from, interp_to, tau, CEED_TRANSPOSE, - Q, P_from, P_to, P_from, 1); CeedChk(ierr); + CeedCall(CeedHouseholderApplyQ(interp_from, interp_to, tau, CEED_TRANSPOSE, Q, P_from, P_to, P_from, 1)); // -- Apply Rinv, interp_project = Rinv interp_c - for (CeedInt j = 0; j < P_from; j++) { // Column j - output_project[m][j + P_from * (P_to - 1)] = interp_from[j + P_from * - (P_to - 1)] / interp_to[P_to * P_to - 1]; - for (CeedInt i = P_to - 2; i >= 0; i--) { // Row i + for (CeedInt j = 0; j < P_from; j++) { // Column j + output_project[m][j + P_from * (P_to - 1)] = interp_from[j + P_from * (P_to - 1)] / interp_to[P_to * P_to - 1]; + for (CeedInt i = P_to - 2; i >= 0; i--) { // Row i output_project[m][j + P_from * i] = interp_from[j + P_from * i]; - for (CeedInt k = i+1; k < P_to; k++) { - output_project[m][j + P_from * i] -= interp_to[k + P_to * i]* - output_project[m][j + P_from * k]; + for (CeedInt k = i + 1; k < P_to; k++) { + output_project[m][j + P_from * i] -= interp_to[k + P_to * i] * output_project[m][j + P_from * k]; } output_project[m][j + P_from * i] /= interp_to[i + P_to * i]; } @@ -282,9 +261,9 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, } // Cleanup - ierr = CeedFree(&tau); CeedChk(ierr); - ierr = CeedFree(&interp_to); CeedChk(ierr); - ierr = CeedFree(&interp_from); CeedChk(ierr); + CeedCall(CeedFree(&tau)); + CeedCall(CeedFree(&interp_to)); + CeedCall(CeedFree(&interp_from)); return CEED_ERROR_SUCCESS; } @@ -309,43 +288,40 @@ static int CeedBasisCreateProjectionMatrices(CeedBasis basis_from, @ref Backend **/ int CeedBasisGetCollocatedGrad(CeedBasis basis, CeedScalar *collo_grad_1d) { - int i, j, k; - Ceed ceed; - CeedInt ierr, P_1d=(basis)->P_1d, Q_1d=(basis)->Q_1d; + int i, j, k; + Ceed ceed; + CeedInt P_1d = (basis)->P_1d, Q_1d = (basis)->Q_1d; CeedScalar *interp_1d, *grad_1d, *tau; - ierr = CeedMalloc(Q_1d*P_1d, &interp_1d); CeedChk(ierr); - ierr = CeedMalloc(Q_1d*P_1d, &grad_1d); CeedChk(ierr); - ierr = CeedMalloc(Q_1d, &tau); CeedChk(ierr); - memcpy(interp_1d, (basis)->interp_1d, Q_1d*P_1d*sizeof(basis)->interp_1d[0]); - memcpy(grad_1d, (basis)->grad_1d, Q_1d*P_1d*sizeof(basis)->interp_1d[0]); + CeedCall(CeedMalloc(Q_1d * P_1d, &interp_1d)); + CeedCall(CeedMalloc(Q_1d * P_1d, &grad_1d)); + CeedCall(CeedMalloc(Q_1d, &tau)); + memcpy(interp_1d, (basis)->interp_1d, Q_1d * P_1d * sizeof(basis)->interp_1d[0]); + memcpy(grad_1d, (basis)->grad_1d, Q_1d * P_1d * sizeof(basis)->interp_1d[0]); // QR Factorization, interp_1d = Q R - ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr); - ierr = CeedQRFactorization(ceed, interp_1d, tau, Q_1d, P_1d); CeedChk(ierr); + CeedCall(CeedBasisGetCeed(basis, &ceed)); + CeedCall(CeedQRFactorization(ceed, interp_1d, tau, Q_1d, P_1d)); // Note: This function is for backend use, so all errors are terminal // and we do not need to clean up memory on failure. // Apply Rinv, collo_grad_1d = grad_1d Rinv - for (i=0; iceed, CEED_ERROR_INCOMPATIBLE, - "Tensor CEED_EVAL_DIV not supported"); break; - case CEED_EVAL_CURL: - return CeedError(basis->ceed, CEED_ERROR_INCOMPATIBLE, - "Tensor CEED_EVAL_CURL not supported"); break; - // LCOV_EXCL_STOP - case CEED_EVAL_WEIGHT: *flops = dim * CeedIntPow(Q_1d, dim); break; + case CEED_EVAL_NONE: + *flops = 0; + break; + case CEED_EVAL_INTERP: + *flops = tensor_flops; + break; + case CEED_EVAL_GRAD: + *flops = tensor_flops * 2; + break; + case CEED_EVAL_DIV: + // LCOV_EXCL_START + return CeedError(basis->ceed, CEED_ERROR_INCOMPATIBLE, "Tensor CEED_EVAL_DIV not supported"); + break; + case CEED_EVAL_CURL: + return CeedError(basis->ceed, CEED_ERROR_INCOMPATIBLE, "Tensor CEED_EVAL_CURL not supported"); + break; + // LCOV_EXCL_STOP + case CEED_EVAL_WEIGHT: + *flops = dim * CeedIntPow(Q_1d, dim); + break; } } else { CeedInt dim, num_comp, num_nodes, num_qpts, Q_comp; - ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); - ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChk(ierr); - ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChk(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChk(ierr); - ierr = CeedBasisGetNumQuadratureComponents(basis, &Q_comp); CeedChk(ierr); + CeedCall(CeedBasisGetDimension(basis, &dim)); + CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCall(CeedBasisGetNumNodes(basis, &num_nodes)); + CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + CeedCall(CeedBasisGetNumQuadratureComponents(basis, &Q_comp)); switch (eval_mode) { - case CEED_EVAL_NONE: *flops = 0; break; - case CEED_EVAL_INTERP: *flops = num_nodes * num_qpts * num_comp; break; - case CEED_EVAL_GRAD: *flops = num_nodes * num_qpts * num_comp * dim; break; - case CEED_EVAL_DIV: *flops = num_nodes * num_qpts; break; - case CEED_EVAL_CURL: *flops = num_nodes * num_qpts * dim; break; - case CEED_EVAL_WEIGHT: *flops = 0; break; + case CEED_EVAL_NONE: + *flops = 0; + break; + case CEED_EVAL_INTERP: + *flops = num_nodes * num_qpts * num_comp; + break; + case CEED_EVAL_GRAD: + *flops = num_nodes * num_qpts * num_comp * dim; + break; + case CEED_EVAL_DIV: + *flops = num_nodes * num_qpts; + break; + case CEED_EVAL_CURL: + *flops = num_nodes * num_qpts * dim; + break; + case CEED_EVAL_WEIGHT: + *flops = 0; + break; } } @@ -484,7 +479,7 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, @ref Backend **/ int CeedBasisGetTopologyDimension(CeedElemTopology topo, CeedInt *dim) { - *dim = (CeedInt) topo >> 16; + *dim = (CeedInt)topo >> 16; return CEED_ERROR_SUCCESS; } @@ -514,9 +509,8 @@ int CeedBasisGetTensorContract(CeedBasis basis, CeedTensorContract *contract) { @ref Backend **/ int CeedBasisSetTensorContract(CeedBasis basis, CeedTensorContract contract) { - int ierr; basis->contract = contract; - ierr = CeedTensorContractReference(contract); CeedChk(ierr); + CeedCall(CeedTensorContractReference(contract)); return CEED_ERROR_SUCCESS; } @@ -537,16 +531,14 @@ int CeedBasisSetTensorContract(CeedBasis basis, CeedTensorContract contract) { @ref Utility **/ -int CeedMatrixMatrixMultiply(Ceed ceed, const CeedScalar *mat_A, - const CeedScalar *mat_B, CeedScalar *mat_C, - CeedInt m, CeedInt n, CeedInt kk) { - for (CeedInt i=0; iBasisCreateTensorH1) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "Basis"); CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support BasisCreateTensorH1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support BasisCreateTensorH1"); + // LCOV_EXCL_STOP + } - ierr = CeedBasisCreateTensorH1(delegate, dim, num_comp, P_1d, - Q_1d, interp_1d, grad_1d, q_ref_1d, - q_weight_1d, basis); CeedChk(ierr); + CeedCall(CeedBasisCreateTensorH1(delegate, dim, num_comp, P_1d, Q_1d, interp_1d, grad_1d, q_ref_1d, q_weight_1d, basis)); return CEED_ERROR_SUCCESS; } - if (dim < 1) + if (dim < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis dimension must be a positive value"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis dimension must be a positive value"); + // LCOV_EXCL_STOP + } - if (num_comp < 1) + if (num_comp < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 component"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 component"); + // LCOV_EXCL_STOP + } - if (P_1d < 1) + if (P_1d < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 node"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 node"); + // LCOV_EXCL_STOP + } - if (Q_1d < 1) + if (Q_1d < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 quadrature point"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 quadrature point"); + // LCOV_EXCL_STOP + } - CeedElemTopology topo = dim == 1 ? CEED_TOPOLOGY_LINE - : dim == 2 ? CEED_TOPOLOGY_QUAD - : CEED_TOPOLOGY_HEX; + CeedElemTopology topo = dim == 1 ? CEED_TOPOLOGY_LINE : dim == 2 ? CEED_TOPOLOGY_QUAD : CEED_TOPOLOGY_HEX; - ierr = CeedCalloc(1, basis); CeedChk(ierr); + CeedCall(CeedCalloc(1, basis)); (*basis)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*basis)->ref_count = 1; + CeedCall(CeedReference(ceed)); + (*basis)->ref_count = 1; (*basis)->tensor_basis = 1; - (*basis)->dim = dim; - (*basis)->topo = topo; - (*basis)->num_comp = num_comp; - (*basis)->P_1d = P_1d; - (*basis)->Q_1d = Q_1d; - (*basis)->P = CeedIntPow(P_1d, dim); - (*basis)->Q = CeedIntPow(Q_1d, dim); - (*basis)->Q_comp = 1; - (*basis)->basis_space = 1; // 1 for H^1 space - ierr = CeedCalloc(Q_1d, &(*basis)->q_ref_1d); CeedChk(ierr); - ierr = CeedCalloc(Q_1d, &(*basis)->q_weight_1d); CeedChk(ierr); - if (q_ref_1d) memcpy((*basis)->q_ref_1d, q_ref_1d, Q_1d*sizeof(q_ref_1d[0])); - if (q_weight_1d) memcpy((*basis)->q_weight_1d, q_weight_1d, - Q_1d*sizeof(q_weight_1d[0])); - ierr = CeedCalloc(Q_1d*P_1d, &(*basis)->interp_1d); CeedChk(ierr); - ierr = CeedCalloc(Q_1d*P_1d, &(*basis)->grad_1d); CeedChk(ierr); - if (interp_1d) memcpy((*basis)->interp_1d, interp_1d, - Q_1d*P_1d*sizeof(interp_1d[0])); - if (grad_1d) memcpy((*basis)->grad_1d, grad_1d, Q_1d*P_1d*sizeof(grad_1d[0])); - ierr = ceed->BasisCreateTensorH1(dim, P_1d, Q_1d, interp_1d, grad_1d, q_ref_1d, - q_weight_1d, *basis); CeedChk(ierr); + (*basis)->dim = dim; + (*basis)->topo = topo; + (*basis)->num_comp = num_comp; + (*basis)->P_1d = P_1d; + (*basis)->Q_1d = Q_1d; + (*basis)->P = CeedIntPow(P_1d, dim); + (*basis)->Q = CeedIntPow(Q_1d, dim); + (*basis)->Q_comp = 1; + (*basis)->basis_space = 1; // 1 for H^1 space + CeedCall(CeedCalloc(Q_1d, &(*basis)->q_ref_1d)); + CeedCall(CeedCalloc(Q_1d, &(*basis)->q_weight_1d)); + if (q_ref_1d) memcpy((*basis)->q_ref_1d, q_ref_1d, Q_1d * sizeof(q_ref_1d[0])); + if (q_weight_1d) memcpy((*basis)->q_weight_1d, q_weight_1d, Q_1d * sizeof(q_weight_1d[0])); + CeedCall(CeedCalloc(Q_1d * P_1d, &(*basis)->interp_1d)); + CeedCall(CeedCalloc(Q_1d * P_1d, &(*basis)->grad_1d)); + if (interp_1d) memcpy((*basis)->interp_1d, interp_1d, Q_1d * P_1d * sizeof(interp_1d[0])); + if (grad_1d) memcpy((*basis)->grad_1d, grad_1d, Q_1d * P_1d * sizeof(grad_1d[0])); + CeedCall(ceed->BasisCreateTensorH1(dim, P_1d, Q_1d, interp_1d, grad_1d, q_ref_1d, q_weight_1d, *basis)); return CEED_ERROR_SUCCESS; } @@ -679,62 +659,58 @@ int CeedBasisCreateTensorH1(Ceed ceed, CeedInt dim, CeedInt num_comp, @ref User **/ -int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, - CeedInt P, CeedInt Q, CeedQuadMode quad_mode, - CeedBasis *basis) { +int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, CeedInt P, CeedInt Q, CeedQuadMode quad_mode, CeedBasis *basis) { // Allocate - int ierr, ierr2, i, j, k; - CeedScalar c1, c2, c3, c4, dx, *nodes, *interp_1d, *grad_1d, *q_ref_1d, - *q_weight_1d; + int ierr = CEED_ERROR_SUCCESS, i, j, k; + CeedScalar c1, c2, c3, c4, dx, *nodes, *interp_1d, *grad_1d, *q_ref_1d, *q_weight_1d; - if (dim < 1) + if (dim < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis dimension must be a positive value"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis dimension must be a positive value"); + // LCOV_EXCL_STOP + } - if (num_comp < 1) + if (num_comp < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 component"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 component"); + // LCOV_EXCL_STOP + } - if (P < 1) + if (P < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 node"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 node"); + // LCOV_EXCL_STOP + } - if (Q < 1) + if (Q < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 quadrature point"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 quadrature point"); + // LCOV_EXCL_STOP + } // Get Nodes and Weights - ierr = CeedCalloc(P*Q, &interp_1d); CeedChk(ierr); - ierr = CeedCalloc(P*Q, &grad_1d); CeedChk(ierr); - ierr = CeedCalloc(P, &nodes); CeedChk(ierr); - ierr = CeedCalloc(Q, &q_ref_1d); CeedChk(ierr); - ierr = CeedCalloc(Q, &q_weight_1d); CeedChk(ierr); - ierr = CeedLobattoQuadrature(P, nodes, NULL); - if (ierr) { goto cleanup; } CeedChk(ierr); + CeedCall(CeedCalloc(P * Q, &interp_1d)); + CeedCall(CeedCalloc(P * Q, &grad_1d)); + CeedCall(CeedCalloc(P, &nodes)); + CeedCall(CeedCalloc(Q, &q_ref_1d)); + CeedCall(CeedCalloc(Q, &q_weight_1d)); + if (CeedLobattoQuadrature(P, nodes, NULL) != CEED_ERROR_SUCCESS) goto cleanup; switch (quad_mode) { - case CEED_GAUSS: - ierr = CeedGaussQuadrature(Q, q_ref_1d, q_weight_1d); - break; - case CEED_GAUSS_LOBATTO: - ierr = CeedLobattoQuadrature(Q, q_ref_1d, q_weight_1d); - break; + case CEED_GAUSS: + ierr = CeedGaussQuadrature(Q, q_ref_1d, q_weight_1d); + break; + case CEED_GAUSS_LOBATTO: + ierr = CeedLobattoQuadrature(Q, q_ref_1d, q_weight_1d); + break; } - if (ierr) { goto cleanup; } CeedChk(ierr); + if (ierr != CEED_ERROR_SUCCESS) goto cleanup; // Build B, D matrix // Fornberg, 1998 - for (i = 0; i < Q; i++) { - c1 = 1.0; - c3 = nodes[0] - q_ref_1d[i]; - interp_1d[i*P+0] = 1.0; + for (i = 0; i < Q; i++) { + c1 = 1.0; + c3 = nodes[0] - q_ref_1d[i]; + interp_1d[i * P + 0] = 1.0; for (j = 1; j < P; j++) { c2 = 1.0; c4 = c3; @@ -743,25 +719,23 @@ int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, dx = nodes[j] - nodes[k]; c2 *= dx; if (k == j - 1) { - grad_1d[i*P + j] = c1*(interp_1d[i*P + k] - c4*grad_1d[i*P + k]) / c2; - interp_1d[i*P + j] = - c1*c4*interp_1d[i*P + k] / c2; + grad_1d[i * P + j] = c1 * (interp_1d[i * P + k] - c4 * grad_1d[i * P + k]) / c2; + interp_1d[i * P + j] = -c1 * c4 * interp_1d[i * P + k] / c2; } - grad_1d[i*P + k] = (c3*grad_1d[i*P + k] - interp_1d[i*P + k]) / dx; - interp_1d[i*P + k] = c3*interp_1d[i*P + k] / dx; + grad_1d[i * P + k] = (c3 * grad_1d[i * P + k] - interp_1d[i * P + k]) / dx; + interp_1d[i * P + k] = c3 * interp_1d[i * P + k] / dx; } c1 = c2; } } // Pass to CeedBasisCreateTensorH1 - ierr = CeedBasisCreateTensorH1(ceed, dim, num_comp, P, Q, interp_1d, grad_1d, - q_ref_1d, q_weight_1d, basis); CeedChk(ierr); + CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P, Q, interp_1d, grad_1d, q_ref_1d, q_weight_1d, basis)); cleanup: - ierr2 = CeedFree(&interp_1d); CeedChk(ierr2); - ierr2 = CeedFree(&grad_1d); CeedChk(ierr2); - ierr2 = CeedFree(&nodes); CeedChk(ierr2); - ierr2 = CeedFree(&q_ref_1d); CeedChk(ierr2); - ierr2 = CeedFree(&q_weight_1d); CeedChk(ierr2); - CeedChk(ierr); + CeedCall(CeedFree(&interp_1d)); + CeedCall(CeedFree(&grad_1d)); + CeedCall(CeedFree(&nodes)); + CeedCall(CeedFree(&q_ref_1d)); + CeedCall(CeedFree(&q_weight_1d)); return CEED_ERROR_SUCCESS; } @@ -788,72 +762,66 @@ int CeedBasisCreateTensorH1Lagrange(Ceed ceed, CeedInt dim, CeedInt num_comp, @ref User **/ -int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, - CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *q_ref, - const CeedScalar *q_weight, CeedBasis *basis) { - int ierr; +int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis *basis) { CeedInt P = num_nodes, Q = num_qpts, dim = 0; if (!ceed->BasisCreateH1) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "Basis"); CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support BasisCreateH1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support BasisCreateH1"); + // LCOV_EXCL_STOP + } - ierr = CeedBasisCreateH1(delegate, topo, num_comp, num_nodes, - num_qpts, interp, grad, q_ref, - q_weight, basis); CeedChk(ierr); + CeedCall(CeedBasisCreateH1(delegate, topo, num_comp, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); return CEED_ERROR_SUCCESS; } - if (num_comp < 1) + if (num_comp < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 component"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 component"); + // LCOV_EXCL_STOP + } - if (num_nodes < 1) + if (num_nodes < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 node"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 node"); + // LCOV_EXCL_STOP + } - if (num_qpts < 1) + if (num_qpts < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 quadrature point"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 quadrature point"); + // LCOV_EXCL_STOP + } - ierr = CeedCalloc(1, basis); CeedChk(ierr); + CeedCall(CeedCalloc(1, basis)); - ierr = CeedBasisGetTopologyDimension(topo, &dim); CeedChk(ierr); + CeedCall(CeedBasisGetTopologyDimension(topo, &dim)); (*basis)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*basis)->ref_count = 1; + CeedCall(CeedReference(ceed)); + (*basis)->ref_count = 1; (*basis)->tensor_basis = 0; - (*basis)->dim = dim; - (*basis)->topo = topo; - (*basis)->num_comp = num_comp; - (*basis)->P = P; - (*basis)->Q = Q; - (*basis)->Q_comp = 1; - (*basis)->basis_space = 1; // 1 for H^1 space - ierr = CeedCalloc(Q*dim, &(*basis)->q_ref_1d); CeedChk(ierr); - ierr = CeedCalloc(Q, &(*basis)->q_weight_1d); CeedChk(ierr); - if (q_ref) memcpy((*basis)->q_ref_1d, q_ref, Q*dim*sizeof(q_ref[0])); - if(q_weight) memcpy((*basis)->q_weight_1d, q_weight, Q*sizeof(q_weight[0])); - ierr = CeedCalloc(Q*P, &(*basis)->interp); CeedChk(ierr); - ierr = CeedCalloc(dim*Q*P, &(*basis)->grad); CeedChk(ierr); - if(interp) memcpy((*basis)->interp, interp, Q*P*sizeof(interp[0])); - if(grad) memcpy((*basis)->grad, grad, dim*Q*P*sizeof(grad[0])); - ierr = ceed->BasisCreateH1(topo, dim, P, Q, interp, grad, q_ref, - q_weight, *basis); CeedChk(ierr); + (*basis)->dim = dim; + (*basis)->topo = topo; + (*basis)->num_comp = num_comp; + (*basis)->P = P; + (*basis)->Q = Q; + (*basis)->Q_comp = 1; + (*basis)->basis_space = 1; // 1 for H^1 space + CeedCall(CeedCalloc(Q * dim, &(*basis)->q_ref_1d)); + CeedCall(CeedCalloc(Q, &(*basis)->q_weight_1d)); + if (q_ref) memcpy((*basis)->q_ref_1d, q_ref, Q * dim * sizeof(q_ref[0])); + if (q_weight) memcpy((*basis)->q_weight_1d, q_weight, Q * sizeof(q_weight[0])); + CeedCall(CeedCalloc(Q * P, &(*basis)->interp)); + CeedCall(CeedCalloc(dim * Q * P, &(*basis)->grad)); + if (interp) memcpy((*basis)->interp, interp, Q * P * sizeof(interp[0])); + if (grad) memcpy((*basis)->grad, grad, dim * Q * P * sizeof(grad[0])); + CeedCall(ceed->BasisCreateH1(topo, dim, P, Q, interp, grad, q_ref, q_weight, *basis)); return CEED_ERROR_SUCCESS; } @@ -881,70 +849,64 @@ int CeedBasisCreateH1(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, @ref User **/ -int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, - CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, - const CeedScalar *div, const CeedScalar *q_ref, - const CeedScalar *q_weight, CeedBasis *basis) { - int ierr; +int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis *basis) { CeedInt Q = num_qpts, P = num_nodes, dim = 0; - ierr = CeedBasisGetTopologyDimension(topo, &dim); CeedChk(ierr); + CeedCall(CeedBasisGetTopologyDimension(topo, &dim)); if (!ceed->BasisCreateHdiv) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "Basis"); CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not implement BasisCreateHdiv"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement BasisCreateHdiv"); + // LCOV_EXCL_STOP + } - ierr = CeedBasisCreateHdiv(delegate, topo, num_comp, num_nodes, - num_qpts, interp, div, q_ref, - q_weight, basis); CeedChk(ierr); + CeedCall(CeedBasisCreateHdiv(delegate, topo, num_comp, num_nodes, num_qpts, interp, div, q_ref, q_weight, basis)); return CEED_ERROR_SUCCESS; } - if (num_comp < 1) + if (num_comp < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 component"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 component"); + // LCOV_EXCL_STOP + } - if (num_nodes < 1) + if (num_nodes < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 node"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 node"); + // LCOV_EXCL_STOP + } - if (num_qpts < 1) + if (num_qpts < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Basis must have at least 1 quadrature point"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Basis must have at least 1 quadrature point"); + // LCOV_EXCL_STOP + } - ierr = CeedCalloc(1, basis); CeedChk(ierr); + CeedCall(CeedCalloc(1, basis)); (*basis)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*basis)->ref_count = 1; + CeedCall(CeedReference(ceed)); + (*basis)->ref_count = 1; (*basis)->tensor_basis = 0; - (*basis)->dim = dim; - (*basis)->topo = topo; - (*basis)->num_comp = num_comp; - (*basis)->P = P; - (*basis)->Q = Q; - (*basis)->Q_comp = dim; - (*basis)->basis_space = 2; // 2 for H(div) space - ierr = CeedMalloc(Q*dim, &(*basis)->q_ref_1d); CeedChk(ierr); - ierr = CeedMalloc(Q, &(*basis)->q_weight_1d); CeedChk(ierr); - if (q_ref) memcpy((*basis)->q_ref_1d, q_ref, Q*dim*sizeof(q_ref[0])); - if (q_weight) memcpy((*basis)->q_weight_1d, q_weight, Q*sizeof(q_weight[0])); - ierr = CeedMalloc(dim*Q*P, &(*basis)->interp); CeedChk(ierr); - ierr = CeedMalloc(Q*P, &(*basis)->div); CeedChk(ierr); - if (interp) memcpy((*basis)->interp, interp, dim*Q*P*sizeof(interp[0])); - if (div) memcpy((*basis)->div, div, Q*P*sizeof(div[0])); - ierr = ceed->BasisCreateHdiv(topo, dim, P, Q, interp, div, q_ref, - q_weight, *basis); CeedChk(ierr); + (*basis)->dim = dim; + (*basis)->topo = topo; + (*basis)->num_comp = num_comp; + (*basis)->P = P; + (*basis)->Q = Q; + (*basis)->Q_comp = dim; + (*basis)->basis_space = 2; // 2 for H(div) space + CeedCall(CeedMalloc(Q * dim, &(*basis)->q_ref_1d)); + CeedCall(CeedMalloc(Q, &(*basis)->q_weight_1d)); + if (q_ref) memcpy((*basis)->q_ref_1d, q_ref, Q * dim * sizeof(q_ref[0])); + if (q_weight) memcpy((*basis)->q_weight_1d, q_weight, Q * sizeof(q_weight[0])); + CeedCall(CeedMalloc(dim * Q * P, &(*basis)->interp)); + CeedCall(CeedMalloc(Q * P, &(*basis)->div)); + if (interp) memcpy((*basis)->interp, interp, dim * Q * P * sizeof(interp[0])); + if (div) memcpy((*basis)->div, div, Q * P * sizeof(div[0])); + CeedCall(ceed->BasisCreateHdiv(topo, dim, P, Q, interp, div, q_ref, q_weight, *basis)); return CEED_ERROR_SUCCESS; } @@ -971,52 +933,44 @@ int CeedBasisCreateHdiv(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, @ref User **/ -int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, - CeedBasis *basis_project) { - int ierr; +int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, CeedBasis *basis_project) { Ceed ceed; - ierr = CeedBasisGetCeed(basis_to, &ceed); CeedChk(ierr); + CeedCall(CeedBasisGetCeed(basis_to, &ceed)); // Create projectior matrix CeedScalar *interp_project, *grad_project; - ierr = CeedBasisCreateProjectionMatrices(basis_from, basis_to, - &interp_project, &grad_project); - CeedChk(ierr); + CeedCall(CeedBasisCreateProjectionMatrices(basis_from, basis_to, &interp_project, &grad_project)); // Build basis - bool is_tensor; - CeedInt dim, num_comp; + bool is_tensor; + CeedInt dim, num_comp; CeedScalar *q_ref, *q_weight; - ierr = CeedBasisIsTensor(basis_to, &is_tensor); CeedChk(ierr); - ierr = CeedBasisGetDimension(basis_to, &dim); CeedChk(ierr); - ierr = CeedBasisGetNumComponents(basis_from, &num_comp); CeedChk(ierr); + CeedCall(CeedBasisIsTensor(basis_to, &is_tensor)); + CeedCall(CeedBasisGetDimension(basis_to, &dim)); + CeedCall(CeedBasisGetNumComponents(basis_from, &num_comp)); if (is_tensor) { CeedInt P_1d_to, P_1d_from; - ierr = CeedBasisGetNumNodes1D(basis_from, &P_1d_from); CeedChk(ierr); - ierr = CeedBasisGetNumNodes1D(basis_to, &P_1d_to); CeedChk(ierr); - ierr = CeedCalloc(P_1d_to, &q_ref); CeedChk(ierr); - ierr = CeedCalloc(P_1d_to, &q_weight); CeedChk(ierr); - ierr = CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_from, P_1d_to, - interp_project, grad_project, q_ref, q_weight, basis_project); - CeedChk(ierr); + CeedCall(CeedBasisGetNumNodes1D(basis_from, &P_1d_from)); + CeedCall(CeedBasisGetNumNodes1D(basis_to, &P_1d_to)); + CeedCall(CeedCalloc(P_1d_to, &q_ref)); + CeedCall(CeedCalloc(P_1d_to, &q_weight)); + CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_from, P_1d_to, interp_project, grad_project, q_ref, q_weight, basis_project)); } else { CeedElemTopology topo; - ierr = CeedBasisGetTopology(basis_to, &topo); CeedChk(ierr); + CeedCall(CeedBasisGetTopology(basis_to, &topo)); CeedInt num_nodes_to, num_nodes_from; - ierr = CeedBasisGetNumNodes(basis_from, &num_nodes_from); CeedChk(ierr); - ierr = CeedBasisGetNumNodes(basis_to, &num_nodes_to); CeedChk(ierr); - ierr = CeedCalloc(num_nodes_to * dim, &q_ref); CeedChk(ierr); - ierr = CeedCalloc(num_nodes_to, &q_weight); CeedChk(ierr); - ierr = CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_from, num_nodes_to, - interp_project, grad_project, q_ref, q_weight, basis_project); - CeedChk(ierr); + CeedCall(CeedBasisGetNumNodes(basis_from, &num_nodes_from)); + CeedCall(CeedBasisGetNumNodes(basis_to, &num_nodes_to)); + CeedCall(CeedCalloc(num_nodes_to * dim, &q_ref)); + CeedCall(CeedCalloc(num_nodes_to, &q_weight)); + CeedCall(CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_from, num_nodes_to, interp_project, grad_project, q_ref, q_weight, basis_project)); } // Cleanup - ierr = CeedFree(&interp_project); CeedChk(ierr); - ierr = CeedFree(&grad_project); CeedChk(ierr); - ierr = CeedFree(&q_ref); CeedChk(ierr); - ierr = CeedFree(&q_weight); CeedChk(ierr); + CeedCall(CeedFree(&interp_project)); + CeedCall(CeedFree(&grad_project)); + CeedCall(CeedFree(&q_ref)); + CeedCall(CeedFree(&q_weight)); return CEED_ERROR_SUCCESS; } @@ -1037,10 +991,8 @@ int CeedBasisCreateProjection(CeedBasis basis_from, CeedBasis basis_to, @ref User **/ int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy) { - int ierr; - - ierr = CeedBasisReference(basis); CeedChk(ierr); - ierr = CeedBasisDestroy(basis_copy); CeedChk(ierr); + CeedCall(CeedBasisReference(basis)); + CeedCall(CeedBasisDestroy(basis_copy)); *basis_copy = basis; return CEED_ERROR_SUCCESS; } @@ -1056,46 +1008,32 @@ int CeedBasisReferenceCopy(CeedBasis basis, CeedBasis *basis_copy) { @ref User **/ int CeedBasisView(CeedBasis basis, FILE *stream) { - int ierr; - CeedFESpace FE_space = basis->basis_space; - CeedElemTopology topo = basis->topo; + CeedFESpace FE_space = basis->basis_space; + CeedElemTopology topo = basis->topo; + // Print FE space and element topology of the basis if (basis->tensor_basis) { - fprintf(stream, "CeedBasis (%s on a %s element): dim=%" CeedInt_FMT " P=%" - CeedInt_FMT " Q=%" CeedInt_FMT "\n", - CeedFESpaces[FE_space], CeedElemTopologies[topo], - basis->dim, basis->P_1d, basis->Q_1d); + fprintf(stream, "CeedBasis (%s on a %s element): dim=%" CeedInt_FMT " P=%" CeedInt_FMT " Q=%" CeedInt_FMT "\n", CeedFESpaces[FE_space], + CeedElemTopologies[topo], basis->dim, basis->P_1d, basis->Q_1d); } else { - fprintf(stream, "CeedBasis (%s on a %s element): dim=%" CeedInt_FMT " P=%" - CeedInt_FMT " Q=%" CeedInt_FMT "\n", - CeedFESpaces[FE_space], CeedElemTopologies[topo], - basis->dim, basis->P, basis->Q); + fprintf(stream, "CeedBasis (%s on a %s element): dim=%" CeedInt_FMT " P=%" CeedInt_FMT " Q=%" CeedInt_FMT "\n", CeedFESpaces[FE_space], + CeedElemTopologies[topo], basis->dim, basis->P, basis->Q); } // Print quadrature data, interpolation/gradient/divergene/curl of the basis - if (basis->tensor_basis) { // tensor basis - ierr = CeedScalarView("qref1d", "\t% 12.8f", 1, basis->Q_1d, basis->q_ref_1d, - stream); CeedChk(ierr); - ierr = CeedScalarView("qweight1d", "\t% 12.8f", 1, basis->Q_1d, - basis->q_weight_1d, stream); CeedChk(ierr); - ierr = CeedScalarView("interp1d", "\t% 12.8f", basis->Q_1d, basis->P_1d, - basis->interp_1d, stream); CeedChk(ierr); - ierr = CeedScalarView("grad1d", "\t% 12.8f", basis->Q_1d, basis->P_1d, - basis->grad_1d, stream); CeedChk(ierr); - } else { // non-tensor basis - ierr = CeedScalarView("qref", "\t% 12.8f", 1, basis->Q*basis->dim, - basis->q_ref_1d, - stream); CeedChk(ierr); - ierr = CeedScalarView("qweight", "\t% 12.8f", 1, basis->Q, basis->q_weight_1d, - stream); CeedChk(ierr); - ierr = CeedScalarView("interp", "\t% 12.8f", basis->Q_comp*basis->Q, basis->P, - basis->interp, stream); CeedChk(ierr); + if (basis->tensor_basis) { // tensor basis + CeedCall(CeedScalarView("qref1d", "\t% 12.8f", 1, basis->Q_1d, basis->q_ref_1d, stream)); + CeedCall(CeedScalarView("qweight1d", "\t% 12.8f", 1, basis->Q_1d, basis->q_weight_1d, stream)); + CeedCall(CeedScalarView("interp1d", "\t% 12.8f", basis->Q_1d, basis->P_1d, basis->interp_1d, stream)); + CeedCall(CeedScalarView("grad1d", "\t% 12.8f", basis->Q_1d, basis->P_1d, basis->grad_1d, stream)); + } else { // non-tensor basis + CeedCall(CeedScalarView("qref", "\t% 12.8f", 1, basis->Q * basis->dim, basis->q_ref_1d, stream)); + CeedCall(CeedScalarView("qweight", "\t% 12.8f", 1, basis->Q, basis->q_weight_1d, stream)); + CeedCall(CeedScalarView("interp", "\t% 12.8f", basis->Q_comp * basis->Q, basis->P, basis->interp, stream)); if (basis->grad) { - ierr = CeedScalarView("grad", "\t% 12.8f", basis->dim*basis->Q, basis->P, - basis->grad, stream); CeedChk(ierr); + CeedCall(CeedScalarView("grad", "\t% 12.8f", basis->dim * basis->Q, basis->P, basis->grad, stream)); } if (basis->div) { - ierr = CeedScalarView("div", "\t% 12.8f", basis->Q, basis->P, - basis->div, stream); CeedChk(ierr); + CeedCall(CeedScalarView("div", "\t% 12.8f", basis->Q, basis->P, basis->div, stream)); } } return CEED_ERROR_SUCCESS; @@ -1122,78 +1060,65 @@ int CeedBasisView(CeedBasis basis, FILE *stream) { @ref User **/ -int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, - CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - int ierr; +int CeedBasisApply(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { CeedSize u_length = 0, v_length; - CeedInt dim, num_comp, num_nodes, num_qpts; - ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); - ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChk(ierr); - ierr = CeedBasisGetNumNodes(basis, &num_nodes); CeedChk(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis, &num_qpts); CeedChk(ierr); - ierr = CeedVectorGetLength(v, &v_length); CeedChk(ierr); + CeedInt dim, num_comp, num_nodes, num_qpts; + CeedCall(CeedBasisGetDimension(basis, &dim)); + CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCall(CeedBasisGetNumNodes(basis, &num_nodes)); + CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + CeedCall(CeedVectorGetLength(v, &v_length)); if (u) { - ierr = CeedVectorGetLength(u, &u_length); CeedChk(ierr); + CeedCall(CeedVectorGetLength(u, &u_length)); } - if (!basis->Apply) + if (!basis->Apply) { // LCOV_EXCL_START - return CeedError(basis->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support BasisApply"); - // LCOV_EXCL_STOP + return CeedError(basis->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support BasisApply"); + // LCOV_EXCL_STOP + } // Check compatibility of topological and geometrical dimensions - if ((t_mode == CEED_TRANSPOSE && (v_length%num_nodes != 0 || - u_length%num_qpts != 0)) || - (t_mode == CEED_NOTRANSPOSE && (u_length%num_nodes != 0 || - v_length%num_qpts != 0))) + if ((t_mode == CEED_TRANSPOSE && (v_length % num_nodes != 0 || u_length % num_qpts != 0)) || + (t_mode == CEED_NOTRANSPOSE && (u_length % num_nodes != 0 || v_length % num_qpts != 0))) { // LCOV_EXCL_START - return CeedError(basis->ceed, CEED_ERROR_DIMENSION, - "Length of input/output vectors " - "incompatible with basis dimensions"); - // LCOV_EXCL_STOP + return CeedError(basis->ceed, CEED_ERROR_DIMENSION, "Length of input/output vectors incompatible with basis dimensions"); + // LCOV_EXCL_STOP + } // Check vector lengths to prevent out of bounds issues bool bad_dims = false; switch (eval_mode) { - case CEED_EVAL_NONE: - case CEED_EVAL_INTERP: bad_dims = - ((t_mode == CEED_TRANSPOSE && (u_length < num_elem*num_comp*num_qpts || - v_length < num_elem*num_comp*num_nodes)) || - (t_mode == CEED_NOTRANSPOSE && (v_length < num_elem*num_qpts*num_comp || - u_length < num_elem*num_comp*num_nodes))); - break; - case CEED_EVAL_GRAD: bad_dims = - ((t_mode == CEED_TRANSPOSE && (u_length < num_elem*num_comp*num_qpts*dim || - v_length < num_elem*num_comp*num_nodes)) || - (t_mode == CEED_NOTRANSPOSE && (v_length < num_elem*num_qpts*num_comp*dim || - u_length < num_elem*num_comp*num_nodes))); - break; - case CEED_EVAL_WEIGHT: - bad_dims = v_length < num_elem*num_qpts; - break; - // LCOV_EXCL_START - case CEED_EVAL_DIV: bad_dims = - ((t_mode == CEED_TRANSPOSE && (u_length < num_elem*num_comp*num_qpts || - v_length < num_elem*num_comp*num_nodes)) || - (t_mode == CEED_NOTRANSPOSE && (v_length < num_elem*num_qpts*num_comp || - u_length < num_elem*num_comp*num_nodes))); - break; - case CEED_EVAL_CURL: bad_dims = - ((t_mode == CEED_TRANSPOSE && (u_length < num_elem*num_comp*num_qpts || - v_length < num_elem*num_comp*num_nodes)) || - (t_mode == CEED_NOTRANSPOSE && (v_length < num_elem*num_qpts*num_comp || - u_length < num_elem*num_comp*num_nodes))); - break; - // LCOV_EXCL_STOP + case CEED_EVAL_NONE: + case CEED_EVAL_INTERP: + bad_dims = ((t_mode == CEED_TRANSPOSE && (u_length < num_elem * num_comp * num_qpts || v_length < num_elem * num_comp * num_nodes)) || + (t_mode == CEED_NOTRANSPOSE && (v_length < num_elem * num_qpts * num_comp || u_length < num_elem * num_comp * num_nodes))); + break; + case CEED_EVAL_GRAD: + bad_dims = ((t_mode == CEED_TRANSPOSE && (u_length < num_elem * num_comp * num_qpts * dim || v_length < num_elem * num_comp * num_nodes)) || + (t_mode == CEED_NOTRANSPOSE && (v_length < num_elem * num_qpts * num_comp * dim || u_length < num_elem * num_comp * num_nodes))); + break; + case CEED_EVAL_WEIGHT: + bad_dims = v_length < num_elem * num_qpts; + break; + // LCOV_EXCL_START + case CEED_EVAL_DIV: + bad_dims = ((t_mode == CEED_TRANSPOSE && (u_length < num_elem * num_comp * num_qpts || v_length < num_elem * num_comp * num_nodes)) || + (t_mode == CEED_NOTRANSPOSE && (v_length < num_elem * num_qpts * num_comp || u_length < num_elem * num_comp * num_nodes))); + break; + case CEED_EVAL_CURL: + bad_dims = ((t_mode == CEED_TRANSPOSE && (u_length < num_elem * num_comp * num_qpts || v_length < num_elem * num_comp * num_nodes)) || + (t_mode == CEED_NOTRANSPOSE && (v_length < num_elem * num_qpts * num_comp || u_length < num_elem * num_comp * num_nodes))); + break; + // LCOV_EXCL_STOP } - if (bad_dims) + if (bad_dims) { // LCOV_EXCL_START - return CeedError(basis->ceed, CEED_ERROR_DIMENSION, - "Input/output vectors too short for basis and evaluation mode"); - // LCOV_EXCL_STOP + return CeedError(basis->ceed, CEED_ERROR_DIMENSION, "Input/output vectors too short for basis and evaluation mode"); + // LCOV_EXCL_STOP + } - ierr = basis->Apply(basis, num_elem, t_mode, eval_mode, u, v); CeedChk(ierr); + CeedCall(basis->Apply(basis, num_elem, t_mode, eval_mode, u, v)); return CEED_ERROR_SUCCESS; } @@ -1298,11 +1223,11 @@ int CeedBasisGetNumNodes(CeedBasis basis, CeedInt *P) { @ref Advanced **/ int CeedBasisGetNumNodes1D(CeedBasis basis, CeedInt *P_1d) { - if (!basis->tensor_basis) + if (!basis->tensor_basis) { // LCOV_EXCL_START - return CeedError(basis->ceed, CEED_ERROR_MINOR, - "Cannot supply P_1d for non-tensor basis"); - // LCOV_EXCL_STOP + return CeedError(basis->ceed, CEED_ERROR_MINOR, "Cannot supply P_1d for non-tensor basis"); + // LCOV_EXCL_STOP + } *P_1d = basis->P_1d; return CEED_ERROR_SUCCESS; @@ -1334,11 +1259,11 @@ int CeedBasisGetNumQuadraturePoints(CeedBasis basis, CeedInt *Q) { @ref Advanced **/ int CeedBasisGetNumQuadraturePoints1D(CeedBasis basis, CeedInt *Q_1d) { - if (!basis->tensor_basis) + if (!basis->tensor_basis) { // LCOV_EXCL_START - return CeedError(basis->ceed, CEED_ERROR_MINOR, - "Cannot supply Q_1d for non-tensor basis"); - // LCOV_EXCL_STOP + return CeedError(basis->ceed, CEED_ERROR_MINOR, "Cannot supply Q_1d for non-tensor basis"); + // LCOV_EXCL_STOP + } *Q_1d = basis->Q_1d; return CEED_ERROR_SUCCESS; @@ -1389,21 +1314,21 @@ int CeedBasisGetQWeights(CeedBasis basis, const CeedScalar **q_weight) { int CeedBasisGetInterp(CeedBasis basis, const CeedScalar **interp) { if (!basis->interp && basis->tensor_basis) { // Allocate - int ierr; - ierr = CeedMalloc(basis->Q*basis->P, &basis->interp); CeedChk(ierr); + CeedCall(CeedMalloc(basis->Q * basis->P, &basis->interp)); // Initialize - for (CeedInt i=0; iQ*basis->P; i++) - basis->interp[i] = 1.0; + for (CeedInt i = 0; i < basis->Q * basis->P; i++) basis->interp[i] = 1.0; // Calculate - for (CeedInt d=0; ddim; d++) - for (CeedInt qpt=0; qptQ; qpt++) - for (CeedInt node=0; nodeP; node++) { + for (CeedInt d = 0; d < basis->dim; d++) { + for (CeedInt qpt = 0; qpt < basis->Q; qpt++) { + for (CeedInt node = 0; node < basis->P; node++) { CeedInt p = (node / CeedIntPow(basis->P_1d, d)) % basis->P_1d; CeedInt q = (qpt / CeedIntPow(basis->Q_1d, d)) % basis->Q_1d; - basis->interp[qpt*(basis->P)+node] *= basis->interp_1d[q*basis->P_1d+p]; + basis->interp[qpt * (basis->P) + node] *= basis->interp_1d[q * basis->P_1d + p]; } + } + } } *interp = basis->interp; return CEED_ERROR_SUCCESS; @@ -1420,11 +1345,11 @@ int CeedBasisGetInterp(CeedBasis basis, const CeedScalar **interp) { @ref Backend **/ int CeedBasisGetInterp1D(CeedBasis basis, const CeedScalar **interp_1d) { - if (!basis->tensor_basis) + if (!basis->tensor_basis) { // LCOV_EXCL_START - return CeedError(basis->ceed, CEED_ERROR_MINOR, - "CeedBasis is not a tensor product basis."); - // LCOV_EXCL_STOP + return CeedError(basis->ceed, CEED_ERROR_MINOR, "CeedBasis is not a tensor product basis."); + // LCOV_EXCL_STOP + } *interp_1d = basis->interp_1d; return CEED_ERROR_SUCCESS; @@ -1443,28 +1368,24 @@ int CeedBasisGetInterp1D(CeedBasis basis, const CeedScalar **interp_1d) { int CeedBasisGetGrad(CeedBasis basis, const CeedScalar **grad) { if (!basis->grad && basis->tensor_basis) { // Allocate - int ierr; - ierr = CeedMalloc(basis->dim*basis->Q*basis->P, &basis->grad); - CeedChk(ierr); + CeedCall(CeedMalloc(basis->dim * basis->Q * basis->P, &basis->grad)); // Initialize - for (CeedInt i=0; idim*basis->Q*basis->P; i++) - basis->grad[i] = 1.0; + for (CeedInt i = 0; i < basis->dim * basis->Q * basis->P; i++) basis->grad[i] = 1.0; // Calculate - for (CeedInt d=0; ddim; d++) - for (CeedInt i=0; idim; i++) - for (CeedInt qpt=0; qptQ; qpt++) - for (CeedInt node=0; nodeP; node++) { + for (CeedInt d = 0; d < basis->dim; d++) { + for (CeedInt i = 0; i < basis->dim; i++) { + for (CeedInt qpt = 0; qpt < basis->Q; qpt++) { + for (CeedInt node = 0; node < basis->P; node++) { CeedInt p = (node / CeedIntPow(basis->P_1d, d)) % basis->P_1d; CeedInt q = (qpt / CeedIntPow(basis->Q_1d, d)) % basis->Q_1d; - if (i == d) - basis->grad[(i*basis->Q+qpt)*(basis->P)+node] *= - basis->grad_1d[q*basis->P_1d+p]; - else - basis->grad[(i*basis->Q+qpt)*(basis->P)+node] *= - basis->interp_1d[q*basis->P_1d+p]; + if (i == d) basis->grad[(i * basis->Q + qpt) * (basis->P) + node] *= basis->grad_1d[q * basis->P_1d + p]; + else basis->grad[(i * basis->Q + qpt) * (basis->P) + node] *= basis->interp_1d[q * basis->P_1d + p]; } + } + } + } } *grad = basis->grad; return CEED_ERROR_SUCCESS; @@ -1481,11 +1402,11 @@ int CeedBasisGetGrad(CeedBasis basis, const CeedScalar **grad) { @ref Advanced **/ int CeedBasisGetGrad1D(CeedBasis basis, const CeedScalar **grad_1d) { - if (!basis->tensor_basis) + if (!basis->tensor_basis) { // LCOV_EXCL_START - return CeedError(basis->ceed, CEED_ERROR_MINOR, - "CeedBasis is not a tensor product basis."); - // LCOV_EXCL_STOP + return CeedError(basis->ceed, CEED_ERROR_MINOR, "CeedBasis is not a tensor product basis."); + // LCOV_EXCL_STOP + } *grad_1d = basis->grad_1d; return CEED_ERROR_SUCCESS; @@ -1502,11 +1423,11 @@ int CeedBasisGetGrad1D(CeedBasis basis, const CeedScalar **grad_1d) { @ref Advanced **/ int CeedBasisGetDiv(CeedBasis basis, const CeedScalar **div) { - if (!basis->div) + if (!basis->div) { // LCOV_EXCL_START - return CeedError(basis->ceed, CEED_ERROR_MINOR, - "CeedBasis does not have divergence matrix."); - // LCOV_EXCL_STOP + return CeedError(basis->ceed, CEED_ERROR_MINOR, "CeedBasis does not have divergence matrix."); + // LCOV_EXCL_STOP + } *div = basis->div; return CEED_ERROR_SUCCESS; @@ -1522,24 +1443,18 @@ int CeedBasisGetDiv(CeedBasis basis, const CeedScalar **div) { @ref User **/ int CeedBasisDestroy(CeedBasis *basis) { - int ierr; - if (!*basis || --(*basis)->ref_count > 0) return CEED_ERROR_SUCCESS; - if ((*basis)->Destroy) { - ierr = (*basis)->Destroy(*basis); CeedChk(ierr); - } - if ((*basis)->contract) { - ierr = CeedTensorContractDestroy(&(*basis)->contract); CeedChk(ierr); - } - ierr = CeedFree(&(*basis)->interp); CeedChk(ierr); - ierr = CeedFree(&(*basis)->interp_1d); CeedChk(ierr); - ierr = CeedFree(&(*basis)->grad); CeedChk(ierr); - ierr = CeedFree(&(*basis)->div); CeedChk(ierr); - ierr = CeedFree(&(*basis)->grad_1d); CeedChk(ierr); - ierr = CeedFree(&(*basis)->q_ref_1d); CeedChk(ierr); - ierr = CeedFree(&(*basis)->q_weight_1d); CeedChk(ierr); - ierr = CeedDestroy(&(*basis)->ceed); CeedChk(ierr); - ierr = CeedFree(basis); CeedChk(ierr); + if ((*basis)->Destroy) CeedCall((*basis)->Destroy(*basis)); + if ((*basis)->contract) CeedCall(CeedTensorContractDestroy(&(*basis)->contract)); + CeedCall(CeedFree(&(*basis)->interp)); + CeedCall(CeedFree(&(*basis)->interp_1d)); + CeedCall(CeedFree(&(*basis)->grad)); + CeedCall(CeedFree(&(*basis)->div)); + CeedCall(CeedFree(&(*basis)->grad_1d)); + CeedCall(CeedFree(&(*basis)->q_ref_1d)); + CeedCall(CeedFree(&(*basis)->q_weight_1d)); + CeedCall(CeedDestroy(&(*basis)->ceed)); + CeedCall(CeedFree(basis)); return CEED_ERROR_SUCCESS; } @@ -1555,44 +1470,43 @@ int CeedBasisDestroy(CeedBasis *basis) { @ref Utility **/ -int CeedGaussQuadrature(CeedInt Q, CeedScalar *q_ref_1d, - CeedScalar *q_weight_1d) { +int CeedGaussQuadrature(CeedInt Q, CeedScalar *q_ref_1d, CeedScalar *q_weight_1d) { // Allocate - CeedScalar P0, P1, P2, dP2, xi, wi, PI = 4.0*atan(1.0); + CeedScalar P0, P1, P2, dP2, xi, wi, PI = 4.0 * atan(1.0); // Build q_ref_1d, q_weight_1d - for (CeedInt i = 0; i <= Q/2; i++) { + for (CeedInt i = 0; i <= Q / 2; i++) { // Guess - xi = cos(PI*(CeedScalar)(2*i+1)/((CeedScalar)(2*Q))); + xi = cos(PI * (CeedScalar)(2 * i + 1) / ((CeedScalar)(2 * Q))); // Pn(xi) P0 = 1.0; P1 = xi; P2 = 0.0; for (CeedInt j = 2; j <= Q; j++) { - P2 = (((CeedScalar)(2*j-1))*xi*P1-((CeedScalar)(j-1))*P0)/((CeedScalar)(j)); + P2 = (((CeedScalar)(2 * j - 1)) * xi * P1 - ((CeedScalar)(j - 1)) * P0) / ((CeedScalar)(j)); P0 = P1; P1 = P2; } // First Newton Step - dP2 = (xi*P2 - P0)*(CeedScalar)Q/(xi*xi-1.0); - xi = xi-P2/dP2; + dP2 = (xi * P2 - P0) * (CeedScalar)Q / (xi * xi - 1.0); + xi = xi - P2 / dP2; // Newton to convergence - for (CeedInt k=0; k<100 && fabs(P2)>10*CEED_EPSILON; k++) { + for (CeedInt k = 0; k < 100 && fabs(P2) > 10 * CEED_EPSILON; k++) { P0 = 1.0; P1 = xi; for (CeedInt j = 2; j <= Q; j++) { - P2 = (((CeedScalar)(2*j-1))*xi*P1-((CeedScalar)(j-1))*P0)/((CeedScalar)(j)); + P2 = (((CeedScalar)(2 * j - 1)) * xi * P1 - ((CeedScalar)(j - 1)) * P0) / ((CeedScalar)(j)); P0 = P1; P1 = P2; } - dP2 = (xi*P2 - P0)*(CeedScalar)Q/(xi*xi-1.0); - xi = xi-P2/dP2; + dP2 = (xi * P2 - P0) * (CeedScalar)Q / (xi * xi - 1.0); + xi = xi - P2 / dP2; } // Save xi, wi - wi = 2.0/((1.0-xi*xi)*dP2*dP2); - q_weight_1d[i] = wi; - q_weight_1d[Q-1-i] = wi; - q_ref_1d[i] = -xi; - q_ref_1d[Q-1-i]= xi; + wi = 2.0 / ((1.0 - xi * xi) * dP2 * dP2); + q_weight_1d[i] = wi; + q_weight_1d[Q - 1 - i] = wi; + q_ref_1d[i] = -xi; + q_ref_1d[Q - 1 - i] = xi; } return CEED_ERROR_SUCCESS; } @@ -1609,62 +1523,61 @@ int CeedGaussQuadrature(CeedInt Q, CeedScalar *q_ref_1d, @ref Utility **/ -int CeedLobattoQuadrature(CeedInt Q, CeedScalar *q_ref_1d, - CeedScalar *q_weight_1d) { +int CeedLobattoQuadrature(CeedInt Q, CeedScalar *q_ref_1d, CeedScalar *q_weight_1d) { // Allocate - CeedScalar P0, P1, P2, dP2, d2P2, xi, wi, PI = 4.0*atan(1.0); + CeedScalar P0, P1, P2, dP2, d2P2, xi, wi, PI = 4.0 * atan(1.0); // Build q_ref_1d, q_weight_1d // Set endpoints - if (Q < 2) + if (Q < 2) { // LCOV_EXCL_START - return CeedError(NULL, CEED_ERROR_DIMENSION, - "Cannot create Lobatto quadrature with Q=%" CeedInt_FMT " < 2 points", Q); - // LCOV_EXCL_STOP - wi = 2.0/((CeedScalar)(Q*(Q-1))); + return CeedError(NULL, CEED_ERROR_DIMENSION, "Cannot create Lobatto quadrature with Q=%" CeedInt_FMT " < 2 points", Q); + // LCOV_EXCL_STOP + } + wi = 2.0 / ((CeedScalar)(Q * (Q - 1))); if (q_weight_1d) { - q_weight_1d[0] = wi; - q_weight_1d[Q-1] = wi; + q_weight_1d[0] = wi; + q_weight_1d[Q - 1] = wi; } - q_ref_1d[0] = -1.0; - q_ref_1d[Q-1] = 1.0; + q_ref_1d[0] = -1.0; + q_ref_1d[Q - 1] = 1.0; // Interior - for (CeedInt i = 1; i <= (Q-1)/2; i++) { + for (CeedInt i = 1; i <= (Q - 1) / 2; i++) { // Guess - xi = cos(PI*(CeedScalar)(i)/(CeedScalar)(Q-1)); + xi = cos(PI * (CeedScalar)(i) / (CeedScalar)(Q - 1)); // Pn(xi) P0 = 1.0; P1 = xi; P2 = 0.0; for (CeedInt j = 2; j < Q; j++) { - P2 = (((CeedScalar)(2*j-1))*xi*P1-((CeedScalar)(j-1))*P0)/((CeedScalar)(j)); + P2 = (((CeedScalar)(2 * j - 1)) * xi * P1 - ((CeedScalar)(j - 1)) * P0) / ((CeedScalar)(j)); P0 = P1; P1 = P2; } // First Newton step - dP2 = (xi*P2 - P0)*(CeedScalar)Q/(xi*xi-1.0); - d2P2 = (2*xi*dP2 - (CeedScalar)(Q*(Q-1))*P2)/(1.0-xi*xi); - xi = xi-dP2/d2P2; + dP2 = (xi * P2 - P0) * (CeedScalar)Q / (xi * xi - 1.0); + d2P2 = (2 * xi * dP2 - (CeedScalar)(Q * (Q - 1)) * P2) / (1.0 - xi * xi); + xi = xi - dP2 / d2P2; // Newton to convergence - for (CeedInt k=0; k<100 && fabs(dP2)>10*CEED_EPSILON; k++) { + for (CeedInt k = 0; k < 100 && fabs(dP2) > 10 * CEED_EPSILON; k++) { P0 = 1.0; P1 = xi; for (CeedInt j = 2; j < Q; j++) { - P2 = (((CeedScalar)(2*j-1))*xi*P1-((CeedScalar)(j-1))*P0)/((CeedScalar)(j)); + P2 = (((CeedScalar)(2 * j - 1)) * xi * P1 - ((CeedScalar)(j - 1)) * P0) / ((CeedScalar)(j)); P0 = P1; P1 = P2; } - dP2 = (xi*P2 - P0)*(CeedScalar)Q/(xi*xi-1.0); - d2P2 = (2*xi*dP2 - (CeedScalar)(Q*(Q-1))*P2)/(1.0-xi*xi); - xi = xi-dP2/d2P2; + dP2 = (xi * P2 - P0) * (CeedScalar)Q / (xi * xi - 1.0); + d2P2 = (2 * xi * dP2 - (CeedScalar)(Q * (Q - 1)) * P2) / (1.0 - xi * xi); + xi = xi - dP2 / d2P2; } // Save xi, wi - wi = 2.0/(((CeedScalar)(Q*(Q-1)))*P2*P2); + wi = 2.0 / (((CeedScalar)(Q * (Q - 1))) * P2 * P2); if (q_weight_1d) { - q_weight_1d[i] = wi; - q_weight_1d[Q-1-i] = wi; + q_weight_1d[i] = wi; + q_weight_1d[Q - 1 - i] = wi; } - q_ref_1d[i] = -xi; - q_ref_1d[Q-1-i]= xi; + q_ref_1d[i] = -xi; + q_ref_1d[Q - 1 - i] = xi; } return CEED_ERROR_SUCCESS; } @@ -1682,45 +1595,42 @@ int CeedLobattoQuadrature(CeedInt Q, CeedScalar *q_ref_1d, @ref Utility **/ -int CeedQRFactorization(Ceed ceed, CeedScalar *mat, CeedScalar *tau, - CeedInt m, CeedInt n) { +int CeedQRFactorization(Ceed ceed, CeedScalar *mat, CeedScalar *tau, CeedInt m, CeedInt n) { CeedScalar v[m]; - // Check m >= n - if (n > m) + // Check matrix shape + if (n > m) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Cannot compute QR factorization with n > m"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Cannot compute QR factorization with n > m"); + // LCOV_EXCL_STOP + } - for (CeedInt i=0; i= m-1) { // last row of matrix, no reflection needed + for (CeedInt i = 0; i < n; i++) { + if (i >= m - 1) { // last row of matrix, no reflection needed tau[i] = 0.; break; } // Calculate Householder vector, magnitude CeedScalar sigma = 0.0; - v[i] = mat[i+n*i]; - for (CeedInt j=i+1; j=0; i--) { + for (CeedInt i = n - 2; i >= 0; i--) { if (tau[i] > 0.0) { v[i] = 1; - for (CeedInt j=i+1; j=0; i--) { - if (fabs(mat_T[i+n*(i+1)]) < tol) - q += 1; - else - break; + p = 0; + q = 0; + for (CeedInt i = n - 2; i >= 0; i--) { + if (fabs(mat_T[i + n * (i + 1)]) < tol) q += 1; + else break; } - for (CeedInt i=0; i tol) { if (fabs(z) > fabs(x)) { - CeedScalar tau = -x/z; - s = 1/sqrt(1+tau*tau), c = s*tau; + CeedScalar tau = -x / z; + s = 1 / sqrt(1 + tau * tau), c = s * tau; } else { - CeedScalar tau = -z/x; - c = 1/sqrt(1+tau*tau), s = c*tau; + CeedScalar tau = -z / x; + c = 1 / sqrt(1 + tau * tau), s = c * tau; } } // Apply Givens rotation to T - CeedGivensRotation(mat_T, c, s, CEED_NOTRANSPOSE, k, k+1, n, n); - CeedGivensRotation(mat_T, c, s, CEED_TRANSPOSE, k, k+1, n, n); + CeedGivensRotation(mat_T, c, s, CEED_NOTRANSPOSE, k, k + 1, n, n); + CeedGivensRotation(mat_T, c, s, CEED_TRANSPOSE, k, k + 1, n, n); // Apply Givens rotation to Q - CeedGivensRotation(mat, c, s, CEED_NOTRANSPOSE, k, k+1, n, n); + CeedGivensRotation(mat, c, s, CEED_NOTRANSPOSE, k, k + 1, n, n); // Update x, z - if (k < n-q-2) { - x = mat_T[k+n*(k+1)]; - z = mat_T[k+n*(k+2)]; + if (k < n - q - 2) { + x = mat_T[k + n * (k + 1)]; + z = mat_T[k + n * (k + 2)]; } } itr++; } // Save eigenvalues - for (CeedInt i=0; i=0; i--) - for (CeedInt j=0; j fabs(vec_D[j+1])) { + for (CeedInt i = n - 1; i >= 0; i--) { + for (CeedInt j = 0; j < i; j++) { + if (fabs(vec_D[j]) > fabs(vec_D[j + 1])) { CeedScalar temp; - temp = vec_D[j]; vec_D[j] = vec_D[j+1]; vec_D[j+1] = temp; - for (CeedInt k=0; k=0; i--) - for (CeedInt j=0; j fabs(lambda[j+1])) { + for (CeedInt i = n - 1; i >= 0; i--) { + for (CeedInt j = 0; j < i; j++) { + if (fabs(lambda[j]) > fabs(lambda[j + 1])) { CeedScalar temp; - temp = lambda[j]; lambda[j] = lambda[j+1]; lambda[j+1] = temp; - for (CeedInt k=0; k +#include #include +#include #include -#include /** @brief Set CUDA function pointer to evaluate action at quadrature points @@ -21,13 +21,12 @@ @ref User **/ int CeedQFunctionSetCUDAUserFunction(CeedQFunction qf, CUfunction f) { - int ierr; if (!qf->SetCUDAUserFunction) { Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChk(ierr); + CeedCall(CeedQFunctionGetCeed(qf, &ceed)); CeedDebug(ceed, "Backend does not support CUfunction pointers for QFunctions."); } else { - ierr = qf->SetCUDAUserFunction(qf, f); CeedChk(ierr); + CeedCall(qf->SetCUDAUserFunction(qf, f)); } return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-elemrestriction.c b/interface/ceed-elemrestriction.c index 3fa2643b6f..4617d653ff 100644 --- a/interface/ceed-elemrestriction.c +++ b/interface/ceed-elemrestriction.c @@ -5,9 +5,9 @@ // // This file is part of CEED: http://github.com/ceed -#include -#include #include +#include +#include #include #include @@ -39,14 +39,14 @@ @ref Utility **/ -int CeedPermutePadOffsets(const CeedInt *offsets, CeedInt *blk_offsets, - CeedInt num_blk, CeedInt num_elem, CeedInt blk_size, - CeedInt elem_size) { - for (CeedInt e=0; estrides) +int CeedElemRestrictionGetStrides(CeedElemRestriction rstr, CeedInt (*strides)[3]) { + if (!rstr->strides) { // LCOV_EXCL_START - return CeedError(rstr->ceed, CEED_ERROR_MINOR, - "ElemRestriction has no stride data"); - // LCOV_EXCL_STOP + return CeedError(rstr->ceed, CEED_ERROR_MINOR, "ElemRestriction has no stride data"); + // LCOV_EXCL_STOP + } - for (CeedInt i=0; i<3; i++) - (*strides)[i] = rstr->strides[i]; + for (CeedInt i = 0; i < 3; i++) (*strides)[i] = rstr->strides[i]; return CEED_ERROR_SUCCESS; } @@ -95,18 +93,14 @@ int CeedElemRestrictionGetStrides(CeedElemRestriction rstr, @ref User **/ -int CeedElemRestrictionGetOffsets(CeedElemRestriction rstr, - CeedMemType mem_type, - const CeedInt **offsets) { - int ierr; - - if (!rstr->GetOffsets) +int CeedElemRestrictionGetOffsets(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { + if (!rstr->GetOffsets) { // LCOV_EXCL_START - return CeedError(rstr->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support GetOffsets"); - // LCOV_EXCL_STOP + return CeedError(rstr->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetOffsets"); + // LCOV_EXCL_STOP + } - ierr = rstr->GetOffsets(rstr, mem_type, offsets); CeedChk(ierr); + CeedCall(rstr->GetOffsets(rstr, mem_type, offsets)); rstr->num_readers++; return CEED_ERROR_SUCCESS; } @@ -121,8 +115,7 @@ int CeedElemRestrictionGetOffsets(CeedElemRestriction rstr, @ref User **/ -int CeedElemRestrictionRestoreOffsets(CeedElemRestriction rstr, - const CeedInt **offsets) { +int CeedElemRestrictionRestoreOffsets(CeedElemRestriction rstr, const CeedInt **offsets) { *offsets = NULL; rstr->num_readers--; return CEED_ERROR_SUCCESS; @@ -168,16 +161,14 @@ int CeedElemRestrictionIsOriented(CeedElemRestriction rstr, bool *is_oriented) { @ref Backend **/ -int CeedElemRestrictionHasBackendStrides(CeedElemRestriction rstr, - bool *has_backend_strides) { - if (!rstr->strides) +int CeedElemRestrictionHasBackendStrides(CeedElemRestriction rstr, bool *has_backend_strides) { + if (!rstr->strides) { // LCOV_EXCL_START - return CeedError(rstr->ceed, CEED_ERROR_MINOR, - "ElemRestriction has no stride data"); - // LCOV_EXCL_STOP + return CeedError(rstr->ceed, CEED_ERROR_MINOR, "ElemRestriction has no stride data"); + // LCOV_EXCL_STOP + } - *has_backend_strides = ((rstr->strides[0] == CEED_STRIDES_BACKEND[0]) && - (rstr->strides[1] == CEED_STRIDES_BACKEND[1]) && + *has_backend_strides = ((rstr->strides[0] == CEED_STRIDES_BACKEND[0]) && (rstr->strides[1] == CEED_STRIDES_BACKEND[1]) && (rstr->strides[2] == CEED_STRIDES_BACKEND[2])); return CEED_ERROR_SUCCESS; } @@ -197,16 +188,14 @@ int CeedElemRestrictionHasBackendStrides(CeedElemRestriction rstr, @ref Backend **/ -int CeedElemRestrictionGetELayout(CeedElemRestriction rstr, - CeedInt (*layout)[3]) { - if (!rstr->layout[0]) +int CeedElemRestrictionGetELayout(CeedElemRestriction rstr, CeedInt (*layout)[3]) { + if (!rstr->layout[0]) { // LCOV_EXCL_START - return CeedError(rstr->ceed, CEED_ERROR_MINOR, - "ElemRestriction has no layout data"); - // LCOV_EXCL_STOP + return CeedError(rstr->ceed, CEED_ERROR_MINOR, "ElemRestriction has no layout data"); + // LCOV_EXCL_STOP + } - for (CeedInt i=0; i<3; i++) - (*layout)[i] = rstr->layout[i]; + for (CeedInt i = 0; i < 3; i++) (*layout)[i] = rstr->layout[i]; return CEED_ERROR_SUCCESS; } @@ -225,10 +214,8 @@ int CeedElemRestrictionGetELayout(CeedElemRestriction rstr, @ref Backend **/ -int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, - CeedInt layout[3]) { - for (CeedInt i = 0; i<3; i++) - rstr->layout[i] = layout[i]; +int CeedElemRestrictionSetELayout(CeedElemRestriction rstr, CeedInt layout[3]) { + for (CeedInt i = 0; i < 3; i++) rstr->layout[i] = layout[i]; return CEED_ERROR_SUCCESS; } @@ -285,18 +272,18 @@ int CeedElemRestrictionReference(CeedElemRestriction rstr) { @ref Backend **/ -int CeedElemRestrictionGetFlopsEstimate(CeedElemRestriction rstr, - CeedTransposeMode t_mode, CeedSize *flops) { - int ierr; - bool is_oriented; - CeedInt e_size = rstr->num_blk * rstr->blk_size * rstr->elem_size * - rstr->num_comp, - scale = 0; - - ierr = CeedElemRestrictionIsOriented(rstr, &is_oriented); CeedChk(ierr); +int CeedElemRestrictionGetFlopsEstimate(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedSize *flops) { + bool is_oriented; + CeedInt e_size = rstr->num_blk * rstr->blk_size * rstr->elem_size * rstr->num_comp, scale = 0; + + CeedCall(CeedElemRestrictionIsOriented(rstr, &is_oriented)); switch (t_mode) { - case CEED_NOTRANSPOSE: scale = is_oriented ? 1 : 0; break; - case CEED_TRANSPOSE: scale = is_oriented ? 2 : 1; break; + case CEED_NOTRANSPOSE: + scale = is_oriented ? 1 : 0; + break; + case CEED_TRANSPOSE: + scale = is_oriented ? 2 : 1; + break; } *flops = e_size * scale; @@ -319,8 +306,7 @@ static struct CeedElemRestriction_private ceed_elemrestriction_none; const CeedInt CEED_STRIDES_BACKEND[3] = {0}; /// Indicate that no CeedElemRestriction is provided by the user -const CeedElemRestriction CEED_ELEMRESTRICTION_NONE = - &ceed_elemrestriction_none; +const CeedElemRestriction CEED_ELEMRESTRICTION_NONE = &ceed_elemrestriction_none; /** @brief Create a CeedElemRestriction @@ -350,62 +336,53 @@ const CeedElemRestriction CEED_ELEMRESTRICTION_NONE = @ref User **/ -int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, - CeedInt num_comp, CeedInt comp_stride, - CeedSize l_size, CeedMemType mem_type, - CeedCopyMode copy_mode, const CeedInt *offsets, - CeedElemRestriction *rstr) { - int ierr; - +int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt num_comp, CeedInt comp_stride, CeedSize l_size, + CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, CeedElemRestriction *rstr) { if (!ceed->ElemRestrictionCreate) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"); - CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support ElemRestrictionCreate"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support ElemRestrictionCreate"); + // LCOV_EXCL_STOP + } - ierr = CeedElemRestrictionCreate(delegate, num_elem, elem_size, num_comp, - comp_stride, l_size, mem_type, copy_mode, - offsets, rstr); CeedChk(ierr); + CeedCall(CeedElemRestrictionCreate(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, rstr)); return CEED_ERROR_SUCCESS; } - if (elem_size < 1) + if (elem_size < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Element size must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1"); + // LCOV_EXCL_STOP + } - if (num_comp < 1) + if (num_comp < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "ElemRestriction must have at least 1 component"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "ElemRestriction must have at least 1 component"); + // LCOV_EXCL_STOP + } - if (num_comp > 1 && comp_stride < 1) + if (num_comp > 1 && comp_stride < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "ElemRestriction component stride must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "ElemRestriction component stride must be at least 1"); + // LCOV_EXCL_STOP + } - ierr = CeedCalloc(1, rstr); CeedChk(ierr); + CeedCall(CeedCalloc(1, rstr)); (*rstr)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*rstr)->ref_count = 1; - (*rstr)->num_elem = num_elem; - (*rstr)->elem_size = elem_size; - (*rstr)->num_comp = num_comp; + CeedCall(CeedReference(ceed)); + (*rstr)->ref_count = 1; + (*rstr)->num_elem = num_elem; + (*rstr)->elem_size = elem_size; + (*rstr)->num_comp = num_comp; (*rstr)->comp_stride = comp_stride; - (*rstr)->l_size = l_size; - (*rstr)->num_blk = num_elem; - (*rstr)->blk_size = 1; + (*rstr)->l_size = l_size; + (*rstr)->num_blk = num_elem; + (*rstr)->blk_size = 1; (*rstr)->is_oriented = 0; - ierr = ceed->ElemRestrictionCreate(mem_type, copy_mode, offsets, *rstr); - CeedChk(ierr); + CeedCall(ceed->ElemRestrictionCreate(mem_type, copy_mode, offsets, *rstr)); return CEED_ERROR_SUCCESS; } @@ -439,63 +416,55 @@ int CeedElemRestrictionCreate(Ceed ceed, CeedInt num_elem, CeedInt elem_size, @ref User **/ -int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, - CeedInt elem_size, CeedInt num_comp, - CeedInt comp_stride, CeedSize l_size, - CeedMemType mem_type, CeedCopyMode copy_mode, - const CeedInt *offsets, const bool *orient, +int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt num_comp, CeedInt comp_stride, CeedSize l_size, + CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orient, CeedElemRestriction *rstr) { - int ierr; - if (!ceed->ElemRestrictionCreateOriented) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"); - CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not implement ElemRestrictionCreateOriented"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement ElemRestrictionCreateOriented"); + // LCOV_EXCL_STOP + } - ierr = CeedElemRestrictionCreateOriented(delegate, num_elem, elem_size, - num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, - orient, rstr); CeedChk(ierr); + CeedCall( + CeedElemRestrictionCreateOriented(delegate, num_elem, elem_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, orient, rstr)); return CEED_ERROR_SUCCESS; } - if (elem_size < 1) + if (elem_size < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Element size must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1"); + // LCOV_EXCL_STOP + } - if (num_comp < 1) + if (num_comp < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "ElemRestriction must have at least 1 component"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "ElemRestriction must have at least 1 component"); + // LCOV_EXCL_STOP + } - if (num_comp > 1 && comp_stride < 1) + if (num_comp > 1 && comp_stride < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "ElemRestriction component stride must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "ElemRestriction component stride must be at least 1"); + // LCOV_EXCL_STOP + } - ierr = CeedCalloc(1, rstr); CeedChk(ierr); + CeedCall(CeedCalloc(1, rstr)); (*rstr)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*rstr)->ref_count = 1; - (*rstr)->num_elem = num_elem; - (*rstr)->elem_size = elem_size; - (*rstr)->num_comp = num_comp; + CeedCall(CeedReference(ceed)); + (*rstr)->ref_count = 1; + (*rstr)->num_elem = num_elem; + (*rstr)->elem_size = elem_size; + (*rstr)->num_comp = num_comp; (*rstr)->comp_stride = comp_stride; - (*rstr)->l_size = l_size; - (*rstr)->num_blk = num_elem; - (*rstr)->blk_size = 1; + (*rstr)->l_size = l_size; + (*rstr)->num_blk = num_elem; + (*rstr)->blk_size = 1; (*rstr)->is_oriented = 1; - ierr = ceed->ElemRestrictionCreateOriented(mem_type, copy_mode, - offsets, orient, *rstr); CeedChk(ierr); + CeedCall(ceed->ElemRestrictionCreateOriented(mem_type, copy_mode, offsets, orient, *rstr)); return CEED_ERROR_SUCCESS; } @@ -522,59 +491,48 @@ int CeedElemRestrictionCreateOriented(Ceed ceed, CeedInt num_elem, @ref User **/ -int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, - CeedInt elem_size, - CeedInt num_comp, CeedSize l_size, - const CeedInt strides[3], +int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt num_comp, CeedSize l_size, const CeedInt strides[3], CeedElemRestriction *rstr) { - int ierr; - if (!ceed->ElemRestrictionCreate) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"); - CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support ElemRestrictionCreate"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support ElemRestrictionCreate"); + // LCOV_EXCL_STOP + } - ierr = CeedElemRestrictionCreateStrided(delegate, num_elem, elem_size, num_comp, - l_size, strides, rstr); - CeedChk(ierr); + CeedCall(CeedElemRestrictionCreateStrided(delegate, num_elem, elem_size, num_comp, l_size, strides, rstr)); return CEED_ERROR_SUCCESS; } - if (elem_size < 1) + if (elem_size < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Element size must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1"); + // LCOV_EXCL_STOP + } - if (num_comp < 1) + if (num_comp < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "ElemRestriction must have at least 1 component"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "ElemRestriction must have at least 1 component"); + // LCOV_EXCL_STOP + } - ierr = CeedCalloc(1, rstr); CeedChk(ierr); + CeedCall(CeedCalloc(1, rstr)); (*rstr)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*rstr)->ref_count = 1; - (*rstr)->num_elem = num_elem; - (*rstr)->elem_size = elem_size; - (*rstr)->num_comp = num_comp; - (*rstr)->l_size = l_size; - (*rstr)->num_blk = num_elem; - (*rstr)->blk_size = 1; + CeedCall(CeedReference(ceed)); + (*rstr)->ref_count = 1; + (*rstr)->num_elem = num_elem; + (*rstr)->elem_size = elem_size; + (*rstr)->num_comp = num_comp; + (*rstr)->l_size = l_size; + (*rstr)->num_blk = num_elem; + (*rstr)->blk_size = 1; (*rstr)->is_oriented = 0; - ierr = CeedMalloc(3, &(*rstr)->strides); CeedChk(ierr); - for (CeedInt i=0; i<3; i++) - (*rstr)->strides[i] = strides[i]; - ierr = ceed->ElemRestrictionCreate(CEED_MEM_HOST, CEED_OWN_POINTER, NULL, - *rstr); - CeedChk(ierr); + CeedCall(CeedMalloc(3, &(*rstr)->strides)); + for (CeedInt i = 0; i < 3; i++) (*rstr)->strides[i] = strides[i]; + CeedCall(ceed->ElemRestrictionCreate(CEED_MEM_HOST, CEED_OWN_POINTER, NULL, *rstr)); return CEED_ERROR_SUCCESS; } @@ -610,80 +568,70 @@ int CeedElemRestrictionCreateStrided(Ceed ceed, CeedInt num_elem, @ref Backend **/ -int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, - CeedInt elem_size, - CeedInt blk_size, CeedInt num_comp, - CeedInt comp_stride, CeedSize l_size, - CeedMemType mem_type, CeedCopyMode copy_mode, - const CeedInt *offsets, +int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt blk_size, CeedInt num_comp, CeedInt comp_stride, + CeedSize l_size, CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, CeedElemRestriction *rstr) { - int ierr; CeedInt *blk_offsets; - CeedInt num_blk = (num_elem / blk_size) + !!(num_elem % blk_size); + CeedInt num_blk = (num_elem / blk_size) + !!(num_elem % blk_size); if (!ceed->ElemRestrictionCreateBlocked) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"); - CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support " - "ElemRestrictionCreateBlocked"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support ElemRestrictionCreateBlocked"); + // LCOV_EXCL_STOP + } - ierr = CeedElemRestrictionCreateBlocked(delegate, num_elem, elem_size, blk_size, - num_comp, comp_stride, l_size, mem_type, - copy_mode, offsets, rstr); - CeedChk(ierr); + CeedCall( + CeedElemRestrictionCreateBlocked(delegate, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size, mem_type, copy_mode, offsets, rstr)); return CEED_ERROR_SUCCESS; } - if (elem_size < 1) + if (elem_size < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Element size must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1"); + // LCOV_EXCL_STOP + } - if (blk_size < 1) + if (blk_size < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Block size must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Block size must be at least 1"); + // LCOV_EXCL_STOP + } - if (num_comp < 1) + if (num_comp < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "ElemRestriction must have at least 1 component"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "ElemRestriction must have at least 1 component"); + // LCOV_EXCL_STOP + } - if (num_comp > 1 && comp_stride < 1) + if (num_comp > 1 && comp_stride < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "ElemRestriction component stride must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "ElemRestriction component stride must be at least 1"); + // LCOV_EXCL_STOP + } - ierr = CeedCalloc(1, rstr); CeedChk(ierr); + CeedCall(CeedCalloc(1, rstr)); - ierr = CeedCalloc(num_blk*blk_size*elem_size, &blk_offsets); CeedChk(ierr); - ierr = CeedPermutePadOffsets(offsets, blk_offsets, num_blk, num_elem, blk_size, - elem_size); CeedChk(ierr); + CeedCall(CeedCalloc(num_blk * blk_size * elem_size, &blk_offsets)); + CeedCall(CeedPermutePadOffsets(offsets, blk_offsets, num_blk, num_elem, blk_size, elem_size)); (*rstr)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*rstr)->ref_count = 1; - (*rstr)->num_elem = num_elem; - (*rstr)->elem_size = elem_size; - (*rstr)->num_comp = num_comp; + CeedCall(CeedReference(ceed)); + (*rstr)->ref_count = 1; + (*rstr)->num_elem = num_elem; + (*rstr)->elem_size = elem_size; + (*rstr)->num_comp = num_comp; (*rstr)->comp_stride = comp_stride; - (*rstr)->l_size = l_size; - (*rstr)->num_blk = num_blk; - (*rstr)->blk_size = blk_size; + (*rstr)->l_size = l_size; + (*rstr)->num_blk = num_blk; + (*rstr)->blk_size = blk_size; (*rstr)->is_oriented = 0; - ierr = ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, - (const CeedInt *) blk_offsets, *rstr); CeedChk(ierr); + CeedCall(ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, (const CeedInt *)blk_offsets, *rstr)); if (copy_mode == CEED_OWN_POINTER) { - ierr = CeedFree(&offsets); CeedChk(ierr); + CeedCall(CeedFree(&offsets)); } return CEED_ERROR_SUCCESS; } @@ -712,63 +660,57 @@ int CeedElemRestrictionCreateBlocked(Ceed ceed, CeedInt num_elem, @ref User **/ -int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, - CeedInt elem_size, CeedInt blk_size, CeedInt num_comp, CeedSize l_size, - const CeedInt strides[3], CeedElemRestriction *rstr) { - int ierr; +int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, CeedInt elem_size, CeedInt blk_size, CeedInt num_comp, CeedSize l_size, + const CeedInt strides[3], CeedElemRestriction *rstr) { CeedInt num_blk = (num_elem / blk_size) + !!(num_elem % blk_size); if (!ceed->ElemRestrictionCreateBlocked) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction"); - CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "ElemRestriction")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support " - "ElemRestrictionCreateBlocked"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support ElemRestrictionCreateBlocked"); + // LCOV_EXCL_STOP + } - ierr = CeedElemRestrictionCreateBlockedStrided(delegate, num_elem, elem_size, - blk_size, num_comp, l_size, strides, rstr); CeedChk(ierr); + CeedCall(CeedElemRestrictionCreateBlockedStrided(delegate, num_elem, elem_size, blk_size, num_comp, l_size, strides, rstr)); return CEED_ERROR_SUCCESS; } - if (elem_size < 1) + if (elem_size < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Element size must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Element size must be at least 1"); + // LCOV_EXCL_STOP + } - if (blk_size < 1) + if (blk_size < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Block size must be at least 1"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Block size must be at least 1"); + // LCOV_EXCL_STOP + } - if (num_comp < 1) + if (num_comp < 1) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "ElemRestriction must have at least 1 component"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "ElemRestriction must have at least 1 component"); + // LCOV_EXCL_STOP + } - ierr = CeedCalloc(1, rstr); CeedChk(ierr); + CeedCall(CeedCalloc(1, rstr)); (*rstr)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*rstr)->ref_count = 1; - (*rstr)->num_elem = num_elem; - (*rstr)->elem_size = elem_size; - (*rstr)->num_comp = num_comp; - (*rstr)->l_size = l_size; - (*rstr)->num_blk = num_blk; - (*rstr)->blk_size = blk_size; + CeedCall(CeedReference(ceed)); + (*rstr)->ref_count = 1; + (*rstr)->num_elem = num_elem; + (*rstr)->elem_size = elem_size; + (*rstr)->num_comp = num_comp; + (*rstr)->l_size = l_size; + (*rstr)->num_blk = num_blk; + (*rstr)->blk_size = blk_size; (*rstr)->is_oriented = 0; - ierr = CeedMalloc(3, &(*rstr)->strides); CeedChk(ierr); - for (CeedInt i=0; i<3; i++) - (*rstr)->strides[i] = strides[i]; - ierr = ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, - NULL, *rstr); CeedChk(ierr); + CeedCall(CeedMalloc(3, &(*rstr)->strides)); + for (CeedInt i = 0; i < 3; i++) (*rstr)->strides[i] = strides[i]; + CeedCall(ceed->ElemRestrictionCreateBlocked(CEED_MEM_HOST, CEED_OWN_POINTER, NULL, *rstr)); return CEED_ERROR_SUCCESS; } @@ -787,12 +729,9 @@ int CeedElemRestrictionCreateBlockedStrided(Ceed ceed, CeedInt num_elem, @ref User **/ -int CeedElemRestrictionReferenceCopy(CeedElemRestriction rstr, - CeedElemRestriction *rstr_copy) { - int ierr; - - ierr = CeedElemRestrictionReference(rstr); CeedChk(ierr); - ierr = CeedElemRestrictionDestroy(rstr_copy); CeedChk(ierr); +int CeedElemRestrictionReferenceCopy(CeedElemRestriction rstr, CeedElemRestriction *rstr_copy) { + CeedCall(CeedElemRestrictionReference(rstr)); + CeedCall(CeedElemRestrictionDestroy(rstr_copy)); *rstr_copy = rstr; return CEED_ERROR_SUCCESS; } @@ -808,18 +747,12 @@ int CeedElemRestrictionReferenceCopy(CeedElemRestriction rstr, @ref User **/ -int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *l_vec, - CeedVector *e_vec) { - int ierr; +int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *l_vec, CeedVector *e_vec) { CeedSize e_size, l_size; l_size = rstr->l_size; e_size = rstr->num_blk * rstr->blk_size * rstr->elem_size * rstr->num_comp; - if (l_vec) { - ierr = CeedVectorCreate(rstr->ceed, l_size, l_vec); CeedChk(ierr); - } - if (e_vec) { - ierr = CeedVectorCreate(rstr->ceed, e_size, e_vec); CeedChk(ierr); - } + if (l_vec) CeedCall(CeedVectorCreate(rstr->ceed, l_size, l_vec)); + if (e_vec) CeedCall(CeedVectorCreate(rstr->ceed, e_size, e_vec)); return CEED_ERROR_SUCCESS; } @@ -838,11 +771,8 @@ int CeedElemRestrictionCreateVector(CeedElemRestriction rstr, CeedVector *l_vec, @ref User **/ -int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode, - CeedVector u, CeedVector ru, - CeedRequest *request) { +int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector ru, CeedRequest *request) { CeedInt m, n; - int ierr; if (t_mode == CEED_NOTRANSPOSE) { m = rstr->num_blk * rstr->blk_size * rstr->elem_size * rstr->num_comp; @@ -851,21 +781,21 @@ int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode, m = rstr->l_size; n = rstr->num_blk * rstr->blk_size * rstr->elem_size * rstr->num_comp; } - if (n != u->length) + if (n != u->length) { // LCOV_EXCL_START return CeedError(rstr->ceed, CEED_ERROR_DIMENSION, - "Input vector size %" CeedInt_FMT " not compatible with " - "element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", u->length, m, n); - // LCOV_EXCL_STOP - if (m != ru->length) + "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", u->length, m, + n); + // LCOV_EXCL_STOP + } + if (m != ru->length) { // LCOV_EXCL_START return CeedError(rstr->ceed, CEED_ERROR_DIMENSION, - "Output vector size %" CeedInt_FMT " not compatible with " - "element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", ru->length, m, n); - // LCOV_EXCL_STOP - if (rstr->num_elem > 0) { - ierr = rstr->Apply(rstr, t_mode, u, ru, request); CeedChk(ierr); + "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", ru->length, + m, n); + // LCOV_EXCL_STOP } + if (rstr->num_elem > 0) CeedCall(rstr->Apply(rstr, t_mode, u, ru, request)); return CEED_ERROR_SUCCESS; } @@ -887,11 +817,9 @@ int CeedElemRestrictionApply(CeedElemRestriction rstr, CeedTransposeMode t_mode, @ref Backend **/ -int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, - CeedTransposeMode t_mode, CeedVector u, - CeedVector ru, CeedRequest *request) { +int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector ru, + CeedRequest *request) { CeedInt m, n; - int ierr; if (t_mode == CEED_NOTRANSPOSE) { m = rstr->blk_size * rstr->elem_size * rstr->num_comp; @@ -900,27 +828,28 @@ int CeedElemRestrictionApplyBlock(CeedElemRestriction rstr, CeedInt block, m = rstr->l_size; n = rstr->blk_size * rstr->elem_size * rstr->num_comp; } - if (n != u->length) + if (n != u->length) { // LCOV_EXCL_START return CeedError(rstr->ceed, CEED_ERROR_DIMENSION, - "Input vector size %" CeedInt_FMT " not compatible with " - "element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", u->length, m, n); - // LCOV_EXCL_STOP - if (m != ru->length) + "Input vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", u->length, m, + n); + // LCOV_EXCL_STOP + } + if (m != ru->length) { // LCOV_EXCL_START return CeedError(rstr->ceed, CEED_ERROR_DIMENSION, - "Output vector size %" CeedInt_FMT " not compatible with " - "element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", ru->length, m, n); - // LCOV_EXCL_STOP - if (rstr->blk_size*block > rstr->num_elem) + "Output vector size %" CeedInt_FMT " not compatible with element restriction (%" CeedInt_FMT ", %" CeedInt_FMT ")", ru->length, + m, n); + // LCOV_EXCL_STOP + } + if (rstr->blk_size * block > rstr->num_elem) { // LCOV_EXCL_START return CeedError(rstr->ceed, CEED_ERROR_DIMENSION, - "Cannot retrieve block %" CeedInt_FMT ", element %" CeedInt_FMT " > " - "total elements %" CeedInt_FMT "", block, rstr->blk_size*block, - rstr->num_elem); - // LCOV_EXCL_STOP - ierr = rstr->ApplyBlock(rstr, block, t_mode, u, ru, request); - CeedChk(ierr); + "Cannot retrieve block %" CeedInt_FMT ", element %" CeedInt_FMT " > total elements %" CeedInt_FMT "", block, + rstr->blk_size * block, rstr->num_elem); + // LCOV_EXCL_STOP + } + CeedCall(rstr->ApplyBlock(rstr, block, t_mode, u, ru, request)); return CEED_ERROR_SUCCESS; } @@ -949,8 +878,7 @@ int CeedElemRestrictionGetCeed(CeedElemRestriction rstr, Ceed *ceed) { @ref Advanced **/ -int CeedElemRestrictionGetCompStride(CeedElemRestriction rstr, - CeedInt *comp_stride) { +int CeedElemRestrictionGetCompStride(CeedElemRestriction rstr, CeedInt *comp_stride) { *comp_stride = rstr->comp_stride; return CEED_ERROR_SUCCESS; } @@ -965,8 +893,7 @@ int CeedElemRestrictionGetCompStride(CeedElemRestriction rstr, @ref Advanced **/ -int CeedElemRestrictionGetNumElements(CeedElemRestriction rstr, - CeedInt *num_elem) { +int CeedElemRestrictionGetNumElements(CeedElemRestriction rstr, CeedInt *num_elem) { *num_elem = rstr->num_elem; return CEED_ERROR_SUCCESS; } @@ -981,8 +908,7 @@ int CeedElemRestrictionGetNumElements(CeedElemRestriction rstr, @ref Advanced **/ -int CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, - CeedInt *elem_size) { +int CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, CeedInt *elem_size) { *elem_size = rstr->elem_size; return CEED_ERROR_SUCCESS; } @@ -997,8 +923,7 @@ int CeedElemRestrictionGetElementSize(CeedElemRestriction rstr, @ref Advanced **/ -int CeedElemRestrictionGetLVectorSize(CeedElemRestriction rstr, - CeedSize *l_size) { +int CeedElemRestrictionGetLVectorSize(CeedElemRestriction rstr, CeedSize *l_size) { *l_size = rstr->l_size; return CEED_ERROR_SUCCESS; } @@ -1014,8 +939,7 @@ int CeedElemRestrictionGetLVectorSize(CeedElemRestriction rstr, @ref Advanced **/ -int CeedElemRestrictionGetNumComponents(CeedElemRestriction rstr, - CeedInt *num_comp) { +int CeedElemRestrictionGetNumComponents(CeedElemRestriction rstr, CeedInt *num_comp) { *num_comp = rstr->num_comp; return CEED_ERROR_SUCCESS; } @@ -1030,8 +954,7 @@ int CeedElemRestrictionGetNumComponents(CeedElemRestriction rstr, @ref Advanced **/ -int CeedElemRestrictionGetNumBlocks(CeedElemRestriction rstr, - CeedInt *num_block) { +int CeedElemRestrictionGetNumBlocks(CeedElemRestriction rstr, CeedInt *num_block) { *num_block = rstr->num_blk; return CEED_ERROR_SUCCESS; } @@ -1046,8 +969,7 @@ int CeedElemRestrictionGetNumBlocks(CeedElemRestriction rstr, @ref Advanced **/ -int CeedElemRestrictionGetBlockSize(CeedElemRestriction rstr, - CeedInt *blk_size) { +int CeedElemRestrictionGetBlockSize(CeedElemRestriction rstr, CeedInt *blk_size) { *blk_size = rstr->blk_size; return CEED_ERROR_SUCCESS; } @@ -1062,24 +984,20 @@ int CeedElemRestrictionGetBlockSize(CeedElemRestriction rstr, @ref User **/ -int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, - CeedVector mult) { - int ierr; +int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, CeedVector mult) { CeedVector e_vec; // Create e_vec to hold intermediate computation in E^T (E 1) - ierr = CeedElemRestrictionCreateVector(rstr, NULL, &e_vec); CeedChk(ierr); + CeedCall(CeedElemRestrictionCreateVector(rstr, NULL, &e_vec)); // Compute e_vec = E * 1 - ierr = CeedVectorSetValue(mult, 1.0); CeedChk(ierr); - ierr = CeedElemRestrictionApply(rstr, CEED_NOTRANSPOSE, mult, e_vec, - CEED_REQUEST_IMMEDIATE); CeedChk(ierr); + CeedCall(CeedVectorSetValue(mult, 1.0)); + CeedCall(CeedElemRestrictionApply(rstr, CEED_NOTRANSPOSE, mult, e_vec, CEED_REQUEST_IMMEDIATE)); // Compute multiplicity, mult = E^T * e_vec = E^T (E 1) - ierr = CeedVectorSetValue(mult, 0.0); CeedChk(ierr); - ierr = CeedElemRestrictionApply(rstr, CEED_TRANSPOSE, e_vec, mult, - CEED_REQUEST_IMMEDIATE); CeedChk(ierr); + CeedCall(CeedVectorSetValue(mult, 0.0)); + CeedCall(CeedElemRestrictionApply(rstr, CEED_TRANSPOSE, e_vec, mult, CEED_REQUEST_IMMEDIATE)); // Cleanup - ierr = CeedVectorDestroy(&e_vec); CeedChk(ierr); + CeedCall(CeedVectorDestroy(&e_vec)); return CEED_ERROR_SUCCESS; } @@ -1095,17 +1013,14 @@ int CeedElemRestrictionGetMultiplicity(CeedElemRestriction rstr, **/ int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) { char stridesstr[500]; - if (rstr->strides) - sprintf(stridesstr, "[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "]", - rstr->strides[0], rstr->strides[1], - rstr->strides[2]); - else + if (rstr->strides) { + sprintf(stridesstr, "[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "]", rstr->strides[0], rstr->strides[1], rstr->strides[2]); + } else { sprintf(stridesstr, "%" CeedInt_FMT, rstr->comp_stride); + } - fprintf(stream, "%sCeedElemRestriction from (%td, %" CeedInt_FMT ") to %" - CeedInt_FMT " elements with %" CeedInt_FMT " " - "nodes each and %s %s\n", rstr->blk_size > 1 ? "Blocked " : "", - rstr->l_size, rstr->num_comp, rstr->num_elem, rstr->elem_size, + fprintf(stream, "%sCeedElemRestriction from (%td, %" CeedInt_FMT ") to %" CeedInt_FMT " elements with %" CeedInt_FMT " nodes each and %s %s\n", + rstr->blk_size > 1 ? "Blocked " : "", rstr->l_size, rstr->num_comp, rstr->num_elem, rstr->elem_size, rstr->strides ? "strides" : "component stride", stridesstr); return CEED_ERROR_SUCCESS; } @@ -1120,21 +1035,16 @@ int CeedElemRestrictionView(CeedElemRestriction rstr, FILE *stream) { @ref User **/ int CeedElemRestrictionDestroy(CeedElemRestriction *rstr) { - int ierr; - if (!*rstr || --(*rstr)->ref_count > 0) return CEED_ERROR_SUCCESS; - if ((*rstr)->num_readers) + if ((*rstr)->num_readers) { // LCOV_EXCL_START - return CeedError((*rstr)->ceed, CEED_ERROR_ACCESS, - "Cannot destroy CeedElemRestriction, " - "a process has read access to the offset data"); - // LCOV_EXCL_STOP - if ((*rstr)->Destroy) { - ierr = (*rstr)->Destroy(*rstr); CeedChk(ierr); + return CeedError((*rstr)->ceed, CEED_ERROR_ACCESS, "Cannot destroy CeedElemRestriction, a process has read access to the offset data"); + // LCOV_EXCL_STOP } - ierr = CeedFree(&(*rstr)->strides); CeedChk(ierr); - ierr = CeedDestroy(&(*rstr)->ceed); CeedChk(ierr); - ierr = CeedFree(rstr); CeedChk(ierr); + if ((*rstr)->Destroy) CeedCall((*rstr)->Destroy(*rstr)); + CeedCall(CeedFree(&(*rstr)->strides)); + CeedCall(CeedDestroy(&(*rstr)->ceed)); + CeedCall(CeedFree(rstr)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-fortran.c b/interface/ceed-fortran.c index 2153518784..3ef5acff35 100644 --- a/interface/ceed-fortran.c +++ b/interface/ceed-fortran.c @@ -6,10 +6,10 @@ // This file is part of CEED: http://github.com/ceed // Fortran interface -#include -#include -#include #include +#include +#include +#include #include #include #include @@ -25,10 +25,10 @@ #define FORTRAN_BASIS_COLLOCATED -8 #define FORTRAN_QFUNCTION_NONE -9 -static Ceed *Ceed_dict = NULL; -static int Ceed_count = 0; -static int Ceed_n = 0; -static int Ceed_count_max = 0; +static Ceed *Ceed_dict = NULL; +static int Ceed_count = 0; +static int Ceed_n = 0; +static int Ceed_count_max = 0; // This test should actually be for the gfortran version, but we don't currently // have a configure system to determine that (TODO). At present, this will use @@ -41,7 +41,7 @@ typedef size_t fortran_charlen_t; typedef int fortran_charlen_t; #endif -#define Splice(a, b) a ## b +#define Splice(a, b) a##b // Fortran strings are generally unterminated and the length is passed as an // extra argument after all the normal arguments. Some compilers (I only know @@ -51,27 +51,25 @@ typedef int fortran_charlen_t; // We can't just NULL-terminate the string in-place because that could overwrite // other strings or attempt to write to read-only memory. This macro allocates // a string to hold the null-terminated version of the string that C expects. -#define FIX_STRING(stringname) \ - char Splice(stringname, _c)[1024]; \ - if (Splice(stringname, _len) > 1023) \ - *err = CeedError(NULL, 1, "Fortran string length too long %zd", (size_t)Splice(stringname, _len)); \ - strncpy(Splice(stringname, _c), stringname, Splice(stringname, _len)); \ - Splice(stringname, _c)[Splice(stringname, _len)] = 0; \ +#define FIX_STRING(stringname) \ + char Splice(stringname, _c)[1024]; \ + if (Splice(stringname, _len) > 1023) *err = CeedError(NULL, 1, "Fortran string length too long %zd", (size_t)Splice(stringname, _len)); \ + strncpy(Splice(stringname, _c), stringname, Splice(stringname, _len)); \ + Splice(stringname, _c)[Splice(stringname, _len)] = 0; // ----------------------------------------------------------------------------- // Ceed // ----------------------------------------------------------------------------- -#define fCeedInit FORTRAN_NAME(ceedinit,CEEDINIT) -CEED_EXTERN void fCeedInit(const char *resource, int *ceed, int *err, - fortran_charlen_t resource_len) { +#define fCeedInit FORTRAN_NAME(ceedinit, CEEDINIT) +CEED_EXTERN void fCeedInit(const char *resource, int *ceed, int *err, fortran_charlen_t resource_len) { FIX_STRING(resource); if (Ceed_count == Ceed_count_max) { - Ceed_count_max += Ceed_count_max/2 + 1; + Ceed_count_max += Ceed_count_max / 2 + 1; CeedRealloc(Ceed_count_max, &Ceed_dict); } Ceed *ceed_ = &Ceed_dict[Ceed_count]; - *err = CeedInit(resource_c, ceed_); + *err = CeedInit(resource_c, ceed_); if (*err == 0) { *ceed = Ceed_count++; @@ -79,25 +77,18 @@ CEED_EXTERN void fCeedInit(const char *resource, int *ceed, int *err, } } -#define fCeedIsDeterministic \ - FORTRAN_NAME(ceedisdeterministic,CEEDISDETERMINISTIC) -CEED_EXTERN void fCeedIsDeterministic(int *ceed, int *is_deterministic, - int *err) { +#define fCeedIsDeterministic FORTRAN_NAME(ceedisdeterministic, CEEDISDETERMINISTIC) +CEED_EXTERN void fCeedIsDeterministic(int *ceed, int *is_deterministic, int *err) { *err = CeedIsDeterministic(Ceed_dict[*ceed], (bool *)is_deterministic); } -#define fCeedGetPreferredMemType \ - FORTRAN_NAME(ceedgetpreferredmemtype,CEEDGETPREFERREDMEMTYPE) -CEED_EXTERN void fCeedGetPreferredMemType(int *ceed, int *type, int *err) { - *err = CeedGetPreferredMemType(Ceed_dict[*ceed], (CeedMemType *)type); -} +#define fCeedGetPreferredMemType FORTRAN_NAME(ceedgetpreferredmemtype, CEEDGETPREFERREDMEMTYPE) +CEED_EXTERN void fCeedGetPreferredMemType(int *ceed, int *type, int *err) { *err = CeedGetPreferredMemType(Ceed_dict[*ceed], (CeedMemType *)type); } -#define fCeedView FORTRAN_NAME(ceedview,CEEDVIEW) -CEED_EXTERN void fCeedView(int *ceed, int *err) { - *err = CeedView(Ceed_dict[*ceed], stdout); -} +#define fCeedView FORTRAN_NAME(ceedview, CEEDVIEW) +CEED_EXTERN void fCeedView(int *ceed, int *err) { *err = CeedView(Ceed_dict[*ceed], stdout); } -#define fCeedDestroy FORTRAN_NAME(ceeddestroy,CEEDDESTROY) +#define fCeedDestroy FORTRAN_NAME(ceeddestroy, CEEDDESTROY) CEED_EXTERN void fCeedDestroy(int *ceed, int *err) { if (*ceed == FORTRAN_NULL) return; *err = CeedDestroy(&Ceed_dict[*ceed]); @@ -107,7 +98,7 @@ CEED_EXTERN void fCeedDestroy(int *ceed, int *err) { Ceed_n--; if (Ceed_n == 0) { CeedFree(&Ceed_dict); - Ceed_count = 0; + Ceed_count = 0; Ceed_count_max = 0; } } @@ -116,20 +107,20 @@ CEED_EXTERN void fCeedDestroy(int *ceed, int *err) { // ----------------------------------------------------------------------------- // CeedVector // ----------------------------------------------------------------------------- -static CeedVector *CeedVector_dict = NULL; -static int CeedVector_count = 0; -static int CeedVector_n = 0; -static int CeedVector_count_max = 0; +static CeedVector *CeedVector_dict = NULL; +static int CeedVector_count = 0; +static int CeedVector_n = 0; +static int CeedVector_count_max = 0; -#define fCeedVectorCreate FORTRAN_NAME(ceedvectorcreate,CEEDVECTORCREATE) +#define fCeedVectorCreate FORTRAN_NAME(ceedvectorcreate, CEEDVECTORCREATE) CEED_EXTERN void fCeedVectorCreate(int *ceed, int *length, int *vec, int *err) { if (CeedVector_count == CeedVector_count_max) { - CeedVector_count_max += CeedVector_count_max/2 + 1; + CeedVector_count_max += CeedVector_count_max / 2 + 1; CeedRealloc(CeedVector_count_max, &CeedVector_dict); } CeedVector *vec_ = &CeedVector_dict[CeedVector_count]; - *err = CeedVectorCreate(Ceed_dict[*ceed], *length, vec_); + *err = CeedVectorCreate(Ceed_dict[*ceed], *length, vec_); if (*err == 0) { *vec = CeedVector_count++; @@ -137,100 +128,74 @@ CEED_EXTERN void fCeedVectorCreate(int *ceed, int *length, int *vec, int *err) { } } -#define fCeedVectorSetArray FORTRAN_NAME(ceedvectorsetarray,CEEDVECTORSETARRAY) -CEED_EXTERN void fCeedVectorSetArray(int *vec, int *memtype, int *copymode, - CeedScalar *array, int64_t *offset, int *err) { - *err = CeedVectorSetArray(CeedVector_dict[*vec], (CeedMemType)*memtype, - (CeedCopyMode)*copymode, - (CeedScalar *)(array + *offset)); +#define fCeedVectorSetArray FORTRAN_NAME(ceedvectorsetarray, CEEDVECTORSETARRAY) +CEED_EXTERN void fCeedVectorSetArray(int *vec, int *memtype, int *copymode, CeedScalar *array, int64_t *offset, int *err) { + *err = CeedVectorSetArray(CeedVector_dict[*vec], (CeedMemType)*memtype, (CeedCopyMode)*copymode, (CeedScalar *)(array + *offset)); } -#define fCeedVectorTakeArray FORTRAN_NAME(ceedvectortakearray,CEEDVECTORTAKEARRAY) -CEED_EXTERN void fCeedVectorTakeArray(int *vec, int *memtype, CeedScalar *array, - int64_t *offset, int *err) { +#define fCeedVectorTakeArray FORTRAN_NAME(ceedvectortakearray, CEEDVECTORTAKEARRAY) +CEED_EXTERN void fCeedVectorTakeArray(int *vec, int *memtype, CeedScalar *array, int64_t *offset, int *err) { CeedScalar *b; - CeedVector vec_ = CeedVector_dict[*vec]; - *err = CeedVectorTakeArray(vec_, (CeedMemType)*memtype, &b); - *offset = b - array; + CeedVector vec_ = CeedVector_dict[*vec]; + *err = CeedVectorTakeArray(vec_, (CeedMemType)*memtype, &b); + *offset = b - array; } -#define fCeedVectorSyncArray FORTRAN_NAME(ceedvectorsyncarray,CEEDVECTORSYNCARRAY) -CEED_EXTERN void fCeedVectorSyncArray(int *vec, int *memtype, int *err) { - *err = CeedVectorSyncArray(CeedVector_dict[*vec], (CeedMemType)*memtype); -} +#define fCeedVectorSyncArray FORTRAN_NAME(ceedvectorsyncarray, CEEDVECTORSYNCARRAY) +CEED_EXTERN void fCeedVectorSyncArray(int *vec, int *memtype, int *err) { *err = CeedVectorSyncArray(CeedVector_dict[*vec], (CeedMemType)*memtype); } -#define fCeedVectorSetValue FORTRAN_NAME(ceedvectorsetvalue,CEEDVECTORSETVALUE) -CEED_EXTERN void fCeedVectorSetValue(int *vec, CeedScalar *value, int *err) { - *err = CeedVectorSetValue(CeedVector_dict[*vec], *value); -} +#define fCeedVectorSetValue FORTRAN_NAME(ceedvectorsetvalue, CEEDVECTORSETVALUE) +CEED_EXTERN void fCeedVectorSetValue(int *vec, CeedScalar *value, int *err) { *err = CeedVectorSetValue(CeedVector_dict[*vec], *value); } -#define fCeedVectorGetArray FORTRAN_NAME(ceedvectorgetarray,CEEDVECTORGETARRAY) -CEED_EXTERN void fCeedVectorGetArray(int *vec, int *memtype, CeedScalar *array, - int64_t *offset, int *err) { +#define fCeedVectorGetArray FORTRAN_NAME(ceedvectorgetarray, CEEDVECTORGETARRAY) +CEED_EXTERN void fCeedVectorGetArray(int *vec, int *memtype, CeedScalar *array, int64_t *offset, int *err) { CeedScalar *b; - CeedVector vec_ = CeedVector_dict[*vec]; - *err = CeedVectorGetArray(vec_, (CeedMemType)*memtype, &b); - *offset = b - array; + CeedVector vec_ = CeedVector_dict[*vec]; + *err = CeedVectorGetArray(vec_, (CeedMemType)*memtype, &b); + *offset = b - array; } -#define fCeedVectorGetArrayRead \ - FORTRAN_NAME(ceedvectorgetarrayread,CEEDVECTORGETARRAYREAD) -CEED_EXTERN void fCeedVectorGetArrayRead(int *vec, int *memtype, - CeedScalar *array, - int64_t *offset, int *err) { +#define fCeedVectorGetArrayRead FORTRAN_NAME(ceedvectorgetarrayread, CEEDVECTORGETARRAYREAD) +CEED_EXTERN void fCeedVectorGetArrayRead(int *vec, int *memtype, CeedScalar *array, int64_t *offset, int *err) { const CeedScalar *b; - CeedVector vec_ = CeedVector_dict[*vec]; - *err = CeedVectorGetArrayRead(vec_, (CeedMemType)*memtype, &b); - *offset = b - array; + CeedVector vec_ = CeedVector_dict[*vec]; + *err = CeedVectorGetArrayRead(vec_, (CeedMemType)*memtype, &b); + *offset = b - array; } -#define fCeedVectorGetArrayWrite \ - FORTRAN_NAME(ceedvectorgetarraywrite,CEEDVECTORGETARRAYWRITE) -CEED_EXTERN void fCeedVectorGetArrayWrite(int *vec, int *memtype, - CeedScalar *array, - int64_t *offset, int *err) { +#define fCeedVectorGetArrayWrite FORTRAN_NAME(ceedvectorgetarraywrite, CEEDVECTORGETARRAYWRITE) +CEED_EXTERN void fCeedVectorGetArrayWrite(int *vec, int *memtype, CeedScalar *array, int64_t *offset, int *err) { CeedScalar *b; - CeedVector vec_ = CeedVector_dict[*vec]; - *err = CeedVectorGetArrayWrite(vec_, (CeedMemType)*memtype, &b); - *offset = b - array; + CeedVector vec_ = CeedVector_dict[*vec]; + *err = CeedVectorGetArrayWrite(vec_, (CeedMemType)*memtype, &b); + *offset = b - array; } -#define fCeedVectorRestoreArray \ - FORTRAN_NAME(ceedvectorrestorearray,CEEDVECTORRESTOREARRAY) -CEED_EXTERN void fCeedVectorRestoreArray(int *vec, CeedScalar *array, - int64_t *offset, int *err) { +#define fCeedVectorRestoreArray FORTRAN_NAME(ceedvectorrestorearray, CEEDVECTORRESTOREARRAY) +CEED_EXTERN void fCeedVectorRestoreArray(int *vec, CeedScalar *array, int64_t *offset, int *err) { CeedScalar *offsetArray = array + *offset; - *err = CeedVectorRestoreArray(CeedVector_dict[*vec], &offsetArray); - *offset = 0; + *err = CeedVectorRestoreArray(CeedVector_dict[*vec], &offsetArray); + *offset = 0; } -#define fCeedVectorRestoreArrayRead \ - FORTRAN_NAME(ceedvectorrestorearrayread,CEEDVECTORRESTOREARRAYREAD) -CEED_EXTERN void fCeedVectorRestoreArrayRead(int *vec, const CeedScalar *array, - int64_t *offset, int *err) { - *err = CeedVectorRestoreArrayRead(CeedVector_dict[*vec], &array); +#define fCeedVectorRestoreArrayRead FORTRAN_NAME(ceedvectorrestorearrayread, CEEDVECTORRESTOREARRAYREAD) +CEED_EXTERN void fCeedVectorRestoreArrayRead(int *vec, const CeedScalar *array, int64_t *offset, int *err) { + *err = CeedVectorRestoreArrayRead(CeedVector_dict[*vec], &array); *offset = 0; } -#define fCeedVectorNorm \ - FORTRAN_NAME(ceedvectornorm,CEEDVECTORNORM) -CEED_EXTERN void fCeedVectorNorm(int *vec, int *type, CeedScalar *norm, - int *err) { +#define fCeedVectorNorm FORTRAN_NAME(ceedvectornorm, CEEDVECTORNORM) +CEED_EXTERN void fCeedVectorNorm(int *vec, int *type, CeedScalar *norm, int *err) { *err = CeedVectorNorm(CeedVector_dict[*vec], (CeedNormType)*type, norm); } -#define fCeedVectorReciprocal \ - FORTRAN_NAME(ceedvectorreciprocal,CEEDVECTORRECIPROCAL) -CEED_EXTERN void fCeedVectorReciprocal(int *vec, int *err) { - *err = CeedVectorReciprocal(CeedVector_dict[*vec]); -} +#define fCeedVectorReciprocal FORTRAN_NAME(ceedvectorreciprocal, CEEDVECTORRECIPROCAL) +CEED_EXTERN void fCeedVectorReciprocal(int *vec, int *err) { *err = CeedVectorReciprocal(CeedVector_dict[*vec]); } -#define fCeedVectorView FORTRAN_NAME(ceedvectorview,CEEDVECTORVIEW) -CEED_EXTERN void fCeedVectorView(int *vec, int *err) { - *err = CeedVectorView(CeedVector_dict[*vec], "%12.8f", stdout); -} +#define fCeedVectorView FORTRAN_NAME(ceedvectorview, CEEDVECTORVIEW) +CEED_EXTERN void fCeedVectorView(int *vec, int *err) { *err = CeedVectorView(CeedVector_dict[*vec], "%12.8f", stdout); } -#define fCeedVectorDestroy FORTRAN_NAME(ceedvectordestroy,CEEDVECTORDESTROY) +#define fCeedVectorDestroy FORTRAN_NAME(ceedvectordestroy, CEEDVECTORDESTROY) CEED_EXTERN void fCeedVectorDestroy(int *vec, int *err) { if (*vec == FORTRAN_NULL) return; *err = CeedVectorDestroy(&CeedVector_dict[*vec]); @@ -240,7 +205,7 @@ CEED_EXTERN void fCeedVectorDestroy(int *vec, int *err) { CeedVector_n--; if (CeedVector_n == 0) { CeedFree(&CeedVector_dict); - CeedVector_count = 0; + CeedVector_count = 0; CeedVector_count_max = 0; } } @@ -249,32 +214,24 @@ CEED_EXTERN void fCeedVectorDestroy(int *vec, int *err) { // ----------------------------------------------------------------------------- // CeedElemRestriction // ----------------------------------------------------------------------------- -static CeedElemRestriction *CeedElemRestriction_dict = NULL; -static int CeedElemRestriction_count = 0; -static int CeedElemRestriction_n = 0; -static int CeedElemRestriction_count_max = 0; - -#define fCeedElemRestrictionCreate \ - FORTRAN_NAME(ceedelemrestrictioncreate, CEEDELEMRESTRICTIONCREATE) -CEED_EXTERN void fCeedElemRestrictionCreate(int *ceed, int *nelements, - int *esize, - int *num_comp, int *comp_stride, int *lsize, - int *memtype, int *copymode, const int *offsets, - int *elemrestriction, int *err) { +static CeedElemRestriction *CeedElemRestriction_dict = NULL; +static int CeedElemRestriction_count = 0; +static int CeedElemRestriction_n = 0; +static int CeedElemRestriction_count_max = 0; + +#define fCeedElemRestrictionCreate FORTRAN_NAME(ceedelemrestrictioncreate, CEEDELEMRESTRICTIONCREATE) +CEED_EXTERN void fCeedElemRestrictionCreate(int *ceed, int *nelements, int *esize, int *num_comp, int *comp_stride, int *lsize, int *memtype, + int *copymode, const int *offsets, int *elemrestriction, int *err) { if (CeedElemRestriction_count == CeedElemRestriction_count_max) { - CeedElemRestriction_count_max += CeedElemRestriction_count_max/2 + 1; + CeedElemRestriction_count_max += CeedElemRestriction_count_max / 2 + 1; CeedRealloc(CeedElemRestriction_count_max, &CeedElemRestriction_dict); } const int *offsets_ = offsets; - CeedElemRestriction *elemrestriction_ = - &CeedElemRestriction_dict[CeedElemRestriction_count]; - *err = CeedElemRestrictionCreate(Ceed_dict[*ceed], *nelements, *esize, - *num_comp, *comp_stride, *lsize, - (CeedMemType)*memtype, - (CeedCopyMode)*copymode, offsets_, - elemrestriction_); + CeedElemRestriction *elemrestriction_ = &CeedElemRestriction_dict[CeedElemRestriction_count]; + *err = CeedElemRestrictionCreate(Ceed_dict[*ceed], *nelements, *esize, *num_comp, *comp_stride, *lsize, (CeedMemType)*memtype, + (CeedCopyMode)*copymode, offsets_, elemrestriction_); if (*err == 0) { *elemrestriction = CeedElemRestriction_count++; @@ -282,53 +239,34 @@ CEED_EXTERN void fCeedElemRestrictionCreate(int *ceed, int *nelements, } } -#define fCeedElemRestrictionCreateStrided \ - FORTRAN_NAME(ceedelemrestrictioncreatestrided, CEEDELEMRESTRICTIONCREATESTRIDED) -CEED_EXTERN void fCeedElemRestrictionCreateStrided(int *ceed, int *nelements, - int *esize, - int *num_comp, int *lsize, int *strides, - int *elemrestriction, int *err) { +#define fCeedElemRestrictionCreateStrided FORTRAN_NAME(ceedelemrestrictioncreatestrided, CEEDELEMRESTRICTIONCREATESTRIDED) +CEED_EXTERN void fCeedElemRestrictionCreateStrided(int *ceed, int *nelements, int *esize, int *num_comp, int *lsize, int *strides, + int *elemrestriction, int *err) { if (CeedElemRestriction_count == CeedElemRestriction_count_max) { - CeedElemRestriction_count_max += CeedElemRestriction_count_max/2 + 1; + CeedElemRestriction_count_max += CeedElemRestriction_count_max / 2 + 1; CeedRealloc(CeedElemRestriction_count_max, &CeedElemRestriction_dict); } - CeedElemRestriction *elemrestriction_ = - &CeedElemRestriction_dict[CeedElemRestriction_count]; - *err = CeedElemRestrictionCreateStrided(Ceed_dict[*ceed], *nelements, *esize, - *num_comp, *lsize, - *strides == FORTRAN_STRIDES_BACKEND ? - CEED_STRIDES_BACKEND : strides, - elemrestriction_); + CeedElemRestriction *elemrestriction_ = &CeedElemRestriction_dict[CeedElemRestriction_count]; + *err = CeedElemRestrictionCreateStrided(Ceed_dict[*ceed], *nelements, *esize, *num_comp, *lsize, + *strides == FORTRAN_STRIDES_BACKEND ? CEED_STRIDES_BACKEND : strides, elemrestriction_); if (*err == 0) { *elemrestriction = CeedElemRestriction_count++; CeedElemRestriction_n++; } } -#define fCeedElemRestrictionCreateBlocked \ - FORTRAN_NAME(ceedelemrestrictioncreateblocked,CEEDELEMRESTRICTIONCREATEBLOCKED) -CEED_EXTERN void fCeedElemRestrictionCreateBlocked(int *ceed, int *nelements, - int *esize, - int *blocksize, int *num_comp, - int *comp_stride, int *lsize, - int *mtype, int *cmode, - int *blkindices, int *elemrestriction, - int *err) { - +#define fCeedElemRestrictionCreateBlocked FORTRAN_NAME(ceedelemrestrictioncreateblocked, CEEDELEMRESTRICTIONCREATEBLOCKED) +CEED_EXTERN void fCeedElemRestrictionCreateBlocked(int *ceed, int *nelements, int *esize, int *blocksize, int *num_comp, int *comp_stride, int *lsize, + int *mtype, int *cmode, int *blkindices, int *elemrestriction, int *err) { if (CeedElemRestriction_count == CeedElemRestriction_count_max) { - CeedElemRestriction_count_max += CeedElemRestriction_count_max/2 + 1; + CeedElemRestriction_count_max += CeedElemRestriction_count_max / 2 + 1; CeedRealloc(CeedElemRestriction_count_max, &CeedElemRestriction_dict); } - CeedElemRestriction *elemrestriction_ = - &CeedElemRestriction_dict[CeedElemRestriction_count]; - *err = CeedElemRestrictionCreateBlocked(Ceed_dict[*ceed], - *nelements, *esize, *blocksize, - *num_comp, *comp_stride, *lsize, - (CeedMemType)*mtype, - (CeedCopyMode)*cmode, blkindices, - elemrestriction_); + CeedElemRestriction *elemrestriction_ = &CeedElemRestriction_dict[CeedElemRestriction_count]; + *err = CeedElemRestrictionCreateBlocked(Ceed_dict[*ceed], *nelements, *esize, *blocksize, *num_comp, *comp_stride, *lsize, (CeedMemType)*mtype, + (CeedCopyMode)*cmode, blkindices, elemrestriction_); if (*err == 0) { *elemrestriction = CeedElemRestriction_count++; @@ -336,56 +274,45 @@ CEED_EXTERN void fCeedElemRestrictionCreateBlocked(int *ceed, int *nelements, } } -#define fCeedElemRestrictionCreateBlockedStrided \ - FORTRAN_NAME(ceedelemrestrictioncreateblockedstrided, CEEDELEMRESTRICTIONCREATEBLOCKEDSTRIDED) -CEED_EXTERN void fCeedElemRestrictionCreateBlockedStrided(int *ceed, - int *nelements, - int *esize, int *blk_size, int *num_comp, int *lsize, int *strides, - int *elemrestriction, int *err) { +#define fCeedElemRestrictionCreateBlockedStrided FORTRAN_NAME(ceedelemrestrictioncreateblockedstrided, CEEDELEMRESTRICTIONCREATEBLOCKEDSTRIDED) +CEED_EXTERN void fCeedElemRestrictionCreateBlockedStrided(int *ceed, int *nelements, int *esize, int *blk_size, int *num_comp, int *lsize, + int *strides, int *elemrestriction, int *err) { if (CeedElemRestriction_count == CeedElemRestriction_count_max) { - CeedElemRestriction_count_max += CeedElemRestriction_count_max/2 + 1; + CeedElemRestriction_count_max += CeedElemRestriction_count_max / 2 + 1; CeedRealloc(CeedElemRestriction_count_max, &CeedElemRestriction_dict); } - CeedElemRestriction *elemrestriction_ = - &CeedElemRestriction_dict[CeedElemRestriction_count]; - *err = CeedElemRestrictionCreateBlockedStrided(Ceed_dict[*ceed], *nelements, - *esize, *blk_size, *num_comp, *lsize, strides, elemrestriction_); + CeedElemRestriction *elemrestriction_ = &CeedElemRestriction_dict[CeedElemRestriction_count]; + *err = CeedElemRestrictionCreateBlockedStrided(Ceed_dict[*ceed], *nelements, *esize, *blk_size, *num_comp, *lsize, strides, elemrestriction_); if (*err == 0) { *elemrestriction = CeedElemRestriction_count++; CeedElemRestriction_n++; } } -static CeedRequest *CeedRequest_dict = NULL; -static int CeedRequest_count = 0; -static int CeedRequest_n = 0; -static int CeedRequest_count_max = 0; +static CeedRequest *CeedRequest_dict = NULL; +static int CeedRequest_count = 0; +static int CeedRequest_n = 0; +static int CeedRequest_count_max = 0; -#define fCeedElemRestrictionApply \ - FORTRAN_NAME(ceedelemrestrictionapply,CEEDELEMRESTRICTIONAPPLY) -CEED_EXTERN void fCeedElemRestrictionApply(int *elemr, int *tmode, int *uvec, - int *ruvec, - int *rqst, int *err) { +#define fCeedElemRestrictionApply FORTRAN_NAME(ceedelemrestrictionapply, CEEDELEMRESTRICTIONAPPLY) +CEED_EXTERN void fCeedElemRestrictionApply(int *elemr, int *tmode, int *uvec, int *ruvec, int *rqst, int *err) { int createRequest = 1; // Check if input is CEED_REQUEST_ORDERED(-2) or CEED_REQUEST_IMMEDIATE(-1) - if (*rqst == FORTRAN_REQUEST_IMMEDIATE || *rqst == FORTRAN_REQUEST_ORDERED) - createRequest = 0; + if (*rqst == FORTRAN_REQUEST_IMMEDIATE || *rqst == FORTRAN_REQUEST_ORDERED) createRequest = 0; if (createRequest && CeedRequest_count == CeedRequest_count_max) { - CeedRequest_count_max += CeedRequest_count_max/2 + 1; + CeedRequest_count_max += CeedRequest_count_max / 2 + 1; CeedRealloc(CeedRequest_count_max, &CeedRequest_dict); } CeedRequest *rqst_; - if (*rqst == FORTRAN_REQUEST_IMMEDIATE) rqst_ = CEED_REQUEST_IMMEDIATE; - else if (*rqst == FORTRAN_REQUEST_ORDERED ) rqst_ = CEED_REQUEST_ORDERED; + if (*rqst == FORTRAN_REQUEST_IMMEDIATE) rqst_ = CEED_REQUEST_IMMEDIATE; + else if (*rqst == FORTRAN_REQUEST_ORDERED) rqst_ = CEED_REQUEST_ORDERED; else rqst_ = &CeedRequest_dict[CeedRequest_count]; - *err = CeedElemRestrictionApply(CeedElemRestriction_dict[*elemr], - (CeedTransposeMode)*tmode, - CeedVector_dict[*uvec], - CeedVector_dict[*ruvec], rqst_); + *err = + CeedElemRestrictionApply(CeedElemRestriction_dict[*elemr], (CeedTransposeMode)*tmode, CeedVector_dict[*uvec], CeedVector_dict[*ruvec], rqst_); if (*err == 0 && createRequest) { *rqst = CeedRequest_count++; @@ -393,28 +320,23 @@ CEED_EXTERN void fCeedElemRestrictionApply(int *elemr, int *tmode, int *uvec, } } -#define fCeedElemRestrictionApplyBlock \ - FORTRAN_NAME(ceedelemrestrictionapplyblock,CEEDELEMRESTRICTIONAPPLYBLOCK) -CEED_EXTERN void fCeedElemRestrictionApplyBlock(int *elemr, int *block, - int *tmode, - int *uvec, int *ruvec, int *rqst, int *err) { +#define fCeedElemRestrictionApplyBlock FORTRAN_NAME(ceedelemrestrictionapplyblock, CEEDELEMRESTRICTIONAPPLYBLOCK) +CEED_EXTERN void fCeedElemRestrictionApplyBlock(int *elemr, int *block, int *tmode, int *uvec, int *ruvec, int *rqst, int *err) { int createRequest = 1; // Check if input is CEED_REQUEST_ORDERED(-2) or CEED_REQUEST_IMMEDIATE(-1) - if (*rqst == FORTRAN_REQUEST_IMMEDIATE || *rqst == FORTRAN_REQUEST_ORDERED) - createRequest = 0; + if (*rqst == FORTRAN_REQUEST_IMMEDIATE || *rqst == FORTRAN_REQUEST_ORDERED) createRequest = 0; if (createRequest && CeedRequest_count == CeedRequest_count_max) { - CeedRequest_count_max += CeedRequest_count_max/2 + 1; + CeedRequest_count_max += CeedRequest_count_max / 2 + 1; CeedRealloc(CeedRequest_count_max, &CeedRequest_dict); } CeedRequest *rqst_; - if (*rqst == FORTRAN_REQUEST_IMMEDIATE) rqst_ = CEED_REQUEST_IMMEDIATE; - else if (*rqst == FORTRAN_REQUEST_ORDERED ) rqst_ = CEED_REQUEST_ORDERED; + if (*rqst == FORTRAN_REQUEST_IMMEDIATE) rqst_ = CEED_REQUEST_IMMEDIATE; + else if (*rqst == FORTRAN_REQUEST_ORDERED) rqst_ = CEED_REQUEST_ORDERED; else rqst_ = &CeedRequest_dict[CeedRequest_count]; - *err = CeedElemRestrictionApplyBlock(CeedElemRestriction_dict[*elemr], *block, - (CeedTransposeMode)*tmode, CeedVector_dict[*uvec], + *err = CeedElemRestrictionApplyBlock(CeedElemRestriction_dict[*elemr], *block, (CeedTransposeMode)*tmode, CeedVector_dict[*uvec], CeedVector_dict[*ruvec], rqst_); if (*err == 0 && createRequest) { @@ -423,30 +345,20 @@ CEED_EXTERN void fCeedElemRestrictionApplyBlock(int *elemr, int *block, } } -#define fCeedElemRestrictionGetMultiplicity \ - FORTRAN_NAME(ceedelemrestrictiongetmultiplicity,CEEDELEMRESTRICTIONGETMULTIPLICITY) -CEED_EXTERN void fCeedElemRestrictionGetMultiplicity(int *elemr, int *mult, - int *err) { - *err = CeedElemRestrictionGetMultiplicity(CeedElemRestriction_dict[*elemr], - CeedVector_dict[*mult]); +#define fCeedElemRestrictionGetMultiplicity FORTRAN_NAME(ceedelemrestrictiongetmultiplicity, CEEDELEMRESTRICTIONGETMULTIPLICITY) +CEED_EXTERN void fCeedElemRestrictionGetMultiplicity(int *elemr, int *mult, int *err) { + *err = CeedElemRestrictionGetMultiplicity(CeedElemRestriction_dict[*elemr], CeedVector_dict[*mult]); } -#define fCeedElemRestrictionGetELayout \ - FORTRAN_NAME(ceedelemrestrictiongetelayout,CEEDELEMRESTRICTIONGETELAYOUT) -CEED_EXTERN void fCeedElemRestrictionGetELayout(int *elemr, int *layout, - int *err) { +#define fCeedElemRestrictionGetELayout FORTRAN_NAME(ceedelemrestrictiongetelayout, CEEDELEMRESTRICTIONGETELAYOUT) +CEED_EXTERN void fCeedElemRestrictionGetELayout(int *elemr, int *layout, int *err) { CeedInt layout_c[3]; - *err = CeedElemRestrictionGetELayout(CeedElemRestriction_dict[*elemr], - &layout_c); - for (int i=0; i<3; i++) - layout[i] = layout_c[i]; + *err = CeedElemRestrictionGetELayout(CeedElemRestriction_dict[*elemr], &layout_c); + for (int i = 0; i < 3; i++) layout[i] = layout_c[i]; } -#define fCeedElemRestrictionView \ - FORTRAN_NAME(ceedelemrestrictionview,CEEDELEMRESTRICTIONVIEW) -CEED_EXTERN void fCeedElemRestrictionView(int *elemr, int *err) { - *err = CeedElemRestrictionView(CeedElemRestriction_dict[*elemr], stdout); -} +#define fCeedElemRestrictionView FORTRAN_NAME(ceedelemrestrictionview, CEEDELEMRESTRICTIONVIEW) +CEED_EXTERN void fCeedElemRestrictionView(int *elemr, int *err) { *err = CeedElemRestrictionView(CeedElemRestriction_dict[*elemr], stdout); } #define fCeedRequestWait FORTRAN_NAME(ceedrequestwait, CEEDREQUESTWAIT) CEED_EXTERN void fCeedRequestWait(int *rqst, int *err) { @@ -457,14 +369,13 @@ CEED_EXTERN void fCeedRequestWait(int *rqst, int *err) { CeedRequest_n--; if (CeedRequest_n == 0) { CeedFree(&CeedRequest_dict); - CeedRequest_count = 0; + CeedRequest_count = 0; CeedRequest_count_max = 0; } } } -#define fCeedElemRestrictionDestroy \ - FORTRAN_NAME(ceedelemrestrictiondestroy,CEEDELEMRESTRICTIONDESTROY) +#define fCeedElemRestrictionDestroy FORTRAN_NAME(ceedelemrestrictiondestroy, CEEDELEMRESTRICTIONDESTROY) CEED_EXTERN void fCeedElemRestrictionDestroy(int *elem, int *err) { if (*elem == FORTRAN_NULL) return; *err = CeedElemRestrictionDestroy(&CeedElemRestriction_dict[*elem]); @@ -474,7 +385,7 @@ CEED_EXTERN void fCeedElemRestrictionDestroy(int *elem, int *err) { CeedElemRestriction_n--; if (CeedElemRestriction_n == 0) { CeedFree(&CeedElemRestriction_dict); - CeedElemRestriction_count = 0; + CeedElemRestriction_count = 0; CeedElemRestriction_count_max = 0; } } @@ -483,25 +394,19 @@ CEED_EXTERN void fCeedElemRestrictionDestroy(int *elem, int *err) { // ----------------------------------------------------------------------------- // CeedBasis // ----------------------------------------------------------------------------- -static CeedBasis *CeedBasis_dict = NULL; -static int CeedBasis_count = 0; -static int CeedBasis_n = 0; -static int CeedBasis_count_max = 0; - -#define fCeedBasisCreateTensorH1Lagrange \ - FORTRAN_NAME(ceedbasiscreatetensorh1lagrange, CEEDBASISCREATETENSORH1LAGRANGE) -CEED_EXTERN void fCeedBasisCreateTensorH1Lagrange(int *ceed, int *dim, - int *num_comp, int *P, int *Q, int *quadmode, - int *basis, int *err) { +static CeedBasis *CeedBasis_dict = NULL; +static int CeedBasis_count = 0; +static int CeedBasis_n = 0; +static int CeedBasis_count_max = 0; + +#define fCeedBasisCreateTensorH1Lagrange FORTRAN_NAME(ceedbasiscreatetensorh1lagrange, CEEDBASISCREATETENSORH1LAGRANGE) +CEED_EXTERN void fCeedBasisCreateTensorH1Lagrange(int *ceed, int *dim, int *num_comp, int *P, int *Q, int *quadmode, int *basis, int *err) { if (CeedBasis_count == CeedBasis_count_max) { - CeedBasis_count_max += CeedBasis_count_max/2 + 1; + CeedBasis_count_max += CeedBasis_count_max / 2 + 1; CeedRealloc(CeedBasis_count_max, &CeedBasis_dict); } - *err = CeedBasisCreateTensorH1Lagrange(Ceed_dict[*ceed], *dim, *num_comp, *P, - *Q, - (CeedQuadMode)*quadmode, - &CeedBasis_dict[CeedBasis_count]); + *err = CeedBasisCreateTensorH1Lagrange(Ceed_dict[*ceed], *dim, *num_comp, *P, *Q, (CeedQuadMode)*quadmode, &CeedBasis_dict[CeedBasis_count]); if (*err == 0) { *basis = CeedBasis_count++; @@ -509,22 +414,16 @@ CEED_EXTERN void fCeedBasisCreateTensorH1Lagrange(int *ceed, int *dim, } } -#define fCeedBasisCreateTensorH1 \ - FORTRAN_NAME(ceedbasiscreatetensorh1, CEEDBASISCREATETENSORH1) -CEED_EXTERN void fCeedBasisCreateTensorH1(int *ceed, int *dim, int *num_comp, - int *P_1d, - int *Q_1d, const CeedScalar *interp_1d, - const CeedScalar *grad_1d, - const CeedScalar *q_ref_1d, - const CeedScalar *q_weight_1d, int *basis, - int *err) { +#define fCeedBasisCreateTensorH1 FORTRAN_NAME(ceedbasiscreatetensorh1, CEEDBASISCREATETENSORH1) +CEED_EXTERN void fCeedBasisCreateTensorH1(int *ceed, int *dim, int *num_comp, int *P_1d, int *Q_1d, const CeedScalar *interp_1d, + const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, int *basis, + int *err) { if (CeedBasis_count == CeedBasis_count_max) { - CeedBasis_count_max += CeedBasis_count_max/2 + 1; + CeedBasis_count_max += CeedBasis_count_max / 2 + 1; CeedRealloc(CeedBasis_count_max, &CeedBasis_dict); } - *err = CeedBasisCreateTensorH1(Ceed_dict[*ceed], *dim, *num_comp, *P_1d, *Q_1d, - interp_1d, grad_1d, q_ref_1d, q_weight_1d, + *err = CeedBasisCreateTensorH1(Ceed_dict[*ceed], *dim, *num_comp, *P_1d, *Q_1d, interp_1d, grad_1d, q_ref_1d, q_weight_1d, &CeedBasis_dict[CeedBasis_count]); if (*err == 0) { @@ -533,20 +432,15 @@ CEED_EXTERN void fCeedBasisCreateTensorH1(int *ceed, int *dim, int *num_comp, } } -#define fCeedBasisCreateH1 \ - FORTRAN_NAME(ceedbasiscreateh1, CEEDBASISCREATEH1) -CEED_EXTERN void fCeedBasisCreateH1(int *ceed, int *topo, int *num_comp, - int *nnodes, - int *nqpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *qref, - const CeedScalar *qweight, int *basis, int *err) { +#define fCeedBasisCreateH1 FORTRAN_NAME(ceedbasiscreateh1, CEEDBASISCREATEH1) +CEED_EXTERN void fCeedBasisCreateH1(int *ceed, int *topo, int *num_comp, int *nnodes, int *nqpts, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar *qref, const CeedScalar *qweight, int *basis, int *err) { if (CeedBasis_count == CeedBasis_count_max) { - CeedBasis_count_max += CeedBasis_count_max/2 + 1; + CeedBasis_count_max += CeedBasis_count_max / 2 + 1; CeedRealloc(CeedBasis_count_max, &CeedBasis_dict); } - *err = CeedBasisCreateH1(Ceed_dict[*ceed], (CeedElemTopology)*topo, *num_comp, - *nnodes, *nqpts, interp, grad, qref, qweight, + *err = CeedBasisCreateH1(Ceed_dict[*ceed], (CeedElemTopology)*topo, *num_comp, *nnodes, *nqpts, interp, grad, qref, qweight, &CeedBasis_dict[CeedBasis_count]); if (*err == 0) { @@ -556,110 +450,72 @@ CEED_EXTERN void fCeedBasisCreateH1(int *ceed, int *topo, int *num_comp, } #define fCeedBasisView FORTRAN_NAME(ceedbasisview, CEEDBASISVIEW) -CEED_EXTERN void fCeedBasisView(int *basis, int *err) { - *err = CeedBasisView(CeedBasis_dict[*basis], stdout); -} +CEED_EXTERN void fCeedBasisView(int *basis, int *err) { *err = CeedBasisView(CeedBasis_dict[*basis], stdout); } -#define fCeedQRFactorization \ - FORTRAN_NAME(ceedqrfactorization, CEEDQRFACTORIZATION) -CEED_EXTERN void fCeedQRFactorization(int *ceed, CeedScalar *mat, - CeedScalar *tau, int *m, - int *n, int *err) { +#define fCeedQRFactorization FORTRAN_NAME(ceedqrfactorization, CEEDQRFACTORIZATION) +CEED_EXTERN void fCeedQRFactorization(int *ceed, CeedScalar *mat, CeedScalar *tau, int *m, int *n, int *err) { *err = CeedQRFactorization(Ceed_dict[*ceed], mat, tau, *m, *n); } -#define fCeedHouseholderApplyQ \ - FORTRAN_NAME(ceedhouseholderapplyq, CEEDHOUSEHOLDERAPPLYQ) -CEED_EXTERN void fCeedHouseholderApplyQ(CeedScalar *A, CeedScalar *Q, - CeedScalar *tau, - int *t_mode, - int *m, int *n, int *k, int *row, int *col, int *err) { - *err = CeedHouseholderApplyQ(A, Q, tau, (CeedTransposeMode)*t_mode, *m, *n, *k, - *row, *col); +#define fCeedHouseholderApplyQ FORTRAN_NAME(ceedhouseholderapplyq, CEEDHOUSEHOLDERAPPLYQ) +CEED_EXTERN void fCeedHouseholderApplyQ(CeedScalar *A, CeedScalar *Q, CeedScalar *tau, int *t_mode, int *m, int *n, int *k, int *row, int *col, + int *err) { + *err = CeedHouseholderApplyQ(A, Q, tau, (CeedTransposeMode)*t_mode, *m, *n, *k, *row, *col); } -#define fCeedSymmetricSchurDecomposition \ - FORTRAN_NAME(ceedsymmetricschurdecomposition, CEEDSYMMETRICSCHURDECOMPOSITION) -CEED_EXTERN void fCeedSymmetricSchurDecomposition(int *ceed, CeedScalar *mat, - CeedScalar *lambda, int *n, int *err) { +#define fCeedSymmetricSchurDecomposition FORTRAN_NAME(ceedsymmetricschurdecomposition, CEEDSYMMETRICSCHURDECOMPOSITION) +CEED_EXTERN void fCeedSymmetricSchurDecomposition(int *ceed, CeedScalar *mat, CeedScalar *lambda, int *n, int *err) { *err = CeedSymmetricSchurDecomposition(Ceed_dict[*ceed], mat, lambda, *n); } -#define fCeedSimultaneousDiagonalization \ - FORTRAN_NAME(ceedsimultaneousdiagonalization, CEEDSIMULTANEOUSDIAGONALIZATION) -CEED_EXTERN void fCeedSimultaneousDiagonalization(int *ceed, CeedScalar *matA, - CeedScalar *matB, CeedScalar *x, - CeedScalar *lambda, int *n, int *err) { - *err = CeedSimultaneousDiagonalization(Ceed_dict[*ceed], matA, matB, x, - lambda, *n); +#define fCeedSimultaneousDiagonalization FORTRAN_NAME(ceedsimultaneousdiagonalization, CEEDSIMULTANEOUSDIAGONALIZATION) +CEED_EXTERN void fCeedSimultaneousDiagonalization(int *ceed, CeedScalar *matA, CeedScalar *matB, CeedScalar *x, CeedScalar *lambda, int *n, + int *err) { + *err = CeedSimultaneousDiagonalization(Ceed_dict[*ceed], matA, matB, x, lambda, *n); } -#define fCeedBasisGetCollocatedGrad \ - FORTRAN_NAME(ceedbasisgetcollocatedgrad, CEEDBASISGETCOLLOCATEDGRAD) -CEED_EXTERN void fCeedBasisGetCollocatedGrad(int *basis, - CeedScalar *colo_grad_1d, - int *err) { +#define fCeedBasisGetCollocatedGrad FORTRAN_NAME(ceedbasisgetcollocatedgrad, CEEDBASISGETCOLLOCATEDGRAD) +CEED_EXTERN void fCeedBasisGetCollocatedGrad(int *basis, CeedScalar *colo_grad_1d, int *err) { *err = CeedBasisGetCollocatedGrad(CeedBasis_dict[*basis], colo_grad_1d); } #define fCeedBasisApply FORTRAN_NAME(ceedbasisapply, CEEDBASISAPPLY) -CEED_EXTERN void fCeedBasisApply(int *basis, int *num_elem, int *tmode, - int *eval_mode, - int *u, int *v, int *err) { - *err = CeedBasisApply(CeedBasis_dict[*basis], *num_elem, - (CeedTransposeMode)*tmode, - (CeedEvalMode)*eval_mode, - *u == FORTRAN_VECTOR_NONE ? CEED_VECTOR_NONE : CeedVector_dict[*u], - CeedVector_dict[*v]); -} - -#define fCeedBasisGetNumNodes \ - FORTRAN_NAME(ceedbasisgetnumnodes, CEEDBASISGETNUMNODES) -CEED_EXTERN void fCeedBasisGetNumNodes(int *basis, int *P, int *err) { - *err = CeedBasisGetNumNodes(CeedBasis_dict[*basis], P); -} - -#define fCeedBasisGetNumQuadraturePoints \ - FORTRAN_NAME(ceedbasisgetnumquadraturepoints, CEEDBASISGETNUMQUADRATUREPOINTS) -CEED_EXTERN void fCeedBasisGetNumQuadraturePoints(int *basis, int *Q, - int *err) { - *err = CeedBasisGetNumQuadraturePoints(CeedBasis_dict[*basis], Q); -} - -#define fCeedBasisGetInterp1D \ - FORTRAN_NAME(ceedbasisgetinterp1d, CEEDBASISGETINTERP1D) -CEED_EXTERN void fCeedBasisGetInterp1D(int *basis, CeedScalar *interp_1d, - int64_t *offset, - int *err) { +CEED_EXTERN void fCeedBasisApply(int *basis, int *num_elem, int *tmode, int *eval_mode, int *u, int *v, int *err) { + *err = CeedBasisApply(CeedBasis_dict[*basis], *num_elem, (CeedTransposeMode)*tmode, (CeedEvalMode)*eval_mode, + *u == FORTRAN_VECTOR_NONE ? CEED_VECTOR_NONE : CeedVector_dict[*u], CeedVector_dict[*v]); +} + +#define fCeedBasisGetNumNodes FORTRAN_NAME(ceedbasisgetnumnodes, CEEDBASISGETNUMNODES) +CEED_EXTERN void fCeedBasisGetNumNodes(int *basis, int *P, int *err) { *err = CeedBasisGetNumNodes(CeedBasis_dict[*basis], P); } + +#define fCeedBasisGetNumQuadraturePoints FORTRAN_NAME(ceedbasisgetnumquadraturepoints, CEEDBASISGETNUMQUADRATUREPOINTS) +CEED_EXTERN void fCeedBasisGetNumQuadraturePoints(int *basis, int *Q, int *err) { *err = CeedBasisGetNumQuadraturePoints(CeedBasis_dict[*basis], Q); } + +#define fCeedBasisGetInterp1D FORTRAN_NAME(ceedbasisgetinterp1d, CEEDBASISGETINTERP1D) +CEED_EXTERN void fCeedBasisGetInterp1D(int *basis, CeedScalar *interp_1d, int64_t *offset, int *err) { const CeedScalar *interp1d_; - CeedBasis basis_ = CeedBasis_dict[*basis]; - *err = CeedBasisGetInterp1D(basis_, &interp1d_); - *offset = interp1d_ - interp_1d; + CeedBasis basis_ = CeedBasis_dict[*basis]; + *err = CeedBasisGetInterp1D(basis_, &interp1d_); + *offset = interp1d_ - interp_1d; } -#define fCeedBasisGetGrad1D \ - FORTRAN_NAME(ceedbasisgetgrad1d, CEEDBASISGETGRAD1D) -CEED_EXTERN void fCeedBasisGetGrad1D(int *basis, CeedScalar *grad_1d, - int64_t *offset, - int *err) { +#define fCeedBasisGetGrad1D FORTRAN_NAME(ceedbasisgetgrad1d, CEEDBASISGETGRAD1D) +CEED_EXTERN void fCeedBasisGetGrad1D(int *basis, CeedScalar *grad_1d, int64_t *offset, int *err) { const CeedScalar *grad1d_; - CeedBasis basis_ = CeedBasis_dict[*basis]; - *err = CeedBasisGetGrad1D(basis_, &grad1d_); - *offset = grad1d_ - grad_1d; + CeedBasis basis_ = CeedBasis_dict[*basis]; + *err = CeedBasisGetGrad1D(basis_, &grad1d_); + *offset = grad1d_ - grad_1d; } -#define fCeedBasisGetQRef \ - FORTRAN_NAME(ceedbasisgetqref, CEEDBASISGETQREF) -CEED_EXTERN void fCeedBasisGetQRef(int *basis, CeedScalar *q_ref, - int64_t *offset, - int *err) { +#define fCeedBasisGetQRef FORTRAN_NAME(ceedbasisgetqref, CEEDBASISGETQREF) +CEED_EXTERN void fCeedBasisGetQRef(int *basis, CeedScalar *q_ref, int64_t *offset, int *err) { const CeedScalar *qref_; - CeedBasis basis_ = CeedBasis_dict[*basis]; - *err = CeedBasisGetQRef(basis_, &qref_); - *offset = qref_ - q_ref; + CeedBasis basis_ = CeedBasis_dict[*basis]; + *err = CeedBasisGetQRef(basis_, &qref_); + *offset = qref_ - q_ref; } -#define fCeedBasisDestroy FORTRAN_NAME(ceedbasisdestroy,CEEDBASISDESTROY) +#define fCeedBasisDestroy FORTRAN_NAME(ceedbasisdestroy, CEEDBASISDESTROY) CEED_EXTERN void fCeedBasisDestroy(int *basis, int *err) { if (*basis == FORTRAN_NULL) return; *err = CeedBasisDestroy(&CeedBasis_dict[*basis]); @@ -669,45 +525,38 @@ CEED_EXTERN void fCeedBasisDestroy(int *basis, int *err) { CeedBasis_n--; if (CeedBasis_n == 0) { CeedFree(&CeedBasis_dict); - CeedBasis_count = 0; + CeedBasis_count = 0; CeedBasis_count_max = 0; } } } #define fCeedGaussQuadrature FORTRAN_NAME(ceedgaussquadrature, CEEDGAUSSQUADRATURE) -CEED_EXTERN void fCeedGaussQuadrature(int *Q, CeedScalar *q_ref_1d, - CeedScalar *q_weight_1d, - int *err) { +CEED_EXTERN void fCeedGaussQuadrature(int *Q, CeedScalar *q_ref_1d, CeedScalar *q_weight_1d, int *err) { *err = CeedGaussQuadrature(*Q, q_ref_1d, q_weight_1d); } -#define fCeedLobattoQuadrature \ - FORTRAN_NAME(ceedlobattoquadrature, CEEDLOBATTOQUADRATURE) -CEED_EXTERN void fCeedLobattoQuadrature(int *Q, CeedScalar *q_ref_1d, - CeedScalar *q_weight_1d, - int *err) { +#define fCeedLobattoQuadrature FORTRAN_NAME(ceedlobattoquadrature, CEEDLOBATTOQUADRATURE) +CEED_EXTERN void fCeedLobattoQuadrature(int *Q, CeedScalar *q_ref_1d, CeedScalar *q_weight_1d, int *err) { *err = CeedLobattoQuadrature(*Q, q_ref_1d, q_weight_1d); } // ----------------------------------------------------------------------------- // CeedQFunctionContext // ----------------------------------------------------------------------------- -static CeedQFunctionContext *CeedQFunctionContext_dict = NULL; -static int CeedQFunctionContext_count = 0; -static int CeedQFunctionContext_n = 0; -static int CeedQFunctionContext_count_max = 0; +static CeedQFunctionContext *CeedQFunctionContext_dict = NULL; +static int CeedQFunctionContext_count = 0; +static int CeedQFunctionContext_n = 0; +static int CeedQFunctionContext_count_max = 0; -#define fCeedQFunctionContextCreate \ - FORTRAN_NAME(ceedqfunctioncontextcreate,CEEDQFUNCTIONCONTEXTCREATE) +#define fCeedQFunctionContextCreate FORTRAN_NAME(ceedqfunctioncontextcreate, CEEDQFUNCTIONCONTEXTCREATE) CEED_EXTERN void fCeedQFunctionContextCreate(int *ceed, int *ctx, int *err) { if (CeedQFunctionContext_count == CeedQFunctionContext_count_max) { - CeedQFunctionContext_count_max += CeedQFunctionContext_count_max/2 + 1; + CeedQFunctionContext_count_max += CeedQFunctionContext_count_max / 2 + 1; CeedRealloc(CeedQFunctionContext_count_max, &CeedQFunctionContext_dict); } - CeedQFunctionContext *ctx_ = - &CeedQFunctionContext_dict[CeedQFunctionContext_count]; + CeedQFunctionContext *ctx_ = &CeedQFunctionContext_dict[CeedQFunctionContext_count]; *err = CeedQFunctionContextCreate(Ceed_dict[*ceed], ctx_); if (*err) return; @@ -715,47 +564,30 @@ CEED_EXTERN void fCeedQFunctionContextCreate(int *ceed, int *ctx, int *err) { CeedQFunctionContext_n++; } -#define fCeedQFunctionContextSetData \ - FORTRAN_NAME(ceedqfunctioncontextsetdata,CEEDQFUNCTIONCONTEXTSETDATA) -CEED_EXTERN void fCeedQFunctionContextSetData(int *ctx, int *memtype, - int *copymode, - CeedInt *n, - CeedScalar *data, int64_t *offset, int *err) { - size_t ctx_size = ((size_t) *n)*sizeof(CeedScalar); - *err = CeedQFunctionContextSetData(CeedQFunctionContext_dict[*ctx], - (CeedMemType)*memtype, - (CeedCopyMode)*copymode, ctx_size, - data + *offset); -} - -#define fCeedQFunctionContextGetData \ - FORTRAN_NAME(ceedqfunctioncontextgetdata,CEEDQFUNCTIONCONTEXTGETDATA) -CEED_EXTERN void fCeedQFunctionContextGetData(int *ctx, int *memtype, - CeedScalar *data, - int64_t *offset, int *err) { - CeedScalar *b; +#define fCeedQFunctionContextSetData FORTRAN_NAME(ceedqfunctioncontextsetdata, CEEDQFUNCTIONCONTEXTSETDATA) +CEED_EXTERN void fCeedQFunctionContextSetData(int *ctx, int *memtype, int *copymode, CeedInt *n, CeedScalar *data, int64_t *offset, int *err) { + size_t ctx_size = ((size_t)*n) * sizeof(CeedScalar); + *err = CeedQFunctionContextSetData(CeedQFunctionContext_dict[*ctx], (CeedMemType)*memtype, (CeedCopyMode)*copymode, ctx_size, data + *offset); +} + +#define fCeedQFunctionContextGetData FORTRAN_NAME(ceedqfunctioncontextgetdata, CEEDQFUNCTIONCONTEXTGETDATA) +CEED_EXTERN void fCeedQFunctionContextGetData(int *ctx, int *memtype, CeedScalar *data, int64_t *offset, int *err) { + CeedScalar *b; CeedQFunctionContext ctx_ = CeedQFunctionContext_dict[*ctx]; - *err = CeedQFunctionContextGetData(ctx_, (CeedMemType)*memtype, &b); - *offset = b - data; + *err = CeedQFunctionContextGetData(ctx_, (CeedMemType)*memtype, &b); + *offset = b - data; } -#define fCeedQFunctionContextRestoreData \ - FORTRAN_NAME(ceedqfunctioncontextrestoredata,CEEDQFUNCTIONCONTEXTRESTOREDATA) -CEED_EXTERN void fCeedQFunctionContextRestoreData(int *ctx, CeedScalar *data, - int64_t *offset, int *err) { - *err = CeedQFunctionContextRestoreData(CeedQFunctionContext_dict[*ctx], - (void **)&data); +#define fCeedQFunctionContextRestoreData FORTRAN_NAME(ceedqfunctioncontextrestoredata, CEEDQFUNCTIONCONTEXTRESTOREDATA) +CEED_EXTERN void fCeedQFunctionContextRestoreData(int *ctx, CeedScalar *data, int64_t *offset, int *err) { + *err = CeedQFunctionContextRestoreData(CeedQFunctionContext_dict[*ctx], (void **)&data); *offset = 0; } -#define fCeedQFunctionContextView \ - FORTRAN_NAME(ceedqfunctioncontextview,CEEDQFUNCTIONCONTEXTVIEW) -CEED_EXTERN void fCeedQFunctionContextView(int *ctx, int *err) { - *err = CeedQFunctionContextView(CeedQFunctionContext_dict[*ctx], stdout); -} +#define fCeedQFunctionContextView FORTRAN_NAME(ceedqfunctioncontextview, CEEDQFUNCTIONCONTEXTVIEW) +CEED_EXTERN void fCeedQFunctionContextView(int *ctx, int *err) { *err = CeedQFunctionContextView(CeedQFunctionContext_dict[*ctx], stdout); } -#define fCeedQFunctionContextDestroy \ - FORTRAN_NAME(ceedqfunctioncontextdestroy,CEEDQFUNCTIONCONTEXTDESTROY) +#define fCeedQFunctionContextDestroy FORTRAN_NAME(ceedqfunctioncontextdestroy, CEEDQFUNCTIONCONTEXTDESTROY) CEED_EXTERN void fCeedQFunctionContextDestroy(int *ctx, int *err) { if (*ctx == FORTRAN_NULL) return; *err = CeedQFunctionContextDestroy(&CeedQFunctionContext_dict[*ctx]); @@ -765,7 +597,7 @@ CEED_EXTERN void fCeedQFunctionContextDestroy(int *ctx, int *err) { CeedQFunctionContext_n--; if (CeedQFunctionContext_n == 0) { CeedFree(&CeedQFunctionContext_dict); - CeedQFunctionContext_count = 0; + CeedQFunctionContext_count = 0; CeedQFunctionContext_count_max = 0; } } @@ -774,17 +606,15 @@ CEED_EXTERN void fCeedQFunctionContextDestroy(int *ctx, int *err) { // ----------------------------------------------------------------------------- // CeedQFunction // ----------------------------------------------------------------------------- -static CeedQFunction *CeedQFunction_dict = NULL; -static int CeedQFunction_count = 0; -static int CeedQFunction_n = 0; -static int CeedQFunction_count_max = 0; - -static int CeedQFunctionFortranStub(void *ctx, int nq, - const CeedScalar *const *u, - CeedScalar *const *v) { - CeedFortranContext fctx = ctx; +static CeedQFunction *CeedQFunction_dict = NULL; +static int CeedQFunction_count = 0; +static int CeedQFunction_n = 0; +static int CeedQFunction_count_max = 0; + +static int CeedQFunctionFortranStub(void *ctx, int nq, const CeedScalar *const *u, CeedScalar *const *v) { + CeedFortranContext fctx = ctx; CeedQFunctionContext inner_ctx = fctx->inner_ctx; - int ierr; + int ierr; CeedScalar *ctx_ = NULL; // Note: Device backends are generating their own kernels from @@ -795,10 +625,8 @@ static int CeedQFunctionFortranStub(void *ctx, int nq, CeedChk(ierr); } - fctx->f((void *)ctx_,&nq,u[0],u[1],u[2],u[3],u[4],u[5],u[6], - u[7],u[8],u[9],u[10],u[11],u[12],u[13],u[14],u[15], - v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7],v[8],v[9], - v[10],v[11],v[12],v[13],v[14],v[15],&ierr); + fctx->f((void *)ctx_, &nq, u[0], u[1], u[2], u[3], u[4], u[5], u[6], u[7], u[8], u[9], u[10], u[11], u[12], u[13], u[14], u[15], v[0], v[1], v[2], + v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], &ierr); if (inner_ctx) { ierr = CeedQFunctionContextRestoreData(inner_ctx, (void *)&ctx_); @@ -808,37 +636,23 @@ static int CeedQFunctionFortranStub(void *ctx, int nq, return ierr; } -#define fCeedQFunctionCreateInterior \ - FORTRAN_NAME(ceedqfunctioncreateinterior, CEEDQFUNCTIONCREATEINTERIOR) -CEED_EXTERN void fCeedQFunctionCreateInterior(int *ceed, int *vec_length, - void (*f)(void *ctx, int *nq, - const CeedScalar *u,const CeedScalar *u1, - const CeedScalar *u2,const CeedScalar *u3, - const CeedScalar *u4,const CeedScalar *u5, - const CeedScalar *u6,const CeedScalar *u7, - const CeedScalar *u8,const CeedScalar *u9, - const CeedScalar *u10,const CeedScalar *u11, - const CeedScalar *u12,const CeedScalar *u13, - const CeedScalar *u14,const CeedScalar *u15, - CeedScalar *v,CeedScalar *v1,CeedScalar *v2, - CeedScalar *v3,CeedScalar *v4, - CeedScalar *v5,CeedScalar *v6, - CeedScalar *v7,CeedScalar *v8, - CeedScalar *v9,CeedScalar *v10, - CeedScalar *v11,CeedScalar *v12, - CeedScalar *v13,CeedScalar *v14, - CeedScalar *v15,int *err), - const char *source, int *qf, int *err, - fortran_charlen_t source_len) { +#define fCeedQFunctionCreateInterior FORTRAN_NAME(ceedqfunctioncreateinterior, CEEDQFUNCTIONCREATEINTERIOR) +CEED_EXTERN void fCeedQFunctionCreateInterior( + int *ceed, int *vec_length, + void (*f)(void *ctx, int *nq, const CeedScalar *u, const CeedScalar *u1, const CeedScalar *u2, const CeedScalar *u3, const CeedScalar *u4, + const CeedScalar *u5, const CeedScalar *u6, const CeedScalar *u7, const CeedScalar *u8, const CeedScalar *u9, const CeedScalar *u10, + const CeedScalar *u11, const CeedScalar *u12, const CeedScalar *u13, const CeedScalar *u14, const CeedScalar *u15, CeedScalar *v, + CeedScalar *v1, CeedScalar *v2, CeedScalar *v3, CeedScalar *v4, CeedScalar *v5, CeedScalar *v6, CeedScalar *v7, CeedScalar *v8, + CeedScalar *v9, CeedScalar *v10, CeedScalar *v11, CeedScalar *v12, CeedScalar *v13, CeedScalar *v14, CeedScalar *v15, int *err), + const char *source, int *qf, int *err, fortran_charlen_t source_len) { FIX_STRING(source); if (CeedQFunction_count == CeedQFunction_count_max) { - CeedQFunction_count_max += CeedQFunction_count_max/2 + 1; + CeedQFunction_count_max += CeedQFunction_count_max / 2 + 1; CeedRealloc(CeedQFunction_count_max, &CeedQFunction_dict); } CeedQFunction *qf_ = &CeedQFunction_dict[CeedQFunction_count]; - *err = CeedQFunctionCreateInterior(Ceed_dict[*ceed], *vec_length, - CeedQFunctionFortranStub, source_c, qf_); + *err = CeedQFunctionCreateInterior(Ceed_dict[*ceed], *vec_length, CeedQFunctionFortranStub, source_c, qf_); if (*err == 0) { *qf = CeedQFunction_count++; @@ -848,12 +662,12 @@ CEED_EXTERN void fCeedQFunctionCreateInterior(int *ceed, int *vec_length, CeedFortranContext fctxdata; *err = CeedCalloc(1, &fctxdata); if (*err) return; - fctxdata->f = f; fctxdata->inner_ctx = NULL; + fctxdata->f = f; + fctxdata->inner_ctx = NULL; CeedQFunctionContext fctx; *err = CeedQFunctionContextCreate(Ceed_dict[*ceed], &fctx); if (*err) return; - *err = CeedQFunctionContextSetData(fctx, CEED_MEM_HOST, CEED_OWN_POINTER, - sizeof(*fctxdata), fctxdata); + *err = CeedQFunctionContextSetData(fctx, CEED_MEM_HOST, CEED_OWN_POINTER, sizeof(*fctxdata), fctxdata); if (*err) return; *err = CeedQFunctionSetContext(*qf_, fctx); if (*err) return; @@ -863,19 +677,16 @@ CEED_EXTERN void fCeedQFunctionCreateInterior(int *ceed, int *vec_length, *err = CeedQFunctionSetFortranStatus(*qf_, true); } -#define fCeedQFunctionCreateInteriorByName \ - FORTRAN_NAME(ceedqfunctioncreateinteriorbyname, CEEDQFUNCTIONCREATEINTERIORBYNAME) -CEED_EXTERN void fCeedQFunctionCreateInteriorByName(int *ceed, const char *name, - int *qf, - int *err, fortran_charlen_t name_len) { +#define fCeedQFunctionCreateInteriorByName FORTRAN_NAME(ceedqfunctioncreateinteriorbyname, CEEDQFUNCTIONCREATEINTERIORBYNAME) +CEED_EXTERN void fCeedQFunctionCreateInteriorByName(int *ceed, const char *name, int *qf, int *err, fortran_charlen_t name_len) { FIX_STRING(name); if (CeedQFunction_count == CeedQFunction_count_max) { - CeedQFunction_count_max += CeedQFunction_count_max/2 + 1; + CeedQFunction_count_max += CeedQFunction_count_max / 2 + 1; CeedRealloc(CeedQFunction_count_max, &CeedQFunction_dict); } CeedQFunction *qf_ = &CeedQFunction_dict[CeedQFunction_count]; - *err = CeedQFunctionCreateInteriorByName(Ceed_dict[*ceed], name_c, qf_); + *err = CeedQFunctionCreateInteriorByName(Ceed_dict[*ceed], name_c, qf_); if (*err == 0) { *qf = CeedQFunction_count++; @@ -883,19 +694,15 @@ CEED_EXTERN void fCeedQFunctionCreateInteriorByName(int *ceed, const char *name, } } -#define fCeedQFunctionCreateIdentity \ - FORTRAN_NAME(ceedqfunctioncreateidentity, CEEDQFUNCTIONCREATEIDENTITY) -CEED_EXTERN void fCeedQFunctionCreateIdentity(int *ceed, int *size, int *inmode, - int *outmode, int *qf, int *err) { +#define fCeedQFunctionCreateIdentity FORTRAN_NAME(ceedqfunctioncreateidentity, CEEDQFUNCTIONCREATEIDENTITY) +CEED_EXTERN void fCeedQFunctionCreateIdentity(int *ceed, int *size, int *inmode, int *outmode, int *qf, int *err) { if (CeedQFunction_count == CeedQFunction_count_max) { - CeedQFunction_count_max += CeedQFunction_count_max/2 + 1; + CeedQFunction_count_max += CeedQFunction_count_max / 2 + 1; CeedRealloc(CeedQFunction_count_max, &CeedQFunction_dict); } CeedQFunction *qf_ = &CeedQFunction_dict[CeedQFunction_count]; - *err = CeedQFunctionCreateIdentity(Ceed_dict[*ceed], *size, - (CeedEvalMode)*inmode, - (CeedEvalMode)*outmode, qf_); + *err = CeedQFunctionCreateIdentity(Ceed_dict[*ceed], *size, (CeedEvalMode)*inmode, (CeedEvalMode)*outmode, qf_); if (*err == 0) { *qf = CeedQFunction_count++; @@ -903,10 +710,8 @@ CEED_EXTERN void fCeedQFunctionCreateIdentity(int *ceed, int *size, int *inmode, } } -#define fCeedQFunctionAddInput \ - FORTRAN_NAME(ceedqfunctionaddinput,CEEDQFUNCTIONADDINPUT) -CEED_EXTERN void fCeedQFunctionAddInput(int *qf, const char *field_name, - CeedInt *num_comp, CeedEvalMode *eval_mode, int *err, +#define fCeedQFunctionAddInput FORTRAN_NAME(ceedqfunctionaddinput, CEEDQFUNCTIONADDINPUT) +CEED_EXTERN void fCeedQFunctionAddInput(int *qf, const char *field_name, CeedInt *num_comp, CeedEvalMode *eval_mode, int *err, fortran_charlen_t field_name_len) { FIX_STRING(field_name); CeedQFunction qf_ = CeedQFunction_dict[*qf]; @@ -914,21 +719,18 @@ CEED_EXTERN void fCeedQFunctionAddInput(int *qf, const char *field_name, *err = CeedQFunctionAddInput(qf_, field_name_c, *num_comp, *eval_mode); } -#define fCeedQFunctionAddOutput \ - FORTRAN_NAME(ceedqfunctionaddoutput,CEEDQFUNCTIONADDOUTPUT) -CEED_EXTERN void fCeedQFunctionAddOutput(int *qf, const char *field_name, - CeedInt *num_comp, CeedEvalMode *eval_mode, int *err, - fortran_charlen_t field_name_len) { +#define fCeedQFunctionAddOutput FORTRAN_NAME(ceedqfunctionaddoutput, CEEDQFUNCTIONADDOUTPUT) +CEED_EXTERN void fCeedQFunctionAddOutput(int *qf, const char *field_name, CeedInt *num_comp, CeedEvalMode *eval_mode, int *err, + fortran_charlen_t field_name_len) { FIX_STRING(field_name); CeedQFunction qf_ = CeedQFunction_dict[*qf]; *err = CeedQFunctionAddOutput(qf_, field_name_c, *num_comp, *eval_mode); } -#define fCeedQFunctionSetContext \ - FORTRAN_NAME(ceedqfunctionsetcontext,CEEDQFUNCTIONSETCONTEXT) +#define fCeedQFunctionSetContext FORTRAN_NAME(ceedqfunctionsetcontext, CEEDQFUNCTIONSETCONTEXT) CEED_EXTERN void fCeedQFunctionSetContext(int *qf, int *ctx, int *err) { - CeedQFunction qf_ = CeedQFunction_dict[*qf]; + CeedQFunction qf_ = CeedQFunction_dict[*qf]; CeedQFunctionContext ctx_ = CeedQFunctionContext_dict[*ctx]; CeedQFunctionContext fctx; @@ -938,69 +740,61 @@ CEED_EXTERN void fCeedQFunctionSetContext(int *qf, int *ctx, int *err) { *err = CeedQFunctionContextGetData(fctx, CEED_MEM_HOST, &fctxdata); if (*err) return; fctxdata->inner_ctx = ctx_; - *err = CeedQFunctionContextRestoreData(fctx, (void **)&fctxdata); + *err = CeedQFunctionContextRestoreData(fctx, (void **)&fctxdata); } -#define fCeedQFunctionView \ - FORTRAN_NAME(ceedqfunctionview,CEEDQFUNCTIONVIEW) +#define fCeedQFunctionView FORTRAN_NAME(ceedqfunctionview, CEEDQFUNCTIONVIEW) CEED_EXTERN void fCeedQFunctionView(int *qf, int *err) { CeedQFunction qf_ = CeedQFunction_dict[*qf]; *err = CeedQFunctionView(qf_, stdout); } -#define fCeedQFunctionApply \ - FORTRAN_NAME(ceedqfunctionapply,CEEDQFUNCTIONAPPLY) -//TODO Need Fixing, double pointer -CEED_EXTERN void fCeedQFunctionApply(int *qf, int *Q, - int *u, int *u1, int *u2, int *u3, - int *u4, int *u5, int *u6, int *u7, - int *u8, int *u9, int *u10, int *u11, - int *u12, int *u13, int *u14, int *u15, - int *v, int *v1, int *v2, int *v3, - int *v4, int *v5, int *v6, int *v7, - int *v8, int *v9, int *v10, int *v11, - int *v12, int *v13, int *v14, int *v15, int *err) { +#define fCeedQFunctionApply FORTRAN_NAME(ceedqfunctionapply, CEEDQFUNCTIONAPPLY) +// TODO Need Fixing, double pointer +CEED_EXTERN void fCeedQFunctionApply(int *qf, int *Q, int *u, int *u1, int *u2, int *u3, int *u4, int *u5, int *u6, int *u7, int *u8, int *u9, + int *u10, int *u11, int *u12, int *u13, int *u14, int *u15, int *v, int *v1, int *v2, int *v3, int *v4, int *v5, + int *v6, int *v7, int *v8, int *v9, int *v10, int *v11, int *v12, int *v13, int *v14, int *v15, int *err) { CeedQFunction qf_ = CeedQFunction_dict[*qf]; - CeedVector *in; + CeedVector *in; *err = CeedCalloc(CEED_FIELD_MAX, &in); if (*err) return; - in[0] = *u==FORTRAN_NULL?NULL:CeedVector_dict[*u]; - in[1] = *u1==FORTRAN_NULL?NULL:CeedVector_dict[*u1]; - in[2] = *u2==FORTRAN_NULL?NULL:CeedVector_dict[*u2]; - in[3] = *u3==FORTRAN_NULL?NULL:CeedVector_dict[*u3]; - in[4] = *u4==FORTRAN_NULL?NULL:CeedVector_dict[*u4]; - in[5] = *u5==FORTRAN_NULL?NULL:CeedVector_dict[*u5]; - in[6] = *u6==FORTRAN_NULL?NULL:CeedVector_dict[*u6]; - in[7] = *u7==FORTRAN_NULL?NULL:CeedVector_dict[*u7]; - in[8] = *u8==FORTRAN_NULL?NULL:CeedVector_dict[*u8]; - in[9] = *u9==FORTRAN_NULL?NULL:CeedVector_dict[*u9]; - in[10] = *u10==FORTRAN_NULL?NULL:CeedVector_dict[*u10]; - in[11] = *u11==FORTRAN_NULL?NULL:CeedVector_dict[*u11]; - in[12] = *u12==FORTRAN_NULL?NULL:CeedVector_dict[*u12]; - in[13] = *u13==FORTRAN_NULL?NULL:CeedVector_dict[*u13]; - in[14] = *u14==FORTRAN_NULL?NULL:CeedVector_dict[*u14]; - in[15] = *u15==FORTRAN_NULL?NULL:CeedVector_dict[*u15]; + in[0] = *u == FORTRAN_NULL ? NULL : CeedVector_dict[*u]; + in[1] = *u1 == FORTRAN_NULL ? NULL : CeedVector_dict[*u1]; + in[2] = *u2 == FORTRAN_NULL ? NULL : CeedVector_dict[*u2]; + in[3] = *u3 == FORTRAN_NULL ? NULL : CeedVector_dict[*u3]; + in[4] = *u4 == FORTRAN_NULL ? NULL : CeedVector_dict[*u4]; + in[5] = *u5 == FORTRAN_NULL ? NULL : CeedVector_dict[*u5]; + in[6] = *u6 == FORTRAN_NULL ? NULL : CeedVector_dict[*u6]; + in[7] = *u7 == FORTRAN_NULL ? NULL : CeedVector_dict[*u7]; + in[8] = *u8 == FORTRAN_NULL ? NULL : CeedVector_dict[*u8]; + in[9] = *u9 == FORTRAN_NULL ? NULL : CeedVector_dict[*u9]; + in[10] = *u10 == FORTRAN_NULL ? NULL : CeedVector_dict[*u10]; + in[11] = *u11 == FORTRAN_NULL ? NULL : CeedVector_dict[*u11]; + in[12] = *u12 == FORTRAN_NULL ? NULL : CeedVector_dict[*u12]; + in[13] = *u13 == FORTRAN_NULL ? NULL : CeedVector_dict[*u13]; + in[14] = *u14 == FORTRAN_NULL ? NULL : CeedVector_dict[*u14]; + in[15] = *u15 == FORTRAN_NULL ? NULL : CeedVector_dict[*u15]; CeedVector *out; *err = CeedCalloc(CEED_FIELD_MAX, &out); if (*err) return; - out[0] = *v==FORTRAN_NULL?NULL:CeedVector_dict[*v]; - out[1] = *v1==FORTRAN_NULL?NULL:CeedVector_dict[*v1]; - out[2] = *v2==FORTRAN_NULL?NULL:CeedVector_dict[*v2]; - out[3] = *v3==FORTRAN_NULL?NULL:CeedVector_dict[*v3]; - out[4] = *v4==FORTRAN_NULL?NULL:CeedVector_dict[*v4]; - out[5] = *v5==FORTRAN_NULL?NULL:CeedVector_dict[*v5]; - out[6] = *v6==FORTRAN_NULL?NULL:CeedVector_dict[*v6]; - out[7] = *v7==FORTRAN_NULL?NULL:CeedVector_dict[*v7]; - out[8] = *v8==FORTRAN_NULL?NULL:CeedVector_dict[*v8]; - out[9] = *v9==FORTRAN_NULL?NULL:CeedVector_dict[*v9]; - out[10] = *v10==FORTRAN_NULL?NULL:CeedVector_dict[*v10]; - out[11] = *v11==FORTRAN_NULL?NULL:CeedVector_dict[*v11]; - out[12] = *v12==FORTRAN_NULL?NULL:CeedVector_dict[*v12]; - out[13] = *v13==FORTRAN_NULL?NULL:CeedVector_dict[*v13]; - out[14] = *v14==FORTRAN_NULL?NULL:CeedVector_dict[*v14]; - out[15] = *v15==FORTRAN_NULL?NULL:CeedVector_dict[*v15]; - *err = CeedQFunctionApply(qf_, *Q, in, out); + out[0] = *v == FORTRAN_NULL ? NULL : CeedVector_dict[*v]; + out[1] = *v1 == FORTRAN_NULL ? NULL : CeedVector_dict[*v1]; + out[2] = *v2 == FORTRAN_NULL ? NULL : CeedVector_dict[*v2]; + out[3] = *v3 == FORTRAN_NULL ? NULL : CeedVector_dict[*v3]; + out[4] = *v4 == FORTRAN_NULL ? NULL : CeedVector_dict[*v4]; + out[5] = *v5 == FORTRAN_NULL ? NULL : CeedVector_dict[*v5]; + out[6] = *v6 == FORTRAN_NULL ? NULL : CeedVector_dict[*v6]; + out[7] = *v7 == FORTRAN_NULL ? NULL : CeedVector_dict[*v7]; + out[8] = *v8 == FORTRAN_NULL ? NULL : CeedVector_dict[*v8]; + out[9] = *v9 == FORTRAN_NULL ? NULL : CeedVector_dict[*v9]; + out[10] = *v10 == FORTRAN_NULL ? NULL : CeedVector_dict[*v10]; + out[11] = *v11 == FORTRAN_NULL ? NULL : CeedVector_dict[*v11]; + out[12] = *v12 == FORTRAN_NULL ? NULL : CeedVector_dict[*v12]; + out[13] = *v13 == FORTRAN_NULL ? NULL : CeedVector_dict[*v13]; + out[14] = *v14 == FORTRAN_NULL ? NULL : CeedVector_dict[*v14]; + out[15] = *v15 == FORTRAN_NULL ? NULL : CeedVector_dict[*v15]; + *err = CeedQFunctionApply(qf_, *Q, in, out); if (*err) return; *err = CeedFree(&in); @@ -1008,8 +802,7 @@ CEED_EXTERN void fCeedQFunctionApply(int *qf, int *Q, *err = CeedFree(&out); } -#define fCeedQFunctionDestroy \ - FORTRAN_NAME(ceedqfunctiondestroy,CEEDQFUNCTIONDESTROY) +#define fCeedQFunctionDestroy FORTRAN_NAME(ceedqfunctiondestroy, CEEDQFUNCTIONDESTROY) CEED_EXTERN void fCeedQFunctionDestroy(int *qf, int *err) { if (*qf == FORTRAN_NULL) return; @@ -1018,8 +811,8 @@ CEED_EXTERN void fCeedQFunctionDestroy(int *qf, int *err) { *qf = FORTRAN_NULL; CeedQFunction_n--; if (CeedQFunction_n == 0) { - *err = CeedFree(&CeedQFunction_dict); - CeedQFunction_count = 0; + *err = CeedFree(&CeedQFunction_dict); + CeedQFunction_count = 0; CeedQFunction_count_max = 0; } } @@ -1028,38 +821,34 @@ CEED_EXTERN void fCeedQFunctionDestroy(int *qf, int *err) { // ----------------------------------------------------------------------------- // CeedOperator // ----------------------------------------------------------------------------- -static CeedOperator *CeedOperator_dict = NULL; -static int CeedOperator_count = 0; -static int CeedOperator_n = 0; -static int CeedOperator_count_max = 0; - -#define fCeedOperatorCreate \ - FORTRAN_NAME(ceedoperatorcreate, CEEDOPERATORCREATE) -CEED_EXTERN void fCeedOperatorCreate(int *ceed, - int *qf, int *dqf, int *dqfT, int *op, int *err) { +static CeedOperator *CeedOperator_dict = NULL; +static int CeedOperator_count = 0; +static int CeedOperator_n = 0; +static int CeedOperator_count_max = 0; + +#define fCeedOperatorCreate FORTRAN_NAME(ceedoperatorcreate, CEEDOPERATORCREATE) +CEED_EXTERN void fCeedOperatorCreate(int *ceed, int *qf, int *dqf, int *dqfT, int *op, int *err) { if (CeedOperator_count == CeedOperator_count_max) { - CeedOperator_count_max += CeedOperator_count_max/2 + 1; + CeedOperator_count_max += CeedOperator_count_max / 2 + 1; CeedRealloc(CeedOperator_count_max, &CeedOperator_dict); } CeedOperator *op_ = &CeedOperator_dict[CeedOperator_count]; - CeedQFunction dqf_ = CEED_QFUNCTION_NONE, dqfT_ = CEED_QFUNCTION_NONE; - if (*dqf != FORTRAN_QFUNCTION_NONE) dqf_ = CeedQFunction_dict[*dqf ]; + CeedQFunction dqf_ = CEED_QFUNCTION_NONE, dqfT_ = CEED_QFUNCTION_NONE; + if (*dqf != FORTRAN_QFUNCTION_NONE) dqf_ = CeedQFunction_dict[*dqf]; if (*dqfT != FORTRAN_QFUNCTION_NONE) dqfT_ = CeedQFunction_dict[*dqfT]; - *err = CeedOperatorCreate(Ceed_dict[*ceed], CeedQFunction_dict[*qf], dqf_, - dqfT_, op_); + *err = CeedOperatorCreate(Ceed_dict[*ceed], CeedQFunction_dict[*qf], dqf_, dqfT_, op_); if (*err) return; *op = CeedOperator_count++; CeedOperator_n++; } -#define fCeedCompositeOperatorCreate \ - FORTRAN_NAME(ceedcompositeoperatorcreate, CEEDCOMPOSITEOPERATORCREATE) +#define fCeedCompositeOperatorCreate FORTRAN_NAME(ceedcompositeoperatorcreate, CEEDCOMPOSITEOPERATORCREATE) CEED_EXTERN void fCeedCompositeOperatorCreate(int *ceed, int *op, int *err) { if (CeedOperator_count == CeedOperator_count_max) { - CeedOperator_count_max += CeedOperator_count_max/2 + 1; + CeedOperator_count_max += CeedOperator_count_max / 2 + 1; CeedRealloc(CeedOperator_count_max, &CeedOperator_dict); } @@ -1071,15 +860,12 @@ CEED_EXTERN void fCeedCompositeOperatorCreate(int *ceed, int *op, int *err) { CeedOperator_n++; } -#define fCeedOperatorSetField \ - FORTRAN_NAME(ceedoperatorsetfield,CEEDOPERATORSETFIELD) -CEED_EXTERN void fCeedOperatorSetField(int *op, const char *field_name, int *r, - int *b, int *v, int *err, - fortran_charlen_t field_name_len) { +#define fCeedOperatorSetField FORTRAN_NAME(ceedoperatorsetfield, CEEDOPERATORSETFIELD) +CEED_EXTERN void fCeedOperatorSetField(int *op, const char *field_name, int *r, int *b, int *v, int *err, fortran_charlen_t field_name_len) { FIX_STRING(field_name); CeedElemRestriction r_; - CeedBasis b_; - CeedVector v_; + CeedBasis b_; + CeedVector v_; CeedOperator op_ = CeedOperator_dict[*op]; @@ -1111,45 +897,37 @@ CEED_EXTERN void fCeedOperatorSetField(int *op, const char *field_name, int *r, *err = CeedOperatorSetField(op_, field_name_c, r_, b_, v_); } -#define fCeedCompositeOperatorAddSub \ - FORTRAN_NAME(ceedcompositeoperatoraddsub, CEEDCOMPOSITEOPERATORADDSUB) -CEED_EXTERN void fCeedCompositeOperatorAddSub(int *compositeop, int *subop, - int *err) { +#define fCeedCompositeOperatorAddSub FORTRAN_NAME(ceedcompositeoperatoraddsub, CEEDCOMPOSITEOPERATORADDSUB) +CEED_EXTERN void fCeedCompositeOperatorAddSub(int *compositeop, int *subop, int *err) { CeedOperator compositeop_ = CeedOperator_dict[*compositeop]; - CeedOperator subop_ = CeedOperator_dict[*subop]; + CeedOperator subop_ = CeedOperator_dict[*subop]; *err = CeedCompositeOperatorAddSub(compositeop_, subop_); } -#define fCeedOperatorSetName \ - FORTRAN_NAME(ceedoperatorsetname, CEEDOPERATORSETNAME) -CEED_EXTERN void fCeedOperatorSetName(int *op, const char *name, int *err, - fortran_charlen_t name_len) { +#define fCeedOperatorSetName FORTRAN_NAME(ceedoperatorsetname, CEEDOPERATORSETNAME) +CEED_EXTERN void fCeedOperatorSetName(int *op, const char *name, int *err, fortran_charlen_t name_len) { FIX_STRING(name); CeedOperator op_ = CeedOperator_dict[*op]; *err = CeedOperatorSetName(op_, name_c); } -#define fCeedOperatorLinearAssembleQFunction \ - FORTRAN_NAME(ceedoperatorlinearassembleqfunction, CEEDOPERATORLINEARASSEMBLEQFUNCTION) -CEED_EXTERN void fCeedOperatorLinearAssembleQFunction(int *op, - int *assembledvec, - int *assembledrstr, int *rqst, int *err) { +#define fCeedOperatorLinearAssembleQFunction FORTRAN_NAME(ceedoperatorlinearassembleqfunction, CEEDOPERATORLINEARASSEMBLEQFUNCTION) +CEED_EXTERN void fCeedOperatorLinearAssembleQFunction(int *op, int *assembledvec, int *assembledrstr, int *rqst, int *err) { // Vector if (CeedVector_count == CeedVector_count_max) { - CeedVector_count_max += CeedVector_count_max/2 + 1; + CeedVector_count_max += CeedVector_count_max / 2 + 1; CeedRealloc(CeedVector_count_max, &CeedVector_dict); } CeedVector *assembledvec_ = &CeedVector_dict[CeedVector_count]; // Restriction if (CeedElemRestriction_count == CeedElemRestriction_count_max) { - CeedElemRestriction_count_max += CeedElemRestriction_count_max/2 + 1; + CeedElemRestriction_count_max += CeedElemRestriction_count_max / 2 + 1; CeedRealloc(CeedElemRestriction_count_max, &CeedElemRestriction_dict); } - CeedElemRestriction *rstr_ = - &CeedElemRestriction_dict[CeedElemRestriction_count]; + CeedElemRestriction *rstr_ = &CeedElemRestriction_dict[CeedElemRestriction_count]; int createRequest = 1; // Check if input is CEED_REQUEST_ORDERED(-2) or CEED_REQUEST_IMMEDIATE(-1) @@ -1158,7 +936,7 @@ CEED_EXTERN void fCeedOperatorLinearAssembleQFunction(int *op, } if (createRequest && CeedRequest_count == CeedRequest_count_max) { - CeedRequest_count_max += CeedRequest_count_max/2 + 1; + CeedRequest_count_max += CeedRequest_count_max / 2 + 1; CeedRealloc(CeedRequest_count_max, &CeedRequest_dict); } @@ -1167,8 +945,7 @@ CEED_EXTERN void fCeedOperatorLinearAssembleQFunction(int *op, else if (*rqst == -2) rqst_ = CEED_REQUEST_ORDERED; else rqst_ = &CeedRequest_dict[CeedRequest_count]; - *err = CeedOperatorLinearAssembleQFunction(CeedOperator_dict[*op], - assembledvec_, rstr_, rqst_); + *err = CeedOperatorLinearAssembleQFunction(CeedOperator_dict[*op], assembledvec_, rstr_, rqst_); if (*err) return; if (createRequest) { *rqst = CeedRequest_count++; @@ -1183,10 +960,8 @@ CEED_EXTERN void fCeedOperatorLinearAssembleQFunction(int *op, } } -#define fCeedOperatorLinearAssembleDiagonal \ - FORTRAN_NAME(ceedoperatorlinearassemblediagonal, CEEDOPERATORLINEARASSEMBLEDIAGONAL) -CEED_EXTERN void fCeedOperatorLinearAssembleDiagonal(int *op, int *assembledvec, - int *rqst, int *err) { +#define fCeedOperatorLinearAssembleDiagonal FORTRAN_NAME(ceedoperatorlinearassemblediagonal, CEEDOPERATORLINEARASSEMBLEDIAGONAL) +CEED_EXTERN void fCeedOperatorLinearAssembleDiagonal(int *op, int *assembledvec, int *rqst, int *err) { int createRequest = 1; // Check if input is CEED_REQUEST_ORDERED(-2) or CEED_REQUEST_IMMEDIATE(-1) if (*rqst == -1 || *rqst == -2) { @@ -1194,7 +969,7 @@ CEED_EXTERN void fCeedOperatorLinearAssembleDiagonal(int *op, int *assembledvec, } if (createRequest && CeedRequest_count == CeedRequest_count_max) { - CeedRequest_count_max += CeedRequest_count_max/2 + 1; + CeedRequest_count_max += CeedRequest_count_max / 2 + 1; CeedRealloc(CeedRequest_count_max, &CeedRequest_dict); } @@ -1203,8 +978,7 @@ CEED_EXTERN void fCeedOperatorLinearAssembleDiagonal(int *op, int *assembledvec, else if (*rqst == -2) rqst_ = CEED_REQUEST_ORDERED; else rqst_ = &CeedRequest_dict[CeedRequest_count]; - *err = CeedOperatorLinearAssembleDiagonal(CeedOperator_dict[*op], - CeedVector_dict[*assembledvec], rqst_); + *err = CeedOperatorLinearAssembleDiagonal(CeedOperator_dict[*op], CeedVector_dict[*assembledvec], rqst_); if (*err) return; if (createRequest) { *rqst = CeedRequest_count++; @@ -1212,112 +986,93 @@ CEED_EXTERN void fCeedOperatorLinearAssembleDiagonal(int *op, int *assembledvec, } } -#define fCeedOperatorMultigridLevelCreate \ - FORTRAN_NAME(ceedoperatormultigridlevelcreate, CEEDOPERATORMULTIGRIDLEVELCREATE) -CEED_EXTERN void fCeedOperatorMultigridLevelCreate(int *opFine, int *pMultFine, - int *rstrCoarse, int *basisCoarse, int *opCoarse, - int *opProlong, int *opRestrict, int *err) { +#define fCeedOperatorMultigridLevelCreate FORTRAN_NAME(ceedoperatormultigridlevelcreate, CEEDOPERATORMULTIGRIDLEVELCREATE) +CEED_EXTERN void fCeedOperatorMultigridLevelCreate(int *opFine, int *pMultFine, int *rstrCoarse, int *basisCoarse, int *opCoarse, int *opProlong, + int *opRestrict, int *err) { // Operators CeedOperator opCoarse_, opProlong_, opRestrict_; // C interface call - *err = CeedOperatorMultigridLevelCreate( - CeedOperator_dict[*opFine], CeedVector_dict[*pMultFine], - CeedElemRestriction_dict[*rstrCoarse], - CeedBasis_dict[*basisCoarse], - &opCoarse_, &opProlong_, &opRestrict_); + *err = CeedOperatorMultigridLevelCreate(CeedOperator_dict[*opFine], CeedVector_dict[*pMultFine], CeedElemRestriction_dict[*rstrCoarse], + CeedBasis_dict[*basisCoarse], &opCoarse_, &opProlong_, &opRestrict_); if (*err) return; while (CeedOperator_count + 2 >= CeedOperator_count_max) { - CeedOperator_count_max += CeedOperator_count_max/2 + 1; + CeedOperator_count_max += CeedOperator_count_max / 2 + 1; } CeedRealloc(CeedOperator_count_max, &CeedOperator_dict); CeedOperator_dict[CeedOperator_count] = opCoarse_; - *opCoarse = CeedOperator_count++; + *opCoarse = CeedOperator_count++; CeedOperator_dict[CeedOperator_count] = opProlong_; - *opProlong = CeedOperator_count++; + *opProlong = CeedOperator_count++; CeedOperator_dict[CeedOperator_count] = opRestrict_; - *opRestrict = CeedOperator_count++; + *opRestrict = CeedOperator_count++; CeedOperator_n += 3; } -#define fCeedOperatorMultigridLevelCreateTensorH1 \ - FORTRAN_NAME(ceedoperatormultigridlevelcreatetensorh1, CEEDOPERATORMULTIGRIDLEVELCREATETENSORH1) -CEED_EXTERN void fCeedOperatorMultigridLevelCreateTensorH1(int *opFine, - int *pMultFine, - int *rstrCoarse, int *basisCoarse, const CeedScalar *interpCtoF, - int *opCoarse, int *opProlong, int *opRestrict, int *err) { +#define fCeedOperatorMultigridLevelCreateTensorH1 FORTRAN_NAME(ceedoperatormultigridlevelcreatetensorh1, CEEDOPERATORMULTIGRIDLEVELCREATETENSORH1) +CEED_EXTERN void fCeedOperatorMultigridLevelCreateTensorH1(int *opFine, int *pMultFine, int *rstrCoarse, int *basisCoarse, + const CeedScalar *interpCtoF, int *opCoarse, int *opProlong, int *opRestrict, int *err) { // Operators CeedOperator opCoarse_, opProlong_, opRestrict_; // C interface call - *err = CeedOperatorMultigridLevelCreateTensorH1( - CeedOperator_dict[*opFine], CeedVector_dict[*pMultFine], - CeedElemRestriction_dict[*rstrCoarse], CeedBasis_dict[*basisCoarse], - interpCtoF, &opCoarse_, &opProlong_, &opRestrict_); + *err = CeedOperatorMultigridLevelCreateTensorH1(CeedOperator_dict[*opFine], CeedVector_dict[*pMultFine], CeedElemRestriction_dict[*rstrCoarse], + CeedBasis_dict[*basisCoarse], interpCtoF, &opCoarse_, &opProlong_, &opRestrict_); if (*err) return; while (CeedOperator_count + 2 >= CeedOperator_count_max) { - CeedOperator_count_max += CeedOperator_count_max/2 + 1; + CeedOperator_count_max += CeedOperator_count_max / 2 + 1; } CeedRealloc(CeedOperator_count_max, &CeedOperator_dict); CeedOperator_dict[CeedOperator_count] = opCoarse_; - *opCoarse = CeedOperator_count++; + *opCoarse = CeedOperator_count++; CeedOperator_dict[CeedOperator_count] = opProlong_; - *opProlong = CeedOperator_count++; + *opProlong = CeedOperator_count++; CeedOperator_dict[CeedOperator_count] = opRestrict_; - *opRestrict = CeedOperator_count++; + *opRestrict = CeedOperator_count++; CeedOperator_n += 3; } -#define fCeedOperatorMultigridLevelCreateH1 \ - FORTRAN_NAME(ceedoperatormultigridlevelcreateh1, CEEDOPERATORMULTIGRIDLEVELCREATEH1) -CEED_EXTERN void fCeedOperatorMultigridLevelCreateH1(int *opFine, - int *pMultFine, - int *rstrCoarse, int *basisCoarse, const CeedScalar *interpCtoF, - int *opCoarse, int *opProlong, int *opRestrict, int *err) { +#define fCeedOperatorMultigridLevelCreateH1 FORTRAN_NAME(ceedoperatormultigridlevelcreateh1, CEEDOPERATORMULTIGRIDLEVELCREATEH1) +CEED_EXTERN void fCeedOperatorMultigridLevelCreateH1(int *opFine, int *pMultFine, int *rstrCoarse, int *basisCoarse, const CeedScalar *interpCtoF, + int *opCoarse, int *opProlong, int *opRestrict, int *err) { // Operators CeedOperator opCoarse_, opProlong_, opRestrict_; // C interface call - *err = CeedOperatorMultigridLevelCreateH1( - CeedOperator_dict[*opFine], CeedVector_dict[*pMultFine], - CeedElemRestriction_dict[*rstrCoarse], CeedBasis_dict[*basisCoarse], - interpCtoF, &opCoarse_, &opProlong_, &opRestrict_); + *err = CeedOperatorMultigridLevelCreateH1(CeedOperator_dict[*opFine], CeedVector_dict[*pMultFine], CeedElemRestriction_dict[*rstrCoarse], + CeedBasis_dict[*basisCoarse], interpCtoF, &opCoarse_, &opProlong_, &opRestrict_); if (*err) return; while (CeedOperator_count + 2 >= CeedOperator_count_max) { - CeedOperator_count_max += CeedOperator_count_max/2 + 1; + CeedOperator_count_max += CeedOperator_count_max / 2 + 1; } CeedRealloc(CeedOperator_count_max, &CeedOperator_dict); CeedOperator_dict[CeedOperator_count] = opCoarse_; - *opCoarse = CeedOperator_count++; + *opCoarse = CeedOperator_count++; CeedOperator_dict[CeedOperator_count] = opProlong_; - *opProlong = CeedOperator_count++; + *opProlong = CeedOperator_count++; CeedOperator_dict[CeedOperator_count] = opRestrict_; - *opRestrict = CeedOperator_count++; + *opRestrict = CeedOperator_count++; CeedOperator_n += 3; } -#define fCeedOperatorView \ - FORTRAN_NAME(ceedoperatorview,CEEDOPERATORVIEW) +#define fCeedOperatorView FORTRAN_NAME(ceedoperatorview, CEEDOPERATORVIEW) CEED_EXTERN void fCeedOperatorView(int *op, int *err) { CeedOperator op_ = CeedOperator_dict[*op]; *err = CeedOperatorView(op_, stdout); } -#define fCeedOperatorCreateFDMElementInverse \ - FORTRAN_NAME(ceedoperatorcreatefdmelementinverse, CEEDOPERATORCREATEFDMELEMENTINVERSE) -CEED_EXTERN void fCeedOperatorCreateFDMElementInverse(int *op, int *fdminv, - int *rqst, int *err) { +#define fCeedOperatorCreateFDMElementInverse FORTRAN_NAME(ceedoperatorcreatefdmelementinverse, CEEDOPERATORCREATEFDMELEMENTINVERSE) +CEED_EXTERN void fCeedOperatorCreateFDMElementInverse(int *op, int *fdminv, int *rqst, int *err) { // Operator if (CeedOperator_count == CeedOperator_count_max) { - CeedOperator_count_max += CeedOperator_count_max/2 + 1; + CeedOperator_count_max += CeedOperator_count_max / 2 + 1; CeedRealloc(CeedOperator_count_max, &CeedOperator_dict); } - CeedOperator *fdminv_ = - &CeedOperator_dict[CeedOperator_count]; + CeedOperator *fdminv_ = &CeedOperator_dict[CeedOperator_count]; int createRequest = 1; // Check if input is CEED_REQUEST_ORDERED(-2) or CEED_REQUEST_IMMEDIATE(-1) @@ -1326,7 +1081,7 @@ CEED_EXTERN void fCeedOperatorCreateFDMElementInverse(int *op, int *fdminv, } if (createRequest && CeedRequest_count == CeedRequest_count_max) { - CeedRequest_count_max += CeedRequest_count_max/2 + 1; + CeedRequest_count_max += CeedRequest_count_max / 2 + 1; CeedRealloc(CeedRequest_count_max, &CeedRequest_dict); } @@ -1335,8 +1090,7 @@ CEED_EXTERN void fCeedOperatorCreateFDMElementInverse(int *op, int *fdminv, else if (*rqst == -2) rqst_ = CEED_REQUEST_ORDERED; else rqst_ = &CeedRequest_dict[CeedRequest_count]; - *err = CeedOperatorCreateFDMElementInverse(CeedOperator_dict[*op], - fdminv_, rqst_); + *err = CeedOperatorCreateFDMElementInverse(CeedOperator_dict[*op], fdminv_, rqst_); if (*err) return; if (createRequest) { *rqst = CeedRequest_count++; @@ -1350,14 +1104,9 @@ CEED_EXTERN void fCeedOperatorCreateFDMElementInverse(int *op, int *fdminv, } #define fCeedOperatorApply FORTRAN_NAME(ceedoperatorapply, CEEDOPERATORAPPLY) -CEED_EXTERN void fCeedOperatorApply(int *op, int *ustatevec, - int *resvec, int *rqst, int *err) { - CeedVector ustatevec_ = (*ustatevec == FORTRAN_NULL) ? - NULL : (*ustatevec == FORTRAN_VECTOR_NONE ? - CEED_VECTOR_NONE : CeedVector_dict[*ustatevec]); - CeedVector resvec_ = (*resvec == FORTRAN_NULL) ? - NULL : (*resvec == FORTRAN_VECTOR_NONE ? - CEED_VECTOR_NONE : CeedVector_dict[*resvec]); +CEED_EXTERN void fCeedOperatorApply(int *op, int *ustatevec, int *resvec, int *rqst, int *err) { + CeedVector ustatevec_ = (*ustatevec == FORTRAN_NULL) ? NULL : (*ustatevec == FORTRAN_VECTOR_NONE ? CEED_VECTOR_NONE : CeedVector_dict[*ustatevec]); + CeedVector resvec_ = (*resvec == FORTRAN_NULL) ? NULL : (*resvec == FORTRAN_VECTOR_NONE ? CEED_VECTOR_NONE : CeedVector_dict[*resvec]); int createRequest = 1; // Check if input is CEED_REQUEST_ORDERED(-2) or CEED_REQUEST_IMMEDIATE(-1) @@ -1366,7 +1115,7 @@ CEED_EXTERN void fCeedOperatorApply(int *op, int *ustatevec, } if (createRequest && CeedRequest_count == CeedRequest_count_max) { - CeedRequest_count_max += CeedRequest_count_max/2 + 1; + CeedRequest_count_max += CeedRequest_count_max / 2 + 1; CeedRealloc(CeedRequest_count_max, &CeedRequest_dict); } @@ -1375,8 +1124,7 @@ CEED_EXTERN void fCeedOperatorApply(int *op, int *ustatevec, else if (*rqst == -2) rqst_ = CEED_REQUEST_ORDERED; else rqst_ = &CeedRequest_dict[CeedRequest_count]; - *err = CeedOperatorApply(CeedOperator_dict[*op], - ustatevec_, resvec_, rqst_); + *err = CeedOperatorApply(CeedOperator_dict[*op], ustatevec_, resvec_, rqst_); if (*err) return; if (createRequest) { *rqst = CeedRequest_count++; @@ -1385,12 +1133,9 @@ CEED_EXTERN void fCeedOperatorApply(int *op, int *ustatevec, } #define fCeedOperatorApplyAdd FORTRAN_NAME(ceedoperatorapplyadd, CEEDOPERATORAPPLYADD) -CEED_EXTERN void fCeedOperatorApplyAdd(int *op, int *ustatevec, - int *resvec, int *rqst, int *err) { - CeedVector ustatevec_ = *ustatevec == FORTRAN_NULL - ? NULL : CeedVector_dict[*ustatevec]; - CeedVector resvec_ = *resvec == FORTRAN_NULL - ? NULL : CeedVector_dict[*resvec]; +CEED_EXTERN void fCeedOperatorApplyAdd(int *op, int *ustatevec, int *resvec, int *rqst, int *err) { + CeedVector ustatevec_ = *ustatevec == FORTRAN_NULL ? NULL : CeedVector_dict[*ustatevec]; + CeedVector resvec_ = *resvec == FORTRAN_NULL ? NULL : CeedVector_dict[*resvec]; int createRequest = 1; // Check if input is CEED_REQUEST_ORDERED(-2) or CEED_REQUEST_IMMEDIATE(-1) @@ -1399,7 +1144,7 @@ CEED_EXTERN void fCeedOperatorApplyAdd(int *op, int *ustatevec, } if (createRequest && CeedRequest_count == CeedRequest_count_max) { - CeedRequest_count_max += CeedRequest_count_max/2 + 1; + CeedRequest_count_max += CeedRequest_count_max / 2 + 1; CeedRealloc(CeedRequest_count_max, &CeedRequest_dict); } @@ -1408,8 +1153,7 @@ CEED_EXTERN void fCeedOperatorApplyAdd(int *op, int *ustatevec, else if (*rqst == -2) rqst_ = CEED_REQUEST_ORDERED; else rqst_ = &CeedRequest_dict[CeedRequest_count]; - *err = CeedOperatorApplyAdd(CeedOperator_dict[*op], - ustatevec_, resvec_, rqst_); + *err = CeedOperatorApplyAdd(CeedOperator_dict[*op], ustatevec_, resvec_, rqst_); if (*err) return; if (createRequest) { *rqst = CeedRequest_count++; @@ -1417,20 +1161,15 @@ CEED_EXTERN void fCeedOperatorApplyAdd(int *op, int *ustatevec, } } -#define fCeedOperatorApplyJacobian \ - FORTRAN_NAME(ceedoperatorapplyjacobian, CEEDOPERATORAPPLYJACOBIAN) -CEED_EXTERN void fCeedOperatorApplyJacobian(int *op, int *qdatavec, - int *ustatevec, - int *dustatevec, int *dresvec, int *rqst, - int *err) { -// TODO Uncomment this when CeedOperatorApplyJacobian is implemented -// *err = CeedOperatorApplyJacobian(CeedOperator_dict[*op], CeedVector_dict[*qdatavec], -// CeedVector_dict[*ustatevec], CeedVector_dict[*dustatevec], -// CeedVector_dict[*dresvec], &CeedRequest_dict[*rqst]); +#define fCeedOperatorApplyJacobian FORTRAN_NAME(ceedoperatorapplyjacobian, CEEDOPERATORAPPLYJACOBIAN) +CEED_EXTERN void fCeedOperatorApplyJacobian(int *op, int *qdatavec, int *ustatevec, int *dustatevec, int *dresvec, int *rqst, int *err) { + // TODO Uncomment this when CeedOperatorApplyJacobian is implemented + // *err = CeedOperatorApplyJacobian(CeedOperator_dict[*op], CeedVector_dict[*qdatavec], + // CeedVector_dict[*ustatevec], CeedVector_dict[*dustatevec], + // CeedVector_dict[*dresvec], &CeedRequest_dict[*rqst]); } -#define fCeedOperatorDestroy \ - FORTRAN_NAME(ceedoperatordestroy, CEEDOPERATORDESTROY) +#define fCeedOperatorDestroy FORTRAN_NAME(ceedoperatordestroy, CEEDOPERATORDESTROY) CEED_EXTERN void fCeedOperatorDestroy(int *op, int *err) { if (*op == FORTRAN_NULL) return; *err = CeedOperatorDestroy(&CeedOperator_dict[*op]); @@ -1438,8 +1177,8 @@ CEED_EXTERN void fCeedOperatorDestroy(int *op, int *err) { *op = FORTRAN_NULL; CeedOperator_n--; if (CeedOperator_n == 0) { - *err = CeedFree(&CeedOperator_dict); - CeedOperator_count = 0; + *err = CeedFree(&CeedOperator_dict); + CeedOperator_count = 0; CeedOperator_count_max = 0; } } diff --git a/interface/ceed-hip.c b/interface/ceed-hip.c index 77f7ffd75f..a420453b25 100644 --- a/interface/ceed-hip.c +++ b/interface/ceed-hip.c @@ -5,10 +5,10 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include #include +#include #include -#include /** @brief Set HIP function pointer to evaluate action at quadrature points @@ -21,14 +21,12 @@ @ref User **/ int CeedQFunctionSetHIPUserFunction(CeedQFunction qf, hipFunction_t f) { - int ierr; if (!qf->SetHIPUserFunction) { Ceed ceed; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChk(ierr); - CeedDebug(ceed, - "Backend does not support hipFunction_t pointers for QFunctions."); + CeedCall(CeedQFunctionGetCeed(qf, &ceed)); + CeedDebug(ceed, "Backend does not support hipFunction_t pointers for QFunctions."); } else { - ierr = qf->SetHIPUserFunction(qf, f); CeedChk(ierr); + CeedCall(qf->SetHIPUserFunction(qf, f)); } return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-jit-source-root-default.c b/interface/ceed-jit-source-root-default.c index 26a3348405..9a97d1d770 100644 --- a/interface/ceed-jit-source-root-default.c +++ b/interface/ceed-jit-source-root-default.c @@ -9,4 +9,4 @@ // This file and definition is used for in-source builds. // The definition for installs is in ceed-jit-source-root-install.c. -const char CeedJitSourceRootDefault[] = CEED_JIT_SOUCE_ROOT_DEFAULT; +const char *CeedJitSourceRootDefault = CEED_JIT_SOUCE_ROOT_DEFAULT; diff --git a/interface/ceed-jit-source-root-install.c b/interface/ceed-jit-source-root-install.c index e25679e7e9..f6fa80f78f 100644 --- a/interface/ceed-jit-source-root-install.c +++ b/interface/ceed-jit-source-root-install.c @@ -9,4 +9,4 @@ // This file and definition is used for installs. // The definition for in-source is in ceed-jit-source-root-default.c. -const char CeedJitSourceRootDefault[] = CEED_JIT_SOUCE_ROOT_DEFAULT; +const char *CeedJitSourceRootDefault = CEED_JIT_SOUCE_ROOT_DEFAULT; diff --git a/interface/ceed-jit-tools.c b/interface/ceed-jit-tools.c index 46664dd90d..c93d6265bb 100644 --- a/interface/ceed-jit-tools.c +++ b/interface/ceed-jit-tools.c @@ -5,10 +5,10 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include #include +#include #include -#include #include #include #include @@ -25,8 +25,6 @@ @ref Backend **/ int CeedCheckFilePath(Ceed ceed, const char *source_file_path, bool *is_valid) { - int ierr; - // Sometimes we have path/to/file.h:function_name // Create tempory file path without name, if needed char *source_file_path_only; @@ -34,8 +32,7 @@ int CeedCheckFilePath(Ceed ceed, const char *source_file_path, bool *is_valid) { if (last_colon) { size_t source_file_path_length = (last_colon - source_file_path + 1); - ierr = CeedCalloc(source_file_path_length, &source_file_path_only); - CeedChk(ierr); + CeedCall(CeedCalloc(source_file_path_length, &source_file_path_only)); memcpy(source_file_path_only, source_file_path, source_file_path_length - 1); } else { source_file_path_only = (char *)source_file_path; @@ -48,7 +45,7 @@ int CeedCheckFilePath(Ceed ceed, const char *source_file_path, bool *is_valid) { // Check for valid file path FILE *source_file; source_file = fopen(source_file_path_only, "rb"); - *is_valid = !!source_file; + *is_valid = !!source_file; if (*is_valid) { // Debug @@ -59,9 +56,7 @@ int CeedCheckFilePath(Ceed ceed, const char *source_file_path, bool *is_valid) { } // Free temp file path, if used - if (last_colon) { - ierr = CeedFree(&source_file_path_only); CeedChk(ierr); - } + if (last_colon) CeedCall(CeedFree(&source_file_path_only)); return CEED_ERROR_SUCCESS; } @@ -78,11 +73,9 @@ int CeedCheckFilePath(Ceed ceed, const char *source_file_path, bool *is_valid) { @ref Backend **/ -int CeedLoadSourceToInitializedBuffer(Ceed ceed, - const char *source_file_path, char **buffer) { - int ierr; +int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *source_file_path, char **buffer) { FILE *source_file; - long file_size, file_offset = 0; + long file_size, file_offset = 0; char *temp_buffer; // Debug @@ -94,24 +87,23 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, // Read file to temporary buffer source_file = fopen(source_file_path, "rb"); - if (!source_file) + if (!source_file) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_MAJOR, "Couldn't open source file: %s", - source_file_path); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_MAJOR, "Couldn't open source file: %s", source_file_path); + // LCOV_EXCL_STOP + } // -- Compute size of source fseek(source_file, 0L, SEEK_END); file_size = ftell(source_file); rewind(source_file); // -- Allocate memory for entire source file - ierr = CeedCalloc(file_size + 1, &temp_buffer); CeedChk(ierr); + CeedCall(CeedCalloc(file_size + 1, &temp_buffer)); // -- Copy the file into the buffer if (1 != fread(temp_buffer, file_size, 1, source_file)) { // LCOV_EXCL_START fclose(source_file); - ierr = CeedFree(&temp_buffer); CeedChk(ierr); - return CeedError(ceed, CEED_ERROR_MAJOR, "Couldn't read source file: %s", - source_file_path); + CeedCall(CeedFree(&temp_buffer)); + return CeedError(ceed, CEED_ERROR_MAJOR, "Couldn't read source file: %s", source_file_path); // LCOV_EXCL_STOP } fclose(source_file); @@ -120,8 +112,8 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, const char *first_hash = strchr(temp_buffer, '#'); while (first_hash) { // -- Check for 'include' keyword - const char *next_e = strchr(first_hash, 'e'); - char keyword[8] = ""; + const char *next_e = strchr(first_hash, 'e'); + char keyword[8] = ""; if (next_e && next_e - first_hash >= 7) memcpy(keyword, &next_e[-6], 7); bool is_hash_include = !strcmp(keyword, "include"); // ---- Spaces allowed in '# include ' @@ -133,52 +125,42 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, if (is_hash_include) { // -- Copy into buffer all preceding # long current_size = strlen(*buffer); - long copy_size = first_hash - &temp_buffer[file_offset]; - ierr = CeedRealloc(current_size + copy_size + 2, buffer); CeedChk(ierr); + long copy_size = first_hash - &temp_buffer[file_offset]; + CeedCall(CeedRealloc(current_size + copy_size + 2, buffer)); memcpy(&(*buffer)[current_size], "\n", 2); memcpy(&(*buffer)[current_size + 1], &temp_buffer[file_offset], copy_size); memcpy(&(*buffer)[current_size + copy_size], "", 1); // -- Load local "header.h" - char *next_quote = strchr(first_hash, '"'); - char *next_new_line = strchr(first_hash, '\n'); - bool is_local_header = is_hash_include && next_quote && - (next_new_line - next_quote > 0); + char *next_quote = strchr(first_hash, '"'); + char *next_new_line = strchr(first_hash, '\n'); + bool is_local_header = is_hash_include && next_quote && (next_new_line - next_quote > 0); char *next_left_chevron = strchr(first_hash, '<'); - bool is_ceed_header = is_hash_include && next_left_chevron && - (next_new_line - next_left_chevron > 0) && - (!strncmp(next_left_chevron, "", 14) || - !strncmp(next_left_chevron, "", 17) || - !strncmp(next_left_chevron, "", 17)); + bool is_ceed_header = is_hash_include && next_left_chevron && (next_new_line - next_left_chevron > 0) && + (!strncmp(next_left_chevron, "", 14) || + !strncmp(next_left_chevron, "", 17) || !strncmp(next_left_chevron, "", 17)); if (is_local_header || is_ceed_header) { // ---- Build source path char *include_source_path; if (is_local_header) { - long root_length = strrchr(source_file_path, '/') - source_file_path; + long root_length = strrchr(source_file_path, '/') - source_file_path; long include_file_name_len = strchr(&next_quote[1], '"') - next_quote - 1; - ierr = CeedCalloc(root_length + include_file_name_len + 2, - &include_source_path); CeedChk(ierr); + CeedCall(CeedCalloc(root_length + include_file_name_len + 2, &include_source_path)); memcpy(include_source_path, source_file_path, root_length + 1); - memcpy(&include_source_path[root_length + 1], &next_quote[1], - include_file_name_len); + memcpy(&include_source_path[root_length + 1], &next_quote[1], include_file_name_len); memcpy(&include_source_path[root_length + include_file_name_len + 1], "", 1); } else { char *next_right_chevron = strchr(first_hash, '>'); char *ceed_relative_path; - long ceed_relative_path_length = next_right_chevron - next_left_chevron - 1; - ierr = CeedCalloc(ceed_relative_path_length + 1, &ceed_relative_path); - CeedChk(ierr); + long ceed_relative_path_length = next_right_chevron - next_left_chevron - 1; + CeedCall(CeedCalloc(ceed_relative_path_length + 1, &ceed_relative_path)); memcpy(ceed_relative_path, &next_left_chevron[1], ceed_relative_path_length); - ierr = CeedGetJitAbsolutePath(ceed, ceed_relative_path, &include_source_path); - CeedChk(ierr); - ierr = CeedFree(&ceed_relative_path); CeedChk(ierr); + CeedCall(CeedGetJitAbsolutePath(ceed, ceed_relative_path, &include_source_path)); + CeedCall(CeedFree(&ceed_relative_path)); } // ---- Recursive call to load source to buffer CeedDebug256(ceed, 2, "JiT Including: %s\n", include_source_path); - CeedChk(ierr); - ierr = CeedLoadSourceToInitializedBuffer(ceed, include_source_path, buffer); - CeedChk(ierr); - ierr = CeedFree(&include_source_path); CeedChk(ierr); + CeedCall(CeedLoadSourceToInitializedBuffer(ceed, include_source_path, buffer)); + CeedCall(CeedFree(&include_source_path)); } file_offset = strchr(first_hash, '\n') - temp_buffer + 1; } @@ -187,14 +169,14 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, } // Copy rest of source file into buffer long current_size = strlen(*buffer); - long copy_size = strlen(&temp_buffer[file_offset]); - ierr = CeedRealloc(current_size + copy_size + 2, buffer); CeedChk(ierr); + long copy_size = strlen(&temp_buffer[file_offset]); + CeedCall(CeedRealloc(current_size + copy_size + 2, buffer)); memcpy(&(*buffer)[current_size], "\n", 2); memcpy(&(*buffer)[current_size + 1], &temp_buffer[file_offset], copy_size); memcpy(&(*buffer)[current_size + copy_size + 1], "", 1); // Cleanup - ierr = CeedFree(&temp_buffer); CeedChk(ierr); + CeedCall(CeedFree(&temp_buffer)); // Debug CeedDebug256(ceed, 1, "---------- Ceed JiT ----------\n"); @@ -219,16 +201,12 @@ int CeedLoadSourceToInitializedBuffer(Ceed ceed, @ref Backend **/ -int CeedLoadSourceToBuffer(Ceed ceed, const char *source_file_path, - char **buffer) { - int ierr; - +int CeedLoadSourceToBuffer(Ceed ceed, const char *source_file_path, char **buffer) { // Initalize buffer - ierr = CeedCalloc(1, buffer); CeedChk(ierr); + CeedCall(CeedCalloc(1, buffer)); // Load to initalized buffer - ierr = CeedLoadSourceToInitializedBuffer(ceed, source_file_path, buffer); - CeedChk(ierr); + CeedCall(CeedLoadSourceToInitializedBuffer(ceed, source_file_path, buffer)); return CEED_ERROR_SUCCESS; } @@ -247,15 +225,12 @@ int CeedLoadSourceToBuffer(Ceed ceed, const char *source_file_path, @ref Backend **/ -int CeedPathConcatenate(Ceed ceed, const char *base_file_path, - const char *relative_file_path, char **new_file_path) { - int ierr; - char *last_slash = strrchr(base_file_path, '/'); - size_t base_length = (last_slash - base_file_path + 1), - relative_length = strlen(relative_file_path), +int CeedPathConcatenate(Ceed ceed, const char *base_file_path, const char *relative_file_path, char **new_file_path) { + char *last_slash = strrchr(base_file_path, '/'); + size_t base_length = (last_slash - base_file_path + 1), relative_length = strlen(relative_file_path), new_file_path_length = base_length + relative_length + 1; - ierr = CeedCalloc(new_file_path_length, new_file_path); CeedChk(ierr); + CeedCall(CeedCalloc(new_file_path_length, new_file_path)); memcpy(*new_file_path, base_file_path, base_length); memcpy(&((*new_file_path)[base_length]), relative_file_path, relative_length); @@ -272,17 +247,14 @@ int CeedPathConcatenate(Ceed ceed, const char *base_file_path, @ref Backend **/ -int CeedGetJitRelativePath(const char *absolute_file_path, - const char **relative_file_path) { +int CeedGetJitRelativePath(const char *absolute_file_path, const char **relative_file_path) { *(relative_file_path) = strstr(absolute_file_path, "ceed/jit-source"); - if (!*relative_file_path) + if (!*relative_file_path) { // LCOV_EXCL_START - return CeedError(NULL, CEED_ERROR_MAJOR, - "Couldn't find relative path including " - "'ceed/jit-source' for: %s", absolute_file_path); - // LCOV_EXCL_STOP - + return CeedError(NULL, CEED_ERROR_MAJOR, "Couldn't find relative path including 'ceed/jit-source' for: %s", absolute_file_path); + // LCOV_EXCL_STOP + } return CEED_ERROR_SUCCESS; } @@ -297,9 +269,7 @@ int CeedGetJitRelativePath(const char *absolute_file_path, @ref Backend **/ -int CeedGetJitAbsolutePath(Ceed ceed, const char *relative_file_path, - char **absolute_file_path) { - int ierr; +int CeedGetJitAbsolutePath(Ceed ceed, const char *relative_file_path, char **absolute_file_path) { Ceed ceed_parent; // Debug @@ -307,8 +277,7 @@ int CeedGetJitAbsolutePath(Ceed ceed, const char *relative_file_path, CeedDebug256(ceed, 1, "Relative JiT source file: "); CeedDebug(ceed, "%s\n", relative_file_path); - - ierr = CeedGetParent(ceed, &ceed_parent); CeedChk(ierr); + CeedCall(CeedGetParent(ceed, &ceed_parent)); for (CeedInt i = 0; i < ceed_parent->num_jit_source_roots; i++) { bool is_valid; @@ -317,21 +286,14 @@ int CeedGetJitAbsolutePath(Ceed ceed, const char *relative_file_path, CeedDebug(ceed, "%s\n", ceed_parent->jit_source_roots[i]); // Build and check absolute path with current root - ierr = CeedPathConcatenate(ceed, ceed_parent->jit_source_roots[i], - relative_file_path, absolute_file_path); - CeedChk(ierr); - ierr = CeedCheckFilePath(ceed, *absolute_file_path, &is_valid); CeedChk(ierr); - - if (is_valid) { - return CEED_ERROR_SUCCESS; - } else { - ierr = CeedFree(absolute_file_path); CeedChk(ierr); - } + CeedCall(CeedPathConcatenate(ceed, ceed_parent->jit_source_roots[i], relative_file_path, absolute_file_path)); + CeedCall(CeedCheckFilePath(ceed, *absolute_file_path, &is_valid)); + + if (is_valid) return CEED_ERROR_SUCCESS; + else CeedCall(CeedFree(absolute_file_path)); } // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_MAJOR, - "Couldn't find matching JiT source file: %s", - relative_file_path); + return CeedError(ceed, CEED_ERROR_MAJOR, "Couldn't find matching JiT source file: %s", relative_file_path); // LCOV_EXCL_STOP } diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c index 98238f36ab..0a9253d69d 100644 --- a/interface/ceed-operator.c +++ b/interface/ceed-operator.c @@ -5,9 +5,9 @@ // // This file is part of CEED: http://github.com/ceed -#include -#include #include +#include +#include #include #include #include @@ -33,112 +33,94 @@ @ref Developer **/ -static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, - CeedElemRestriction r, CeedBasis b) { - int ierr; +static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedElemRestriction r, CeedBasis b) { CeedEvalMode eval_mode = qf_field->eval_mode; - CeedInt dim = 1, num_comp = 1, Q_comp = 1, restr_num_comp = 1, - size = qf_field->size; + CeedInt dim = 1, num_comp = 1, Q_comp = 1, restr_num_comp = 1, size = qf_field->size; + // Restriction if (r != CEED_ELEMRESTRICTION_NONE) { if (eval_mode == CEED_EVAL_WEIGHT) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_INCOMPATIBLE, - "CEED_ELEMRESTRICTION_NONE should be used " - "for a field with eval mode CEED_EVAL_WEIGHT"); + return CeedError(ceed, CEED_ERROR_INCOMPATIBLE, "CEED_ELEMRESTRICTION_NONE should be used for a field with eval mode CEED_EVAL_WEIGHT"); // LCOV_EXCL_STOP } - ierr = CeedElemRestrictionGetNumComponents(r, &restr_num_comp); - CeedChk(ierr); + CeedCall(CeedElemRestrictionGetNumComponents(r, &restr_num_comp)); } if ((r == CEED_ELEMRESTRICTION_NONE) != (eval_mode == CEED_EVAL_WEIGHT)) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_INCOMPATIBLE, - "CEED_ELEMRESTRICTION_NONE and CEED_EVAL_WEIGHT " - "must be used together."); + return CeedError(ceed, CEED_ERROR_INCOMPATIBLE, "CEED_ELEMRESTRICTION_NONE and CEED_EVAL_WEIGHT must be used together."); // LCOV_EXCL_STOP } // Basis if (b != CEED_BASIS_COLLOCATED) { - if (eval_mode == CEED_EVAL_NONE) + if (eval_mode == CEED_EVAL_NONE) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_INCOMPATIBLE, - "Field '%s' configured with CEED_EVAL_NONE must " - "be used with CEED_BASIS_COLLOCATED", + return CeedError(ceed, CEED_ERROR_INCOMPATIBLE, "Field '%s' configured with CEED_EVAL_NONE must be used with CEED_BASIS_COLLOCATED", qf_field->field_name); - // LCOV_EXCL_STOP - ierr = CeedBasisGetDimension(b, &dim); CeedChk(ierr); - ierr = CeedBasisGetNumComponents(b, &num_comp); CeedChk(ierr); - ierr = CeedBasisGetNumQuadratureComponents(b, &Q_comp); CeedChk(ierr); + // LCOV_EXCL_STOP + } + CeedCall(CeedBasisGetDimension(b, &dim)); + CeedCall(CeedBasisGetNumComponents(b, &num_comp)); + CeedCall(CeedBasisGetNumQuadratureComponents(b, &Q_comp)); if (r != CEED_ELEMRESTRICTION_NONE && restr_num_comp != num_comp) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_DIMENSION, - "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction " - "has %" CeedInt_FMT " components, but Basis has %" CeedInt_FMT " components", - qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], - restr_num_comp, num_comp); + "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has %" CeedInt_FMT + " components, but Basis has %" CeedInt_FMT " components", + qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], restr_num_comp, num_comp); // LCOV_EXCL_STOP } } else if (eval_mode != CEED_EVAL_NONE) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_INCOMPATIBLE, - "Field '%s' configured with %s cannot " - "be used with CEED_BASIS_COLLOCATED", - qf_field->field_name, CeedEvalModes[eval_mode]); + return CeedError(ceed, CEED_ERROR_INCOMPATIBLE, "Field '%s' configured with %s cannot be used with CEED_BASIS_COLLOCATED", qf_field->field_name, + CeedEvalModes[eval_mode]); // LCOV_EXCL_STOP - } // Field size - switch(eval_mode) { - case CEED_EVAL_NONE: - if (size != restr_num_comp) - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has " - CeedInt_FMT " components", - qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], - restr_num_comp); - // LCOV_EXCL_STOP - break; - case CEED_EVAL_INTERP: - if (size != num_comp*Q_comp) - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Field '%s' of size %" CeedInt_FMT - " and EvalMode %s: ElemRestriction/Basis has " - CeedInt_FMT " components", - qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], - num_comp*Q_comp); - // LCOV_EXCL_STOP - break; - case CEED_EVAL_GRAD: - if (size != num_comp * dim) - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Field '%s' of size %" CeedInt_FMT " and EvalMode %s in %" CeedInt_FMT - " dimensions: " - "ElemRestriction/Basis has %" CeedInt_FMT " components", - qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], dim, - num_comp); - // LCOV_EXCL_STOP - break; - case CEED_EVAL_WEIGHT: - // No additional checks required - break; - case CEED_EVAL_DIV: - if (size != num_comp) - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Field '%s' of size %" CeedInt_FMT - " and EvalMode %s: ElemRestriction/Basis has " - CeedInt_FMT " components", - qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], - num_comp); - // LCOV_EXCL_STOP - break; - case CEED_EVAL_CURL: - // Not implemented - break; + switch (eval_mode) { + case CEED_EVAL_NONE: + if (size != restr_num_comp) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_DIMENSION, + "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has " CeedInt_FMT " components", qf_field->field_name, + qf_field->size, CeedEvalModes[qf_field->eval_mode], restr_num_comp); + // LCOV_EXCL_STOP + } + break; + case CEED_EVAL_INTERP: + if (size != num_comp * Q_comp) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_DIMENSION, + "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction/Basis has " CeedInt_FMT " components", + qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], num_comp * Q_comp); + // LCOV_EXCL_STOP + } + break; + case CEED_EVAL_GRAD: + if (size != num_comp * dim) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_DIMENSION, + "Field '%s' of size %" CeedInt_FMT " and EvalMode %s in %" CeedInt_FMT " dimensions: ElemRestriction/Basis has %" CeedInt_FMT + " components", + qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], dim, num_comp); + // LCOV_EXCL_STOP + } + break; + case CEED_EVAL_WEIGHT: + // No additional checks required + break; + case CEED_EVAL_DIV: + if (size != num_comp) { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_DIMENSION, + "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction/Basis has " CeedInt_FMT " components", + qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], num_comp); + // LCOV_EXCL_STOP + } + break; + case CEED_EVAL_CURL: + // Not implemented + break; } return CEED_ERROR_SUCCESS; } @@ -157,29 +139,24 @@ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, @ref Utility **/ -static int CeedOperatorFieldView(CeedOperatorField field, - CeedQFunctionField qf_field, - CeedInt field_number, bool sub, bool input, - FILE *stream) { - const char *pre = sub ? " " : ""; +static int CeedOperatorFieldView(CeedOperatorField field, CeedQFunctionField qf_field, CeedInt field_number, bool sub, bool input, FILE *stream) { + const char *pre = sub ? " " : ""; const char *in_out = input ? "Input" : "Output"; - fprintf(stream, "%s %s field %" CeedInt_FMT ":\n" + fprintf(stream, + "%s %s field %" CeedInt_FMT + ":\n" "%s Name: \"%s\"\n", pre, in_out, field_number, pre, qf_field->field_name); fprintf(stream, "%s Size: %" CeedInt_FMT "\n", pre, qf_field->size); - fprintf(stream, "%s EvalMode: %s\n", pre, - CeedEvalModes[qf_field->eval_mode]); + fprintf(stream, "%s EvalMode: %s\n", pre, CeedEvalModes[qf_field->eval_mode]); - if (field->basis == CEED_BASIS_COLLOCATED) - fprintf(stream, "%s Collocated basis\n", pre); + if (field->basis == CEED_BASIS_COLLOCATED) fprintf(stream, "%s Collocated basis\n", pre); - if (field->vec == CEED_VECTOR_ACTIVE) - fprintf(stream, "%s Active vector\n", pre); - else if (field->vec == CEED_VECTOR_NONE) - fprintf(stream, "%s No vector\n", pre); + if (field->vec == CEED_VECTOR_ACTIVE) fprintf(stream, "%s Active vector\n", pre); + else if (field->vec == CEED_VECTOR_NONE) fprintf(stream, "%s No vector\n", pre); return CEED_ERROR_SUCCESS; } @@ -196,36 +173,26 @@ static int CeedOperatorFieldView(CeedOperatorField field, @ref Utility **/ int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) { - int ierr; const char *pre = sub ? " " : ""; CeedInt num_elem, num_qpts; - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChk(ierr); - ierr = CeedOperatorGetNumQuadraturePoints(op, &num_qpts); CeedChk(ierr); + CeedCall(CeedOperatorGetNumElements(op, &num_elem)); + CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts)); CeedInt total_fields = 0; - ierr = CeedOperatorGetNumArgs(op, &total_fields); CeedChk(ierr); - fprintf(stream, "%s %" CeedInt_FMT " elements with %" CeedInt_FMT - " quadrature points each\n", - pre, num_elem, num_qpts); - - fprintf(stream, "%s %" CeedInt_FMT " field%s\n", pre, total_fields, - total_fields>1 ? "s" : ""); - - fprintf(stream, "%s %" CeedInt_FMT " input field%s:\n", pre, - op->qf->num_input_fields, - op->qf->num_input_fields>1 ? "s" : ""); - for (CeedInt i=0; iqf->num_input_fields; i++) { - ierr = CeedOperatorFieldView(op->input_fields[i], op->qf->input_fields[i], - i, sub, 1, stream); CeedChk(ierr); + CeedCall(CeedOperatorGetNumArgs(op, &total_fields)); + fprintf(stream, "%s %" CeedInt_FMT " elements with %" CeedInt_FMT " quadrature points each\n", pre, num_elem, num_qpts); + + fprintf(stream, "%s %" CeedInt_FMT " field%s\n", pre, total_fields, total_fields > 1 ? "s" : ""); + + fprintf(stream, "%s %" CeedInt_FMT " input field%s:\n", pre, op->qf->num_input_fields, op->qf->num_input_fields > 1 ? "s" : ""); + for (CeedInt i = 0; i < op->qf->num_input_fields; i++) { + CeedCall(CeedOperatorFieldView(op->input_fields[i], op->qf->input_fields[i], i, sub, 1, stream)); } - fprintf(stream, "%s %" CeedInt_FMT " output field%s:\n", pre, - op->qf->num_output_fields, - op->qf->num_output_fields>1 ? "s" : ""); - for (CeedInt i=0; iqf->num_output_fields; i++) { - ierr = CeedOperatorFieldView(op->output_fields[i], op->qf->output_fields[i], - i, sub, 0, stream); CeedChk(ierr); + fprintf(stream, "%s %" CeedInt_FMT " output field%s:\n", pre, op->qf->num_output_fields, op->qf->num_output_fields > 1 ? "s" : ""); + for (CeedInt i = 0; i < op->qf->num_output_fields; i++) { + CeedCall(CeedOperatorFieldView(op->output_fields[i], op->qf->output_fields[i], i, sub, 0, stream)); } return CEED_ERROR_SUCCESS; } @@ -243,19 +210,19 @@ int CeedOperatorSingleView(CeedOperator op, bool sub, FILE *stream) { int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis) { *active_basis = NULL; if (op->is_composite) return CEED_ERROR_SUCCESS; - for (CeedInt i = 0; i < op->qf->num_input_fields; i++) + for (CeedInt i = 0; i < op->qf->num_input_fields; i++) { if (op->input_fields[i]->vec == CEED_VECTOR_ACTIVE) { *active_basis = op->input_fields[i]->basis; break; } + } if (!*active_basis) { // LCOV_EXCL_START - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); - return CeedError(ceed, CEED_ERROR_MINOR, - "No active CeedBasis found"); + + CeedCall(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_MINOR, "No active CeedBasis found"); // LCOV_EXCL_STOP } return CEED_ERROR_SUCCESS; @@ -271,23 +238,22 @@ int CeedOperatorGetActiveBasis(CeedOperator op, CeedBasis *active_basis) { @ref Utility **/ -int CeedOperatorGetActiveElemRestriction(CeedOperator op, - CeedElemRestriction *active_rstr) { +int CeedOperatorGetActiveElemRestriction(CeedOperator op, CeedElemRestriction *active_rstr) { *active_rstr = NULL; if (op->is_composite) return CEED_ERROR_SUCCESS; - for (CeedInt i = 0; i < op->qf->num_input_fields; i++) + for (CeedInt i = 0; i < op->qf->num_input_fields; i++) { if (op->input_fields[i]->vec == CEED_VECTOR_ACTIVE) { *active_rstr = op->input_fields[i]->elem_restr; break; } + } if (!*active_rstr) { // LCOV_EXCL_START - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); - return CeedError(ceed, CEED_ERROR_INCOMPLETE, - "No active CeedElemRestriction found"); + + CeedCall(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_INCOMPLETE, "No active CeedElemRestriction found"); // LCOV_EXCL_STOP } return CEED_ERROR_SUCCESS; @@ -310,49 +276,42 @@ int CeedOperatorGetActiveElemRestriction(CeedOperator op, @ref User **/ -static int CeedOperatorContextSetGeneric(CeedOperator op, - CeedContextFieldLabel field_label, CeedContextFieldType field_type, - void *value) { - int ierr; - - if (!field_label) +static int CeedOperatorContextSetGeneric(CeedOperator op, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *value) { + if (!field_label) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, - "Invalid field label"); - // LCOV_EXCL_STOP + return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label"); + // LCOV_EXCL_STOP + } bool is_composite = false; - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); if (is_composite) { - CeedInt num_sub; + CeedInt num_sub; CeedOperator *sub_operators; - ierr = CeedOperatorGetNumSub(op, &num_sub); CeedChk(ierr); - ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr); - if (num_sub != field_label->num_sub_labels) + CeedCall(CeedOperatorGetNumSub(op, &num_sub)); + CeedCall(CeedOperatorGetSubList(op, &sub_operators)); + if (num_sub != field_label->num_sub_labels) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, - "ContextLabel does not correspond to composite operator.\n" - "Use CeedOperatorGetContextFieldLabel()."); - // LCOV_EXCL_STOP + "ContextLabel does not correspond to composite operator. Use CeedOperatorGetContextFieldLabel()."); + // LCOV_EXCL_STOP + } for (CeedInt i = 0; i < num_sub; i++) { // Try every sub-operator, ok if some sub-operators do not have field if (field_label->sub_labels[i] && sub_operators[i]->qf->ctx) { - ierr = CeedQFunctionContextSetGeneric(sub_operators[i]->qf->ctx, - field_label->sub_labels[i], - field_type, value); CeedChk(ierr); + CeedCall(CeedQFunctionContextSetGeneric(sub_operators[i]->qf->ctx, field_label->sub_labels[i], field_type, value)); } } } else { - if (!op->qf->ctx) + if (!op->qf->ctx) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, - "QFunction does not have context data"); - // LCOV_EXCL_STOP + return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, "QFunction does not have context data"); + // LCOV_EXCL_STOP + } - ierr = CeedQFunctionContextSetGeneric(op->qf->ctx, field_label, - field_type, value); CeedChk(ierr); + CeedCall(CeedQFunctionContextSetGeneric(op->qf->ctx, field_label, field_type, value)); } return CEED_ERROR_SUCCESS; @@ -378,12 +337,11 @@ static int CeedOperatorContextSetGeneric(CeedOperator op, **/ int CeedOperatorGetNumArgs(CeedOperator op, CeedInt *num_args) { - if (op->is_composite) + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_MINOR, - "Not defined for composite operators"); - // LCOV_EXCL_STOP - + return CeedError(op->ceed, CEED_ERROR_MINOR, "Not defined for composite operators"); + // LCOV_EXCL_STOP + } *num_args = op->num_fields; return CEED_ERROR_SUCCESS; } @@ -416,12 +374,11 @@ int CeedOperatorIsSetupDone(CeedOperator op, bool *is_setup_done) { **/ int CeedOperatorGetQFunction(CeedOperator op, CeedQFunction *qf) { - if (op->is_composite) + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_MINOR, - "Not defined for composite operator"); - // LCOV_EXCL_STOP - + return CeedError(op->ceed, CEED_ERROR_MINOR, "Not defined for composite operator"); + // LCOV_EXCL_STOP + } *qf = op->qf; return CEED_ERROR_SUCCESS; } @@ -454,11 +411,11 @@ int CeedOperatorIsComposite(CeedOperator op, bool *is_composite) { **/ int CeedOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators) { - if (!op->is_composite) + if (!op->is_composite) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_MINOR, "Not a composite operator"); - // LCOV_EXCL_STOP - + // LCOV_EXCL_STOP + } *num_suboperators = op->num_suboperators; return CEED_ERROR_SUCCESS; } @@ -475,11 +432,11 @@ int CeedOperatorGetNumSub(CeedOperator op, CeedInt *num_suboperators) { **/ int CeedOperatorGetSubList(CeedOperator op, CeedOperator **sub_operators) { - if (!op->is_composite) + if (!op->is_composite) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_MINOR, "Not a composite operator"); - // LCOV_EXCL_STOP - + // LCOV_EXCL_STOP + } *sub_operators = op->sub_operators; return CEED_ERROR_SUCCESS; } @@ -571,50 +528,46 @@ int CeedOperatorSetSetupDone(CeedOperator op) { @ref User */ -int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, - CeedQFunction dqfT, CeedOperator *op) { - int ierr; - +int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op) { if (!ceed->OperatorCreate) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "Operator"); CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support OperatorCreate"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support OperatorCreate"); + // LCOV_EXCL_STOP + } - ierr = CeedOperatorCreate(delegate, qf, dqf, dqfT, op); CeedChk(ierr); + CeedCall(CeedOperatorCreate(delegate, qf, dqf, dqfT, op)); return CEED_ERROR_SUCCESS; } - if (!qf || qf == CEED_QFUNCTION_NONE) + if (!qf || qf == CEED_QFUNCTION_NONE) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_MINOR, - "Operator must have a valid QFunction."); - // LCOV_EXCL_STOP - ierr = CeedCalloc(1, op); CeedChk(ierr); + return CeedError(ceed, CEED_ERROR_MINOR, "Operator must have a valid QFunction."); + // LCOV_EXCL_STOP + } + CeedCall(CeedCalloc(1, op)); (*op)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*op)->ref_count = 1; - (*op)->qf = qf; - (*op)->input_size = -1; + CeedCall(CeedReference(ceed)); + (*op)->ref_count = 1; + (*op)->qf = qf; + (*op)->input_size = -1; (*op)->output_size = -1; - ierr = CeedQFunctionReference(qf); CeedChk(ierr); + CeedCall(CeedQFunctionReference(qf)); if (dqf && dqf != CEED_QFUNCTION_NONE) { (*op)->dqf = dqf; - ierr = CeedQFunctionReference(dqf); CeedChk(ierr); + CeedCall(CeedQFunctionReference(dqf)); } if (dqfT && dqfT != CEED_QFUNCTION_NONE) { (*op)->dqfT = dqfT; - ierr = CeedQFunctionReference(dqfT); CeedChk(ierr); + CeedCall(CeedQFunctionReference(dqfT)); } - ierr = CeedQFunctionAssemblyDataCreate(ceed, &(*op)->qf_assembled); - CeedChk(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &(*op)->input_fields); CeedChk(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &(*op)->output_fields); CeedChk(ierr); - ierr = ceed->OperatorCreate(*op); CeedChk(ierr); + CeedCall(CeedQFunctionAssemblyDataCreate(ceed, &(*op)->qf_assembled)); + CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->input_fields)); + CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*op)->output_fields)); + CeedCall(ceed->OperatorCreate(*op)); return CEED_ERROR_SUCCESS; } @@ -630,29 +583,27 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, @ref User */ int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op) { - int ierr; - if (!ceed->CompositeOperatorCreate) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "Operator"); CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Operator")); if (delegate) { - ierr = CeedCompositeOperatorCreate(delegate, op); CeedChk(ierr); + CeedCall(CeedCompositeOperatorCreate(delegate, op)); return CEED_ERROR_SUCCESS; } } - ierr = CeedCalloc(1, op); CeedChk(ierr); + CeedCall(CeedCalloc(1, op)); (*op)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*op)->ref_count = 1; + CeedCall(CeedReference(ceed)); + (*op)->ref_count = 1; (*op)->is_composite = true; - ierr = CeedCalloc(CEED_COMPOSITE_MAX, &(*op)->sub_operators); CeedChk(ierr); - (*op)->input_size = -1; + CeedCall(CeedCalloc(CEED_COMPOSITE_MAX, &(*op)->sub_operators)); + (*op)->input_size = -1; (*op)->output_size = -1; if (ceed->CompositeOperatorCreate) { - ierr = ceed->CompositeOperatorCreate(*op); CeedChk(ierr); + CeedCall(ceed->CompositeOperatorCreate(*op)); } return CEED_ERROR_SUCCESS; } @@ -673,10 +624,8 @@ int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op) { @ref User **/ int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy) { - int ierr; - - ierr = CeedOperatorReference(op); CeedChk(ierr); - ierr = CeedOperatorDestroy(op_copy); CeedChk(ierr); + CeedCall(CeedOperatorReference(op)); + CeedCall(CeedOperatorDestroy(op_copy)); *op_copy = op; return CEED_ERROR_SUCCESS; } @@ -707,63 +656,56 @@ int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy) { @ref User **/ -int CeedOperatorSetField(CeedOperator op, const char *field_name, - CeedElemRestriction r, CeedBasis b, CeedVector v) { - int ierr; - if (op->is_composite) +int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction r, CeedBasis b, CeedVector v) { + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, - "Cannot add field to composite operator."); - // LCOV_EXCL_STOP - if (op->is_immutable) + return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, "Cannot add field to composite operator."); + // LCOV_EXCL_STOP + } + if (op->is_immutable) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_MAJOR, - "Operator cannot be changed after set as immutable"); - // LCOV_EXCL_STOP - if (!r) + return CeedError(op->ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); + // LCOV_EXCL_STOP + } + if (!r) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, - "ElemRestriction r for field \"%s\" must be non-NULL.", - field_name); - // LCOV_EXCL_STOP - if (!b) + return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, "ElemRestriction r for field \"%s\" must be non-NULL.", field_name); + // LCOV_EXCL_STOP + } + if (!b) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, - "Basis b for field \"%s\" must be non-NULL.", - field_name); - // LCOV_EXCL_STOP - if (!v) + return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, "Basis b for field \"%s\" must be non-NULL.", field_name); + // LCOV_EXCL_STOP + } + if (!v) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, - "Vector v for field \"%s\" must be non-NULL.", - field_name); - // LCOV_EXCL_STOP + return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, "Vector v for field \"%s\" must be non-NULL.", field_name); + // LCOV_EXCL_STOP + } CeedInt num_elem; - ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChk(ierr); - if (r != CEED_ELEMRESTRICTION_NONE && op->has_restriction && - op->num_elem != num_elem) + CeedCall(CeedElemRestrictionGetNumElements(r, &num_elem)); + if (r != CEED_ELEMRESTRICTION_NONE && op->has_restriction && op->num_elem != num_elem) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_DIMENSION, - "ElemRestriction with %" CeedInt_FMT " elements incompatible with prior %" - CeedInt_FMT " elements", num_elem, op->num_elem); - // LCOV_EXCL_STOP + "ElemRestriction with %" CeedInt_FMT " elements incompatible with prior %" CeedInt_FMT " elements", num_elem, op->num_elem); + // LCOV_EXCL_STOP + } CeedInt num_qpts = 0; if (b != CEED_BASIS_COLLOCATED) { - ierr = CeedBasisGetNumQuadraturePoints(b, &num_qpts); CeedChk(ierr); - if (op->num_qpts && op->num_qpts != num_qpts) + CeedCall(CeedBasisGetNumQuadraturePoints(b, &num_qpts)); + if (op->num_qpts && op->num_qpts != num_qpts) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_DIMENSION, - "Basis with %" CeedInt_FMT " quadrature points " - "incompatible with prior %" CeedInt_FMT " points", num_qpts, - op->num_qpts); - // LCOV_EXCL_STOP + "Basis with %" CeedInt_FMT " quadrature points incompatible with prior %" CeedInt_FMT " points", num_qpts, op->num_qpts); + // LCOV_EXCL_STOP + } } CeedQFunctionField qf_field; CeedOperatorField *op_field; - bool is_input = true; - for (CeedInt i=0; iqf->num_input_fields; i++) { + bool is_input = true; + for (CeedInt i = 0; i < op->qf->num_input_fields; i++) { if (!strcmp(field_name, (*op->qf->input_fields[i]).field_name)) { qf_field = op->qf->input_fields[i]; op_field = &op->input_fields[i]; @@ -771,7 +713,7 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, } } is_input = false; - for (CeedInt i=0; iqf->num_output_fields; i++) { + for (CeedInt i = 0; i < op->qf->num_output_fields; i++) { if (!strcmp(field_name, (*op->qf->output_fields[i]).field_name)) { qf_field = op->qf->output_fields[i]; op_field = &op->output_fields[i]; @@ -779,59 +721,54 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, } } // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_INCOMPLETE, - "QFunction has no knowledge of field '%s'", - field_name); + return CeedError(op->ceed, CEED_ERROR_INCOMPLETE, "QFunction has no knowledge of field '%s'", field_name); // LCOV_EXCL_STOP found: - ierr = CeedOperatorCheckField(op->ceed, qf_field, r, b); CeedChk(ierr); - ierr = CeedCalloc(1, op_field); CeedChk(ierr); + CeedCall(CeedOperatorCheckField(op->ceed, qf_field, r, b)); + CeedCall(CeedCalloc(1, op_field)); if (v == CEED_VECTOR_ACTIVE) { CeedSize l_size; - ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetLVectorSize(r, &l_size)); if (is_input) { if (op->input_size == -1) op->input_size = l_size; - if (l_size != op->input_size) + if (l_size != op->input_size) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, - "LVector size %td does not match previous size %td", - l_size, op->input_size); - // LCOV_EXCL_STOP + return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, "LVector size %td does not match previous size %td", l_size, op->input_size); + // LCOV_EXCL_STOP + } } else { if (op->output_size == -1) op->output_size = l_size; - if (l_size != op->output_size) + if (l_size != op->output_size) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, - "LVector size %td does not match previous size %td", - l_size, op->output_size); - // LCOV_EXCL_STOP + return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, "LVector size %td does not match previous size %td", l_size, op->output_size); + // LCOV_EXCL_STOP + } } } (*op_field)->vec = v; if (v != CEED_VECTOR_ACTIVE && v != CEED_VECTOR_NONE) { - ierr = CeedVectorReference(v); CeedChk(ierr); + CeedCall(CeedVectorReference(v)); } (*op_field)->elem_restr = r; - ierr = CeedElemRestrictionReference(r); CeedChk(ierr); + CeedCall(CeedElemRestrictionReference(r)); if (r != CEED_ELEMRESTRICTION_NONE) { - op->num_elem = num_elem; - op->has_restriction = true; // Restriction set, but num_elem may be 0 + op->num_elem = num_elem; + op->has_restriction = true; // Restriction set, but num_elem may be 0 } (*op_field)->basis = b; if (b != CEED_BASIS_COLLOCATED) { if (!op->num_qpts) { - ierr = CeedOperatorSetNumQuadraturePoints(op, num_qpts); CeedChk(ierr); + CeedCall(CeedOperatorSetNumQuadraturePoints(op, num_qpts)); } - ierr = CeedBasisReference(b); CeedChk(ierr); + CeedCall(CeedBasisReference(b)); } op->num_fields += 1; - ierr = CeedStringAllocCopy(field_name, (char **)&(*op_field)->field_name); - CeedChk(ierr); + CeedCall(CeedStringAllocCopy(field_name, (char **)&(*op_field)->field_name)); return CEED_ERROR_SUCCESS; } @@ -851,18 +788,14 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, @ref Advanced **/ -int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, - CeedOperatorField **input_fields, - CeedInt *num_output_fields, +int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperatorField **input_fields, CeedInt *num_output_fields, CeedOperatorField **output_fields) { - int ierr; - - if (op->is_composite) + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_MINOR, - "Not defined for composite operator"); - // LCOV_EXCL_STOP - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); + return CeedError(op->ceed, CEED_ERROR_MINOR, "Not defined for composite operator"); + // LCOV_EXCL_STOP + } + CeedCall(CeedOperatorCheckReady(op)); if (num_input_fields) *num_input_fields = op->qf->num_input_fields; if (input_fields) *input_fields = op->input_fields; @@ -896,8 +829,7 @@ int CeedOperatorFieldGetName(CeedOperatorField op_field, char **field_name) { @ref Advanced **/ -int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, - CeedElemRestriction *rstr) { +int CeedOperatorFieldGetElemRestriction(CeedOperatorField op_field, CeedElemRestriction *rstr) { *rstr = op_field->elem_restr; return CEED_ERROR_SUCCESS; } @@ -942,47 +874,42 @@ int CeedOperatorFieldGetVector(CeedOperatorField op_field, CeedVector *vec) { @ref User */ -int CeedCompositeOperatorAddSub(CeedOperator composite_op, - CeedOperator sub_op) { - int ierr; - - if (!composite_op->is_composite) +int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op) { + if (!composite_op->is_composite) { // LCOV_EXCL_START - return CeedError(composite_op->ceed, CEED_ERROR_MINOR, - "CeedOperator is not a composite operator"); - // LCOV_EXCL_STOP + return CeedError(composite_op->ceed, CEED_ERROR_MINOR, "CeedOperator is not a composite operator"); + // LCOV_EXCL_STOP + } - if (composite_op->num_suboperators == CEED_COMPOSITE_MAX) + if (composite_op->num_suboperators == CEED_COMPOSITE_MAX) { // LCOV_EXCL_START - return CeedError(composite_op->ceed, CEED_ERROR_UNSUPPORTED, - "Cannot add additional sub-operators"); - // LCOV_EXCL_STOP - if (composite_op->is_immutable) + return CeedError(composite_op->ceed, CEED_ERROR_UNSUPPORTED, "Cannot add additional sub-operators"); + // LCOV_EXCL_STOP + } + if (composite_op->is_immutable) { // LCOV_EXCL_START - return CeedError(composite_op->ceed, CEED_ERROR_MAJOR, - "Operator cannot be changed after set as immutable"); - // LCOV_EXCL_STOP + return CeedError(composite_op->ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); + // LCOV_EXCL_STOP + } { CeedSize input_size, output_size; - ierr = CeedOperatorGetActiveVectorLengths(sub_op, &input_size, &output_size); - CeedChk(ierr); + CeedCall(CeedOperatorGetActiveVectorLengths(sub_op, &input_size, &output_size)); if (composite_op->input_size == -1) composite_op->input_size = input_size; if (composite_op->output_size == -1) composite_op->output_size = output_size; // Note, a size of -1 means no active vector restriction set, so no incompatibility - if ((input_size != -1 && input_size != composite_op->input_size) || - (output_size != -1 && output_size != composite_op->output_size)) + if ((input_size != -1 && input_size != composite_op->input_size) || (output_size != -1 && output_size != composite_op->output_size)) { // LCOV_EXCL_START return CeedError(composite_op->ceed, CEED_ERROR_MAJOR, - "Sub-operators must have compatible dimensions; " - "composite operator of shape (%td, %td) not compatible with " - "sub-operator of shape (%td, %td)", + "Sub-operators must have compatible dimensions; composite operator of shape (%td, %td) not compatible with sub-operator of " + "shape (%td, %td)", composite_op->input_size, composite_op->output_size, input_size, output_size); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } } composite_op->sub_operators[composite_op->num_suboperators] = sub_op; - ierr = CeedOperatorReference(sub_op); CeedChk(ierr); + CeedCall(CeedOperatorReference(sub_op)); composite_op->num_suboperators++; return CEED_ERROR_SUCCESS; } @@ -997,65 +924,54 @@ int CeedCompositeOperatorAddSub(CeedOperator composite_op, @ref User **/ int CeedOperatorCheckReady(CeedOperator op) { - int ierr; Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); + CeedCall(CeedOperatorGetCeed(op, &ceed)); - if (op->is_interface_setup) - return CEED_ERROR_SUCCESS; + if (op->is_interface_setup) return CEED_ERROR_SUCCESS; CeedQFunction qf = op->qf; if (op->is_composite) { if (!op->num_suboperators) { // Empty operator setup - op->input_size = 0; + op->input_size = 0; op->output_size = 0; } else { for (CeedInt i = 0; i < op->num_suboperators; i++) { - ierr = CeedOperatorCheckReady(op->sub_operators[i]); CeedChk(ierr); + CeedCall(CeedOperatorCheckReady(op->sub_operators[i])); } // Sub-operators could be modified after adding to composite operator // Need to verify no lvec incompatibility from any changes CeedSize input_size, output_size; - ierr = CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size); - CeedChk(ierr); + CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); } } else { - if (op->num_fields == 0) + if (op->num_fields == 0) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_INCOMPLETE, "No operator fields set"); - // LCOV_EXCL_STOP - if (op->num_fields < qf->num_input_fields + qf->num_output_fields) + // LCOV_EXCL_STOP + } + if (op->num_fields < qf->num_input_fields + qf->num_output_fields) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_INCOMPLETE, "Not all operator fields set"); - // LCOV_EXCL_STOP - if (!op->has_restriction) + // LCOV_EXCL_STOP + } + if (!op->has_restriction) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_INCOMPLETE, - "At least one restriction required"); - // LCOV_EXCL_STOP - if (op->num_qpts == 0) + return CeedError(ceed, CEED_ERROR_INCOMPLETE, "At least one restriction required"); + // LCOV_EXCL_STOP + } + if (op->num_qpts == 0) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_INCOMPLETE, - "At least one non-collocated basis is required " - "or the number of quadrature points must be set"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_INCOMPLETE, "At least one non-collocated basis is required or the number of quadrature points must be set"); + // LCOV_EXCL_STOP + } } // Flag as immutable and ready op->is_interface_setup = true; - if (op->qf && op->qf != CEED_QFUNCTION_NONE) - // LCOV_EXCL_START - op->qf->is_immutable = true; - // LCOV_EXCL_STOP - if (op->dqf && op->dqf != CEED_QFUNCTION_NONE) - // LCOV_EXCL_START - op->dqf->is_immutable = true; - // LCOV_EXCL_STOP - if (op->dqfT && op->dqfT != CEED_QFUNCTION_NONE) - // LCOV_EXCL_START - op->dqfT->is_immutable = true; - // LCOV_EXCL_STOP + if (op->qf && op->qf != CEED_QFUNCTION_NONE) op->qf->is_immutable = true; + if (op->dqf && op->dqf != CEED_QFUNCTION_NONE) op->dqf->is_immutable = true; + if (op->dqfT && op->dqfT != CEED_QFUNCTION_NONE) op->dqfT->is_immutable = true; return CEED_ERROR_SUCCESS; } @@ -1072,32 +988,28 @@ int CeedOperatorCheckReady(CeedOperator op) { @ref User **/ -int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, - CeedSize *output_size) { - int ierr; +int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size) { bool is_composite; if (input_size) *input_size = op->input_size; if (output_size) *output_size = op->output_size; - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); if (is_composite && (op->input_size == -1 || op->output_size == -1)) { for (CeedInt i = 0; i < op->num_suboperators; i++) { CeedSize sub_input_size, sub_output_size; - ierr = CeedOperatorGetActiveVectorLengths(op->sub_operators[i], - &sub_input_size, &sub_output_size); CeedChk(ierr); + CeedCall(CeedOperatorGetActiveVectorLengths(op->sub_operators[i], &sub_input_size, &sub_output_size)); if (op->input_size == -1) op->input_size = sub_input_size; if (op->output_size == -1) op->output_size = sub_output_size; // Note, a size of -1 means no active vector restriction set, so no incompatibility - if ((sub_input_size != -1 && sub_input_size != op->input_size) || - (sub_output_size != -1 && sub_output_size != op->output_size)) + if ((sub_input_size != -1 && sub_input_size != op->input_size) || (sub_output_size != -1 && sub_output_size != op->output_size)) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_MAJOR, - "Sub-operators must have compatible dimensions; " - "composite operator of shape (%td, %td) not compatible with " - "sub-operator of shape (%td, %td)", + "Sub-operators must have compatible dimensions; composite operator of shape (%td, %td) not compatible with sub-operator of " + "shape (%td, %td)", op->input_size, op->output_size, input_size, output_size); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } } } @@ -1120,20 +1032,16 @@ int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, @ref Advanced **/ -int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, - bool reuse_assembly_data) { - int ierr; +int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data) { bool is_composite; - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); if (is_composite) { for (CeedInt i = 0; i < op->num_suboperators; i++) { - ierr = CeedOperatorSetQFunctionAssemblyReuse(op->sub_operators[i], - reuse_assembly_data); CeedChk(ierr); + CeedCall(CeedOperatorSetQFunctionAssemblyReuse(op->sub_operators[i], reuse_assembly_data)); } } else { - ierr = CeedQFunctionAssemblyDataSetReuse(op->qf_assembled, reuse_assembly_data); - CeedChk(ierr); + CeedCall(CeedQFunctionAssemblyDataSetReuse(op->qf_assembled, reuse_assembly_data)); } return CEED_ERROR_SUCCESS; @@ -1149,21 +1057,16 @@ int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, @ref Advanced **/ -int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, - bool needs_data_update) { - int ierr; +int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update) { bool is_composite; - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); if (is_composite) { for (CeedInt i = 0; i < op->num_suboperators; i++) { - ierr = CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op->sub_operators[i], - needs_data_update); CeedChk(ierr); + CeedCall(CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op->sub_operators[i], needs_data_update)); } } else { - ierr = CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, - needs_data_update); - CeedChk(ierr); + CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, needs_data_update)); } return CEED_ERROR_SUCCESS; @@ -1183,22 +1086,21 @@ int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, @ref Advanced **/ int CeedOperatorSetNumQuadraturePoints(CeedOperator op, CeedInt num_qpts) { - if (op->is_composite) + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_MINOR, - "Not defined for composite operator"); - // LCOV_EXCL_STOP - if (op->num_qpts) + return CeedError(op->ceed, CEED_ERROR_MINOR, "Not defined for composite operator"); + // LCOV_EXCL_STOP + } + if (op->num_qpts) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_MINOR, - "Number of quadrature points already defined"); - // LCOV_EXCL_STOP - if (op->is_immutable) + return CeedError(op->ceed, CEED_ERROR_MINOR, "Number of quadrature points already defined"); + // LCOV_EXCL_STOP + } + if (op->is_immutable) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_MAJOR, - "Operator cannot be changed after set as immutable"); - // LCOV_EXCL_STOP - + return CeedError(op->ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); + // LCOV_EXCL_STOP + } op->num_qpts = num_qpts; return CEED_ERROR_SUCCESS; } @@ -1214,13 +1116,12 @@ int CeedOperatorSetNumQuadraturePoints(CeedOperator op, CeedInt num_qpts) { @ref User **/ int CeedOperatorSetName(CeedOperator op, const char *name) { - int ierr; - char *name_copy; + char *name_copy; size_t name_len = name ? strlen(name) : 0; - ierr = CeedFree(&op->name); CeedChk(ierr); + CeedCall(CeedFree(&op->name)); if (name_len > 0) { - ierr = CeedCalloc(name_len + 1, &name_copy); CeedChk(ierr); + CeedCall(CeedCalloc(name_len + 1, &name_copy)); memcpy(name_copy, name, name_len); op->name = name_copy; } @@ -1239,25 +1140,19 @@ int CeedOperatorSetName(CeedOperator op, const char *name) { @ref User **/ int CeedOperatorView(CeedOperator op, FILE *stream) { - int ierr; bool has_name = op->name; if (op->is_composite) { - fprintf(stream, "Composite CeedOperator%s%s\n", - has_name ? " - " : "", has_name ? op->name : ""); + fprintf(stream, "Composite CeedOperator%s%s\n", has_name ? " - " : "", has_name ? op->name : ""); - for (CeedInt i=0; inum_suboperators; i++) { + for (CeedInt i = 0; i < op->num_suboperators; i++) { has_name = op->sub_operators[i]->name; - fprintf(stream, " SubOperator %" CeedInt_FMT "%s%s:\n", i, - has_name ? " - " : "", - has_name ? op->sub_operators[i]->name : ""); - ierr = CeedOperatorSingleView(op->sub_operators[i], 1, stream); - CeedChk(ierr); + fprintf(stream, " SubOperator %" CeedInt_FMT "%s%s:\n", i, has_name ? " - " : "", has_name ? op->sub_operators[i]->name : ""); + CeedCall(CeedOperatorSingleView(op->sub_operators[i], 1, stream)); } } else { - fprintf(stream, "CeedOperator%s%s\n", - has_name ? " - " : "", has_name ? op->name : ""); - ierr = CeedOperatorSingleView(op, 0, stream); CeedChk(ierr); + fprintf(stream, "CeedOperator%s%s\n", has_name ? " - " : "", has_name ? op->name : ""); + CeedCall(CeedOperatorSingleView(op, 0, stream)); } return CEED_ERROR_SUCCESS; } @@ -1288,12 +1183,11 @@ int CeedOperatorGetCeed(CeedOperator op, Ceed *ceed) { @ref Advanced **/ int CeedOperatorGetNumElements(CeedOperator op, CeedInt *num_elem) { - if (op->is_composite) + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_MINOR, - "Not defined for composite operator"); - // LCOV_EXCL_STOP - + return CeedError(op->ceed, CEED_ERROR_MINOR, "Not defined for composite operator"); + // LCOV_EXCL_STOP + } *num_elem = op->num_elem; return CEED_ERROR_SUCCESS; } @@ -1309,12 +1203,11 @@ int CeedOperatorGetNumElements(CeedOperator op, CeedInt *num_elem) { @ref Advanced **/ int CeedOperatorGetNumQuadraturePoints(CeedOperator op, CeedInt *num_qpts) { - if (op->is_composite) + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_MINOR, - "Not defined for composite operator"); - // LCOV_EXCL_STOP - + return CeedError(op->ceed, CEED_ERROR_MINOR, "Not defined for composite operator"); + // LCOV_EXCL_STOP + } *num_qpts = op->num_qpts; return CEED_ERROR_SUCCESS; } @@ -1328,63 +1221,56 @@ int CeedOperatorGetNumQuadraturePoints(CeedOperator op, CeedInt *num_qpts) { @ref Backend **/ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) { - int ierr; bool is_composite; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); + CeedCall(CeedOperatorCheckReady(op)); *flops = 0; - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); if (is_composite) { CeedInt num_suboperators; - ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChk(ierr); + CeedCall(CeedOperatorGetNumSub(op, &num_suboperators)); CeedOperator *sub_operators; - ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr); + CeedCall(CeedOperatorGetSubList(op, &sub_operators)); // FLOPs for each suboperator for (CeedInt i = 0; i < num_suboperators; i++) { CeedSize suboperator_flops; - ierr = CeedOperatorGetFlopsEstimate(sub_operators[i], &suboperator_flops); - CeedChk(ierr); + CeedCall(CeedOperatorGetFlopsEstimate(sub_operators[i], &suboperator_flops)); *flops += suboperator_flops; } } else { - CeedInt num_input_fields, num_output_fields; + CeedInt num_input_fields, num_output_fields; CeedOperatorField *input_fields, *output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &input_fields, - &num_output_fields, &output_fields); CeedChk(ierr); + CeedCall(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); CeedInt num_elem = 0; - ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChk(ierr); + CeedCall(CeedOperatorGetNumElements(op, &num_elem)); // Input FLOPs for (CeedInt i = 0; i < num_input_fields; i++) { if (input_fields[i]->vec == CEED_VECTOR_ACTIVE) { CeedSize restr_flops, basis_flops; - ierr = CeedElemRestrictionGetFlopsEstimate(input_fields[i]->elem_restr, - CEED_NOTRANSPOSE, &restr_flops); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetFlopsEstimate(input_fields[i]->elem_restr, CEED_NOTRANSPOSE, &restr_flops)); *flops += restr_flops; - ierr = CeedBasisGetFlopsEstimate(input_fields[i]->basis, CEED_NOTRANSPOSE, - op->qf->input_fields[i]->eval_mode, &basis_flops); CeedChk(ierr); + CeedCall(CeedBasisGetFlopsEstimate(input_fields[i]->basis, CEED_NOTRANSPOSE, op->qf->input_fields[i]->eval_mode, &basis_flops)); *flops += basis_flops * num_elem; } } // QF FLOPs - CeedInt num_qpts; + CeedInt num_qpts; CeedSize qf_flops; - ierr = CeedOperatorGetNumQuadraturePoints(op, &num_qpts); CeedChk(ierr); - ierr = CeedQFunctionGetFlopsEstimate(op->qf, &qf_flops); CeedChk(ierr); + CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts)); + CeedCall(CeedQFunctionGetFlopsEstimate(op->qf, &qf_flops)); *flops += num_elem * num_qpts * qf_flops; // Output FLOPs for (CeedInt i = 0; i < num_output_fields; i++) { if (output_fields[i]->vec == CEED_VECTOR_ACTIVE) { CeedSize restr_flops, basis_flops; - ierr = CeedElemRestrictionGetFlopsEstimate(output_fields[i]->elem_restr, - CEED_TRANSPOSE, &restr_flops); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetFlopsEstimate(output_fields[i]->elem_restr, CEED_TRANSPOSE, &restr_flops)); *flops += restr_flops; - ierr = CeedBasisGetFlopsEstimate(output_fields[i]->basis, CEED_TRANSPOSE, - op->qf->output_fields[i]->eval_mode, &basis_flops); CeedChk(ierr); + CeedCall(CeedBasisGetFlopsEstimate(output_fields[i]->basis, CEED_TRANSPOSE, op->qf->output_fields[i]->eval_mode, &basis_flops)); *flops += basis_flops * num_elem; } } @@ -1405,16 +1291,13 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) { @ref User **/ -int CeedOperatorContextGetFieldLabel(CeedOperator op, - const char *field_name, - CeedContextFieldLabel *field_label) { - int ierr; - +int CeedOperatorContextGetFieldLabel(CeedOperator op, const char *field_name, CeedContextFieldLabel *field_label) { bool is_composite; - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + if (is_composite) { // Check if composite label already created - for (CeedInt i=0; inum_context_labels; i++) { + for (CeedInt i = 0; i < op->num_context_labels; i++) { if (!strcmp(op->context_labels[i]->name, field_name)) { *field_label = op->context_labels[i]; return CEED_ERROR_SUCCESS; @@ -1422,47 +1305,39 @@ int CeedOperatorContextGetFieldLabel(CeedOperator op, } // Create composite label if needed - CeedInt num_sub; - CeedOperator *sub_operators; + CeedInt num_sub; + CeedOperator *sub_operators; CeedContextFieldLabel new_field_label; - ierr = CeedCalloc(1, &new_field_label); CeedChk(ierr); - ierr = CeedOperatorGetNumSub(op, &num_sub); CeedChk(ierr); - ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr); - ierr = CeedCalloc(num_sub, &new_field_label->sub_labels); CeedChk(ierr); + CeedCall(CeedCalloc(1, &new_field_label)); + CeedCall(CeedOperatorGetNumSub(op, &num_sub)); + CeedCall(CeedOperatorGetSubList(op, &sub_operators)); + CeedCall(CeedCalloc(num_sub, &new_field_label->sub_labels)); new_field_label->num_sub_labels = num_sub; bool label_found = false; - for (CeedInt i=0; iqf->ctx) { CeedContextFieldLabel new_field_label_i; - ierr = CeedQFunctionContextGetFieldLabel(sub_operators[i]->qf->ctx, field_name, - &new_field_label_i); CeedChk(ierr); + CeedCall(CeedQFunctionContextGetFieldLabel(sub_operators[i]->qf->ctx, field_name, &new_field_label_i)); if (new_field_label_i) { - label_found = true; + label_found = true; new_field_label->sub_labels[i] = new_field_label_i; - new_field_label->name = new_field_label_i->name; - new_field_label->description = new_field_label_i->description; - if (new_field_label->type && - new_field_label->type != new_field_label_i->type) { + new_field_label->name = new_field_label_i->name; + new_field_label->description = new_field_label_i->description; + if (new_field_label->type && new_field_label->type != new_field_label_i->type) { // LCOV_EXCL_START - ierr = CeedFree(&new_field_label); CeedChk(ierr); - return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, - "Incompatible field types on sub-operator contexts. " - "%s != %s", - CeedContextFieldTypes[new_field_label->type], - CeedContextFieldTypes[new_field_label_i->type]); + CeedCall(CeedFree(&new_field_label)); + return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, "Incompatible field types on sub-operator contexts. %s != %s", + CeedContextFieldTypes[new_field_label->type], CeedContextFieldTypes[new_field_label_i->type]); // LCOV_EXCL_STOP } else { new_field_label->type = new_field_label_i->type; } - if (new_field_label->num_values != 0 && - new_field_label->num_values != new_field_label_i->num_values) { + if (new_field_label->num_values != 0 && new_field_label->num_values != new_field_label_i->num_values) { // LCOV_EXCL_START - ierr = CeedFree(&new_field_label); CeedChk(ierr); - return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, - "Incompatible field number of values on sub-operator" - " contexts. %ld != %ld", + CeedCall(CeedFree(&new_field_label)); + return CeedError(op->ceed, CEED_ERROR_INCOMPATIBLE, "Incompatible field number of values on sub-operator contexts. %ld != %ld", new_field_label->num_values, new_field_label_i->num_values); // LCOV_EXCL_STOP } else { @@ -1473,22 +1348,21 @@ int CeedOperatorContextGetFieldLabel(CeedOperator op, } if (!label_found) { // LCOV_EXCL_START - ierr = CeedFree(&new_field_label->sub_labels); CeedChk(ierr); - ierr = CeedFree(&new_field_label); CeedChk(ierr); + CeedCall(CeedFree(&new_field_label->sub_labels)); + CeedCall(CeedFree(&new_field_label)); *field_label = NULL; // LCOV_EXCL_STOP } else { // Move new composite label to operator if (op->num_context_labels == 0) { - ierr = CeedCalloc(1, &op->context_labels); CeedChk(ierr); + CeedCall(CeedCalloc(1, &op->context_labels)); op->max_context_labels = 1; } else if (op->num_context_labels == op->max_context_labels) { - ierr = CeedRealloc(2*op->num_context_labels, &op->context_labels); - CeedChk(ierr); + CeedCall(CeedRealloc(2 * op->num_context_labels, &op->context_labels)); op->max_context_labels *= 2; } op->context_labels[op->num_context_labels] = new_field_label; - *field_label = new_field_label; + *field_label = new_field_label; op->num_context_labels++; } @@ -1511,11 +1385,8 @@ int CeedOperatorContextGetFieldLabel(CeedOperator op, @ref User **/ -int CeedOperatorContextSetDouble(CeedOperator op, - CeedContextFieldLabel field_label, - double *values) { - return CeedOperatorContextSetGeneric(op, field_label, CEED_CONTEXT_FIELD_DOUBLE, - values); +int CeedOperatorContextSetDouble(CeedOperator op, CeedContextFieldLabel field_label, double *values) { + return CeedOperatorContextSetGeneric(op, field_label, CEED_CONTEXT_FIELD_DOUBLE, values); } /** @@ -1531,11 +1402,8 @@ int CeedOperatorContextSetDouble(CeedOperator op, @ref User **/ -int CeedOperatorContextSetInt32(CeedOperator op, - CeedContextFieldLabel field_label, - int *values) { - return CeedOperatorContextSetGeneric(op, field_label, CEED_CONTEXT_FIELD_INT32, - values); +int CeedOperatorContextSetInt32(CeedOperator op, CeedContextFieldLabel field_label, int *values) { + return CeedOperatorContextSetGeneric(op, field_label, CEED_CONTEXT_FIELD_INT32, values); } /** @@ -1561,55 +1429,51 @@ int CeedOperatorContextSetInt32(CeedOperator op, @ref User **/ -int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, - CeedRequest *request) { - int ierr; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); +int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request) { + CeedCall(CeedOperatorCheckReady(op)); - if (op->num_elem) { + if (op->num_elem) { // Standard Operator if (op->Apply) { - ierr = op->Apply(op, in, out, request); CeedChk(ierr); + CeedCall(op->Apply(op, in, out, request)); } else { // Zero all output vectors CeedQFunction qf = op->qf; - for (CeedInt i=0; inum_output_fields; i++) { + for (CeedInt i = 0; i < qf->num_output_fields; i++) { CeedVector vec = op->output_fields[i]->vec; - if (vec == CEED_VECTOR_ACTIVE) - vec = out; + if (vec == CEED_VECTOR_ACTIVE) vec = out; if (vec != CEED_VECTOR_NONE) { - ierr = CeedVectorSetValue(vec, 0.0); CeedChk(ierr); + CeedCall(CeedVectorSetValue(vec, 0.0)); } } // Apply - ierr = op->ApplyAdd(op, in, out, request); CeedChk(ierr); + CeedCall(op->ApplyAdd(op, in, out, request)); } } else if (op->is_composite) { // Composite Operator if (op->ApplyComposite) { - ierr = op->ApplyComposite(op, in, out, request); CeedChk(ierr); + CeedCall(op->ApplyComposite(op, in, out, request)); } else { CeedInt num_suboperators; - ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChk(ierr); + CeedCall(CeedOperatorGetNumSub(op, &num_suboperators)); CeedOperator *sub_operators; - ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr); + CeedCall(CeedOperatorGetSubList(op, &sub_operators)); // Zero all output vectors if (out != CEED_VECTOR_NONE) { - ierr = CeedVectorSetValue(out, 0.0); CeedChk(ierr); + CeedCall(CeedVectorSetValue(out, 0.0)); } - for (CeedInt i=0; iqf->num_output_fields; j++) { + for (CeedInt i = 0; i < num_suboperators; i++) { + for (CeedInt j = 0; j < sub_operators[i]->qf->num_output_fields; j++) { CeedVector vec = sub_operators[i]->output_fields[j]->vec; if (vec != CEED_VECTOR_ACTIVE && vec != CEED_VECTOR_NONE) { - ierr = CeedVectorSetValue(vec, 0.0); CeedChk(ierr); + CeedCall(CeedVectorSetValue(vec, 0.0)); } } } // Apply - for (CeedInt i=0; inum_suboperators; i++) { - ierr = CeedOperatorApplyAdd(op->sub_operators[i], in, out, request); - CeedChk(ierr); + for (CeedInt i = 0; i < op->num_suboperators; i++) { + CeedCall(CeedOperatorApplyAdd(op->sub_operators[i], in, out, request)); } } } @@ -1635,27 +1499,24 @@ int CeedOperatorApply(CeedOperator op, CeedVector in, CeedVector out, @ref User **/ -int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, - CeedRequest *request) { - int ierr; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); +int CeedOperatorApplyAdd(CeedOperator op, CeedVector in, CeedVector out, CeedRequest *request) { + CeedCall(CeedOperatorCheckReady(op)); - if (op->num_elem) { + if (op->num_elem) { // Standard Operator - ierr = op->ApplyAdd(op, in, out, request); CeedChk(ierr); + CeedCall(op->ApplyAdd(op, in, out, request)); } else if (op->is_composite) { // Composite Operator if (op->ApplyAddComposite) { - ierr = op->ApplyAddComposite(op, in, out, request); CeedChk(ierr); + CeedCall(op->ApplyAddComposite(op, in, out, request)); } else { CeedInt num_suboperators; - ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChk(ierr); + CeedCall(CeedOperatorGetNumSub(op, &num_suboperators)); CeedOperator *sub_operators; - ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr); + CeedCall(CeedOperatorGetSubList(op, &sub_operators)); - for (CeedInt i=0; iref_count > 0) return CEED_ERROR_SUCCESS; - if ((*op)->Destroy) { - ierr = (*op)->Destroy(*op); CeedChk(ierr); - } - ierr = CeedDestroy(&(*op)->ceed); CeedChk(ierr); + if ((*op)->Destroy) CeedCall((*op)->Destroy(*op)); + CeedCall(CeedDestroy(&(*op)->ceed)); // Free fields - for (CeedInt i=0; i<(*op)->num_fields; i++) + for (CeedInt i = 0; i < (*op)->num_fields; i++) { if ((*op)->input_fields[i]) { if ((*op)->input_fields[i]->elem_restr != CEED_ELEMRESTRICTION_NONE) { - ierr = CeedElemRestrictionDestroy(&(*op)->input_fields[i]->elem_restr); - CeedChk(ierr); + CeedCall(CeedElemRestrictionDestroy(&(*op)->input_fields[i]->elem_restr)); } if ((*op)->input_fields[i]->basis != CEED_BASIS_COLLOCATED) { - ierr = CeedBasisDestroy(&(*op)->input_fields[i]->basis); CeedChk(ierr); + CeedCall(CeedBasisDestroy(&(*op)->input_fields[i]->basis)); } - if ((*op)->input_fields[i]->vec != CEED_VECTOR_ACTIVE && - (*op)->input_fields[i]->vec != CEED_VECTOR_NONE ) { - ierr = CeedVectorDestroy(&(*op)->input_fields[i]->vec); CeedChk(ierr); + if ((*op)->input_fields[i]->vec != CEED_VECTOR_ACTIVE && (*op)->input_fields[i]->vec != CEED_VECTOR_NONE) { + CeedCall(CeedVectorDestroy(&(*op)->input_fields[i]->vec)); } - ierr = CeedFree(&(*op)->input_fields[i]->field_name); CeedChk(ierr); - ierr = CeedFree(&(*op)->input_fields[i]); CeedChk(ierr); + CeedCall(CeedFree(&(*op)->input_fields[i]->field_name)); + CeedCall(CeedFree(&(*op)->input_fields[i])); } - for (CeedInt i=0; i<(*op)->num_fields; i++) + } + for (CeedInt i = 0; i < (*op)->num_fields; i++) { if ((*op)->output_fields[i]) { - ierr = CeedElemRestrictionDestroy(&(*op)->output_fields[i]->elem_restr); - CeedChk(ierr); + CeedCall(CeedElemRestrictionDestroy(&(*op)->output_fields[i]->elem_restr)); if ((*op)->output_fields[i]->basis != CEED_BASIS_COLLOCATED) { - ierr = CeedBasisDestroy(&(*op)->output_fields[i]->basis); CeedChk(ierr); + CeedCall(CeedBasisDestroy(&(*op)->output_fields[i]->basis)); } - if ((*op)->output_fields[i]->vec != CEED_VECTOR_ACTIVE && - (*op)->output_fields[i]->vec != CEED_VECTOR_NONE ) { - ierr = CeedVectorDestroy(&(*op)->output_fields[i]->vec); CeedChk(ierr); + if ((*op)->output_fields[i]->vec != CEED_VECTOR_ACTIVE && (*op)->output_fields[i]->vec != CEED_VECTOR_NONE) { + CeedCall(CeedVectorDestroy(&(*op)->output_fields[i]->vec)); } - ierr = CeedFree(&(*op)->output_fields[i]->field_name); CeedChk(ierr); - ierr = CeedFree(&(*op)->output_fields[i]); CeedChk(ierr); + CeedCall(CeedFree(&(*op)->output_fields[i]->field_name)); + CeedCall(CeedFree(&(*op)->output_fields[i])); } + } // Destroy sub_operators - for (CeedInt i=0; i<(*op)->num_suboperators; i++) + for (CeedInt i = 0; i < (*op)->num_suboperators; i++) { if ((*op)->sub_operators[i]) { - ierr = CeedOperatorDestroy(&(*op)->sub_operators[i]); CeedChk(ierr); + CeedCall(CeedOperatorDestroy(&(*op)->sub_operators[i])); } - ierr = CeedQFunctionDestroy(&(*op)->qf); CeedChk(ierr); - ierr = CeedQFunctionDestroy(&(*op)->dqf); CeedChk(ierr); - ierr = CeedQFunctionDestroy(&(*op)->dqfT); CeedChk(ierr); + } + CeedCall(CeedQFunctionDestroy(&(*op)->qf)); + CeedCall(CeedQFunctionDestroy(&(*op)->dqf)); + CeedCall(CeedQFunctionDestroy(&(*op)->dqfT)); // Destroy any composite labels - for (CeedInt i=0; i<(*op)->num_context_labels; i++) { - ierr = CeedFree(&(*op)->context_labels[i]->sub_labels); CeedChk(ierr); - ierr = CeedFree(&(*op)->context_labels[i]); CeedChk(ierr); + for (CeedInt i = 0; i < (*op)->num_context_labels; i++) { + CeedCall(CeedFree(&(*op)->context_labels[i]->sub_labels)); + CeedCall(CeedFree(&(*op)->context_labels[i])); } - ierr = CeedFree(&(*op)->context_labels); CeedChk(ierr); + CeedCall(CeedFree(&(*op)->context_labels)); // Destroy fallback - ierr = CeedOperatorDestroy(&(*op)->op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorDestroy(&(*op)->op_fallback)); // Destroy assembly data - ierr = CeedQFunctionAssemblyDataDestroy(&(*op)->qf_assembled); CeedChk(ierr); - ierr = CeedOperatorAssemblyDataDestroy(&(*op)->op_assembled); CeedChk(ierr); - - ierr = CeedFree(&(*op)->input_fields); CeedChk(ierr); - ierr = CeedFree(&(*op)->output_fields); CeedChk(ierr); - ierr = CeedFree(&(*op)->sub_operators); CeedChk(ierr); - ierr = CeedFree(&(*op)->name); CeedChk(ierr); - ierr = CeedFree(op); CeedChk(ierr); + CeedCall(CeedQFunctionAssemblyDataDestroy(&(*op)->qf_assembled)); + CeedCall(CeedOperatorAssemblyDataDestroy(&(*op)->op_assembled)); + + CeedCall(CeedFree(&(*op)->input_fields)); + CeedCall(CeedFree(&(*op)->output_fields)); + CeedCall(CeedFree(&(*op)->sub_operators)); + CeedCall(CeedFree(&(*op)->name)); + CeedCall(CeedFree(op)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c index 0d0e5be8c3..07ec9790cd 100644 --- a/interface/ceed-preconditioning.c +++ b/interface/ceed-preconditioning.c @@ -5,11 +5,11 @@ // // This file is part of CEED: http://github.com/ceed -#include -#include +#include #include +#include +#include #include -#include #include #include #include @@ -35,10 +35,7 @@ @ref Developer **/ -static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, - CeedQFunction *qf_fallback) { - int ierr; - +static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, CeedQFunction *qf_fallback) { // Check if NULL qf passed in if (!qf) return CEED_ERROR_SUCCESS; @@ -47,37 +44,29 @@ static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, char *source_path_with_name = ""; if (qf->source_path) { - size_t path_len = strlen(qf->source_path), - name_len = strlen(qf->kernel_name); - ierr = CeedCalloc(path_len + name_len + 2, &source_path_with_name); - CeedChk(ierr); + size_t path_len = strlen(qf->source_path), name_len = strlen(qf->kernel_name); + CeedCall(CeedCalloc(path_len + name_len + 2, &source_path_with_name)); memcpy(source_path_with_name, qf->source_path, path_len); memcpy(&source_path_with_name[path_len], ":", 1); memcpy(&source_path_with_name[path_len + 1], qf->kernel_name, name_len); } else { - ierr = CeedCalloc(1, &source_path_with_name); CeedChk(ierr); + CeedCall(CeedCalloc(1, &source_path_with_name)); } - ierr = CeedQFunctionCreateInterior(fallback_ceed, qf->vec_length, - qf->function, source_path_with_name, - qf_fallback); CeedChk(ierr); + CeedCall(CeedQFunctionCreateInterior(fallback_ceed, qf->vec_length, qf->function, source_path_with_name, qf_fallback)); { CeedQFunctionContext ctx; - ierr = CeedQFunctionGetContext(qf, &ctx); CeedChk(ierr); - ierr = CeedQFunctionSetContext(*qf_fallback, ctx); CeedChk(ierr); + CeedCall(CeedQFunctionGetContext(qf, &ctx)); + CeedCall(CeedQFunctionSetContext(*qf_fallback, ctx)); } for (CeedInt i = 0; i < qf->num_input_fields; i++) { - ierr = CeedQFunctionAddInput(*qf_fallback, qf->input_fields[i]->field_name, - qf->input_fields[i]->size, - qf->input_fields[i]->eval_mode); CeedChk(ierr); + CeedCall(CeedQFunctionAddInput(*qf_fallback, qf->input_fields[i]->field_name, qf->input_fields[i]->size, qf->input_fields[i]->eval_mode)); } for (CeedInt i = 0; i < qf->num_output_fields; i++) { - ierr = CeedQFunctionAddOutput(*qf_fallback, qf->output_fields[i]->field_name, - qf->output_fields[i]->size, - qf->output_fields[i]->eval_mode); CeedChk(ierr); + CeedCall(CeedQFunctionAddOutput(*qf_fallback, qf->output_fields[i]->field_name, qf->output_fields[i]->size, qf->output_fields[i]->eval_mode)); } - ierr = CeedFree(&source_path_with_name); CeedChk(ierr); + CeedCall(CeedFree(&source_path_with_name)); return CEED_ERROR_SUCCESS; } @@ -93,14 +82,13 @@ static int CeedQFunctionCreateFallback(Ceed fallback_ceed, CeedQFunction qf, @ref Developer **/ static int CeedOperatorCreateFallback(CeedOperator op) { - int ierr; Ceed ceed_fallback; // Check not already created if (op->op_fallback) return CEED_ERROR_SUCCESS; // Fallback Ceed - ierr = CeedGetOperatorFallbackCeed(op->ceed, &ceed_fallback); CeedChk(ierr); + CeedCall(CeedGetOperatorFallbackCeed(op->ceed, &ceed_fallback)); if (!ceed_fallback) return CEED_ERROR_SUCCESS; CeedDebug256(op->ceed, 1, "---------- CeedOperator Fallback ----------\n"); @@ -109,50 +97,38 @@ static int CeedOperatorCreateFallback(CeedOperator op) { // Clone Op CeedOperator op_fallback; if (op->is_composite) { - ierr = CeedCompositeOperatorCreate(ceed_fallback, &op_fallback); - CeedChk(ierr); + CeedCall(CeedCompositeOperatorCreate(ceed_fallback, &op_fallback)); for (CeedInt i = 0; i < op->num_suboperators; i++) { CeedOperator op_sub_fallback; - ierr = CeedOperatorGetFallback(op->sub_operators[i], &op_sub_fallback); - CeedChk(ierr); - ierr = CeedCompositeOperatorAddSub(op_fallback, op_sub_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op->sub_operators[i], &op_sub_fallback)); + CeedCall(CeedCompositeOperatorAddSub(op_fallback, op_sub_fallback)); } } else { CeedQFunction qf_fallback = NULL, dqf_fallback = NULL, dqfT_fallback = NULL; - ierr = CeedQFunctionCreateFallback(ceed_fallback, op->qf, &qf_fallback); - CeedChk(ierr); - ierr = CeedQFunctionCreateFallback(ceed_fallback, op->dqf, &dqf_fallback); - CeedChk(ierr); - ierr = CeedQFunctionCreateFallback(ceed_fallback, op->dqfT, &dqfT_fallback); - CeedChk(ierr); - ierr = CeedOperatorCreate(ceed_fallback, qf_fallback, dqf_fallback, - dqfT_fallback, &op_fallback); CeedChk(ierr); + CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->qf, &qf_fallback)); + CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->dqf, &dqf_fallback)); + CeedCall(CeedQFunctionCreateFallback(ceed_fallback, op->dqfT, &dqfT_fallback)); + CeedCall(CeedOperatorCreate(ceed_fallback, qf_fallback, dqf_fallback, dqfT_fallback, &op_fallback)); for (CeedInt i = 0; i < op->qf->num_input_fields; i++) { - ierr = CeedOperatorSetField(op_fallback, op->input_fields[i]->field_name, - op->input_fields[i]->elem_restr, - op->input_fields[i]->basis, - op->input_fields[i]->vec); CeedChk(ierr); + CeedCall(CeedOperatorSetField(op_fallback, op->input_fields[i]->field_name, op->input_fields[i]->elem_restr, op->input_fields[i]->basis, + op->input_fields[i]->vec)); } for (CeedInt i = 0; i < op->qf->num_output_fields; i++) { - ierr = CeedOperatorSetField(op_fallback, op->output_fields[i]->field_name, - op->output_fields[i]->elem_restr, - op->output_fields[i]->basis, - op->output_fields[i]->vec); CeedChk(ierr); + CeedCall(CeedOperatorSetField(op_fallback, op->output_fields[i]->field_name, op->output_fields[i]->elem_restr, op->output_fields[i]->basis, + op->output_fields[i]->vec)); } - ierr = CeedQFunctionAssemblyDataReferenceCopy(op->qf_assembled, - &op_fallback->qf_assembled); CeedChk(ierr); + CeedCall(CeedQFunctionAssemblyDataReferenceCopy(op->qf_assembled, &op_fallback->qf_assembled)); if (op_fallback->num_qpts == 0) { - ierr = CeedOperatorSetNumQuadraturePoints(op_fallback, op->num_qpts); - CeedChk(ierr); + CeedCall(CeedOperatorSetNumQuadraturePoints(op_fallback, op->num_qpts)); } // Cleanup - ierr = CeedQFunctionDestroy(&qf_fallback); CeedChk(ierr); - ierr = CeedQFunctionDestroy(&dqf_fallback); CeedChk(ierr); - ierr = CeedQFunctionDestroy(&dqfT_fallback); CeedChk(ierr); + CeedCall(CeedQFunctionDestroy(&qf_fallback)); + CeedCall(CeedQFunctionDestroy(&dqf_fallback)); + CeedCall(CeedQFunctionDestroy(&dqfT_fallback)); } - ierr = CeedOperatorSetName(op_fallback, op->name); CeedChk(ierr); - ierr = CeedOperatorCheckReady(op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorSetName(op_fallback, op->name)); + CeedCall(CeedOperatorCheckReady(op_fallback)); op->op_fallback = op_fallback; return CEED_ERROR_SUCCESS; @@ -169,28 +145,25 @@ static int CeedOperatorCreateFallback(CeedOperator op) { @ref Developer **/ int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback) { - int ierr; - // Create if needed if (!op->op_fallback) { - ierr = CeedOperatorCreateFallback(op); CeedChk(ierr); + CeedCall(CeedOperatorCreateFallback(op)); } if (op->op_fallback) { bool is_debug; - ierr = CeedIsDebug(op->ceed, &is_debug); CeedChk(ierr); + CeedCall(CeedIsDebug(op->ceed, &is_debug)); if (is_debug) { - Ceed ceed_fallback; + Ceed ceed_fallback; const char *resource, *resource_fallback; - ierr = CeedGetOperatorFallbackCeed(op->ceed, &ceed_fallback); CeedChk(ierr); - ierr = CeedGetResource(op->ceed, &resource); CeedChk(ierr); - ierr = CeedGetResource(ceed_fallback, &resource_fallback); CeedChk(ierr); + CeedCall(CeedGetOperatorFallbackCeed(op->ceed, &ceed_fallback)); + CeedCall(CeedGetResource(op->ceed, &resource)); + CeedCall(CeedGetResource(ceed_fallback, &resource_fallback)); CeedDebug256(op->ceed, 1, "---------- CeedOperator Fallback ----------\n"); - CeedDebug(op->ceed, - "Falling back from %s operator at address %ld to %s operator at address %ld\n", - resource, op, resource_fallback, op->op_fallback); + CeedDebug(op->ceed, "Falling back from %s operator at address %ld to %s operator at address %ld\n", resource, op, resource_fallback, + op->op_fallback); } } *op_fallback = op->op_fallback; @@ -209,23 +182,22 @@ int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback) { @ref Developer **/ -static inline void CeedOperatorGetBasisPointer(CeedEvalMode eval_mode, - const CeedScalar *identity, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar **basis_ptr) { +static inline void CeedOperatorGetBasisPointer(CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar **basis_ptr) { switch (eval_mode) { - case CEED_EVAL_NONE: - *basis_ptr = identity; - break; - case CEED_EVAL_INTERP: - *basis_ptr = interp; - break; - case CEED_EVAL_GRAD: - *basis_ptr = grad; - break; - case CEED_EVAL_WEIGHT: - case CEED_EVAL_DIV: - case CEED_EVAL_CURL: - break; // Caught by QF Assembly + case CEED_EVAL_NONE: + *basis_ptr = identity; + break; + case CEED_EVAL_INTERP: + *basis_ptr = interp; + break; + case CEED_EVAL_GRAD: + *basis_ptr = grad; + break; + case CEED_EVAL_WEIGHT: + case CEED_EVAL_DIV: + case CEED_EVAL_CURL: + break; // Caught by QF Assembly } assert(*basis_ptr != NULL); } @@ -241,41 +213,33 @@ static inline void CeedOperatorGetBasisPointer(CeedEvalMode eval_mode, @ref Developer **/ -static int CeedOperatorCreateActivePointBlockRestriction( - CeedElemRestriction rstr, - CeedElemRestriction *pointblock_rstr) { - int ierr; +static int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, CeedElemRestriction *pointblock_rstr) { Ceed ceed; - ierr = CeedElemRestrictionGetCeed(rstr, &ceed); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetCeed(rstr, &ceed)); const CeedInt *offsets; - ierr = CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets); - CeedChk(ierr); + CeedCall(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); // Expand offsets - CeedInt num_elem, num_comp, elem_size, comp_stride, *pointblock_offsets; + CeedInt num_elem, num_comp, elem_size, comp_stride, *pointblock_offsets; CeedSize l_size; - ierr = CeedElemRestrictionGetNumElements(rstr, &num_elem); CeedChk(ierr); - ierr = CeedElemRestrictionGetNumComponents(rstr, &num_comp); CeedChk(ierr); - ierr = CeedElemRestrictionGetElementSize(rstr, &elem_size); CeedChk(ierr); - ierr = CeedElemRestrictionGetCompStride(rstr, &comp_stride); CeedChk(ierr); - ierr = CeedElemRestrictionGetLVectorSize(rstr, &l_size); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCall(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + CeedCall(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCall(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); + CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); CeedInt shift = num_comp; - if (comp_stride != 1) - shift *= num_comp; - ierr = CeedCalloc(num_elem*elem_size, &pointblock_offsets); - CeedChk(ierr); - for (CeedInt i = 0; i < num_elem*elem_size; i++) { - pointblock_offsets[i] = offsets[i]*shift; + if (comp_stride != 1) shift *= num_comp; + CeedCall(CeedCalloc(num_elem * elem_size, &pointblock_offsets)); + for (CeedInt i = 0; i < num_elem * elem_size; i++) { + pointblock_offsets[i] = offsets[i] * shift; } // Create new restriction - ierr = CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp*num_comp, - 1, l_size * num_comp, CEED_MEM_HOST, - CEED_OWN_POINTER, pointblock_offsets, pointblock_rstr); - CeedChk(ierr); + CeedCall(CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp * num_comp, 1, l_size * num_comp, CEED_MEM_HOST, CEED_OWN_POINTER, + pointblock_offsets, pointblock_rstr)); // Cleanup - ierr = CeedElemRestrictionRestoreOffsets(rstr, &offsets); CeedChk(ierr); + CeedCall(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); return CEED_ERROR_SUCCESS; } @@ -293,146 +257,131 @@ static int CeedOperatorCreateActivePointBlockRestriction( @ref Developer **/ -static inline int CeedSingleOperatorAssembleAddDiagonal_Core(CeedOperator op, - CeedRequest *request, const bool is_pointblock, CeedVector assembled) { - int ierr; +static inline int CeedSingleOperatorAssembleAddDiagonal_Core(CeedOperator op, CeedRequest *request, const bool is_pointblock, CeedVector assembled) { Ceed ceed; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); + CeedCall(CeedOperatorGetCeed(op, &ceed)); // Assemble QFunction CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); + CeedCall(CeedOperatorGetQFunction(op, &qf)); CeedInt num_input_fields, num_output_fields; - ierr= CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields); - CeedChk(ierr); - CeedVector assembled_qf; + CeedCall(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); + CeedVector assembled_qf; CeedElemRestriction rstr; - ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, - &rstr, request); CeedChk(ierr); + CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); CeedInt layout[3]; - ierr = CeedElemRestrictionGetELayout(rstr, &layout); CeedChk(ierr); - ierr = CeedElemRestrictionDestroy(&rstr); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetELayout(rstr, &layout)); + CeedCall(CeedElemRestrictionDestroy(&rstr)); // Get assembly data CeedOperatorAssemblyData data; - ierr = CeedOperatorGetOperatorAssemblyData(op, &data); CeedChk(ierr); + CeedCall(CeedOperatorGetOperatorAssemblyData(op, &data)); const CeedEvalMode *eval_mode_in, *eval_mode_out; - CeedInt num_eval_mode_in, num_eval_mode_out; - ierr = CeedOperatorAssemblyDataGetEvalModes(data, &num_eval_mode_in, - &eval_mode_in, &num_eval_mode_out, &eval_mode_out); CeedChk(ierr); + CeedInt num_eval_mode_in, num_eval_mode_out; + CeedCall(CeedOperatorAssemblyDataGetEvalModes(data, &num_eval_mode_in, &eval_mode_in, &num_eval_mode_out, &eval_mode_out)); CeedBasis basis_in, basis_out; - ierr = CeedOperatorAssemblyDataGetBases(data, &basis_in, NULL, &basis_out, - NULL); CeedChk(ierr); + CeedCall(CeedOperatorAssemblyDataGetBases(data, &basis_in, NULL, &basis_out, NULL)); CeedInt num_comp; - ierr = CeedBasisGetNumComponents(basis_in, &num_comp); CeedChk(ierr); + CeedCall(CeedBasisGetNumComponents(basis_in, &num_comp)); // Assemble point block diagonal restriction, if needed CeedElemRestriction diag_rstr; - ierr = CeedOperatorGetActiveElemRestriction(op, &diag_rstr); CeedChk(ierr); + CeedCall(CeedOperatorGetActiveElemRestriction(op, &diag_rstr)); if (is_pointblock) { CeedElemRestriction point_block_rstr; - ierr = CeedOperatorCreateActivePointBlockRestriction(diag_rstr, - &point_block_rstr); - CeedChk(ierr); + CeedCall(CeedOperatorCreateActivePointBlockRestriction(diag_rstr, &point_block_rstr)); diag_rstr = point_block_rstr; } // Create diagonal vector CeedVector elem_diag; - ierr = CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag); - CeedChk(ierr); + CeedCall(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); // Assemble element operator diagonals - CeedScalar *elem_diag_array; + CeedScalar *elem_diag_array; const CeedScalar *assembled_qf_array; - ierr = CeedVectorSetValue(elem_diag, 0.0); CeedChk(ierr); - ierr = CeedVectorGetArray(elem_diag, CEED_MEM_HOST, &elem_diag_array); - CeedChk(ierr); - ierr = CeedVectorGetArrayRead(assembled_qf, CEED_MEM_HOST, &assembled_qf_array); - CeedChk(ierr); + CeedCall(CeedVectorSetValue(elem_diag, 0.0)); + CeedCall(CeedVectorGetArray(elem_diag, CEED_MEM_HOST, &elem_diag_array)); + CeedCall(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_HOST, &assembled_qf_array)); CeedInt num_elem, num_nodes, num_qpts; - ierr = CeedElemRestrictionGetNumElements(diag_rstr, &num_elem); CeedChk(ierr); - ierr = CeedBasisGetNumNodes(basis_in, &num_nodes); CeedChk(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); + CeedCall(CeedBasisGetNumNodes(basis_in, &num_nodes)); + CeedCall(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); // Basis matrices const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; - CeedScalar *identity = NULL; - bool has_eval_none = false; + CeedScalar *identity = NULL; + bool has_eval_none = false; for (CeedInt i = 0; i < num_eval_mode_in; i++) { has_eval_none = has_eval_none || (eval_mode_in[i] == CEED_EVAL_NONE); } - for (CeedInt i = 0; iceed; - if (op->is_composite) + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Composite operator not supported"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); + // LCOV_EXCL_STOP + } CeedSize num_nodes; - ierr = CeedOperatorGetActiveVectorLengths(op, &num_nodes, NULL); CeedChk(ierr); + CeedCall(CeedOperatorGetActiveVectorLengths(op, &num_nodes, NULL)); CeedElemRestriction rstr_in; - ierr = CeedOperatorGetActiveElemRestriction(op, &rstr_in); CeedChk(ierr); + CeedCall(CeedOperatorGetActiveElemRestriction(op, &rstr_in)); CeedInt num_elem, elem_size, num_comp; - ierr = CeedElemRestrictionGetNumElements(rstr_in, &num_elem); CeedChk(ierr); - ierr = CeedElemRestrictionGetElementSize(rstr_in, &elem_size); CeedChk(ierr); - ierr = CeedElemRestrictionGetNumComponents(rstr_in, &num_comp); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); + CeedCall(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); + CeedCall(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); CeedInt layout_er[3]; - ierr = CeedElemRestrictionGetELayout(rstr_in, &layout_er); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetELayout(rstr_in, &layout_er)); - CeedInt local_num_entries = elem_size*num_comp * elem_size*num_comp * num_elem; + CeedInt local_num_entries = elem_size * num_comp * elem_size * num_comp * num_elem; // Determine elem_dof relation CeedVector index_vec; - ierr = CeedVectorCreate(ceed, num_nodes, &index_vec); CeedChk(ierr); + CeedCall(CeedVectorCreate(ceed, num_nodes, &index_vec)); CeedScalar *array; - ierr = CeedVectorGetArrayWrite(index_vec, CEED_MEM_HOST, &array); CeedChk(ierr); + CeedCall(CeedVectorGetArrayWrite(index_vec, CEED_MEM_HOST, &array)); for (CeedInt i = 0; i < num_nodes; i++) array[i] = i; - ierr = CeedVectorRestoreArray(index_vec, &array); CeedChk(ierr); + CeedCall(CeedVectorRestoreArray(index_vec, &array)); CeedVector elem_dof; - ierr = CeedVectorCreate(ceed, num_elem * elem_size * num_comp, &elem_dof); - CeedChk(ierr); - ierr = CeedVectorSetValue(elem_dof, 0.0); CeedChk(ierr); - CeedElemRestrictionApply(rstr_in, CEED_NOTRANSPOSE, index_vec, - elem_dof, CEED_REQUEST_IMMEDIATE); CeedChk(ierr); + CeedCall(CeedVectorCreate(ceed, num_elem * elem_size * num_comp, &elem_dof)); + CeedCall(CeedVectorSetValue(elem_dof, 0.0)); + CeedCall(CeedElemRestrictionApply(rstr_in, CEED_NOTRANSPOSE, index_vec, elem_dof, CEED_REQUEST_IMMEDIATE)); const CeedScalar *elem_dof_a; - ierr = CeedVectorGetArrayRead(elem_dof, CEED_MEM_HOST, &elem_dof_a); - CeedChk(ierr); - ierr = CeedVectorDestroy(&index_vec); CeedChk(ierr); + CeedCall(CeedVectorGetArrayRead(elem_dof, CEED_MEM_HOST, &elem_dof_a)); + CeedCall(CeedVectorDestroy(&index_vec)); // Determine i, j locations for element matrices CeedInt count = 0; @@ -532,10 +472,8 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, for (CeedInt comp_out = 0; comp_out < num_comp; comp_out++) { for (CeedInt i = 0; i < elem_size; i++) { for (CeedInt j = 0; j < elem_size; j++) { - const CeedInt elem_dof_index_row = i*layout_er[0] + - (comp_out)*layout_er[1] + e*layout_er[2]; - const CeedInt elem_dof_index_col = j*layout_er[0] + - comp_in*layout_er[1] + e*layout_er[2]; + const CeedInt elem_dof_index_row = i * layout_er[0] + (comp_out)*layout_er[1] + e * layout_er[2]; + const CeedInt elem_dof_index_col = j * layout_er[0] + comp_in * layout_er[1] + e * layout_er[2]; const CeedInt row = elem_dof_a[elem_dof_index_row]; const CeedInt col = elem_dof_a[elem_dof_index_col]; @@ -548,12 +486,13 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, } } } - if (count != local_num_entries) + if (count != local_num_entries) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_MAJOR, "Error computing assembled entries"); - // LCOV_EXCL_STOP - ierr = CeedVectorRestoreArrayRead(elem_dof, &elem_dof_a); CeedChk(ierr); - ierr = CeedVectorDestroy(&elem_dof); CeedChk(ierr); + // LCOV_EXCL_STOP + } + CeedCall(CeedVectorRestoreArrayRead(elem_dof, &elem_dof_a)); + CeedCall(CeedVectorDestroy(&elem_dof)); return CEED_ERROR_SUCCESS; } @@ -571,100 +510,89 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, @ref Developer **/ -static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, - CeedVector values) { - int ierr; +static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, CeedVector values) { Ceed ceed = op->ceed; - if (op->is_composite) + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Composite operator not supported"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); + // LCOV_EXCL_STOP + } if (op->num_elem == 0) return CEED_ERROR_SUCCESS; if (op->LinearAssembleSingle) { // Backend version - ierr = op->LinearAssembleSingle(op, offset, values); CeedChk(ierr); + CeedCall(op->LinearAssembleSingle(op, offset, values)); return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedSingleOperatorAssemble(op_fallback, offset, values); - CeedChk(ierr); + CeedCall(CeedSingleOperatorAssemble(op_fallback, offset, values)); return CEED_ERROR_SUCCESS; } } // Assemble QFunction CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); - CeedVector assembled_qf; + CeedCall(CeedOperatorGetQFunction(op, &qf)); + CeedVector assembled_qf; CeedElemRestriction rstr_q; - ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate( - op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE); CeedChk(ierr); + CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); CeedSize qf_length; - ierr = CeedVectorGetLength(assembled_qf, &qf_length); CeedChk(ierr); + CeedCall(CeedVectorGetLength(assembled_qf, &qf_length)); - CeedInt num_input_fields, num_output_fields; + CeedInt num_input_fields, num_output_fields; CeedOperatorField *input_fields; CeedOperatorField *output_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &input_fields, - &num_output_fields, &output_fields); CeedChk(ierr); + CeedCall(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Get assembly data CeedOperatorAssemblyData data; - ierr = CeedOperatorGetOperatorAssemblyData(op, &data); CeedChk(ierr); + CeedCall(CeedOperatorGetOperatorAssemblyData(op, &data)); const CeedEvalMode *eval_mode_in, *eval_mode_out; - CeedInt num_eval_mode_in, num_eval_mode_out; - ierr = CeedOperatorAssemblyDataGetEvalModes(data, &num_eval_mode_in, - &eval_mode_in, &num_eval_mode_out, &eval_mode_out); CeedChk(ierr); + CeedInt num_eval_mode_in, num_eval_mode_out; + CeedCall(CeedOperatorAssemblyDataGetEvalModes(data, &num_eval_mode_in, &eval_mode_in, &num_eval_mode_out, &eval_mode_out)); CeedBasis basis_in, basis_out; - ierr = CeedOperatorAssemblyDataGetBases(data, &basis_in, NULL, &basis_out, - NULL); CeedChk(ierr); + CeedCall(CeedOperatorAssemblyDataGetBases(data, &basis_in, NULL, &basis_out, NULL)); - if (num_eval_mode_in == 0 || num_eval_mode_out == 0) + if (num_eval_mode_in == 0 || num_eval_mode_out == 0) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Cannot assemble operator with out inputs/outputs"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator with out inputs/outputs"); + // LCOV_EXCL_STOP + } CeedElemRestriction active_rstr; - CeedInt num_elem, elem_size, num_qpts, num_comp; - ierr = CeedOperatorGetActiveElemRestriction(op, &active_rstr); CeedChk(ierr); - ierr = CeedElemRestrictionGetNumElements(active_rstr, &num_elem); CeedChk(ierr); - ierr = CeedElemRestrictionGetElementSize(active_rstr, &elem_size); - CeedChk(ierr); - ierr = CeedElemRestrictionGetNumComponents(active_rstr, &num_comp); - CeedChk(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts); CeedChk(ierr); + CeedInt num_elem, elem_size, num_qpts, num_comp; + CeedCall(CeedOperatorGetActiveElemRestriction(op, &active_rstr)); + CeedCall(CeedElemRestrictionGetNumElements(active_rstr, &num_elem)); + CeedCall(CeedElemRestrictionGetElementSize(active_rstr, &elem_size)); + CeedCall(CeedElemRestrictionGetNumComponents(active_rstr, &num_comp)); + CeedCall(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); - CeedInt local_num_entries = elem_size*num_comp * elem_size*num_comp * num_elem; + CeedInt local_num_entries = elem_size * num_comp * elem_size * num_comp * num_elem; // loop over elements and put in data structure const CeedScalar *interp_in, *grad_in; - ierr = CeedBasisGetInterp(basis_in, &interp_in); CeedChk(ierr); - ierr = CeedBasisGetGrad(basis_in, &grad_in); CeedChk(ierr); + CeedCall(CeedBasisGetInterp(basis_in, &interp_in)); + CeedCall(CeedBasisGetGrad(basis_in, &grad_in)); const CeedScalar *assembled_qf_array; - ierr = CeedVectorGetArrayRead(assembled_qf, CEED_MEM_HOST, &assembled_qf_array); - CeedChk(ierr); + CeedCall(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_HOST, &assembled_qf_array)); CeedInt layout_qf[3]; - ierr = CeedElemRestrictionGetELayout(rstr_q, &layout_qf); CeedChk(ierr); - ierr = CeedElemRestrictionDestroy(&rstr_q); CeedChk(ierr); + CeedCall(CeedElemRestrictionGetELayout(rstr_q, &layout_qf)); + CeedCall(CeedElemRestrictionDestroy(&rstr_q)); // we store B_mat_in, B_mat_out, BTD, elem_mat in row-major order const CeedScalar *B_mat_in, *B_mat_out; - ierr = CeedOperatorAssemblyDataGetBases(data, NULL, &B_mat_in, NULL, - &B_mat_out); CeedChk(ierr); - CeedScalar BTD_mat[elem_size * num_qpts*num_eval_mode_in]; - CeedScalar elem_mat[elem_size * elem_size]; - CeedInt count = 0; + CeedCall(CeedOperatorAssemblyDataGetBases(data, NULL, &B_mat_in, NULL, &B_mat_out)); + CeedScalar BTD_mat[elem_size * num_qpts * num_eval_mode_in]; + CeedScalar elem_mat[elem_size * elem_size]; + CeedInt count = 0; CeedScalar *vals; - ierr = CeedVectorGetArrayWrite(values, CEED_MEM_HOST, &vals); CeedChk(ierr); + CeedCall(CeedVectorGetArrayWrite(values, CEED_MEM_HOST, &vals)); for (CeedInt e = 0; e < num_elem; e++) { for (CeedInt comp_in = 0; comp_in < num_comp; comp_in++) { for (CeedInt comp_out = 0; comp_out < num_comp; comp_out++) { @@ -672,15 +600,12 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, for (CeedInt n = 0; n < elem_size; n++) { for (CeedInt q = 0; q < num_qpts; q++) { for (CeedInt e_in = 0; e_in < num_eval_mode_in; e_in++) { - const CeedInt btd_index = n*(num_qpts*num_eval_mode_in) + - (num_eval_mode_in*q + e_in); - CeedScalar sum = 0.0; + const CeedInt btd_index = n * (num_qpts * num_eval_mode_in) + (num_eval_mode_in * q + e_in); + CeedScalar sum = 0.0; for (CeedInt e_out = 0; e_out < num_eval_mode_out; e_out++) { - const CeedInt b_out_index = (num_eval_mode_out*q + e_out)*elem_size + n; - const CeedInt eval_mode_index = ((e_in*num_comp+comp_in)*num_eval_mode_out - +e_out)*num_comp + comp_out; - const CeedInt qf_index = q*layout_qf[0] + eval_mode_index*layout_qf[1] + - e*layout_qf[2]; + const CeedInt b_out_index = (num_eval_mode_out * q + e_out) * elem_size + n; + const CeedInt eval_mode_index = ((e_in * num_comp + comp_in) * num_eval_mode_out + e_out) * num_comp + comp_out; + const CeedInt qf_index = q * layout_qf[0] + eval_mode_index * layout_qf[1] + e * layout_qf[2]; sum += B_mat_out[b_out_index] * assembled_qf_array[qf_index]; } BTD_mat[btd_index] = sum; @@ -688,28 +613,27 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, } } // form element matrix itself (for each block component) - ierr = CeedMatrixMatrixMultiply(ceed, BTD_mat, B_mat_in, elem_mat, elem_size, - elem_size, num_qpts*num_eval_mode_in); CeedChk(ierr); + CeedCall(CeedMatrixMatrixMultiply(ceed, BTD_mat, B_mat_in, elem_mat, elem_size, elem_size, num_qpts * num_eval_mode_in)); // put element matrix in coordinate data structure for (CeedInt i = 0; i < elem_size; i++) { for (CeedInt j = 0; j < elem_size; j++) { - vals[offset + count] = elem_mat[i*elem_size + j]; + vals[offset + count] = elem_mat[i * elem_size + j]; count++; } } } } } - if (count != local_num_entries) + if (count != local_num_entries) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_MAJOR, "Error computing entries"); - // LCOV_EXCL_STOP - ierr = CeedVectorRestoreArray(values, &vals); CeedChk(ierr); + // LCOV_EXCL_STOP + } + CeedCall(CeedVectorRestoreArray(values, &vals)); - ierr = CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array); - CeedChk(ierr); - ierr = CeedVectorDestroy(&assembled_qf); CeedChk(ierr); + CeedCall(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); + CeedCall(CeedVectorDestroy(&assembled_qf)); return CEED_ERROR_SUCCESS; } @@ -724,22 +648,20 @@ static int CeedSingleOperatorAssemble(CeedOperator op, CeedInt offset, @ref Utility **/ -static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, - CeedInt *num_entries) { - int ierr; +static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedInt *num_entries) { CeedElemRestriction rstr; - CeedInt num_elem, elem_size, num_comp; + CeedInt num_elem, elem_size, num_comp; - if (op->is_composite) + if (op->is_composite) { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, - "Composite operator not supported"); - // LCOV_EXCL_STOP - ierr = CeedOperatorGetActiveElemRestriction(op, &rstr); CeedChk(ierr); - ierr = CeedElemRestrictionGetNumElements(rstr, &num_elem); CeedChk(ierr); - ierr = CeedElemRestrictionGetElementSize(rstr, &elem_size); CeedChk(ierr); - ierr = CeedElemRestrictionGetNumComponents(rstr, &num_comp); CeedChk(ierr); - *num_entries = elem_size*num_comp * elem_size*num_comp * num_elem; + return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); + // LCOV_EXCL_STOP + } + CeedCall(CeedOperatorGetActiveElemRestriction(op, &rstr)); + CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCall(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCall(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + *num_entries = elem_size * num_comp * elem_size * num_comp * num_elem; return CEED_ERROR_SUCCESS; } @@ -761,167 +683,123 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, @ref Developer **/ -static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, - CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, - CeedBasis basis_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, - CeedOperator *op_restrict) { - int ierr; +static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, + CeedBasis basis_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict) { Ceed ceed; - ierr = CeedOperatorGetCeed(op_fine, &ceed); CeedChk(ierr); + CeedCall(CeedOperatorGetCeed(op_fine, &ceed)); // Check for composite operator bool is_composite; - ierr = CeedOperatorIsComposite(op_fine, &is_composite); CeedChk(ierr); - if (is_composite) + CeedCall(CeedOperatorIsComposite(op_fine, &is_composite)); + if (is_composite) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Automatic multigrid setup for composite operators not supported"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Automatic multigrid setup for composite operators not supported"); + // LCOV_EXCL_STOP + } // Coarse Grid - ierr = CeedOperatorCreate(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, - op_coarse); CeedChk(ierr); + CeedCall(CeedOperatorCreate(ceed, op_fine->qf, op_fine->dqf, op_fine->dqfT, op_coarse)); CeedElemRestriction rstr_fine = NULL; // -- Clone input fields for (CeedInt i = 0; i < op_fine->qf->num_input_fields; i++) { if (op_fine->input_fields[i]->vec == CEED_VECTOR_ACTIVE) { rstr_fine = op_fine->input_fields[i]->elem_restr; - ierr = CeedOperatorSetField(*op_coarse, op_fine->input_fields[i]->field_name, - rstr_coarse, basis_coarse, CEED_VECTOR_ACTIVE); - CeedChk(ierr); + CeedCall(CeedOperatorSetField(*op_coarse, op_fine->input_fields[i]->field_name, rstr_coarse, basis_coarse, CEED_VECTOR_ACTIVE)); } else { - ierr = CeedOperatorSetField(*op_coarse, op_fine->input_fields[i]->field_name, - op_fine->input_fields[i]->elem_restr, - op_fine->input_fields[i]->basis, - op_fine->input_fields[i]->vec); CeedChk(ierr); + CeedCall(CeedOperatorSetField(*op_coarse, op_fine->input_fields[i]->field_name, op_fine->input_fields[i]->elem_restr, + op_fine->input_fields[i]->basis, op_fine->input_fields[i]->vec)); } } // -- Clone output fields for (CeedInt i = 0; i < op_fine->qf->num_output_fields; i++) { if (op_fine->output_fields[i]->vec == CEED_VECTOR_ACTIVE) { - ierr = CeedOperatorSetField(*op_coarse, op_fine->output_fields[i]->field_name, - rstr_coarse, basis_coarse, CEED_VECTOR_ACTIVE); - CeedChk(ierr); + CeedCall(CeedOperatorSetField(*op_coarse, op_fine->output_fields[i]->field_name, rstr_coarse, basis_coarse, CEED_VECTOR_ACTIVE)); } else { - ierr = CeedOperatorSetField(*op_coarse, op_fine->output_fields[i]->field_name, - op_fine->output_fields[i]->elem_restr, - op_fine->output_fields[i]->basis, - op_fine->output_fields[i]->vec); CeedChk(ierr); + CeedCall(CeedOperatorSetField(*op_coarse, op_fine->output_fields[i]->field_name, op_fine->output_fields[i]->elem_restr, + op_fine->output_fields[i]->basis, op_fine->output_fields[i]->vec)); } } // -- Clone QFunctionAssemblyData - ierr = CeedQFunctionAssemblyDataReferenceCopy(op_fine->qf_assembled, - &(*op_coarse)->qf_assembled); CeedChk(ierr); + CeedCall(CeedQFunctionAssemblyDataReferenceCopy(op_fine->qf_assembled, &(*op_coarse)->qf_assembled)); // Multiplicity vector CeedVector mult_vec, mult_e_vec; - ierr = CeedElemRestrictionCreateVector(rstr_fine, &mult_vec, &mult_e_vec); - CeedChk(ierr); - ierr = CeedVectorSetValue(mult_e_vec, 0.0); CeedChk(ierr); - ierr = CeedElemRestrictionApply(rstr_fine, CEED_NOTRANSPOSE, p_mult_fine, - mult_e_vec, CEED_REQUEST_IMMEDIATE); CeedChk(ierr); - ierr = CeedVectorSetValue(mult_vec, 0.0); CeedChk(ierr); - ierr = CeedElemRestrictionApply(rstr_fine, CEED_TRANSPOSE, mult_e_vec, mult_vec, - CEED_REQUEST_IMMEDIATE); CeedChk(ierr); - ierr = CeedVectorDestroy(&mult_e_vec); CeedChk(ierr); - ierr = CeedVectorReciprocal(mult_vec); CeedChk(ierr); + CeedCall(CeedElemRestrictionCreateVector(rstr_fine, &mult_vec, &mult_e_vec)); + CeedCall(CeedVectorSetValue(mult_e_vec, 0.0)); + CeedCall(CeedElemRestrictionApply(rstr_fine, CEED_NOTRANSPOSE, p_mult_fine, mult_e_vec, CEED_REQUEST_IMMEDIATE)); + CeedCall(CeedVectorSetValue(mult_vec, 0.0)); + CeedCall(CeedElemRestrictionApply(rstr_fine, CEED_TRANSPOSE, mult_e_vec, mult_vec, CEED_REQUEST_IMMEDIATE)); + CeedCall(CeedVectorDestroy(&mult_e_vec)); + CeedCall(CeedVectorReciprocal(mult_vec)); // Restriction CeedInt num_comp; - ierr = CeedBasisGetNumComponents(basis_coarse, &num_comp); CeedChk(ierr); + CeedCall(CeedBasisGetNumComponents(basis_coarse, &num_comp)); CeedQFunction qf_restrict; - ierr = CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_restrict); - CeedChk(ierr); + CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_restrict)); CeedInt *num_comp_r_data; - ierr = CeedCalloc(1, &num_comp_r_data); CeedChk(ierr); + CeedCall(CeedCalloc(1, &num_comp_r_data)); num_comp_r_data[0] = num_comp; CeedQFunctionContext ctx_r; - ierr = CeedQFunctionContextCreate(ceed, &ctx_r); CeedChk(ierr); - ierr = CeedQFunctionContextSetData(ctx_r, CEED_MEM_HOST, CEED_OWN_POINTER, - sizeof(*num_comp_r_data), num_comp_r_data); - CeedChk(ierr); - ierr = CeedQFunctionSetContext(qf_restrict, ctx_r); CeedChk(ierr); - ierr = CeedQFunctionContextDestroy(&ctx_r); CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf_restrict, "input", num_comp, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf_restrict, "scale", num_comp, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf_restrict, "output", num_comp, - CEED_EVAL_INTERP); CeedChk(ierr); - ierr = CeedQFunctionSetUserFlopsEstimate(qf_restrict, num_comp); CeedChk(ierr); - - ierr = CeedOperatorCreate(ceed, qf_restrict, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, op_restrict); - CeedChk(ierr); - ierr = CeedOperatorSetField(*op_restrict, "input", rstr_fine, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); - CeedChk(ierr); - ierr = CeedOperatorSetField(*op_restrict, "scale", rstr_fine, - CEED_BASIS_COLLOCATED, mult_vec); - CeedChk(ierr); - ierr = CeedOperatorSetField(*op_restrict, "output", rstr_coarse, basis_c_to_f, - CEED_VECTOR_ACTIVE); CeedChk(ierr); + CeedCall(CeedQFunctionContextCreate(ceed, &ctx_r)); + CeedCall(CeedQFunctionContextSetData(ctx_r, CEED_MEM_HOST, CEED_OWN_POINTER, sizeof(*num_comp_r_data), num_comp_r_data)); + CeedCall(CeedQFunctionSetContext(qf_restrict, ctx_r)); + CeedCall(CeedQFunctionContextDestroy(&ctx_r)); + CeedCall(CeedQFunctionAddInput(qf_restrict, "input", num_comp, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", num_comp, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf_restrict, "output", num_comp, CEED_EVAL_INTERP)); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_restrict, num_comp)); + + CeedCall(CeedOperatorCreate(ceed, qf_restrict, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, op_restrict)); + CeedCall(CeedOperatorSetField(*op_restrict, "input", rstr_fine, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE)); + CeedCall(CeedOperatorSetField(*op_restrict, "scale", rstr_fine, CEED_BASIS_COLLOCATED, mult_vec)); + CeedCall(CeedOperatorSetField(*op_restrict, "output", rstr_coarse, basis_c_to_f, CEED_VECTOR_ACTIVE)); // Prolongation CeedQFunction qf_prolong; - ierr = CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_prolong); - CeedChk(ierr); + CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_prolong)); CeedInt *num_comp_p_data; - ierr = CeedCalloc(1, &num_comp_p_data); CeedChk(ierr); + CeedCall(CeedCalloc(1, &num_comp_p_data)); num_comp_p_data[0] = num_comp; CeedQFunctionContext ctx_p; - ierr = CeedQFunctionContextCreate(ceed, &ctx_p); CeedChk(ierr); - ierr = CeedQFunctionContextSetData(ctx_p, CEED_MEM_HOST, CEED_OWN_POINTER, - sizeof(*num_comp_p_data), num_comp_p_data); - CeedChk(ierr); - ierr = CeedQFunctionSetContext(qf_prolong, ctx_p); CeedChk(ierr); - ierr = CeedQFunctionContextDestroy(&ctx_p); CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf_prolong, "input", num_comp, CEED_EVAL_INTERP); - CeedChk(ierr); - ierr = CeedQFunctionAddInput(qf_prolong, "scale", num_comp, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionAddOutput(qf_prolong, "output", num_comp, CEED_EVAL_NONE); - CeedChk(ierr); - ierr = CeedQFunctionSetUserFlopsEstimate(qf_prolong, num_comp); CeedChk(ierr); - - ierr = CeedOperatorCreate(ceed, qf_prolong, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, op_prolong); - CeedChk(ierr); - ierr = CeedOperatorSetField(*op_prolong, "input", rstr_coarse, basis_c_to_f, - CEED_VECTOR_ACTIVE); CeedChk(ierr); - ierr = CeedOperatorSetField(*op_prolong, "scale", rstr_fine, - CEED_BASIS_COLLOCATED, mult_vec); - CeedChk(ierr); - ierr = CeedOperatorSetField(*op_prolong, "output", rstr_fine, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); - CeedChk(ierr); + CeedCall(CeedQFunctionContextCreate(ceed, &ctx_p)); + CeedCall(CeedQFunctionContextSetData(ctx_p, CEED_MEM_HOST, CEED_OWN_POINTER, sizeof(*num_comp_p_data), num_comp_p_data)); + CeedCall(CeedQFunctionSetContext(qf_prolong, ctx_p)); + CeedCall(CeedQFunctionContextDestroy(&ctx_p)); + CeedCall(CeedQFunctionAddInput(qf_prolong, "input", num_comp, CEED_EVAL_INTERP)); + CeedCall(CeedQFunctionAddInput(qf_prolong, "scale", num_comp, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf_prolong, "output", num_comp, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_prolong, num_comp)); + + CeedCall(CeedOperatorCreate(ceed, qf_prolong, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, op_prolong)); + CeedCall(CeedOperatorSetField(*op_prolong, "input", rstr_coarse, basis_c_to_f, CEED_VECTOR_ACTIVE)); + CeedCall(CeedOperatorSetField(*op_prolong, "scale", rstr_fine, CEED_BASIS_COLLOCATED, mult_vec)); + CeedCall(CeedOperatorSetField(*op_prolong, "output", rstr_fine, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE)); // Clone name - bool has_name = op_fine->name; + bool has_name = op_fine->name; size_t name_len = op_fine->name ? strlen(op_fine->name) : 0; - ierr = CeedOperatorSetName(*op_coarse, op_fine->name); CeedChk(ierr); + CeedCall(CeedOperatorSetName(*op_coarse, op_fine->name)); { char *prolongation_name; - ierr = CeedCalloc(18 + name_len, &prolongation_name); CeedChk(ierr); - sprintf(prolongation_name, "prolongation%s%s", has_name ? " for " : "", - has_name ? op_fine->name : ""); - ierr = CeedOperatorSetName(*op_prolong, prolongation_name); CeedChk(ierr); - ierr = CeedFree(&prolongation_name); CeedChk(ierr); + CeedCall(CeedCalloc(18 + name_len, &prolongation_name)); + sprintf(prolongation_name, "prolongation%s%s", has_name ? " for " : "", has_name ? op_fine->name : ""); + CeedCall(CeedOperatorSetName(*op_prolong, prolongation_name)); + CeedCall(CeedFree(&prolongation_name)); } { char *restriction_name; - ierr = CeedCalloc(17 + name_len, &restriction_name); CeedChk(ierr); - sprintf(restriction_name, "restriction%s%s", has_name ? " for " : "", - has_name ? op_fine->name : ""); - ierr = CeedOperatorSetName(*op_restrict, restriction_name); CeedChk(ierr); - ierr = CeedFree(&restriction_name); CeedChk(ierr); + CeedCall(CeedCalloc(17 + name_len, &restriction_name)); + sprintf(restriction_name, "restriction%s%s", has_name ? " for " : "", has_name ? op_fine->name : ""); + CeedCall(CeedOperatorSetName(*op_restrict, restriction_name)); + CeedCall(CeedFree(&restriction_name)); } // Cleanup - ierr = CeedVectorDestroy(&mult_vec); CeedChk(ierr); - ierr = CeedBasisDestroy(&basis_c_to_f); CeedChk(ierr); - ierr = CeedQFunctionDestroy(&qf_restrict); CeedChk(ierr); - ierr = CeedQFunctionDestroy(&qf_prolong); CeedChk(ierr); + CeedCall(CeedVectorDestroy(&mult_vec)); + CeedCall(CeedBasisDestroy(&basis_c_to_f)); + CeedCall(CeedQFunctionDestroy(&qf_restrict)); + CeedCall(CeedQFunctionDestroy(&qf_prolong)); return CEED_ERROR_SUCCESS; } @@ -942,62 +820,54 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, @ref Developer **/ -CeedPragmaOptimizeOff -static int CeedBuildMassLaplace(const CeedScalar *interp_1d, - const CeedScalar *grad_1d, - const CeedScalar *q_weight_1d, CeedInt P_1d, - CeedInt Q_1d, CeedInt dim, - CeedScalar *mass, CeedScalar *laplace) { - - for (CeedInt i=0; i2 ? 1e-6 : 1e-4; - for (CeedInt i=0; i 2 ? 1e-6 : 1e-4; + for (CeedInt i = 0; i < P_1d; i++) laplace[i + P_1d * i] += perturbation; return CEED_ERROR_SUCCESS; } CeedPragmaOptimizeOn -/// @} + /// @} -/// ---------------------------------------------------------------------------- -/// CeedOperator Backend API -/// ---------------------------------------------------------------------------- -/// @addtogroup CeedOperatorBackend -/// @{ + /// ---------------------------------------------------------------------------- + /// CeedOperator Backend API + /// ---------------------------------------------------------------------------- + /// @addtogroup CeedOperatorBackend + /// @{ -/** - @brief Create object holding CeedQFunction assembly data for CeedOperator + /** + @brief Create object holding CeedQFunction assembly data for CeedOperator - @param[in] ceed A Ceed object where the CeedQFunctionAssemblyData will be created - @param[out] data Address of the variable where the newly created - CeedQFunctionAssemblyData will be stored + @param[in] ceed A Ceed object where the CeedQFunctionAssemblyData will be created + @param[out] data Address of the variable where the newly created + CeedQFunctionAssemblyData will be stored - @return An error code: 0 - success, otherwise - failure + @return An error code: 0 - success, otherwise - failure - @ref Backend -**/ -int CeedQFunctionAssemblyDataCreate(Ceed ceed, - CeedQFunctionAssemblyData *data) { - int ierr; - - ierr = CeedCalloc(1, data); CeedChk(ierr); + @ref Backend + **/ + int + CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data) { + CeedCall(CeedCalloc(1, data)); (*data)->ref_count = 1; - (*data)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); + (*data)->ceed = ceed; + CeedCall(CeedReference(ceed)); return CEED_ERROR_SUCCESS; } @@ -1026,9 +896,8 @@ int CeedQFunctionAssemblyDataReference(CeedQFunctionAssemblyData data) { @ref Backend **/ -int CeedQFunctionAssemblyDataSetReuse(CeedQFunctionAssemblyData data, - bool reuse_data) { - data->reuse_data = reuse_data; +int CeedQFunctionAssemblyDataSetReuse(CeedQFunctionAssemblyData data, bool reuse_data) { + data->reuse_data = reuse_data; data->needs_data_update = true; return CEED_ERROR_SUCCESS; } @@ -1043,8 +912,7 @@ int CeedQFunctionAssemblyDataSetReuse(CeedQFunctionAssemblyData data, @ref Backend **/ -int CeedQFunctionAssemblyDataSetUpdateNeeded(CeedQFunctionAssemblyData data, - bool needs_data_update) { +int CeedQFunctionAssemblyDataSetUpdateNeeded(CeedQFunctionAssemblyData data, bool needs_data_update) { data->needs_data_update = needs_data_update; return CEED_ERROR_SUCCESS; } @@ -1059,8 +927,7 @@ int CeedQFunctionAssemblyDataSetUpdateNeeded(CeedQFunctionAssemblyData data, @ref Backend **/ -int CeedQFunctionAssemblyDataIsUpdateNeeded(CeedQFunctionAssemblyData data, - bool *is_update_needed) { +int CeedQFunctionAssemblyDataIsUpdateNeeded(CeedQFunctionAssemblyData data, bool *is_update_needed) { *is_update_needed = !data->reuse_data || data->needs_data_update; return CEED_ERROR_SUCCESS; } @@ -1080,12 +947,9 @@ int CeedQFunctionAssemblyDataIsUpdateNeeded(CeedQFunctionAssemblyData data, @ref Backend **/ -int CeedQFunctionAssemblyDataReferenceCopy(CeedQFunctionAssemblyData data, - CeedQFunctionAssemblyData *data_copy) { - int ierr; - - ierr = CeedQFunctionAssemblyDataReference(data); CeedChk(ierr); - ierr = CeedQFunctionAssemblyDataDestroy(data_copy); CeedChk(ierr); +int CeedQFunctionAssemblyDataReferenceCopy(CeedQFunctionAssemblyData data, CeedQFunctionAssemblyData *data_copy) { + CeedCall(CeedQFunctionAssemblyDataReference(data)); + CeedCall(CeedQFunctionAssemblyDataDestroy(data_copy)); *data_copy = data; return CEED_ERROR_SUCCESS; } @@ -1100,8 +964,7 @@ int CeedQFunctionAssemblyDataReferenceCopy(CeedQFunctionAssemblyData data, @ref Backend **/ -int CeedQFunctionAssemblyDataIsSetup(CeedQFunctionAssemblyData data, - bool *is_setup) { +int CeedQFunctionAssemblyDataIsSetup(CeedQFunctionAssemblyData data, bool *is_setup) { *is_setup = data->is_setup; return CEED_ERROR_SUCCESS; } @@ -1117,29 +980,23 @@ int CeedQFunctionAssemblyDataIsSetup(CeedQFunctionAssemblyData data, @ref Backend **/ -int CeedQFunctionAssemblyDataSetObjects(CeedQFunctionAssemblyData data, - CeedVector vec, CeedElemRestriction rstr) { - int ierr; - - ierr = CeedVectorReferenceCopy(vec, &data->vec); CeedChk(ierr); - ierr = CeedElemRestrictionReferenceCopy(rstr, &data->rstr); CeedChk(ierr); +int CeedQFunctionAssemblyDataSetObjects(CeedQFunctionAssemblyData data, CeedVector vec, CeedElemRestriction rstr) { + CeedCall(CeedVectorReferenceCopy(vec, &data->vec)); + CeedCall(CeedElemRestrictionReferenceCopy(rstr, &data->rstr)); data->is_setup = true; return CEED_ERROR_SUCCESS; } -int CeedQFunctionAssemblyDataGetObjects(CeedQFunctionAssemblyData data, - CeedVector *vec, CeedElemRestriction *rstr) { - int ierr; - - if (!data->is_setup) +int CeedQFunctionAssemblyDataGetObjects(CeedQFunctionAssemblyData data, CeedVector *vec, CeedElemRestriction *rstr) { + if (!data->is_setup) { // LCOV_EXCL_START - return CeedError(data->ceed, CEED_ERROR_INCOMPLETE, - "Internal objects not set; must call CeedQFunctionAssemblyDataSetObjects first."); - // LCOV_EXCL_STOP + return CeedError(data->ceed, CEED_ERROR_INCOMPLETE, "Internal objects not set; must call CeedQFunctionAssemblyDataSetObjects first."); + // LCOV_EXCL_STOP + } - ierr = CeedVectorReferenceCopy(data->vec, vec); CeedChk(ierr); - ierr = CeedElemRestrictionReferenceCopy(data->rstr, rstr); CeedChk(ierr); + CeedCall(CeedVectorReferenceCopy(data->vec, vec)); + CeedCall(CeedElemRestrictionReferenceCopy(data->rstr, rstr)); return CEED_ERROR_SUCCESS; } @@ -1154,15 +1011,13 @@ int CeedQFunctionAssemblyDataGetObjects(CeedQFunctionAssemblyData data, @ref Backend **/ int CeedQFunctionAssemblyDataDestroy(CeedQFunctionAssemblyData *data) { - int ierr; - if (!*data || --(*data)->ref_count > 0) return CEED_ERROR_SUCCESS; - ierr = CeedDestroy(&(*data)->ceed); CeedChk(ierr); - ierr = CeedVectorDestroy(&(*data)->vec); CeedChk(ierr); - ierr = CeedElemRestrictionDestroy(&(*data)->rstr); CeedChk(ierr); + CeedCall(CeedDestroy(&(*data)->ceed)); + CeedCall(CeedVectorDestroy(&(*data)->vec)); + CeedCall(CeedElemRestrictionDestroy(&(*data)->rstr)); - ierr = CeedFree(data); CeedChk(ierr); + CeedCall(CeedFree(data)); return CEED_ERROR_SUCCESS; } @@ -1176,14 +1031,11 @@ int CeedQFunctionAssemblyDataDestroy(CeedQFunctionAssemblyData *data) { @ref Backend **/ -int CeedOperatorGetOperatorAssemblyData(CeedOperator op, - CeedOperatorAssemblyData *data) { - int ierr; - +int CeedOperatorGetOperatorAssemblyData(CeedOperator op, CeedOperatorAssemblyData *data) { if (!op->op_assembled) { CeedOperatorAssemblyData data; - ierr = CeedOperatorAssemblyDataCreate(op->ceed, op, &data); CeedChk(ierr); + CeedCall(CeedOperatorAssemblyDataCreate(op->ceed, op, &data)); op->op_assembled = data; } *data = op->op_assembled; @@ -1203,104 +1055,95 @@ int CeedOperatorGetOperatorAssemblyData(CeedOperator op, @ref Backend **/ -int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, - CeedOperatorAssemblyData *data) { - int ierr; - - ierr = CeedCalloc(1, data); CeedChk(ierr); +int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssemblyData *data) { + CeedCall(CeedCalloc(1, data)); (*data)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); + CeedCall(CeedReference(ceed)); // Build OperatorAssembly data - CeedQFunction qf; + CeedQFunction qf; CeedQFunctionField *qf_fields; - CeedOperatorField *op_fields; - CeedInt num_input_fields; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); - ierr = CeedQFunctionGetFields(qf, &num_input_fields, &qf_fields, NULL, NULL); - CeedChk(ierr); - ierr = CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL); CeedChk(ierr); + CeedOperatorField *op_fields; + CeedInt num_input_fields; + CeedCall(CeedOperatorGetQFunction(op, &qf)); + CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_fields, NULL, NULL)); + CeedCall(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); // Determine active input basis - CeedInt num_eval_mode_in = 0, dim = 1; + CeedInt num_eval_mode_in = 0, dim = 1; CeedEvalMode *eval_mode_in = NULL; - CeedBasis basis_in = NULL; - for (CeedInt i=0; inum_eval_mode_in = num_eval_mode_in; - (*data)->eval_mode_in = eval_mode_in; - ierr = CeedBasisReferenceCopy(basis_in, &(*data)->basis_in); CeedChk(ierr); + (*data)->eval_mode_in = eval_mode_in; + CeedCall(CeedBasisReferenceCopy(basis_in, &(*data)->basis_in)); // Determine active output basis CeedInt num_output_fields; - ierr = CeedQFunctionGetFields(qf, NULL, NULL, &num_output_fields, &qf_fields); - CeedChk(ierr); - ierr = CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields); CeedChk(ierr); - CeedInt num_eval_mode_out = 0; - CeedEvalMode *eval_mode_out = NULL; - CeedBasis basis_out = NULL; - for (CeedInt i=0; inum_eval_mode_out = num_eval_mode_out; - (*data)->eval_mode_out = eval_mode_out; - ierr = CeedBasisReferenceCopy(basis_out, &(*data)->basis_out); CeedChk(ierr); + (*data)->eval_mode_out = eval_mode_out; + CeedCall(CeedBasisReferenceCopy(basis_out, &(*data)->basis_out)); return CEED_ERROR_SUCCESS; } @@ -1318,13 +1161,12 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, @ref Backend **/ -int CeedOperatorAssemblyDataGetEvalModes(CeedOperatorAssemblyData data, - CeedInt *num_eval_mode_in, const CeedEvalMode **eval_mode_in, - CeedInt *num_eval_mode_out, const CeedEvalMode **eval_mode_out) { - if (num_eval_mode_in) *num_eval_mode_in = data->num_eval_mode_in; - if (eval_mode_in) *eval_mode_in = data->eval_mode_in; +int CeedOperatorAssemblyDataGetEvalModes(CeedOperatorAssemblyData data, CeedInt *num_eval_mode_in, const CeedEvalMode **eval_mode_in, + CeedInt *num_eval_mode_out, const CeedEvalMode **eval_mode_out) { + if (num_eval_mode_in) *num_eval_mode_in = data->num_eval_mode_in; + if (eval_mode_in) *eval_mode_in = data->eval_mode_in; if (num_eval_mode_out) *num_eval_mode_out = data->num_eval_mode_out; - if (eval_mode_out) *eval_mode_out = data->eval_mode_out; + if (eval_mode_out) *eval_mode_out = data->eval_mode_out; return CEED_ERROR_SUCCESS; } @@ -1342,47 +1184,41 @@ int CeedOperatorAssemblyDataGetEvalModes(CeedOperatorAssemblyData data, @ref Backend **/ -int CeedOperatorAssemblyDataGetBases(CeedOperatorAssemblyData data, - CeedBasis *basis_in, const CeedScalar **B_in, CeedBasis *basis_out, +int CeedOperatorAssemblyDataGetBases(CeedOperatorAssemblyData data, CeedBasis *basis_in, const CeedScalar **B_in, CeedBasis *basis_out, const CeedScalar **B_out) { - int ierr; - // Assemble B_in, B_out if needed if (B_in && !data->B_in) { - CeedInt num_qpts, elem_size; - CeedScalar *B_in, *identity = NULL; + CeedInt num_qpts, elem_size; + CeedScalar *B_in, *identity = NULL; const CeedScalar *interp_in, *grad_in; - bool has_eval_none = false; + bool has_eval_none = false; - ierr = CeedBasisGetNumQuadraturePoints(data->basis_in, &num_qpts); - CeedChk(ierr); - ierr = CeedBasisGetNumNodes(data->basis_in, &elem_size); CeedChk(ierr); - ierr = CeedCalloc(num_qpts * elem_size * data->num_eval_mode_in, &B_in); - CeedChk(ierr); + CeedCall(CeedBasisGetNumQuadraturePoints(data->basis_in, &num_qpts)); + CeedCall(CeedBasisGetNumNodes(data->basis_in, &elem_size)); + CeedCall(CeedCalloc(num_qpts * elem_size * data->num_eval_mode_in, &B_in)); for (CeedInt i = 0; i < data->num_eval_mode_in; i++) { has_eval_none = has_eval_none || (data->eval_mode_in[i] == CEED_EVAL_NONE); } if (has_eval_none) { - ierr = CeedCalloc(num_qpts * elem_size, &identity); CeedChk(ierr); + CeedCall(CeedCalloc(num_qpts * elem_size, &identity)); for (CeedInt i = 0; i < (elem_size < num_qpts ? elem_size : num_qpts); i++) { identity[i * elem_size + i] = 1.0; } } - ierr = CeedBasisGetInterp(data->basis_in, &interp_in); CeedChk(ierr); - ierr = CeedBasisGetGrad(data->basis_in, &grad_in); CeedChk(ierr); + CeedCall(CeedBasisGetInterp(data->basis_in, &interp_in)); + CeedCall(CeedBasisGetGrad(data->basis_in, &grad_in)); for (CeedInt q = 0; q < num_qpts; q++) { for (CeedInt n = 0; n < elem_size; n++) { CeedInt d_in = -1; for (CeedInt e_in = 0; e_in < data->num_eval_mode_in; e_in++) { - const CeedInt qq = data->num_eval_mode_in * q; - const CeedScalar *b = NULL; + const CeedInt qq = data->num_eval_mode_in * q; + const CeedScalar *b = NULL; if (data->eval_mode_in[e_in] == CEED_EVAL_GRAD) d_in++; - CeedOperatorGetBasisPointer(data->eval_mode_in[e_in], identity, - interp_in, &grad_in[d_in * num_qpts * elem_size], &b); CeedChk(ierr); - B_in[(qq + e_in)*elem_size + n] = b[q * elem_size + n]; + CeedOperatorGetBasisPointer(data->eval_mode_in[e_in], identity, interp_in, &grad_in[d_in * num_qpts * elem_size], &b); + B_in[(qq + e_in) * elem_size + n] = b[q * elem_size + n]; } } } @@ -1390,50 +1226,47 @@ int CeedOperatorAssemblyDataGetBases(CeedOperatorAssemblyData data, } if (B_out && !data->B_out) { - CeedInt num_qpts, elem_size; - CeedScalar *B_out, *identity = NULL; + CeedInt num_qpts, elem_size; + CeedScalar *B_out, *identity = NULL; const CeedScalar *interp_out, *grad_out; - bool has_eval_none = false; + bool has_eval_none = false; - ierr = CeedBasisGetNumQuadraturePoints(data->basis_out, &num_qpts); - CeedChk(ierr); - ierr = CeedBasisGetNumNodes(data->basis_out, &elem_size); CeedChk(ierr); - ierr = CeedCalloc(num_qpts * elem_size * data->num_eval_mode_out, &B_out); - CeedChk(ierr); + CeedCall(CeedBasisGetNumQuadraturePoints(data->basis_out, &num_qpts)); + CeedCall(CeedBasisGetNumNodes(data->basis_out, &elem_size)); + CeedCall(CeedCalloc(num_qpts * elem_size * data->num_eval_mode_out, &B_out)); for (CeedInt i = 0; i < data->num_eval_mode_out; i++) { has_eval_none = has_eval_none || (data->eval_mode_out[i] == CEED_EVAL_NONE); } if (has_eval_none) { - ierr = CeedCalloc(num_qpts * elem_size, &identity); CeedChk(ierr); + CeedCall(CeedCalloc(num_qpts * elem_size, &identity)); for (CeedInt i = 0; i < (elem_size < num_qpts ? elem_size : num_qpts); i++) { identity[i * elem_size + i] = 1.0; } } - ierr = CeedBasisGetInterp(data->basis_out, &interp_out); CeedChk(ierr); - ierr = CeedBasisGetGrad(data->basis_out, &grad_out); CeedChk(ierr); + CeedCall(CeedBasisGetInterp(data->basis_out, &interp_out)); + CeedCall(CeedBasisGetGrad(data->basis_out, &grad_out)); for (CeedInt q = 0; q < num_qpts; q++) { for (CeedInt n = 0; n < elem_size; n++) { CeedInt d_out = -1; for (CeedInt e_out = 0; e_out < data->num_eval_mode_out; e_out++) { - const CeedInt qq = data->num_eval_mode_out * q; - const CeedScalar *b = NULL; + const CeedInt qq = data->num_eval_mode_out * q; + const CeedScalar *b = NULL; if (data->eval_mode_out[e_out] == CEED_EVAL_GRAD) d_out++; - CeedOperatorGetBasisPointer(data->eval_mode_out[e_out], identity, - interp_out, &grad_out[d_out * num_qpts * elem_size], &b); CeedChk(ierr); - B_out[(qq + e_out)*elem_size + n] = b[q * elem_size + n]; + CeedOperatorGetBasisPointer(data->eval_mode_out[e_out], identity, interp_out, &grad_out[d_out * num_qpts * elem_size], &b); + B_out[(qq + e_out) * elem_size + n] = b[q * elem_size + n]; } } } data->B_out = B_out; } - if (basis_in) *basis_in = data->basis_in; - if (B_in) *B_in = data->B_in; + if (basis_in) *basis_in = data->basis_in; + if (B_in) *B_in = data->B_in; if (basis_out) *basis_out = data->basis_out; - if (B_out) *B_out = data->B_out; + if (B_out) *B_out = data->B_out; return CEED_ERROR_SUCCESS; } @@ -1448,19 +1281,17 @@ int CeedOperatorAssemblyDataGetBases(CeedOperatorAssemblyData data, @ref Backend **/ int CeedOperatorAssemblyDataDestroy(CeedOperatorAssemblyData *data) { - int ierr; - if (!*data) return CEED_ERROR_SUCCESS; - ierr = CeedDestroy(&(*data)->ceed); CeedChk(ierr); - ierr = CeedBasisDestroy(&(*data)->basis_in); CeedChk(ierr); - ierr = CeedBasisDestroy(&(*data)->basis_out); CeedChk(ierr); - ierr = CeedFree(&(*data)->eval_mode_in); CeedChk(ierr); - ierr = CeedFree(&(*data)->eval_mode_out); CeedChk(ierr); - ierr = CeedFree(&(*data)->B_in); CeedChk(ierr); - ierr = CeedFree(&(*data)->B_out); CeedChk(ierr); + CeedCall(CeedDestroy(&(*data)->ceed)); + CeedCall(CeedBasisDestroy(&(*data)->basis_in)); + CeedCall(CeedBasisDestroy(&(*data)->basis_out)); + CeedCall(CeedFree(&(*data)->eval_mode_in)); + CeedCall(CeedFree(&(*data)->eval_mode_out)); + CeedCall(CeedFree(&(*data)->B_in)); + CeedCall(CeedFree(&(*data)->B_out)); - ierr = CeedFree(data); CeedChk(ierr); + CeedCall(CeedFree(data)); return CEED_ERROR_SUCCESS; } @@ -1502,28 +1333,22 @@ int CeedOperatorAssemblyDataDestroy(CeedOperatorAssemblyData *data) { @ref User **/ -int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, - CeedElemRestriction *rstr, - CeedRequest *request) { - int ierr; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); +int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + CeedCall(CeedOperatorCheckReady(op)); if (op->LinearAssembleQFunction) { // Backend version - ierr = op->LinearAssembleQFunction(op, assembled, rstr, request); - CeedChk(ierr); + CeedCall(op->LinearAssembleQFunction(op, assembled, rstr, request)); } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedOperatorLinearAssembleQFunction(op_fallback, assembled, - rstr, request); CeedChk(ierr); + CeedCall(CeedOperatorLinearAssembleQFunction(op_fallback, assembled, rstr, request)); } else { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support CeedOperatorLinearAssembleQFunction"); + return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunction"); // LCOV_EXCL_STOP } } @@ -1548,58 +1373,47 @@ int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, @ref User **/ -int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, - CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - int ierr; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); +int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + CeedCall(CeedOperatorCheckReady(op)); if (op->LinearAssembleQFunctionUpdate) { // Backend version - bool qf_assembled_is_setup; - CeedVector assembled_vec = NULL; + bool qf_assembled_is_setup; + CeedVector assembled_vec = NULL; CeedElemRestriction assembled_rstr = NULL; - ierr = CeedQFunctionAssemblyDataIsSetup(op->qf_assembled, - &qf_assembled_is_setup); CeedChk(ierr); + CeedCall(CeedQFunctionAssemblyDataIsSetup(op->qf_assembled, &qf_assembled_is_setup)); if (qf_assembled_is_setup) { bool update_needed; - ierr = CeedQFunctionAssemblyDataGetObjects(op->qf_assembled, &assembled_vec, - &assembled_rstr); CeedChk(ierr); - ierr = CeedQFunctionAssemblyDataIsUpdateNeeded(op->qf_assembled, - &update_needed); CeedChk(ierr); + CeedCall(CeedQFunctionAssemblyDataGetObjects(op->qf_assembled, &assembled_vec, &assembled_rstr)); + CeedCall(CeedQFunctionAssemblyDataIsUpdateNeeded(op->qf_assembled, &update_needed)); if (update_needed) { - ierr = op->LinearAssembleQFunctionUpdate(op, assembled_vec, assembled_rstr, - request); CeedChk(ierr); + CeedCall(op->LinearAssembleQFunctionUpdate(op, assembled_vec, assembled_rstr, request)); } } else { - ierr = op->LinearAssembleQFunction(op, &assembled_vec, &assembled_rstr, - request); CeedChk(ierr); - ierr = CeedQFunctionAssemblyDataSetObjects(op->qf_assembled, assembled_vec, - assembled_rstr); CeedChk(ierr); + CeedCall(op->LinearAssembleQFunction(op, &assembled_vec, &assembled_rstr, request)); + CeedCall(CeedQFunctionAssemblyDataSetObjects(op->qf_assembled, assembled_vec, assembled_rstr)); } - ierr = CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, false); - CeedChk(ierr); + CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(op->qf_assembled, false)); // Copy reference from internally held copy *assembled = NULL; - *rstr = NULL; - ierr = CeedVectorReferenceCopy(assembled_vec, assembled); CeedChk(ierr); - ierr = CeedVectorDestroy(&assembled_vec); CeedChk(ierr); - ierr = CeedElemRestrictionReferenceCopy(assembled_rstr, rstr); CeedChk(ierr); - ierr = CeedElemRestrictionDestroy(&assembled_rstr); CeedChk(ierr); + *rstr = NULL; + CeedCall(CeedVectorReferenceCopy(assembled_vec, assembled)); + CeedCall(CeedVectorDestroy(&assembled_vec)); + CeedCall(CeedElemRestrictionReferenceCopy(assembled_rstr, rstr)); + CeedCall(CeedElemRestrictionDestroy(&assembled_rstr)); } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, - rstr, request); CeedChk(ierr); + CeedCall(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op_fallback, assembled, rstr, request)); } else { // LCOV_EXCL_START - return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate"); + return CeedError(op->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support CeedOperatorLinearAssembleQFunctionUpdate"); // LCOV_EXCL_STOP } } @@ -1627,43 +1441,39 @@ int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, @ref User **/ -int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, - CeedRequest *request) { - int ierr; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); +int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedCall(CeedOperatorCheckReady(op)); CeedSize input_size = 0, output_size = 0; - ierr = CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size); - CeedChk(ierr); - if (input_size != output_size) + CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); + if (input_size != output_size) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_DIMENSION, "Operator must be square"); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } if (op->LinearAssembleDiagonal) { // Backend version - ierr = op->LinearAssembleDiagonal(op, assembled, request); CeedChk(ierr); + CeedCall(op->LinearAssembleDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; } else if (op->LinearAssembleAddDiagonal) { // Backend version with zeroing first - ierr = CeedVectorSetValue(assembled, 0.0); CeedChk(ierr); - ierr = op->LinearAssembleAddDiagonal(op, assembled, request); CeedChk(ierr); + CeedCall(CeedVectorSetValue(assembled, 0.0)); + CeedCall(op->LinearAssembleAddDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedOperatorLinearAssembleDiagonal(op_fallback, assembled, request); - CeedChk(ierr); + CeedCall(CeedOperatorLinearAssembleDiagonal(op_fallback, assembled, request)); return CEED_ERROR_SUCCESS; } } // Default interface implementation - ierr = CeedVectorSetValue(assembled, 0.0); CeedChk(ierr); - ierr = CeedOperatorLinearAssembleAddDiagonal(op, assembled, request); - CeedChk(ierr); + CeedCall(CeedVectorSetValue(assembled, 0.0)); + CeedCall(CeedOperatorLinearAssembleAddDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; } @@ -1688,43 +1498,38 @@ int CeedOperatorLinearAssembleDiagonal(CeedOperator op, CeedVector assembled, @ref User **/ -int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, - CeedRequest *request) { - int ierr; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); +int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedCall(CeedOperatorCheckReady(op)); CeedSize input_size = 0, output_size = 0; - ierr = CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size); - CeedChk(ierr); - if (input_size != output_size) + CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); + if (input_size != output_size) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_DIMENSION, "Operator must be square"); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } if (op->LinearAssembleAddDiagonal) { // Backend version - ierr = op->LinearAssembleAddDiagonal(op, assembled, request); CeedChk(ierr); + CeedCall(op->LinearAssembleAddDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request); - CeedChk(ierr); + CeedCall(CeedOperatorLinearAssembleAddDiagonal(op_fallback, assembled, request)); return CEED_ERROR_SUCCESS; } } // Default interface implementation bool is_composite; - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); if (is_composite) { - ierr = CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, - false, assembled); CeedChk(ierr); + CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, false, assembled)); } else { - ierr = CeedSingleOperatorAssembleAddDiagonal_Core(op, request, false, - assembled); CeedChk(ierr); + CeedCall(CeedSingleOperatorAssembleAddDiagonal_Core(op, request, false, assembled)); } return CEED_ERROR_SUCCESS; @@ -1756,45 +1561,39 @@ int CeedOperatorLinearAssembleAddDiagonal(CeedOperator op, CeedVector assembled, @ref User **/ -int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, - CeedVector assembled, CeedRequest *request) { - int ierr; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); +int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedCall(CeedOperatorCheckReady(op)); CeedSize input_size = 0, output_size = 0; - ierr = CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size); - CeedChk(ierr); - if (input_size != output_size) + CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); + if (input_size != output_size) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_DIMENSION, "Operator must be square"); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } if (op->LinearAssemblePointBlockDiagonal) { // Backend version - ierr = op->LinearAssemblePointBlockDiagonal(op, assembled, request); - CeedChk(ierr); + CeedCall(op->LinearAssemblePointBlockDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; } else if (op->LinearAssembleAddPointBlockDiagonal) { // Backend version with zeroing first - ierr = CeedVectorSetValue(assembled, 0.0); CeedChk(ierr); - ierr = CeedOperatorLinearAssembleAddPointBlockDiagonal(op, assembled, - request); CeedChk(ierr); + CeedCall(CeedVectorSetValue(assembled, 0.0)); + CeedCall(CeedOperatorLinearAssembleAddPointBlockDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedOperatorLinearAssemblePointBlockDiagonal(op_fallback, assembled, - request); CeedChk(ierr); + CeedCall(CeedOperatorLinearAssemblePointBlockDiagonal(op_fallback, assembled, request)); return CEED_ERROR_SUCCESS; } } // Default interface implementation - ierr = CeedVectorSetValue(assembled, 0.0); CeedChk(ierr); - ierr = CeedOperatorLinearAssembleAddPointBlockDiagonal(op, assembled, request); - CeedChk(ierr); + CeedCall(CeedVectorSetValue(assembled, 0.0)); + CeedCall(CeedOperatorLinearAssembleAddPointBlockDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; } @@ -1825,44 +1624,38 @@ int CeedOperatorLinearAssemblePointBlockDiagonal(CeedOperator op, @ref User **/ -int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, - CeedVector assembled, CeedRequest *request) { - int ierr; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); +int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, CeedVector assembled, CeedRequest *request) { + CeedCall(CeedOperatorCheckReady(op)); CeedSize input_size = 0, output_size = 0; - ierr = CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size); - CeedChk(ierr); - if (input_size != output_size) + CeedCall(CeedOperatorGetActiveVectorLengths(op, &input_size, &output_size)); + if (input_size != output_size) { // LCOV_EXCL_START return CeedError(op->ceed, CEED_ERROR_DIMENSION, "Operator must be square"); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } if (op->LinearAssembleAddPointBlockDiagonal) { // Backend version - ierr = op->LinearAssembleAddPointBlockDiagonal(op, assembled, request); - CeedChk(ierr); + CeedCall(op->LinearAssembleAddPointBlockDiagonal(op, assembled, request)); return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedOperatorLinearAssembleAddPointBlockDiagonal(op_fallback, assembled, - request); CeedChk(ierr); + CeedCall(CeedOperatorLinearAssembleAddPointBlockDiagonal(op_fallback, assembled, request)); return CEED_ERROR_SUCCESS; } } // Default interface implemenation bool is_composite; - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); if (is_composite) { - ierr = CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, - true, assembled); CeedChk(ierr); + CeedCall(CeedCompositeOperatorLinearAssembleAddDiagonal(op, request, true, assembled)); } else { - ierr = CeedSingleOperatorAssembleAddDiagonal_Core(op, request, true, assembled); - CeedChk(ierr); + CeedCall(CeedSingleOperatorAssembleAddDiagonal_Core(op, request, true, assembled)); } return CEED_ERROR_SUCCESS; @@ -1892,26 +1685,23 @@ int CeedOperatorLinearAssembleAddPointBlockDiagonal(CeedOperator op, @ref User **/ -int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, - CeedInt **rows, CeedInt **cols) { - int ierr; - CeedInt num_suboperators, single_entries; +int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, CeedInt **rows, CeedInt **cols) { + CeedInt num_suboperators, single_entries; CeedOperator *sub_operators; - bool is_composite; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); + bool is_composite; + CeedCall(CeedOperatorCheckReady(op)); if (op->LinearAssembleSymbolic) { // Backend version - ierr = op->LinearAssembleSymbolic(op, num_entries, rows, cols); CeedChk(ierr); + CeedCall(op->LinearAssembleSymbolic(op, num_entries, rows, cols)); return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedOperatorLinearAssembleSymbolic(op_fallback, num_entries, rows, cols); - CeedChk(ierr); + CeedCall(CeedOperatorLinearAssembleSymbolic(op_fallback, num_entries, rows, cols)); return CEED_ERROR_SUCCESS; } } @@ -1919,40 +1709,34 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, // Default interface implementation // count entries and allocate rows, cols arrays - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); *num_entries = 0; if (is_composite) { - ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChk(ierr); - ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr); + CeedCall(CeedOperatorGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorGetSubList(op, &sub_operators)); for (CeedInt k = 0; k < num_suboperators; ++k) { - ierr = CeedSingleOperatorAssemblyCountEntries(sub_operators[k], - &single_entries); CeedChk(ierr); + CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries)); *num_entries += single_entries; } } else { - ierr = CeedSingleOperatorAssemblyCountEntries(op, - &single_entries); CeedChk(ierr); + CeedCall(CeedSingleOperatorAssemblyCountEntries(op, &single_entries)); *num_entries += single_entries; } - ierr = CeedCalloc(*num_entries, rows); CeedChk(ierr); - ierr = CeedCalloc(*num_entries, cols); CeedChk(ierr); + CeedCall(CeedCalloc(*num_entries, rows)); + CeedCall(CeedCalloc(*num_entries, cols)); // assemble nonzero locations CeedInt offset = 0; if (is_composite) { - ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChk(ierr); - ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr); + CeedCall(CeedOperatorGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorGetSubList(op, &sub_operators)); for (CeedInt k = 0; k < num_suboperators; ++k) { - ierr = CeedSingleOperatorAssembleSymbolic(sub_operators[k], offset, *rows, - *cols); CeedChk(ierr); - ierr = CeedSingleOperatorAssemblyCountEntries(sub_operators[k], - &single_entries); - CeedChk(ierr); + CeedCall(CeedSingleOperatorAssembleSymbolic(sub_operators[k], offset, *rows, *cols)); + CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries)); offset += single_entries; } } else { - ierr = CeedSingleOperatorAssembleSymbolic(op, offset, *rows, *cols); - CeedChk(ierr); + CeedCall(CeedSingleOperatorAssembleSymbolic(op, offset, *rows, *cols)); } return CEED_ERROR_SUCCESS; @@ -1980,44 +1764,40 @@ int CeedOperatorLinearAssembleSymbolic(CeedOperator op, CeedSize *num_entries, @ref User **/ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) { - int ierr; - CeedInt num_suboperators, single_entries = 0; + CeedInt num_suboperators, single_entries = 0; CeedOperator *sub_operators; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); + CeedCall(CeedOperatorCheckReady(op)); if (op->LinearAssemble) { // Backend version - ierr = op->LinearAssemble(op, values); CeedChk(ierr); + CeedCall(op->LinearAssemble(op, values)); return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedOperatorLinearAssemble(op_fallback, values); CeedChk(ierr); + CeedCall(CeedOperatorLinearAssemble(op_fallback, values)); return CEED_ERROR_SUCCESS; } } // Default interface implementation bool is_composite; - ierr = CeedOperatorIsComposite(op, &is_composite); CeedChk(ierr); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); CeedInt offset = 0; if (is_composite) { - ierr = CeedOperatorGetNumSub(op, &num_suboperators); CeedChk(ierr); - ierr = CeedOperatorGetSubList(op, &sub_operators); CeedChk(ierr); + CeedCall(CeedOperatorGetNumSub(op, &num_suboperators)); + CeedCall(CeedOperatorGetSubList(op, &sub_operators)); for (CeedInt k = 0; k < num_suboperators; k++) { - ierr = CeedSingleOperatorAssemble(sub_operators[k], offset, values); - CeedChk(ierr); - ierr = CeedSingleOperatorAssemblyCountEntries(sub_operators[k], - &single_entries); - CeedChk(ierr); + CeedCall(CeedSingleOperatorAssemble(sub_operators[k], offset, values)); + CeedCall(CeedSingleOperatorAssemblyCountEntries(sub_operators[k], &single_entries)); offset += single_entries; } } else { - ierr = CeedSingleOperatorAssemble(op, offset, values); CeedChk(ierr); + CeedCall(CeedSingleOperatorAssemble(op, offset, values)); } return CEED_ERROR_SUCCESS; @@ -2043,25 +1823,17 @@ int CeedOperatorLinearAssemble(CeedOperator op, CeedVector values) { @ref User **/ -int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, - CeedVector p_mult_fine, - CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, - CeedOperator *op_coarse, CeedOperator *op_prolong, - CeedOperator *op_restrict) { - int ierr; - ierr = CeedOperatorCheckReady(op_fine); CeedChk(ierr); +int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, + CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict) { + CeedCall(CeedOperatorCheckReady(op_fine)); // Build prolongation matrix CeedBasis basis_fine, basis_c_to_f; - ierr = CeedOperatorGetActiveBasis(op_fine, &basis_fine); CeedChk(ierr); - ierr = CeedBasisCreateProjection(basis_coarse, basis_fine, &basis_c_to_f); - CeedChk(ierr); + CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine)); + CeedCall(CeedBasisCreateProjection(basis_coarse, basis_fine, &basis_c_to_f)); // Core code - ierr = CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, - basis_coarse, basis_c_to_f, op_coarse, - op_prolong, op_restrict); - CeedChk(ierr); + CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict)); return CEED_ERROR_SUCCESS; } @@ -2086,54 +1858,44 @@ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, @ref User **/ -int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, - CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, - const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, - CeedOperator *op_prolong, CeedOperator *op_restrict) { - int ierr; - ierr = CeedOperatorCheckReady(op_fine); CeedChk(ierr); +int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, + const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, + CeedOperator *op_restrict) { + CeedCall(CeedOperatorCheckReady(op_fine)); Ceed ceed; - ierr = CeedOperatorGetCeed(op_fine, &ceed); CeedChk(ierr); + CeedCall(CeedOperatorGetCeed(op_fine, &ceed)); // Check for compatible quadrature spaces CeedBasis basis_fine; - ierr = CeedOperatorGetActiveBasis(op_fine, &basis_fine); CeedChk(ierr); + CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine)); CeedInt Q_f, Q_c; - ierr = CeedBasisGetNumQuadraturePoints(basis_fine, &Q_f); CeedChk(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis_coarse, &Q_c); CeedChk(ierr); - if (Q_f != Q_c) + CeedCall(CeedBasisGetNumQuadraturePoints(basis_fine, &Q_f)); + CeedCall(CeedBasisGetNumQuadraturePoints(basis_coarse, &Q_c)); + if (Q_f != Q_c) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Bases must have compatible quadrature spaces"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces"); + // LCOV_EXCL_STOP + } // Coarse to fine basis CeedInt dim, num_comp, num_nodes_c, P_1d_f, P_1d_c; - ierr = CeedBasisGetDimension(basis_fine, &dim); CeedChk(ierr); - ierr = CeedBasisGetNumComponents(basis_fine, &num_comp); CeedChk(ierr); - ierr = CeedBasisGetNumNodes1D(basis_fine, &P_1d_f); CeedChk(ierr); - ierr = CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c); - CeedChk(ierr); - P_1d_c = dim == 1 ? num_nodes_c : - dim == 2 ? sqrt(num_nodes_c) : - cbrt(num_nodes_c); + CeedCall(CeedBasisGetDimension(basis_fine, &dim)); + CeedCall(CeedBasisGetNumComponents(basis_fine, &num_comp)); + CeedCall(CeedBasisGetNumNodes1D(basis_fine, &P_1d_f)); + CeedCall(CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c)); + P_1d_c = dim == 1 ? num_nodes_c : dim == 2 ? sqrt(num_nodes_c) : cbrt(num_nodes_c); CeedScalar *q_ref, *q_weight, *grad; - ierr = CeedCalloc(P_1d_f, &q_ref); CeedChk(ierr); - ierr = CeedCalloc(P_1d_f, &q_weight); CeedChk(ierr); - ierr = CeedCalloc(P_1d_f*P_1d_c*dim, &grad); CeedChk(ierr); + CeedCall(CeedCalloc(P_1d_f, &q_ref)); + CeedCall(CeedCalloc(P_1d_f, &q_weight)); + CeedCall(CeedCalloc(P_1d_f * P_1d_c * dim, &grad)); CeedBasis basis_c_to_f; - ierr = CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_c, P_1d_f, - interp_c_to_f, grad, q_ref, q_weight, &basis_c_to_f); - CeedChk(ierr); - ierr = CeedFree(&q_ref); CeedChk(ierr); - ierr = CeedFree(&q_weight); CeedChk(ierr); - ierr = CeedFree(&grad); CeedChk(ierr); + CeedCall(CeedBasisCreateTensorH1(ceed, dim, num_comp, P_1d_c, P_1d_f, interp_c_to_f, grad, q_ref, q_weight, &basis_c_to_f)); + CeedCall(CeedFree(&q_ref)); + CeedCall(CeedFree(&q_weight)); + CeedCall(CeedFree(&grad)); // Core code - ierr = CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, - basis_coarse, basis_c_to_f, op_coarse, - op_prolong, op_restrict); - CeedChk(ierr); + CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict)); return CEED_ERROR_SUCCESS; } @@ -2157,57 +1919,45 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, @ref User **/ -int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, - CeedVector p_mult_fine, - CeedElemRestriction rstr_coarse, - CeedBasis basis_coarse, - const CeedScalar *interp_c_to_f, - CeedOperator *op_coarse, - CeedOperator *op_prolong, +int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, + const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict) { - int ierr; - ierr = CeedOperatorCheckReady(op_fine); CeedChk(ierr); + CeedCall(CeedOperatorCheckReady(op_fine)); Ceed ceed; - ierr = CeedOperatorGetCeed(op_fine, &ceed); CeedChk(ierr); + CeedCall(CeedOperatorGetCeed(op_fine, &ceed)); // Check for compatible quadrature spaces CeedBasis basis_fine; - ierr = CeedOperatorGetActiveBasis(op_fine, &basis_fine); CeedChk(ierr); + CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine)); CeedInt Q_f, Q_c; - ierr = CeedBasisGetNumQuadraturePoints(basis_fine, &Q_f); CeedChk(ierr); - ierr = CeedBasisGetNumQuadraturePoints(basis_coarse, &Q_c); CeedChk(ierr); - if (Q_f != Q_c) + CeedCall(CeedBasisGetNumQuadraturePoints(basis_fine, &Q_f)); + CeedCall(CeedBasisGetNumQuadraturePoints(basis_coarse, &Q_c)); + if (Q_f != Q_c) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_DIMENSION, - "Bases must have compatible quadrature spaces"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces"); + // LCOV_EXCL_STOP + } // Coarse to fine basis CeedElemTopology topo; - ierr = CeedBasisGetTopology(basis_fine, &topo); CeedChk(ierr); + CeedCall(CeedBasisGetTopology(basis_fine, &topo)); CeedInt dim, num_comp, num_nodes_c, num_nodes_f; - ierr = CeedBasisGetDimension(basis_fine, &dim); CeedChk(ierr); - ierr = CeedBasisGetNumComponents(basis_fine, &num_comp); CeedChk(ierr); - ierr = CeedBasisGetNumNodes(basis_fine, &num_nodes_f); CeedChk(ierr); - ierr = CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c); - CeedChk(ierr); + CeedCall(CeedBasisGetDimension(basis_fine, &dim)); + CeedCall(CeedBasisGetNumComponents(basis_fine, &num_comp)); + CeedCall(CeedBasisGetNumNodes(basis_fine, &num_nodes_f)); + CeedCall(CeedElemRestrictionGetElementSize(rstr_coarse, &num_nodes_c)); CeedScalar *q_ref, *q_weight, *grad; - ierr = CeedCalloc(num_nodes_f*dim, &q_ref); CeedChk(ierr); - ierr = CeedCalloc(num_nodes_f, &q_weight); CeedChk(ierr); - ierr = CeedCalloc(num_nodes_f*num_nodes_c*dim, &grad); CeedChk(ierr); + CeedCall(CeedCalloc(num_nodes_f * dim, &q_ref)); + CeedCall(CeedCalloc(num_nodes_f, &q_weight)); + CeedCall(CeedCalloc(num_nodes_f * num_nodes_c * dim, &grad)); CeedBasis basis_c_to_f; - ierr = CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_c, num_nodes_f, - interp_c_to_f, grad, q_ref, q_weight, &basis_c_to_f); - CeedChk(ierr); - ierr = CeedFree(&q_ref); CeedChk(ierr); - ierr = CeedFree(&q_weight); CeedChk(ierr); - ierr = CeedFree(&grad); CeedChk(ierr); + CeedCall(CeedBasisCreateH1(ceed, topo, num_comp, num_nodes_c, num_nodes_f, interp_c_to_f, grad, q_ref, q_weight, &basis_c_to_f)); + CeedCall(CeedFree(&q_ref)); + CeedCall(CeedFree(&q_weight)); + CeedCall(CeedFree(&grad)); // Core code - ierr = CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, - basis_coarse, basis_c_to_f, op_coarse, - op_prolong, op_restrict); - CeedChk(ierr); + CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict)); return CEED_ERROR_SUCCESS; } @@ -2237,241 +1987,215 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, @ref User **/ -int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, - CeedRequest *request) { - int ierr; - ierr = CeedOperatorCheckReady(op); CeedChk(ierr); +int CeedOperatorCreateFDMElementInverse(CeedOperator op, CeedOperator *fdm_inv, CeedRequest *request) { + CeedCall(CeedOperatorCheckReady(op)); if (op->CreateFDMElementInverse) { // Backend version - ierr = op->CreateFDMElementInverse(op, fdm_inv, request); CeedChk(ierr); + CeedCall(op->CreateFDMElementInverse(op, fdm_inv, request)); return CEED_ERROR_SUCCESS; } else { // Operator fallback CeedOperator op_fallback; - ierr = CeedOperatorGetFallback(op, &op_fallback); CeedChk(ierr); + CeedCall(CeedOperatorGetFallback(op, &op_fallback)); if (op_fallback) { - ierr = CeedOperatorCreateFDMElementInverse(op_fallback, fdm_inv, request); - CeedChk(ierr); + CeedCall(CeedOperatorCreateFDMElementInverse(op_fallback, fdm_inv, request)); return CEED_ERROR_SUCCESS; } } // Default interface implementation Ceed ceed, ceed_parent; - ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); - ierr = CeedGetOperatorFallbackParentCeed(ceed, &ceed_parent); CeedChk(ierr); + CeedCall(CeedOperatorGetCeed(op, &ceed)); + CeedCall(CeedGetOperatorFallbackParentCeed(ceed, &ceed_parent)); ceed_parent = ceed_parent ? ceed_parent : ceed; CeedQFunction qf; - ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); + CeedCall(CeedOperatorGetQFunction(op, &qf)); // Determine active input basis - bool interp = false, grad = false; - CeedBasis basis = NULL; - CeedElemRestriction rstr = NULL; - CeedOperatorField *op_fields; + bool interp = false, grad = false; + CeedBasis basis = NULL; + CeedElemRestriction rstr = NULL; + CeedOperatorField *op_fields; CeedQFunctionField *qf_fields; - CeedInt num_input_fields; - ierr = CeedOperatorGetFields(op, &num_input_fields, &op_fields, NULL, NULL); - CeedChk(ierr); - ierr = CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL); CeedChk(ierr); - for (CeedInt i=0; i - qf_value_bound) { - elem_avg[e] += assembled_array[q*layout[0] + i*layout[1] + e*layout[2]] / - q_weight_array[q]; + for (CeedInt q = 0; q < num_qpts; q++) { + for (CeedInt i = 0; i < num_comp * num_comp * num_modes * num_modes; i++) { + if (fabs(assembled_array[q * layout[0] + i * layout[1] + e * layout[2]]) > qf_value_bound) { + elem_avg[e] += assembled_array[q * layout[0] + i * layout[1] + e * layout[2]] / q_weight_array[q]; count++; } + } + } if (count) { elem_avg[e] /= count; } else { elem_avg[e] = 1.0; } } - ierr = CeedVectorRestoreArrayRead(assembled, &assembled_array); CeedChk(ierr); - ierr = CeedVectorDestroy(&assembled); CeedChk(ierr); - ierr = CeedVectorRestoreArrayRead(q_weight, &q_weight_array); CeedChk(ierr); - ierr = CeedVectorDestroy(&q_weight); CeedChk(ierr); + CeedCall(CeedVectorRestoreArrayRead(assembled, &assembled_array)); + CeedCall(CeedVectorDestroy(&assembled)); + CeedCall(CeedVectorRestoreArrayRead(q_weight, &q_weight_array)); + CeedCall(CeedVectorDestroy(&q_weight)); // Build FDM diagonal - CeedVector q_data; + CeedVector q_data; CeedScalar *q_data_array, *fdm_diagonal; - ierr = CeedCalloc(num_comp*elem_size, &fdm_diagonal); CeedChk(ierr); - const CeedScalar fdm_diagonal_bound = elem_size*CEED_EPSILON; - for (CeedInt c=0; c #include +#include static bool register_all_called; diff --git a/interface/ceed-qfunction.c b/interface/ceed-qfunction.c index 719e85e386..6b15994ece 100644 --- a/interface/ceed-qfunction.c +++ b/interface/ceed-qfunction.c @@ -5,10 +5,10 @@ // // This file is part of CEED: http://github.com/ceed -#include +#include #include +#include #include -#include #include #include #include @@ -32,9 +32,9 @@ const CeedQFunction CEED_QFUNCTION_NONE = &ceed_qfunction_none; /// @cond DOXYGEN_SKIP static struct { - char name[CEED_MAX_RESOURCE_LEN]; - char source[CEED_MAX_RESOURCE_LEN]; - CeedInt vec_length; + char name[CEED_MAX_RESOURCE_LEN]; + char source[CEED_MAX_RESOURCE_LEN]; + CeedInt vec_length; CeedQFunctionUser f; int (*init)(Ceed ceed, const char *name, CeedQFunction qf); } gallery_qfunctions[1024]; @@ -64,30 +64,26 @@ static size_t num_qfunctions; @ref Developer **/ -int CeedQFunctionRegister(const char *name, const char *source, - CeedInt vec_length, CeedQFunctionUser f, +int CeedQFunctionRegister(const char *name, const char *source, CeedInt vec_length, CeedQFunctionUser f, int (*init)(Ceed, const char *, CeedQFunction)) { - int ierr; - - if (num_qfunctions >= sizeof(gallery_qfunctions) / sizeof( - gallery_qfunctions[0])) + if (num_qfunctions >= sizeof(gallery_qfunctions) / sizeof(gallery_qfunctions[0])) { // LCOV_EXCL_START return CeedError(NULL, CEED_ERROR_MAJOR, "Too many gallery QFunctions"); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } CeedDebugEnv("Gallery Register: %s", name); const char *relative_file_path; - ierr = CeedGetJitRelativePath(source, &relative_file_path); CeedChk(ierr); + CeedCall(CeedGetJitRelativePath(source, &relative_file_path)); strncpy(gallery_qfunctions[num_qfunctions].name, name, CEED_MAX_RESOURCE_LEN); - gallery_qfunctions[num_qfunctions].name[CEED_MAX_RESOURCE_LEN-1] = 0; - strncpy(gallery_qfunctions[num_qfunctions].source, relative_file_path, - CEED_MAX_RESOURCE_LEN); - gallery_qfunctions[num_qfunctions].source[CEED_MAX_RESOURCE_LEN-1] = 0; - gallery_qfunctions[num_qfunctions].vec_length = vec_length; - gallery_qfunctions[num_qfunctions].f = f; - gallery_qfunctions[num_qfunctions].init = init; + gallery_qfunctions[num_qfunctions].name[CEED_MAX_RESOURCE_LEN - 1] = 0; + strncpy(gallery_qfunctions[num_qfunctions].source, relative_file_path, CEED_MAX_RESOURCE_LEN); + gallery_qfunctions[num_qfunctions].source[CEED_MAX_RESOURCE_LEN - 1] = 0; + gallery_qfunctions[num_qfunctions].vec_length = vec_length; + gallery_qfunctions[num_qfunctions].f = f; + gallery_qfunctions[num_qfunctions].init = init; num_qfunctions++; return CEED_ERROR_SUCCESS; } @@ -108,14 +104,10 @@ int CeedQFunctionRegister(const char *name, const char *source, @ref Developer **/ -static int CeedQFunctionFieldSet(CeedQFunctionField *f, const char *field_name, - CeedInt size, CeedEvalMode eval_mode) { - int ierr; - - ierr = CeedCalloc(1, f); CeedChk(ierr); - ierr = CeedStringAllocCopy(field_name, (char **)&(*f)->field_name); - CeedChk(ierr); - (*f)->size = size; +static int CeedQFunctionFieldSet(CeedQFunctionField *f, const char *field_name, CeedInt size, CeedEvalMode eval_mode) { + CeedCall(CeedCalloc(1, f)); + CeedCall(CeedStringAllocCopy(field_name, (char **)&(*f)->field_name)); + (*f)->size = size; (*f)->eval_mode = eval_mode; return CEED_ERROR_SUCCESS; } @@ -132,20 +124,20 @@ static int CeedQFunctionFieldSet(CeedQFunctionField *f, const char *field_name, @ref Utility **/ -static int CeedQFunctionFieldView(CeedQFunctionField field, - CeedInt field_number, - bool in, FILE *stream) { - int ierr; +static int CeedQFunctionFieldView(CeedQFunctionField field, CeedInt field_number, bool in, FILE *stream) { const char *inout = in ? "Input" : "Output"; - char *field_name; - ierr = CeedQFunctionFieldGetName(field, &field_name); CeedChk(ierr); + char *field_name; + CeedCall(CeedQFunctionFieldGetName(field, &field_name)); CeedInt size; - ierr = CeedQFunctionFieldGetSize(field, &size); CeedChk(ierr); + CeedCall(CeedQFunctionFieldGetSize(field, &size)); CeedEvalMode eval_mode; - ierr = CeedQFunctionFieldGetEvalMode(field, &eval_mode); CeedChk(ierr); - fprintf(stream, " %s field %" CeedInt_FMT ":\n" + CeedCall(CeedQFunctionFieldGetEvalMode(field, &eval_mode)); + fprintf(stream, + " %s field %" CeedInt_FMT + ":\n" " Name: \"%s\"\n" - " Size: %" CeedInt_FMT "\n" + " Size: %" CeedInt_FMT + "\n" " EvalMode: \"%s\"\n", inout, field_number, field_name, size, CeedEvalModes[eval_mode]); return CEED_ERROR_SUCCESS; @@ -200,8 +192,7 @@ int CeedQFunctionGetVectorLength(CeedQFunction qf, CeedInt *vec_length) { @ref Backend **/ -int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input, - CeedInt *num_output) { +int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input, CeedInt *num_output) { if (num_input) *num_input = qf->num_input_fields; if (num_output) *num_output = qf->num_output_fields; return CEED_ERROR_SUCCESS; @@ -218,26 +209,24 @@ int CeedQFunctionGetNumArgs(CeedQFunction qf, CeedInt *num_input, @ref Backend **/ int CeedQFunctionGetKernelName(CeedQFunction qf, char **kernel_name) { - int ierr; - - if (!qf->kernel_name ) { - Ceed ceed; + if (!qf->kernel_name) { + Ceed ceed; char *kernel_name_copy; - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChk(ierr); + CeedCall(CeedQFunctionGetCeed(qf, &ceed)); if (qf->user_source) { - const char *kernel_name = strrchr(qf->user_source, ':') + 1; - size_t kernel_name_len = strlen(kernel_name); + const char *kernel_name = strrchr(qf->user_source, ':') + 1; + size_t kernel_name_len = strlen(kernel_name); - ierr = CeedCalloc(kernel_name_len + 1, &kernel_name_copy); CeedChk(ierr); + CeedCall(CeedCalloc(kernel_name_len + 1, &kernel_name_copy)); memcpy(kernel_name_copy, kernel_name, kernel_name_len); } else { - ierr = CeedCalloc(1, &kernel_name_copy); CeedChk(ierr); + CeedCall(CeedCalloc(1, &kernel_name_copy)); } qf->kernel_name = kernel_name_copy; } - *kernel_name = (char *) qf->kernel_name; + *kernel_name = (char *)qf->kernel_name; return CEED_ERROR_SUCCESS; } @@ -252,37 +241,31 @@ int CeedQFunctionGetKernelName(CeedQFunction qf, char **kernel_name) { @ref Backend **/ int CeedQFunctionGetSourcePath(CeedQFunction qf, char **source_path) { - int ierr; - if (!qf->source_path && qf->user_source) { - Ceed ceed; - bool is_absolute_path; - char *absolute_path, *source_path_copy; - const char *kernel_name = strrchr(qf->user_source, ':') + 1; - size_t kernel_name_len = strlen(kernel_name); + Ceed ceed; + bool is_absolute_path; + char *absolute_path, *source_path_copy; + const char *kernel_name = strrchr(qf->user_source, ':') + 1; + size_t kernel_name_len = strlen(kernel_name); - ierr = CeedQFunctionGetCeed(qf, &ceed); CeedChk(ierr); + CeedCall(CeedQFunctionGetCeed(qf, &ceed)); - ierr = CeedCheckFilePath(ceed, qf->user_source, &is_absolute_path); - CeedChk(ierr); + CeedCall(CeedCheckFilePath(ceed, qf->user_source, &is_absolute_path)); if (is_absolute_path) { absolute_path = (char *)qf->user_source; } else { - ierr = CeedGetJitAbsolutePath(ceed, qf->user_source, &absolute_path); - CeedChk(ierr); + CeedCall(CeedGetJitAbsolutePath(ceed, qf->user_source, &absolute_path)); } size_t source_len = strlen(absolute_path) - kernel_name_len - 1; - ierr = CeedCalloc(source_len + 1, &source_path_copy); CeedChk(ierr); + CeedCall(CeedCalloc(source_len + 1, &source_path_copy)); memcpy(source_path_copy, absolute_path, source_len); qf->source_path = source_path_copy; - if (!is_absolute_path) { - ierr = CeedFree(&absolute_path); CeedChk(ierr); - } + if (!is_absolute_path) CeedCall(CeedFree(&absolute_path)); } - *source_path = (char *) qf->source_path; + *source_path = (char *)qf->source_path; return CEED_ERROR_SUCCESS; } @@ -300,14 +283,12 @@ int CeedQFunctionGetSourcePath(CeedQFunction qf, char **source_path) { @ref Backend **/ int CeedQFunctionLoadSourceToBuffer(CeedQFunction qf, char **source_buffer) { - int ierr; char *source_path; - ierr = CeedQFunctionGetSourcePath(qf, &source_path); CeedChk(ierr); + CeedCall(CeedQFunctionGetSourcePath(qf, &source_path)); *source_buffer = NULL; if (source_path) { - ierr = CeedLoadSourceToBuffer(qf->ceed, source_path, source_buffer); - CeedChk(ierr); + CeedCall(CeedLoadSourceToBuffer(qf->ceed, source_path, source_buffer)); } return CEED_ERROR_SUCCESS; @@ -358,19 +339,17 @@ int CeedQFunctionGetContext(CeedQFunction qf, CeedQFunctionContext *ctx) { @ref Backend **/ -int CeedQFunctionGetContextData(CeedQFunction qf, CeedMemType mem_type, - void *data) { - int ierr; - bool is_writable; +int CeedQFunctionGetContextData(CeedQFunction qf, CeedMemType mem_type, void *data) { + bool is_writable; CeedQFunctionContext ctx; - ierr = CeedQFunctionGetContext(qf, &ctx); CeedChk(ierr); + CeedCall(CeedQFunctionGetContext(qf, &ctx)); if (ctx) { - ierr = CeedQFunctionIsContextWritable(qf, &is_writable); CeedChk(ierr); + CeedCall(CeedQFunctionIsContextWritable(qf, &is_writable)); if (is_writable) { - ierr = CeedQFunctionContextGetData(ctx, mem_type, data); CeedChk(ierr); + CeedCall(CeedQFunctionContextGetData(ctx, mem_type, data)); } else { - ierr = CeedQFunctionContextGetDataRead(ctx, mem_type, data); CeedChk(ierr); + CeedCall(CeedQFunctionContextGetDataRead(ctx, mem_type, data)); } } else { *(void **)data = NULL; @@ -389,17 +368,16 @@ int CeedQFunctionGetContextData(CeedQFunction qf, CeedMemType mem_type, @ref Backend **/ int CeedQFunctionRestoreContextData(CeedQFunction qf, void *data) { - int ierr; - bool is_writable; + bool is_writable; CeedQFunctionContext ctx; - ierr = CeedQFunctionGetContext(qf, &ctx); CeedChk(ierr); + CeedCall(CeedQFunctionGetContext(qf, &ctx)); if (ctx) { - ierr = CeedQFunctionIsContextWritable(qf, &is_writable); CeedChk(ierr); + CeedCall(CeedQFunctionIsContextWritable(qf, &is_writable)); if (is_writable) { - ierr = CeedQFunctionContextRestoreData(ctx, data); CeedChk(ierr); + CeedCall(CeedQFunctionContextRestoreData(ctx, data)); } else { - ierr = CeedQFunctionContextRestoreDataRead(ctx, data); CeedChk(ierr); + CeedCall(CeedQFunctionContextRestoreDataRead(ctx, data)); } } return CEED_ERROR_SUCCESS; @@ -418,14 +396,11 @@ int CeedQFunctionRestoreContextData(CeedQFunction qf, void *data) { @ref Backend **/ int CeedQFunctionGetInnerContext(CeedQFunction qf, CeedQFunctionContext *ctx) { - int ierr; if (qf->is_fortran) { CeedFortranContext fortran_ctx = NULL; - ierr = CeedQFunctionContextGetData(qf->ctx, CEED_MEM_HOST, &fortran_ctx); - CeedChk(ierr); + CeedCall(CeedQFunctionContextGetData(qf->ctx, CEED_MEM_HOST, &fortran_ctx)); *ctx = fortran_ctx->inner_ctx; - ierr = CeedQFunctionContextRestoreData(qf->ctx, (void *)&fortran_ctx); - CeedChk(ierr); + CeedCall(CeedQFunctionContextRestoreData(qf->ctx, (void *)&fortran_ctx)); } else { *ctx = qf->ctx; } @@ -444,19 +419,17 @@ int CeedQFunctionGetInnerContext(CeedQFunction qf, CeedQFunctionContext *ctx) { @ref Backend **/ -int CeedQFunctionGetInnerContextData(CeedQFunction qf, CeedMemType mem_type, - void *data) { - int ierr; - bool is_writable; +int CeedQFunctionGetInnerContextData(CeedQFunction qf, CeedMemType mem_type, void *data) { + bool is_writable; CeedQFunctionContext ctx; - ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr); + CeedCall(CeedQFunctionGetInnerContext(qf, &ctx)); if (ctx) { - ierr = CeedQFunctionIsContextWritable(qf, &is_writable); CeedChk(ierr); + CeedCall(CeedQFunctionIsContextWritable(qf, &is_writable)); if (is_writable) { - ierr = CeedQFunctionContextGetData(ctx, mem_type, data); CeedChk(ierr); + CeedCall(CeedQFunctionContextGetData(ctx, mem_type, data)); } else { - ierr = CeedQFunctionContextGetDataRead(ctx, mem_type, data); CeedChk(ierr); + CeedCall(CeedQFunctionContextGetDataRead(ctx, mem_type, data)); } } else { *(void **)data = NULL; @@ -475,17 +448,16 @@ int CeedQFunctionGetInnerContextData(CeedQFunction qf, CeedMemType mem_type, @ref Backend **/ int CeedQFunctionRestoreInnerContextData(CeedQFunction qf, void *data) { - int ierr; - bool is_writable; + bool is_writable; CeedQFunctionContext ctx; - ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr); + CeedCall(CeedQFunctionGetInnerContext(qf, &ctx)); if (ctx) { - ierr = CeedQFunctionIsContextWritable(qf, &is_writable); CeedChk(ierr); + CeedCall(CeedQFunctionIsContextWritable(qf, &is_writable)); if (is_writable) { - ierr = CeedQFunctionContextRestoreData(ctx, data); CeedChk(ierr); + CeedCall(CeedQFunctionContextRestoreData(ctx, data)); } else { - ierr = CeedQFunctionContextRestoreDataRead(ctx, data); CeedChk(ierr); + CeedCall(CeedQFunctionContextRestoreDataRead(ctx, data)); } } return CEED_ERROR_SUCCESS; @@ -574,11 +546,11 @@ int CeedQFunctionReference(CeedQFunction qf) { @ref Backend **/ int CeedQFunctionGetFlopsEstimate(CeedQFunction qf, CeedSize *flops) { - if (qf->user_flop_estimate == -1) + if (qf->user_flop_estimate == -1) { // LCOV_EXCL_START - return CeedError(qf->ceed, CEED_ERROR_INCOMPLETE, - "Must set FLOPs estimate with CeedQFunctionSetUserFlopsEstimate"); - // LCOV_EXCL_STOP + return CeedError(qf->ceed, CEED_ERROR_INCOMPLETE, "Must set FLOPs estimate with CeedQFunctionSetUserFlopsEstimate"); + // LCOV_EXCL_STOP + } *flops = qf->user_flop_estimate; return CEED_ERROR_SUCCESS; } @@ -613,54 +585,50 @@ int CeedQFunctionGetFlopsEstimate(CeedQFunction qf, CeedSize *flops) { @ref User **/ -int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, - CeedQFunctionUser f, - const char *source, CeedQFunction *qf) { - int ierr; +int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, CeedQFunctionUser f, const char *source, CeedQFunction *qf) { char *user_source_copy; if (!ceed->QFunctionCreate) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "QFunction"); CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "QFunction")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support QFunctionCreate"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support QFunctionCreate"); + // LCOV_EXCL_STOP + } - ierr = CeedQFunctionCreateInterior(delegate, vec_length, f, source, qf); - CeedChk(ierr); + CeedCall(CeedQFunctionCreateInterior(delegate, vec_length, f, source, qf)); return CEED_ERROR_SUCCESS; } - if (strlen(source) && !strrchr(source, ':')) + if (strlen(source) && !strrchr(source, ':')) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_INCOMPLETE, - "Provided path to source does not include function name. " - "Provided: \"%s\"\nRequired: \"\\abs_path\\file.h:function_name\"", + "Provided path to source does not include function name. Provided: \"%s\"\nRequired: \"\\abs_path\\file.h:function_name\"", source); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } - ierr = CeedCalloc(1, qf); CeedChk(ierr); + CeedCall(CeedCalloc(1, qf)); (*qf)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - (*qf)->ref_count = 1; - (*qf)->vec_length = vec_length; - (*qf)->is_identity = false; + CeedCall(CeedReference(ceed)); + (*qf)->ref_count = 1; + (*qf)->vec_length = vec_length; + (*qf)->is_identity = false; (*qf)->is_context_writable = true; - (*qf)->function = f; - (*qf)->user_flop_estimate = -1; + (*qf)->function = f; + (*qf)->user_flop_estimate = -1; if (strlen(source)) { size_t user_source_len = strlen(source); - ierr = CeedCalloc(user_source_len + 1, &user_source_copy); CeedChk(ierr); + CeedCall(CeedCalloc(user_source_len + 1, &user_source_copy)); memcpy(user_source_copy, source, user_source_len); (*qf)->user_source = user_source_copy; } - ierr = CeedCalloc(CEED_FIELD_MAX, &(*qf)->input_fields); CeedChk(ierr); - ierr = CeedCalloc(CEED_FIELD_MAX, &(*qf)->output_fields); CeedChk(ierr); - ierr = ceed->QFunctionCreate(*qf); CeedChk(ierr); + CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*qf)->input_fields)); + CeedCall(CeedCalloc(CEED_FIELD_MAX, &(*qf)->output_fields)); + CeedCall(ceed->QFunctionCreate(*qf)); return CEED_ERROR_SUCCESS; } @@ -676,41 +644,37 @@ int CeedQFunctionCreateInterior(Ceed ceed, CeedInt vec_length, @ref User **/ -int CeedQFunctionCreateInteriorByName(Ceed ceed, const char *name, - CeedQFunction *qf) { - int ierr; +int CeedQFunctionCreateInteriorByName(Ceed ceed, const char *name, CeedQFunction *qf) { size_t match_len = 0, match_index = UINT_MAX; - ierr = CeedQFunctionRegisterAll(); CeedChk(ierr); + CeedCall(CeedQFunctionRegisterAll()); // Find matching backend - if (!name) return CeedError(ceed, CEED_ERROR_INCOMPLETE, - "No QFunction name provided"); - for (size_t i=0; i match_len) { - match_len = n; + match_len = n; match_index = i; } } - if (!match_len) + if (!match_len) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "No suitable gallery QFunction"); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } // Create QFunction - ierr = CeedQFunctionCreateInterior(ceed, - gallery_qfunctions[match_index].vec_length, - gallery_qfunctions[match_index].f, - gallery_qfunctions[match_index].source, qf); - CeedChk(ierr); + CeedCall(CeedQFunctionCreateInterior(ceed, gallery_qfunctions[match_index].vec_length, gallery_qfunctions[match_index].f, + gallery_qfunctions[match_index].source, qf)); // QFunction specific setup - ierr = gallery_qfunctions[match_index].init(ceed, name, *qf); CeedChk(ierr); + CeedCall(gallery_qfunctions[match_index].init(ceed, name, *qf)); // Copy name - ierr = CeedStringAllocCopy(name, (char **)&(*qf)->gallery_name); CeedChk(ierr); + CeedCall(CeedStringAllocCopy(name, (char **)&(*qf)->gallery_name)); (*qf)->is_gallery = true; return CEED_ERROR_SUCCESS; } @@ -735,22 +699,18 @@ int CeedQFunctionCreateInteriorByName(Ceed ceed, const char *name, @ref User **/ -int CeedQFunctionCreateIdentity(Ceed ceed, CeedInt size, CeedEvalMode in_mode, - CeedEvalMode out_mode, CeedQFunction *qf) { - int ierr; - - ierr = CeedQFunctionCreateInteriorByName(ceed, "Identity", qf); CeedChk(ierr); - ierr = CeedQFunctionAddInput(*qf, "input", size, in_mode); CeedChk(ierr); - ierr = CeedQFunctionAddOutput(*qf, "output", size, out_mode); CeedChk(ierr); +int CeedQFunctionCreateIdentity(Ceed ceed, CeedInt size, CeedEvalMode in_mode, CeedEvalMode out_mode, CeedQFunction *qf) { + CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Identity", qf)); + CeedCall(CeedQFunctionAddInput(*qf, "input", size, in_mode)); + CeedCall(CeedQFunctionAddOutput(*qf, "output", size, out_mode)); (*qf)->is_identity = true; - CeedQFunctionContext ctx; + CeedQFunctionContext ctx; CeedContextFieldLabel size_label; - ierr = CeedQFunctionGetContext(*qf, &ctx); CeedChk(ierr); - ierr = CeedQFunctionContextGetFieldLabel(ctx, "size", &size_label); - CeedChk(ierr); - ierr = CeedQFunctionContextSetInt32(ctx, size_label, &size); CeedChk(ierr); + CeedCall(CeedQFunctionGetContext(*qf, &ctx)); + CeedCall(CeedQFunctionContextGetFieldLabel(ctx, "size", &size_label)); + CeedCall(CeedQFunctionContextSetInt32(ctx, size_label, &size)); return CEED_ERROR_SUCCESS; } @@ -771,10 +731,8 @@ int CeedQFunctionCreateIdentity(Ceed ceed, CeedInt size, CeedEvalMode in_mode, @ref User **/ int CeedQFunctionReferenceCopy(CeedQFunction qf, CeedQFunction *qf_copy) { - int ierr; - - ierr = CeedQFunctionReference(qf); CeedChk(ierr); - ierr = CeedQFunctionDestroy(qf_copy); CeedChk(ierr); + CeedCall(CeedQFunctionReference(qf)); + CeedCall(CeedQFunctionDestroy(qf_copy)); *qf_copy = qf; return CEED_ERROR_SUCCESS; } @@ -794,22 +752,18 @@ int CeedQFunctionReferenceCopy(CeedQFunction qf, CeedQFunction *qf_copy) { @ref User **/ -int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, - CeedInt size, - CeedEvalMode eval_mode) { - if (qf->is_immutable) +int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode) { + if (qf->is_immutable) { // LCOV_EXCL_START - return CeedError(qf->ceed, CEED_ERROR_MAJOR, - "QFunction cannot be changed after set as immutable"); - // LCOV_EXCL_STOP - if ((eval_mode == CEED_EVAL_WEIGHT) && (size != 1)) + return CeedError(qf->ceed, CEED_ERROR_MAJOR, "QFunction cannot be changed after set as immutable"); + // LCOV_EXCL_STOP + } + if ((eval_mode == CEED_EVAL_WEIGHT) && (size != 1)) { // LCOV_EXCL_START - return CeedError(qf->ceed, CEED_ERROR_DIMENSION, - "CEED_EVAL_WEIGHT should have size 1"); - // LCOV_EXCL_STOP - int ierr = CeedQFunctionFieldSet(&qf->input_fields[qf->num_input_fields], - field_name, size, eval_mode); - CeedChk(ierr); + return CeedError(qf->ceed, CEED_ERROR_DIMENSION, "CEED_EVAL_WEIGHT should have size 1"); + // LCOV_EXCL_STOP + } + CeedCall(CeedQFunctionFieldSet(&qf->input_fields[qf->num_input_fields], field_name, size, eval_mode)); qf->num_input_fields++; return CEED_ERROR_SUCCESS; } @@ -829,22 +783,18 @@ int CeedQFunctionAddInput(CeedQFunction qf, const char *field_name, @ref User **/ -int CeedQFunctionAddOutput(CeedQFunction qf, const char *field_name, - CeedInt size, CeedEvalMode eval_mode) { - if (qf->is_immutable) +int CeedQFunctionAddOutput(CeedQFunction qf, const char *field_name, CeedInt size, CeedEvalMode eval_mode) { + if (qf->is_immutable) { // LCOV_EXCL_START - return CeedError(qf->ceed, CEED_ERROR_MAJOR, - "QFunction cannot be changed after set as immutable"); - // LCOV_EXCL_STOP - if (eval_mode == CEED_EVAL_WEIGHT) + return CeedError(qf->ceed, CEED_ERROR_MAJOR, "QFunction cannot be changed after set as immutable"); + // LCOV_EXCL_STOP + } + if (eval_mode == CEED_EVAL_WEIGHT) { // LCOV_EXCL_START - return CeedError(qf->ceed, CEED_ERROR_DIMENSION, - "Cannot create QFunction output with " - "CEED_EVAL_WEIGHT"); - // LCOV_EXCL_STOP - int ierr = CeedQFunctionFieldSet(&qf->output_fields[qf->num_output_fields], - field_name, size, eval_mode); - CeedChk(ierr); + return CeedError(qf->ceed, CEED_ERROR_DIMENSION, "Cannot create QFunction output with CEED_EVAL_WEIGHT"); + // LCOV_EXCL_STOP + } + CeedCall(CeedQFunctionFieldSet(&qf->output_fields[qf->num_output_fields], field_name, size, eval_mode)); qf->num_output_fields++; return CEED_ERROR_SUCCESS; } @@ -865,9 +815,7 @@ int CeedQFunctionAddOutput(CeedQFunction qf, const char *field_name, @ref Advanced **/ -int CeedQFunctionGetFields(CeedQFunction qf, CeedInt *num_input_fields, - CeedQFunctionField **input_fields, - CeedInt *num_output_fields, +int CeedQFunctionGetFields(CeedQFunction qf, CeedInt *num_input_fields, CeedQFunctionField **input_fields, CeedInt *num_output_fields, CeedQFunctionField **output_fields) { qf->is_immutable = true; if (num_input_fields) *num_input_fields = qf->num_input_fields; @@ -917,8 +865,7 @@ int CeedQFunctionFieldGetSize(CeedQFunctionField qf_field, CeedInt *size) { @ref Advanced **/ -int CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, - CeedEvalMode *eval_mode) { +int CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, CeedEvalMode *eval_mode) { *eval_mode = qf_field->eval_mode; return CEED_ERROR_SUCCESS; } @@ -934,11 +881,10 @@ int CeedQFunctionFieldGetEvalMode(CeedQFunctionField qf_field, @ref User **/ int CeedQFunctionSetContext(CeedQFunction qf, CeedQFunctionContext ctx) { - int ierr; - ierr = CeedQFunctionContextDestroy(&qf->ctx); CeedChk(ierr); + CeedCall(CeedQFunctionContextDestroy(&qf->ctx)); qf->ctx = ctx; if (ctx) { - ierr = CeedQFunctionContextReference(ctx); CeedChk(ierr); + CeedCall(CeedQFunctionContextReference(ctx)); } return CEED_ERROR_SUCCESS; } @@ -978,11 +924,11 @@ int CeedQFunctionSetContextWritable(CeedQFunction qf, bool is_writable) { @ref Backend **/ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) { - if (flops < 0) + if (flops < 0) { // LCOV_EXCL_START - return CeedError(qf->ceed, CEED_ERROR_INCOMPATIBLE, - "Must set non-negative FLOPs estimate"); - // LCOV_EXCL_STOP + return CeedError(qf->ceed, CEED_ERROR_INCOMPATIBLE, "Must set non-negative FLOPs estimate"); + // LCOV_EXCL_STOP + } qf->user_flop_estimate = flops; return CEED_ERROR_SUCCESS; } @@ -998,26 +944,19 @@ int CeedQFunctionSetUserFlopsEstimate(CeedQFunction qf, CeedSize flops) { @ref User **/ int CeedQFunctionView(CeedQFunction qf, FILE *stream) { - int ierr; char *kernel_name; - ierr = CeedQFunctionGetKernelName(qf, &kernel_name); CeedChk(ierr); - fprintf(stream, "%sCeedQFunction - %s\n", - qf->is_gallery ? "Gallery " : "User ", - qf->is_gallery ? qf->gallery_name : kernel_name); + CeedCall(CeedQFunctionGetKernelName(qf, &kernel_name)); + fprintf(stream, "%sCeedQFunction - %s\n", qf->is_gallery ? "Gallery " : "User ", qf->is_gallery ? qf->gallery_name : kernel_name); - fprintf(stream, " %" CeedInt_FMT " input field%s:\n", qf->num_input_fields, - qf->num_input_fields>1 ? "s" : ""); - for (CeedInt i=0; inum_input_fields; i++) { - ierr = CeedQFunctionFieldView(qf->input_fields[i], i, 1, stream); - CeedChk(ierr); + fprintf(stream, " %" CeedInt_FMT " input field%s:\n", qf->num_input_fields, qf->num_input_fields > 1 ? "s" : ""); + for (CeedInt i = 0; i < qf->num_input_fields; i++) { + CeedCall(CeedQFunctionFieldView(qf->input_fields[i], i, 1, stream)); } - fprintf(stream, " %" CeedInt_FMT " output field%s:\n", qf->num_output_fields, - qf->num_output_fields>1 ? "s" : ""); - for (CeedInt i=0; inum_output_fields; i++) { - ierr = CeedQFunctionFieldView(qf->output_fields[i], i, 0, stream); - CeedChk(ierr); + fprintf(stream, " %" CeedInt_FMT " output field%s:\n", qf->num_output_fields, qf->num_output_fields > 1 ? "s" : ""); + for (CeedInt i = 0; i < qf->num_output_fields; i++) { + CeedCall(CeedQFunctionFieldView(qf->output_fields[i], i, 0, stream)); } return CEED_ERROR_SUCCESS; } @@ -1052,22 +991,20 @@ int CeedQFunctionGetCeed(CeedQFunction qf, Ceed *ceed) { @ref User **/ -int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, - CeedVector *u, CeedVector *v) { - int ierr; - if (!qf->Apply) +int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, CeedVector *u, CeedVector *v) { + if (!qf->Apply) { // LCOV_EXCL_START - return CeedError(qf->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support QFunctionApply"); - // LCOV_EXCL_STOP - if (Q % qf->vec_length) + return CeedError(qf->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support QFunctionApply"); + // LCOV_EXCL_STOP + } + if (Q % qf->vec_length) { // LCOV_EXCL_START - return CeedError(qf->ceed, CEED_ERROR_DIMENSION, - "Number of quadrature points %" CeedInt_FMT " must be a " - "multiple of %" CeedInt_FMT, Q, qf->vec_length); - // LCOV_EXCL_STOP + return CeedError(qf->ceed, CEED_ERROR_DIMENSION, "Number of quadrature points %" CeedInt_FMT " must be a multiple of %" CeedInt_FMT, Q, + qf->vec_length); + // LCOV_EXCL_STOP + } qf->is_immutable = true; - ierr = qf->Apply(qf, Q, u, v); CeedChk(ierr); + CeedCall(qf->Apply(qf, Q, u, v)); return CEED_ERROR_SUCCESS; } @@ -1081,34 +1018,32 @@ int CeedQFunctionApply(CeedQFunction qf, CeedInt Q, @ref User **/ int CeedQFunctionDestroy(CeedQFunction *qf) { - int ierr; - if (!*qf || --(*qf)->ref_count > 0) return CEED_ERROR_SUCCESS; // Backend destroy if ((*qf)->Destroy) { - ierr = (*qf)->Destroy(*qf); CeedChk(ierr); + CeedCall((*qf)->Destroy(*qf)); } // Free fields - for (CeedInt i=0; i<(*qf)->num_input_fields; i++) { - ierr = CeedFree(&(*(*qf)->input_fields[i]).field_name); CeedChk(ierr); - ierr = CeedFree(&(*qf)->input_fields[i]); CeedChk(ierr); + for (CeedInt i = 0; i < (*qf)->num_input_fields; i++) { + CeedCall(CeedFree(&(*(*qf)->input_fields[i]).field_name)); + CeedCall(CeedFree(&(*qf)->input_fields[i])); } - for (CeedInt i=0; i<(*qf)->num_output_fields; i++) { - ierr = CeedFree(&(*(*qf)->output_fields[i]).field_name); CeedChk(ierr); - ierr = CeedFree(&(*qf)->output_fields[i]); CeedChk(ierr); + for (CeedInt i = 0; i < (*qf)->num_output_fields; i++) { + CeedCall(CeedFree(&(*(*qf)->output_fields[i]).field_name)); + CeedCall(CeedFree(&(*qf)->output_fields[i])); } - ierr = CeedFree(&(*qf)->input_fields); CeedChk(ierr); - ierr = CeedFree(&(*qf)->output_fields); CeedChk(ierr); + CeedCall(CeedFree(&(*qf)->input_fields)); + CeedCall(CeedFree(&(*qf)->output_fields)); // User context data object - ierr = CeedQFunctionContextDestroy(&(*qf)->ctx); CeedChk(ierr); - - ierr = CeedFree(&(*qf)->user_source); CeedChk(ierr); - ierr = CeedFree(&(*qf)->source_path); CeedChk(ierr); - ierr = CeedFree(&(*qf)->gallery_name); CeedChk(ierr); - ierr = CeedFree(&(*qf)->kernel_name); CeedChk(ierr); - ierr = CeedDestroy(&(*qf)->ceed); CeedChk(ierr); - ierr = CeedFree(qf); CeedChk(ierr); + CeedCall(CeedQFunctionContextDestroy(&(*qf)->ctx)); + + CeedCall(CeedFree(&(*qf)->user_source)); + CeedCall(CeedFree(&(*qf)->source_path)); + CeedCall(CeedFree(&(*qf)->gallery_name)); + CeedCall(CeedFree(&(*qf)->kernel_name)); + CeedCall(CeedDestroy(&(*qf)->ceed)); + CeedCall(CeedFree(qf)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-qfunctioncontext.c b/interface/ceed-qfunctioncontext.c index 788c686d5d..2d50a9f1ed 100644 --- a/interface/ceed-qfunctioncontext.c +++ b/interface/ceed-qfunctioncontext.c @@ -5,9 +5,9 @@ // // This file is part of CEED: http://github.com/ceed -#include -#include #include +#include +#include #include #include #include @@ -32,12 +32,11 @@ @ref Developer **/ -int CeedQFunctionContextGetFieldIndex(CeedQFunctionContext ctx, - const char *field_name, CeedInt *field_index) { +int CeedQFunctionContextGetFieldIndex(CeedQFunctionContext ctx, const char *field_name, CeedInt *field_index) { *field_index = -1; - for (CeedInt i=0; inum_fields; i++) - if (!strcmp(ctx->field_labels[i]->name, field_name)) - *field_index = i; + for (CeedInt i = 0; i < ctx->num_fields; i++) { + if (!strcmp(ctx->field_labels[i]->name, field_name)) *field_index = i; + } return CEED_ERROR_SUCCESS; } @@ -56,45 +55,33 @@ int CeedQFunctionContextGetFieldIndex(CeedQFunctionContext ctx, @ref Developer **/ -int CeedQFunctionContextRegisterGeneric(CeedQFunctionContext ctx, - const char *field_name, size_t field_offset, - const char *field_description, - CeedContextFieldType field_type, - size_t field_size, size_t num_values) { - int ierr; - +int CeedQFunctionContextRegisterGeneric(CeedQFunctionContext ctx, const char *field_name, size_t field_offset, const char *field_description, + CeedContextFieldType field_type, size_t field_size, size_t num_values) { // Check for duplicate CeedInt field_index = -1; - ierr = CeedQFunctionContextGetFieldIndex(ctx, field_name, &field_index); - CeedChk(ierr); - if (field_index != -1) + CeedCall(CeedQFunctionContextGetFieldIndex(ctx, field_name, &field_index)); + if (field_index != -1) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "QFunctionContext field with name \"%s\" already registered", - field_name); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "QFunctionContext field with name \"%s\" already registered", field_name); + // LCOV_EXCL_STOP + } // Allocate space for field data if (ctx->num_fields == 0) { - ierr = CeedCalloc(1, &ctx->field_labels); CeedChk(ierr); + CeedCall(CeedCalloc(1, &ctx->field_labels)); ctx->max_fields = 1; } else if (ctx->num_fields == ctx->max_fields) { - ierr = CeedRealloc(2*ctx->max_fields, &ctx->field_labels); - CeedChk(ierr); + CeedCall(CeedRealloc(2 * ctx->max_fields, &ctx->field_labels)); ctx->max_fields *= 2; } - ierr = CeedCalloc(1, &ctx->field_labels[ctx->num_fields]); CeedChk(ierr); + CeedCall(CeedCalloc(1, &ctx->field_labels[ctx->num_fields])); // Copy field data - ierr = CeedStringAllocCopy(field_name, - (char **)&ctx->field_labels[ctx->num_fields]->name); - CeedChk(ierr); - ierr = CeedStringAllocCopy(field_description, - (char **)&ctx->field_labels[ctx->num_fields]->description); - CeedChk(ierr); - ctx->field_labels[ctx->num_fields]->type = field_type; - ctx->field_labels[ctx->num_fields]->offset = field_offset; - ctx->field_labels[ctx->num_fields]->size = field_size * num_values; + CeedCall(CeedStringAllocCopy(field_name, (char **)&ctx->field_labels[ctx->num_fields]->name)); + CeedCall(CeedStringAllocCopy(field_description, (char **)&ctx->field_labels[ctx->num_fields]->description)); + ctx->field_labels[ctx->num_fields]->type = field_type; + ctx->field_labels[ctx->num_fields]->offset = field_offset; + ctx->field_labels[ctx->num_fields]->size = field_size * num_values; ctx->field_labels[ctx->num_fields]->num_values = num_values; ctx->num_fields++; return CEED_ERROR_SUCCESS; @@ -111,23 +98,19 @@ int CeedQFunctionContextRegisterGeneric(CeedQFunctionContext ctx, @ref Developer **/ static int CeedQFunctionContextDestroyData(CeedQFunctionContext ctx) { - int ierr; - if (ctx->DataDestroy) { - ierr = ctx->DataDestroy(ctx); CeedChk(ierr); + CeedCall(ctx->DataDestroy(ctx)); } else { CeedQFunctionContextDataDestroyUser data_destroy_function; - CeedMemType data_destroy_mem_type; + CeedMemType data_destroy_mem_type; - ierr = CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, - &data_destroy_function); CeedChk(ierr); + CeedCall(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function)); if (data_destroy_function) { void *data; - ierr = CeedQFunctionContextGetData(ctx, data_destroy_mem_type, &data); - CeedChk(ierr); - ierr = data_destroy_function(data); CeedChk(ierr); - ierr = CeedQFunctionContextRestoreData(ctx, &data); CeedChk(ierr); + CeedCall(CeedQFunctionContextGetData(ctx, data_destroy_mem_type, &data)); + CeedCall(data_destroy_function(data)); + CeedCall(CeedQFunctionContextRestoreData(ctx, &data)); } } @@ -167,17 +150,14 @@ int CeedQFunctionContextGetCeed(CeedQFunctionContext ctx, Ceed *ceed) { @ref Backend **/ -int CeedQFunctionContextHasValidData(CeedQFunctionContext ctx, - bool *has_valid_data) { - int ierr; - - if (!ctx->HasValidData) +int CeedQFunctionContextHasValidData(CeedQFunctionContext ctx, bool *has_valid_data) { + if (!ctx->HasValidData) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support HasValidData"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support HasValidData"); + // LCOV_EXCL_STOP + } - ierr = ctx->HasValidData(ctx, has_valid_data); CeedChk(ierr); + CeedCall(ctx->HasValidData(ctx, has_valid_data)); return CEED_ERROR_SUCCESS; } @@ -194,18 +174,14 @@ int CeedQFunctionContextHasValidData(CeedQFunctionContext ctx, @ref Backend **/ -int CeedQFunctionContextHasBorrowedDataOfType(CeedQFunctionContext ctx, - CeedMemType mem_type, bool *has_borrowed_data_of_type) { - int ierr; - - if (!ctx->HasBorrowedDataOfType) +int CeedQFunctionContextHasBorrowedDataOfType(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { + if (!ctx->HasBorrowedDataOfType) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support HasBorrowedDataOfType"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support HasBorrowedDataOfType"); + // LCOV_EXCL_STOP + } - ierr = ctx->HasBorrowedDataOfType(ctx, mem_type, has_borrowed_data_of_type); - CeedChk(ierr); + CeedCall(ctx->HasBorrowedDataOfType(ctx, mem_type, has_borrowed_data_of_type)); return CEED_ERROR_SUCCESS; } @@ -267,14 +243,9 @@ int CeedQFunctionContextSetBackendData(CeedQFunctionContext ctx, void *data) { @ref Backend **/ -int CeedQFunctionContextGetFieldLabel(CeedQFunctionContext ctx, - const char *field_name, - CeedContextFieldLabel *field_label) { - int ierr; - +int CeedQFunctionContextGetFieldLabel(CeedQFunctionContext ctx, const char *field_name, CeedContextFieldLabel *field_label) { CeedInt field_index; - ierr = CeedQFunctionContextGetFieldIndex(ctx, field_name, &field_index); - CeedChk(ierr); + CeedCall(CeedQFunctionContextGetFieldIndex(ctx, field_name, &field_index)); if (field_index != -1) { *field_label = ctx->field_labels[field_index]; @@ -297,26 +268,19 @@ int CeedQFunctionContextGetFieldLabel(CeedQFunctionContext ctx, @ref Backend **/ -int CeedQFunctionContextSetGeneric(CeedQFunctionContext ctx, - CeedContextFieldLabel field_label, - CeedContextFieldType field_type, - void *value) { - int ierr; - +int CeedQFunctionContextSetGeneric(CeedQFunctionContext ctx, CeedContextFieldLabel field_label, CeedContextFieldType field_type, void *value) { // Check field type - if (field_label->type != field_type) + if (field_label->type != field_type) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "QFunctionContext field with name \"%s\" registered as %s, " - "not registered as %s", field_label->name, - CeedContextFieldTypes[field_label->type], - CeedContextFieldTypes[field_type]); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "QFunctionContext field with name \"%s\" registered as %s, not registered as %s", + field_label->name, CeedContextFieldTypes[field_label->type], CeedContextFieldTypes[field_type]); + // LCOV_EXCL_STOP + } char *data; - ierr = CeedQFunctionContextGetData(ctx, CEED_MEM_HOST, &data); CeedChk(ierr); + CeedCall(CeedQFunctionContextGetData(ctx, CEED_MEM_HOST, &data)); memcpy(&data[field_label->offset], value, field_label->size); - ierr = CeedQFunctionContextRestoreData(ctx, &data); CeedChk(ierr); + CeedCall(CeedQFunctionContextRestoreData(ctx, &data)); return CEED_ERROR_SUCCESS; } @@ -332,19 +296,14 @@ int CeedQFunctionContextSetGeneric(CeedQFunctionContext ctx, @ref Backend **/ -int CeedQFunctionContextSetDouble(CeedQFunctionContext ctx, - CeedContextFieldLabel field_label, double *values) { - int ierr; - - if (!field_label) +int CeedQFunctionContextSetDouble(CeedQFunctionContext ctx, CeedContextFieldLabel field_label, double *values) { + if (!field_label) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "Invalid field label"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label"); + // LCOV_EXCL_STOP + } - ierr = CeedQFunctionContextSetGeneric(ctx, field_label, - CEED_CONTEXT_FIELD_DOUBLE, - values); CeedChk(ierr); + CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label, CEED_CONTEXT_FIELD_DOUBLE, values)); return CEED_ERROR_SUCCESS; } @@ -360,19 +319,14 @@ int CeedQFunctionContextSetDouble(CeedQFunctionContext ctx, @ref Backend **/ -int CeedQFunctionContextSetInt32(CeedQFunctionContext ctx, - CeedContextFieldLabel field_label, int *values) { - int ierr; - - if (!field_label) +int CeedQFunctionContextSetInt32(CeedQFunctionContext ctx, CeedContextFieldLabel field_label, int *values) { + if (!field_label) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "Invalid field label"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "Invalid field label"); + // LCOV_EXCL_STOP + } - ierr = CeedQFunctionContextSetGeneric(ctx, field_label, - CEED_CONTEXT_FIELD_INT32, - values); CeedChk(ierr); + CeedCall(CeedQFunctionContextSetGeneric(ctx, field_label, CEED_CONTEXT_FIELD_INT32, values)); return CEED_ERROR_SUCCESS; } @@ -388,8 +342,7 @@ int CeedQFunctionContextSetInt32(CeedQFunctionContext ctx, @ref Backend **/ -int CeedQFunctionContextGetDataDestroy(CeedQFunctionContext ctx, - CeedMemType *f_mem_type, CeedQFunctionContextDataDestroyUser *f) { +int CeedQFunctionContextGetDataDestroy(CeedQFunctionContext ctx, CeedMemType *f_mem_type, CeedQFunctionContextDataDestroyUser *f) { if (f_mem_type) *f_mem_type = ctx->data_destroy_mem_type; if (f) *f = ctx->data_destroy_function; return CEED_ERROR_SUCCESS; @@ -429,27 +382,25 @@ int CeedQFunctionContextReference(CeedQFunctionContext ctx) { @ref User **/ int CeedQFunctionContextCreate(Ceed ceed, CeedQFunctionContext *ctx) { - int ierr; - if (!ceed->QFunctionContextCreate) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "Context"); CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Context")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support ContextCreate"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support ContextCreate"); + // LCOV_EXCL_STOP + } - ierr = CeedQFunctionContextCreate(delegate, ctx); CeedChk(ierr); + CeedCall(CeedQFunctionContextCreate(delegate, ctx)); return CEED_ERROR_SUCCESS; } - ierr = CeedCalloc(1, ctx); CeedChk(ierr); + CeedCall(CeedCalloc(1, ctx)); (*ctx)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); + CeedCall(CeedReference(ceed)); (*ctx)->ref_count = 1; - ierr = ceed->QFunctionContextCreate(*ctx); CeedChk(ierr); + CeedCall(ceed->QFunctionContextCreate(*ctx)); return CEED_ERROR_SUCCESS; } @@ -468,12 +419,9 @@ int CeedQFunctionContextCreate(Ceed ceed, CeedQFunctionContext *ctx) { @ref User **/ -int CeedQFunctionContextReferenceCopy(CeedQFunctionContext ctx, - CeedQFunctionContext *ctx_copy) { - int ierr; - - ierr = CeedQFunctionContextReference(ctx); CeedChk(ierr); - ierr = CeedQFunctionContextDestroy(ctx_copy); CeedChk(ierr); +int CeedQFunctionContextReferenceCopy(CeedQFunctionContext ctx, CeedQFunctionContext *ctx_copy) { + CeedCall(CeedQFunctionContextReference(ctx)); + CeedCall(CeedQFunctionContextDestroy(ctx_copy)); *ctx_copy = ctx; return CEED_ERROR_SUCCESS; } @@ -494,27 +442,21 @@ int CeedQFunctionContextReferenceCopy(CeedQFunctionContext ctx, @ref User **/ -int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type, - CeedCopyMode copy_mode, - size_t size, void *data) { - int ierr; - - if (!ctx->SetData) +int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type, CeedCopyMode copy_mode, size_t size, void *data) { + if (!ctx->SetData) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support ContextSetData"); - // LCOV_EXCL_STOP - - if (ctx->state % 2 == 1) + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support ContextSetData"); + // LCOV_EXCL_STOP + } + if (ctx->state % 2 == 1) { // LCOV_EXCL_START - return CeedError(ctx->ceed, 1, - "Cannot grant CeedQFunctionContext data access, the " - "access lock is already in use"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); + // LCOV_EXCL_STOP + } - ierr = CeedQFunctionContextDestroyData(ctx); CeedChk(ierr); + CeedCall(CeedQFunctionContextDestroyData(ctx)); ctx->ctx_size = size; - ierr = ctx->SetData(ctx, mem_type, copy_mode, data); CeedChk(ierr); + CeedCall(ctx->SetData(ctx, mem_type, copy_mode, data)); ctx->state += 2; return CEED_ERROR_SUCCESS; } @@ -532,44 +474,37 @@ int CeedQFunctionContextSetData(CeedQFunctionContext ctx, CeedMemType mem_type, @ref User **/ -int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type, - void *data) { - int ierr; - +int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { bool has_valid_data = true; - ierr = CeedQFunctionContextHasValidData(ctx, &has_valid_data); CeedChk(ierr); - if (!has_valid_data) + CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data)); + if (!has_valid_data) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_BACKEND, - "CeedQFunctionContext has no valid data to take, must set data"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to take, must set data"); + // LCOV_EXCL_STOP + } - if (!ctx->TakeData) + if (!ctx->TakeData) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support TakeData"); - // LCOV_EXCL_STOP - - if (ctx->state % 2 == 1) + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support TakeData"); + // LCOV_EXCL_STOP + } + if (ctx->state % 2 == 1) { // LCOV_EXCL_START - return CeedError(ctx->ceed, 1, - "Cannot grant CeedQFunctionContext data access, the " - "access lock is already in use"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); + // LCOV_EXCL_STOP + } bool has_borrowed_data_of_type = true; - ierr = CeedQFunctionContextHasBorrowedDataOfType(ctx, mem_type, - &has_borrowed_data_of_type); CeedChk(ierr); - if (!has_borrowed_data_of_type) + CeedCall(CeedQFunctionContextHasBorrowedDataOfType(ctx, mem_type, &has_borrowed_data_of_type)); + if (!has_borrowed_data_of_type) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_BACKEND, - "CeedQFunctionContext has no borowed %s data, " - "must set data with CeedQFunctionContextSetData", + return CeedError(ctx->ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no borowed %s data, must set data with CeedQFunctionContextSetData", CeedMemTypes[mem_type]); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } void *temp_data = NULL; - ierr = ctx->TakeData(ctx, mem_type, &temp_data); CeedChk(ierr); + CeedCall(ctx->TakeData(ctx, mem_type, &temp_data)); if (data) (*(void **)data) = temp_data; return CEED_ERROR_SUCCESS; } @@ -591,39 +526,32 @@ int CeedQFunctionContextTakeData(CeedQFunctionContext ctx, CeedMemType mem_type, @ref User **/ -int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type, - void *data) { - int ierr; - - if (!ctx->GetData) +int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { + if (!ctx->GetData) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support GetData"); - // LCOV_EXCL_STOP - - if (ctx->state % 2 == 1) + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetData"); + // LCOV_EXCL_STOP + } + if (ctx->state % 2 == 1) { // LCOV_EXCL_START - return CeedError(ctx->ceed, 1, - "Cannot grant CeedQFunctionContext data access, the " - "access lock is already in use"); - // LCOV_EXCL_STOP - - if (ctx->num_readers > 0) + return CeedError(ctx->ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); + // LCOV_EXCL_STOP + } + if (ctx->num_readers > 0) { // LCOV_EXCL_START - return CeedError(ctx->ceed, 1, - "Cannot grant CeedQFunctionContext data access, a " - "process has read access"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, 1, "Cannot grant CeedQFunctionContext data access, a process has read access"); + // LCOV_EXCL_STOP + } bool has_valid_data = true; - ierr = CeedQFunctionContextHasValidData(ctx, &has_valid_data); CeedChk(ierr); - if (!has_valid_data) + CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data)); + if (!has_valid_data) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_BACKEND, - "CeedQFunctionContext has no valid data to get, must set data"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data"); + // LCOV_EXCL_STOP + } - ierr = ctx->GetData(ctx, mem_type, data); CeedChk(ierr); + CeedCall(ctx->GetData(ctx, mem_type, data)); ctx->state++; return CEED_ERROR_SUCCESS; } @@ -645,33 +573,27 @@ int CeedQFunctionContextGetData(CeedQFunctionContext ctx, CeedMemType mem_type, @ref User **/ -int CeedQFunctionContextGetDataRead(CeedQFunctionContext ctx, - CeedMemType mem_type, - void *data) { - int ierr; - - if (!ctx->GetDataRead) +int CeedQFunctionContextGetDataRead(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { + if (!ctx->GetDataRead) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support GetDataRead"); - // LCOV_EXCL_STOP - - if (ctx->state % 2 == 1) + return CeedError(ctx->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetDataRead"); + // LCOV_EXCL_STOP + } + if (ctx->state % 2 == 1) { // LCOV_EXCL_START - return CeedError(ctx->ceed, 1, - "Cannot grant CeedQFunctionContext data access, the " - "access lock is already in use"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, 1, "Cannot grant CeedQFunctionContext data access, the access lock is already in use"); + // LCOV_EXCL_STOP + } bool has_valid_data = true; - ierr = CeedQFunctionContextHasValidData(ctx, &has_valid_data); CeedChk(ierr); - if (!has_valid_data) + CeedCall(CeedQFunctionContextHasValidData(ctx, &has_valid_data)); + if (!has_valid_data) { // LCOV_EXCL_START - return CeedError(ctx->ceed, CEED_ERROR_BACKEND, - "CeedQFunctionContext has no valid data to get, must set data"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, CEED_ERROR_BACKEND, "CeedQFunctionContext has no valid data to get, must set data"); + // LCOV_EXCL_STOP + } - ierr = ctx->GetDataRead(ctx, mem_type, data); CeedChk(ierr); + CeedCall(ctx->GetDataRead(ctx, mem_type, data)); ctx->num_readers++; return CEED_ERROR_SUCCESS; } @@ -687,17 +609,14 @@ int CeedQFunctionContextGetDataRead(CeedQFunctionContext ctx, @ref User **/ int CeedQFunctionContextRestoreData(CeedQFunctionContext ctx, void *data) { - int ierr; - - if (ctx->state % 2 != 1) + if (ctx->state % 2 != 1) { // LCOV_EXCL_START - return CeedError(ctx->ceed, 1, - "Cannot restore CeedQFunctionContext array access, " - "access was not granted"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, 1, "Cannot restore CeedQFunctionContext array access, access was not granted"); + // LCOV_EXCL_STOP + } if (ctx->RestoreData) { - ierr = ctx->RestoreData(ctx); CeedChk(ierr); + CeedCall(ctx->RestoreData(ctx)); } *(void **)data = NULL; ctx->state++; @@ -715,18 +634,15 @@ int CeedQFunctionContextRestoreData(CeedQFunctionContext ctx, void *data) { @ref User **/ int CeedQFunctionContextRestoreDataRead(CeedQFunctionContext ctx, void *data) { - int ierr; - - if (ctx->num_readers == 0) + if (ctx->num_readers == 0) { // LCOV_EXCL_START - return CeedError(ctx->ceed, 1, - "Cannot restore CeedQFunctionContext array access, " - "access was not granted"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, 1, "Cannot restore CeedQFunctionContext array access, access was not granted"); + // LCOV_EXCL_STOP + } ctx->num_readers--; if (ctx->num_readers == 0 && ctx->RestoreDataRead) { - ierr = ctx->RestoreDataRead(ctx); CeedChk(ierr); + CeedCall(ctx->RestoreDataRead(ctx)); } *(void **)data = NULL; @@ -746,12 +662,9 @@ int CeedQFunctionContextRestoreDataRead(CeedQFunctionContext ctx, void *data) { @ref User **/ -int CeedQFunctionContextRegisterDouble(CeedQFunctionContext ctx, - const char *field_name, size_t field_offset, - size_t num_values, +int CeedQFunctionContextRegisterDouble(CeedQFunctionContext ctx, const char *field_name, size_t field_offset, size_t num_values, const char *field_description) { - return CeedQFunctionContextRegisterGeneric(ctx, field_name, field_offset, - field_description, CEED_CONTEXT_FIELD_DOUBLE, sizeof(double), num_values); + return CeedQFunctionContextRegisterGeneric(ctx, field_name, field_offset, field_description, CEED_CONTEXT_FIELD_DOUBLE, sizeof(double), num_values); } /** @@ -767,12 +680,9 @@ int CeedQFunctionContextRegisterDouble(CeedQFunctionContext ctx, @ref User **/ -int CeedQFunctionContextRegisterInt32(CeedQFunctionContext ctx, - const char *field_name, size_t field_offset, - size_t num_values, +int CeedQFunctionContextRegisterInt32(CeedQFunctionContext ctx, const char *field_name, size_t field_offset, size_t num_values, const char *field_description) { - return CeedQFunctionContextRegisterGeneric(ctx, field_name, field_offset, - field_description, CEED_CONTEXT_FIELD_INT32, sizeof(int), num_values); + return CeedQFunctionContextRegisterGeneric(ctx, field_name, field_offset, field_description, CEED_CONTEXT_FIELD_INT32, sizeof(int), num_values); } /** @@ -786,10 +696,9 @@ int CeedQFunctionContextRegisterInt32(CeedQFunctionContext ctx, @ref User **/ -int CeedQFunctionContextGetAllFieldLabels(CeedQFunctionContext ctx, - const CeedContextFieldLabel **field_labels, CeedInt *num_fields) { +int CeedQFunctionContextGetAllFieldLabels(CeedQFunctionContext ctx, const CeedContextFieldLabel **field_labels, CeedInt *num_fields) { *field_labels = ctx->field_labels; - *num_fields = ctx->num_fields; + *num_fields = ctx->num_fields; return CEED_ERROR_SUCCESS; } @@ -806,10 +715,7 @@ int CeedQFunctionContextGetAllFieldLabels(CeedQFunctionContext ctx, @ref User **/ -int CeedContextFieldLabelGetDescription(CeedContextFieldLabel label, - const char **field_name, - const char **field_description, - size_t *num_values, +int CeedContextFieldLabelGetDescription(CeedContextFieldLabel label, const char **field_name, const char **field_description, size_t *num_values, CeedContextFieldType *field_type) { if (field_name) *field_name = label->name; if (field_description) *field_description = label->description; @@ -828,13 +734,11 @@ int CeedContextFieldLabelGetDescription(CeedContextFieldLabel label, @ref User **/ -int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, - size_t *ctx_size) { +int CeedQFunctionContextGetContextSize(CeedQFunctionContext ctx, size_t *ctx_size) { *ctx_size = ctx->ctx_size; return CEED_ERROR_SUCCESS; } - /** @brief View a CeedQFunctionContext @@ -850,9 +754,7 @@ int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream) { fprintf(stream, " Context Data Size: %ld\n", ctx->ctx_size); for (CeedInt i = 0; i < ctx->num_fields; i++) { // LCOV_EXCL_START - fprintf(stream, " Labeled %s field: %s\n", - CeedContextFieldTypes[ctx->field_labels[i]->type], - ctx->field_labels[i]->name); + fprintf(stream, " Labeled %s field: %s\n", CeedContextFieldTypes[ctx->field_labels[i]->type], ctx->field_labels[i]->name); // LCOV_EXCL_STOP } return CEED_ERROR_SUCCESS; @@ -869,13 +771,12 @@ int CeedQFunctionContextView(CeedQFunctionContext ctx, FILE *stream) { @ref User **/ -int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, - CeedMemType f_mem_type, CeedQFunctionContextDataDestroyUser f) { - if (!f) +int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, CeedMemType f_mem_type, CeedQFunctionContextDataDestroyUser f) { + if (!f) { // LCOV_EXCL_START - return CeedError(ctx->ceed, 1, - "Must provide valid callback function for destroying user data"); - // LCOV_EXCL_STOP + return CeedError(ctx->ceed, 1, "Must provide valid callback function for destroying user data"); + // LCOV_EXCL_STOP + } ctx->data_destroy_mem_type = f_mem_type; ctx->data_destroy_function = f; return CEED_ERROR_SUCCESS; @@ -891,30 +792,24 @@ int CeedQFunctionContextSetDataDestroy(CeedQFunctionContext ctx, @ref User **/ int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx) { - int ierr; - - if (!*ctx || --(*ctx)->ref_count > 0) - return CEED_ERROR_SUCCESS; + if (!*ctx || --(*ctx)->ref_count > 0) return CEED_ERROR_SUCCESS; - if ((*ctx) && ((*ctx)->state % 2) == 1) + if ((*ctx) && ((*ctx)->state % 2) == 1) { // LCOV_EXCL_START - return CeedError((*ctx)->ceed, 1, - "Cannot destroy CeedQFunctionContext, the access " - "lock is in use"); - // LCOV_EXCL_STOP - - ierr = CeedQFunctionContextDestroyData(*ctx); CeedChk(ierr); - if ((*ctx)->Destroy) { - ierr = (*ctx)->Destroy(*ctx); CeedChk(ierr); - } - for (CeedInt i=0; i<(*ctx)->num_fields; i++) { - ierr = CeedFree(&(*ctx)->field_labels[i]->name); CeedChk(ierr); - ierr = CeedFree(&(*ctx)->field_labels[i]->description); CeedChk(ierr); - ierr = CeedFree(&(*ctx)->field_labels[i]); CeedChk(ierr); - } - ierr = CeedFree(&(*ctx)->field_labels); CeedChk(ierr); - ierr = CeedDestroy(&(*ctx)->ceed); CeedChk(ierr); - ierr = CeedFree(ctx); CeedChk(ierr); + return CeedError((*ctx)->ceed, 1, "Cannot destroy CeedQFunctionContext, the access lock is in use"); + // LCOV_EXCL_STOP + } + + CeedCall(CeedQFunctionContextDestroyData(*ctx)); + if ((*ctx)->Destroy) CeedCall((*ctx)->Destroy(*ctx)); + for (CeedInt i = 0; i < (*ctx)->num_fields; i++) { + CeedCall(CeedFree(&(*ctx)->field_labels[i]->name)); + CeedCall(CeedFree(&(*ctx)->field_labels[i]->description)); + CeedCall(CeedFree(&(*ctx)->field_labels[i])); + } + CeedCall(CeedFree(&(*ctx)->field_labels)); + CeedCall(CeedDestroy(&(*ctx)->ceed)); + CeedCall(CeedFree(ctx)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-register.c b/interface/ceed-register.c index c0018a160c..8553c47309 100644 --- a/interface/ceed-register.c +++ b/interface/ceed-register.c @@ -5,15 +5,15 @@ // // This file is part of CEED: http://github.com/ceed -#include -#include #include +#include +#include #include #include static bool register_all_called; -#define MACRO(name,...) CEED_INTERN int name(void); +#define MACRO(name, ...) CEED_INTERN int name(void); #include "../backends/ceed-backend-list.h" #undef MACRO @@ -33,7 +33,7 @@ int CeedRegisterAll() { if (register_all_called) return 0; register_all_called = true; -#define MACRO(name,...) CeedChk(name()); +#define MACRO(name, ...) CeedChk(name()); #include "../backends/ceed-backend-list.h" #undef MACRO return CEED_ERROR_SUCCESS; diff --git a/interface/ceed-tensor.c b/interface/ceed-tensor.c index 8422f5074b..9a6e9f82ba 100644 --- a/interface/ceed-tensor.c +++ b/interface/ceed-tensor.c @@ -5,9 +5,9 @@ // // This file is part of CEED: http://github.com/ceed -#include -#include #include +#include +#include /// @file /// Implementation of CeedTensorContract interfaces @@ -30,32 +30,26 @@ @ref Backend **/ -int CeedTensorContractCreate(Ceed ceed, CeedBasis basis, - CeedTensorContract *contract) { - int ierr; - +int CeedTensorContractCreate(Ceed ceed, CeedBasis basis, CeedTensorContract *contract) { if (!ceed->TensorContractCreate) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "TensorContract"); - CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "TensorContract")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support TensorContractCreate"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support TensorContractCreate"); + // LCOV_EXCL_STOP + } - ierr = CeedTensorContractCreate(delegate, basis, contract); - CeedChk(ierr); + CeedCall(CeedTensorContractCreate(delegate, basis, contract)); return CEED_ERROR_SUCCESS; } - ierr = CeedCalloc(1, contract); CeedChk(ierr); + CeedCall(CeedCalloc(1, contract)); (*contract)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); - ierr = ceed->TensorContractCreate(basis, *contract); - CeedChk(ierr); + CeedCall(CeedReference(ceed)); + CeedCall(ceed->TensorContractCreate(basis, *contract)); return CEED_ERROR_SUCCESS; } @@ -83,15 +77,9 @@ int CeedTensorContractCreate(Ceed ceed, CeedBasis basis, @ref Backend **/ -int CeedTensorContractApply(CeedTensorContract contract, CeedInt A, CeedInt B, - CeedInt C, CeedInt J, const CeedScalar *restrict t, - CeedTransposeMode t_mode, const CeedInt add, - const CeedScalar *restrict u, - CeedScalar *restrict v) { - int ierr; - - ierr = contract->Apply(contract, A, B, C, J, t, t_mode, add, u, v); - CeedChk(ierr); +int CeedTensorContractApply(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v) { + CeedCall(contract->Apply(contract, A, B, C, J, t, t_mode, add, u, v)); return CEED_ERROR_SUCCESS; } @@ -164,14 +152,12 @@ int CeedTensorContractReference(CeedTensorContract contract) { @ref Backend **/ int CeedTensorContractDestroy(CeedTensorContract *contract) { - int ierr; - if (!*contract || --(*contract)->ref_count > 0) return CEED_ERROR_SUCCESS; if ((*contract)->Destroy) { - ierr = (*contract)->Destroy(*contract); CeedChk(ierr); + CeedCall((*contract)->Destroy(*contract)); } - ierr = CeedDestroy(&(*contract)->ceed); CeedChk(ierr); - ierr = CeedFree(contract); CeedChk(ierr); + CeedCall(CeedDestroy(&(*contract)->ceed)); + CeedCall(CeedFree(contract)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed-types.c b/interface/ceed-types.c index f0fef55624..4e7f2d8501 100644 --- a/interface/ceed-types.c +++ b/interface/ceed-types.c @@ -5,69 +5,60 @@ // // This file is part of CEED: http://github.com/ceed -#include #include +#include const char *const CeedErrorTypesShifted[] = { - [CEED_ERROR_SUCCESS - CEED_ERROR_UNSUPPORTED] = "success", - [CEED_ERROR_MINOR - CEED_ERROR_UNSUPPORTED] = "generic minor error", - [CEED_ERROR_DIMENSION - CEED_ERROR_UNSUPPORTED] = "dimension mismatch", - [CEED_ERROR_INCOMPLETE - CEED_ERROR_UNSUPPORTED] = "object setup incomplete", - [CEED_ERROR_INCOMPATIBLE - CEED_ERROR_UNSUPPORTED] = "incompatible arguments", - [CEED_ERROR_ACCESS - CEED_ERROR_UNSUPPORTED] = "access lock in use", - [CEED_ERROR_MAJOR - CEED_ERROR_UNSUPPORTED] = "generic major error", - [CEED_ERROR_BACKEND - CEED_ERROR_UNSUPPORTED] = "internal backend error", - [CEED_ERROR_UNSUPPORTED - CEED_ERROR_UNSUPPORTED] = "operation unsupported by backend", + [CEED_ERROR_SUCCESS - CEED_ERROR_UNSUPPORTED] = "success", + [CEED_ERROR_MINOR - CEED_ERROR_UNSUPPORTED] = "generic minor error", + [CEED_ERROR_DIMENSION - CEED_ERROR_UNSUPPORTED] = "dimension mismatch", + [CEED_ERROR_INCOMPLETE - CEED_ERROR_UNSUPPORTED] = "object setup incomplete", + [CEED_ERROR_INCOMPATIBLE - CEED_ERROR_UNSUPPORTED] = "incompatible arguments", + [CEED_ERROR_ACCESS - CEED_ERROR_UNSUPPORTED] = "access lock in use", + [CEED_ERROR_MAJOR - CEED_ERROR_UNSUPPORTED] = "generic major error", + [CEED_ERROR_BACKEND - CEED_ERROR_UNSUPPORTED] = "internal backend error", + [CEED_ERROR_UNSUPPORTED - CEED_ERROR_UNSUPPORTED] = "operation unsupported by backend", }; -const char *const *CeedErrorTypes = &CeedErrorTypesShifted[- - CEED_ERROR_UNSUPPORTED]; +const char *const *CeedErrorTypes = &CeedErrorTypesShifted[-CEED_ERROR_UNSUPPORTED]; const char *const CeedMemTypes[] = { - [CEED_MEM_HOST] = "host", - [CEED_MEM_DEVICE] = "device", + [CEED_MEM_HOST] = "host", + [CEED_MEM_DEVICE] = "device", }; const char *const CeedCopyModes[] = { - [CEED_COPY_VALUES] = "copy values", - [CEED_USE_POINTER] = "use pointer", - [CEED_OWN_POINTER] = "own pointer", + [CEED_COPY_VALUES] = "copy values", + [CEED_USE_POINTER] = "use pointer", + [CEED_OWN_POINTER] = "own pointer", }; const char *const CeedTransposeModes[] = { - [CEED_TRANSPOSE] = "transpose", - [CEED_NOTRANSPOSE] = "no transpose", + [CEED_TRANSPOSE] = "transpose", + [CEED_NOTRANSPOSE] = "no transpose", }; const char *const CeedEvalModes[] = { - [CEED_EVAL_NONE] = "none", - [CEED_EVAL_INTERP] = "interpolation", - [CEED_EVAL_GRAD] = "gradient", - [CEED_EVAL_DIV] = "divergence", - [CEED_EVAL_CURL] = "curl", - [CEED_EVAL_WEIGHT] = "quadrature weights", + [CEED_EVAL_NONE] = "none", [CEED_EVAL_INTERP] = "interpolation", [CEED_EVAL_GRAD] = "gradient", [CEED_EVAL_DIV] = "divergence", + [CEED_EVAL_CURL] = "curl", [CEED_EVAL_WEIGHT] = "quadrature weights", }; const char *const CeedQuadModes[] = { - [CEED_GAUSS] = "Gauss", - [CEED_GAUSS_LOBATTO] = "Gauss Lobatto", + [CEED_GAUSS] = "Gauss", + [CEED_GAUSS_LOBATTO] = "Gauss Lobatto", }; const char *const CeedElemTopologies[] = { - [CEED_TOPOLOGY_LINE] = "line", - [CEED_TOPOLOGY_TRIANGLE] = "triangle", - [CEED_TOPOLOGY_QUAD] = "quadrilateral", - [CEED_TOPOLOGY_TET] = "tetrahedron", - [CEED_TOPOLOGY_PYRAMID] = "pyramid", - [CEED_TOPOLOGY_PRISM] = "prism", - [CEED_TOPOLOGY_HEX] = "hexahedron", + [CEED_TOPOLOGY_LINE] = "line", [CEED_TOPOLOGY_TRIANGLE] = "triangle", [CEED_TOPOLOGY_QUAD] = "quadrilateral", + [CEED_TOPOLOGY_TET] = "tetrahedron", [CEED_TOPOLOGY_PYRAMID] = "pyramid", [CEED_TOPOLOGY_PRISM] = "prism", + [CEED_TOPOLOGY_HEX] = "hexahedron", }; const char *const CeedContextFieldTypes[] = { - [CEED_CONTEXT_FIELD_DOUBLE] = "double", - [CEED_CONTEXT_FIELD_INT32] = "int32", + [CEED_CONTEXT_FIELD_DOUBLE] = "double", + [CEED_CONTEXT_FIELD_INT32] = "int32", }; const char *const CeedFESpaces[] = { - [CEED_FE_SPACE_H1] = "H^1 space", - [CEED_FE_SPACE_HDIV] = "H(div) space", + [CEED_FE_SPACE_H1] = "H^1 space", + [CEED_FE_SPACE_HDIV] = "H(div) space", }; diff --git a/interface/ceed-vector.c b/interface/ceed-vector.c index 65a9ebd14d..e69a5adacb 100644 --- a/interface/ceed-vector.c +++ b/interface/ceed-vector.c @@ -5,10 +5,10 @@ // // This file is part of CEED: http://github.com/ceed -#include -#include -#include #include +#include +#include +#include #include #include #include @@ -50,15 +50,13 @@ const CeedVector CEED_VECTOR_NONE = &ceed_vector_none; @ref Backend **/ int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array) { - int ierr; - - if (!vec->HasValidArray) + if (!vec->HasValidArray) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support HasValidArray"); - // LCOV_EXCL_STOP + return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support HasValidArray"); + // LCOV_EXCL_STOP + } - ierr = vec->HasValidArray(vec, has_valid_array); CeedChk(ierr); + CeedCall(vec->HasValidArray(vec, has_valid_array)); return CEED_ERROR_SUCCESS; } @@ -74,18 +72,14 @@ int CeedVectorHasValidArray(CeedVector vec, bool *has_valid_array) { @ref Backend **/ -int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, - bool *has_borrowed_array_of_type) { - int ierr; - - if (!vec->HasBorrowedArrayOfType) +int CeedVectorHasBorrowedArrayOfType(CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { + if (!vec->HasBorrowedArrayOfType) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support HasBorrowedArrayOfType"); - // LCOV_EXCL_STOP + return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support HasBorrowedArrayOfType"); + // LCOV_EXCL_STOP + } - ierr = vec->HasBorrowedArrayOfType(vec, mem_type, has_borrowed_array_of_type); - CeedChk(ierr); + CeedCall(vec->HasBorrowedArrayOfType(vec, mem_type, has_borrowed_array_of_type)); return CEED_ERROR_SUCCESS; } @@ -184,29 +178,27 @@ int CeedVectorReference(CeedVector vec) { @ref User **/ int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) { - int ierr; - if (!ceed->VectorCreate) { Ceed delegate; - ierr = CeedGetObjectDelegate(ceed, &delegate, "Vector"); CeedChk(ierr); + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); - if (!delegate) + if (!delegate) { // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support VectorCreate"); - // LCOV_EXCL_STOP + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support VectorCreate"); + // LCOV_EXCL_STOP + } - ierr = CeedVectorCreate(delegate, length, vec); CeedChk(ierr); + CeedCall(CeedVectorCreate(delegate, length, vec)); return CEED_ERROR_SUCCESS; } - ierr = CeedCalloc(1, vec); CeedChk(ierr); + CeedCall(CeedCalloc(1, vec)); (*vec)->ceed = ceed; - ierr = CeedReference(ceed); CeedChk(ierr); + CeedCall(CeedReference(ceed)); (*vec)->ref_count = 1; - (*vec)->length = length; - (*vec)->state = 0; - ierr = ceed->VectorCreate(length, *vec); CeedChk(ierr); + (*vec)->length = length; + (*vec)->state = 0; + CeedCall(ceed->VectorCreate(length, *vec)); return CEED_ERROR_SUCCESS; } @@ -226,10 +218,8 @@ int CeedVectorCreate(Ceed ceed, CeedSize length, CeedVector *vec) { @ref User **/ int CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy) { - int ierr; - - ierr = CeedVectorReference(vec); CeedChk(ierr); - ierr = CeedVectorDestroy(vec_copy); CeedChk(ierr); + CeedCall(CeedVectorReference(vec)); + CeedCall(CeedVectorDestroy(vec_copy)); *vec_copy = vec; return CEED_ERROR_SUCCESS; } @@ -250,28 +240,20 @@ int CeedVectorReferenceCopy(CeedVector vec, CeedVector *vec_copy) { @ref User **/ -int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, - CeedCopyMode copy_mode, - CeedScalar *array) { - int ierr; - - if (!vec->SetArray) +int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array) { + if (!vec->SetArray) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support VectorSetArray"); - // LCOV_EXCL_STOP - - if (vec->state % 2 == 1) - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot grant CeedVector array access, the " - "access lock is already in use"); - - if (vec->num_readers > 0) - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot grant CeedVector array access, a " - "process has read access"); + return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support VectorSetArray"); + // LCOV_EXCL_STOP + } + if (vec->state % 2 == 1) { + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use"); + } + if (vec->num_readers > 0) { + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); + } - ierr = vec->SetArray(vec, mem_type, copy_mode, array); CeedChk(ierr); + CeedCall(vec->SetArray(vec, mem_type, copy_mode, array)); vec->state += 2; return CEED_ERROR_SUCCESS; } @@ -287,29 +269,24 @@ int CeedVectorSetArray(CeedVector vec, CeedMemType mem_type, @ref User **/ int CeedVectorSetValue(CeedVector vec, CeedScalar value) { - int ierr; - - if (vec->state % 2 == 1) + if (vec->state % 2 == 1) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot grant CeedVector array access, the " - "access lock is already in use"); - // LCOV_EXCL_STOP - - if (vec->num_readers > 0) + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use"); + // LCOV_EXCL_STOP + } + if (vec->num_readers > 0) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot grant CeedVector array access, a " - "process has read access"); - // LCOV_EXCL_STOP + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); + // LCOV_EXCL_STOP + } if (vec->SetValue) { - ierr = vec->SetValue(vec, value); CeedChk(ierr); + CeedCall(vec->SetValue(vec, value)); } else { CeedScalar *array; - ierr = CeedVectorGetArrayWrite(vec, CEED_MEM_HOST, &array); CeedChk(ierr); - for (CeedInt i=0; ilength; i++) array[i] = value; - ierr = CeedVectorRestoreArray(vec, &array); CeedChk(ierr); + CeedCall(CeedVectorGetArrayWrite(vec, CEED_MEM_HOST, &array)); + for (CeedInt i = 0; i < vec->length; i++) array[i] = value; + CeedCall(CeedVectorRestoreArray(vec, &array)); } vec->state += 2; return CEED_ERROR_SUCCESS; @@ -329,19 +306,16 @@ int CeedVectorSetValue(CeedVector vec, CeedScalar value) { @ref User **/ int CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type) { - int ierr; - - if (vec->state % 2 == 1) - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot sync CeedVector, the access lock is " - "already in use"); + if (vec->state % 2 == 1) { + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot sync CeedVector, the access lock is already in use"); + } if (vec->SyncArray) { - ierr = vec->SyncArray(vec, mem_type); CeedChk(ierr); + CeedCall(vec->SyncArray(vec, mem_type)); } else { const CeedScalar *array; - ierr = CeedVectorGetArrayRead(vec, mem_type, &array); CeedChk(ierr); - ierr = CeedVectorRestoreArrayRead(vec, &array); CeedChk(ierr); + CeedCall(CeedVectorGetArrayRead(vec, mem_type, &array)); + CeedCall(CeedVectorRestoreArrayRead(vec, &array)); } return CEED_ERROR_SUCCESS; } @@ -363,45 +337,39 @@ int CeedVectorSyncArray(CeedVector vec, CeedMemType mem_type) { @ref User **/ -int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, - CeedScalar **array) { - int ierr; - - if (vec->state % 2 == 1) +int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + if (vec->state % 2 == 1) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot take CeedVector array, the access " - "lock is already in use"); - // LCOV_EXCL_STOP - if (vec->num_readers > 0) + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot take CeedVector array, the access lock is already in use"); + // LCOV_EXCL_STOP + } + if (vec->num_readers > 0) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot take CeedVector array, a process " - "has read access"); - // LCOV_EXCL_STOP + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot take CeedVector array, a process has read access"); + // LCOV_EXCL_STOP + } + CeedScalar *temp_array = NULL; if (vec->length > 0) { bool has_borrowed_array_of_type = true; - ierr = CeedVectorHasBorrowedArrayOfType(vec, mem_type, - &has_borrowed_array_of_type); - CeedChk(ierr); - if (!has_borrowed_array_of_type) + CeedCall(CeedVectorHasBorrowedArrayOfType(vec, mem_type, &has_borrowed_array_of_type)); + if (!has_borrowed_array_of_type) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_BACKEND, - "CeedVector has no borrowed %s array, " - "must set array with CeedVectorSetArray", CeedMemTypes[mem_type]); - // LCOV_EXCL_STOP + return CeedError(vec->ceed, CEED_ERROR_BACKEND, "CeedVector has no borrowed %s array, must set array with CeedVectorSetArray", + CeedMemTypes[mem_type]); + // LCOV_EXCL_STOP + } bool has_valid_array = true; - ierr = CeedVectorHasValidArray(vec, &has_valid_array); CeedChk(ierr); - if (!has_valid_array) + CeedCall(CeedVectorHasValidArray(vec, &has_valid_array)); + if (!has_valid_array) { // LCOV_EXCL_START return CeedError(vec->ceed, CEED_ERROR_BACKEND, - "CeedVector has no valid data to take, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP + "CeedVector has no valid data to take, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } - ierr = vec->TakeArray(vec, mem_type, &temp_array); CeedChk(ierr); + CeedCall(vec->TakeArray(vec, mem_type, &temp_array)); } if (array) (*array) = temp_array; return CEED_ERROR_SUCCESS; @@ -425,36 +393,29 @@ int CeedVectorTakeArray(CeedVector vec, CeedMemType mem_type, @ref User **/ -int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, - CeedScalar **array) { - int ierr; - - if (!vec->GetArray) +int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + if (!vec->GetArray) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support GetArray"); - // LCOV_EXCL_STOP - - if (vec->state % 2 == 1) - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot grant CeedVector array access, the " - "access lock is already in use"); - - if (vec->num_readers > 0) - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot grant CeedVector array access, a " - "process has read access"); + return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetArray"); + // LCOV_EXCL_STOP + } + if (vec->state % 2 == 1) { + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use"); + } + if (vec->num_readers > 0) { + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); + } bool has_valid_array = true; - ierr = CeedVectorHasValidArray(vec, &has_valid_array); CeedChk(ierr); - if (!has_valid_array) + CeedCall(CeedVectorHasValidArray(vec, &has_valid_array)); + if (!has_valid_array) { // LCOV_EXCL_START return CeedError(vec->ceed, CEED_ERROR_BACKEND, - "CeedVector has no valid data to read, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP + "CeedVector has no valid data to read, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } - ierr = vec->GetArray(vec, mem_type, array); CeedChk(ierr); + CeedCall(vec->GetArray(vec, mem_type, array)); vec->state++; return CEED_ERROR_SUCCESS; } @@ -473,32 +434,27 @@ int CeedVectorGetArray(CeedVector vec, CeedMemType mem_type, @ref User **/ -int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, - const CeedScalar **array) { - int ierr; - - if (!vec->GetArrayRead) +int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, const CeedScalar **array) { + if (!vec->GetArrayRead) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support GetArrayRead"); - // LCOV_EXCL_STOP - - if (vec->state % 2 == 1) - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot grant CeedVector read-only array " - "access, the access lock is already in use"); + return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetArrayRead"); + // LCOV_EXCL_STOP + } + if (vec->state % 2 == 1) { + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector read-only array access, the access lock is already in use"); + } if (vec->length > 0) { bool has_valid_array = true; - ierr = CeedVectorHasValidArray(vec, &has_valid_array); CeedChk(ierr); - if (!has_valid_array) + CeedCall(CeedVectorHasValidArray(vec, &has_valid_array)); + if (!has_valid_array) { // LCOV_EXCL_START return CeedError(vec->ceed, CEED_ERROR_BACKEND, - "CeedVector has no valid data to read, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP + "CeedVector has no valid data to read, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } - ierr = vec->GetArrayRead(vec, mem_type, array); CeedChk(ierr); + CeedCall(vec->GetArrayRead(vec, mem_type, array)); } else { *array = NULL; } @@ -519,31 +475,24 @@ int CeedVectorGetArrayRead(CeedVector vec, CeedMemType mem_type, @ref User **/ -int CeedVectorGetArrayWrite(CeedVector vec, CeedMemType mem_type, - CeedScalar **array) { - int ierr; - - if (!vec->GetArrayWrite) +int CeedVectorGetArrayWrite(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + if (!vec->GetArrayWrite) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, - "Backend does not support GetArrayWrite"); - // LCOV_EXCL_STOP - - if (vec->state % 2 == 1) + return CeedError(vec->ceed, CEED_ERROR_UNSUPPORTED, "Backend does not support GetArrayWrite"); + // LCOV_EXCL_STOP + } + if (vec->state % 2 == 1) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot grant CeedVector array access, the " - "access lock is already in use"); - // LCOV_EXCL_STOP - - if (vec->num_readers > 0) + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, the access lock is already in use"); + // LCOV_EXCL_STOP + } + if (vec->num_readers > 0) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot grant CeedVector array access, a " - "process has read access"); - // LCOV_EXCL_STOP + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot grant CeedVector array access, a process has read access"); + // LCOV_EXCL_STOP + } - ierr = vec->GetArrayWrite(vec, mem_type, array); CeedChk(ierr); + CeedCall(vec->GetArrayWrite(vec, mem_type, array)); vec->state++; return CEED_ERROR_SUCCESS; } @@ -560,15 +509,10 @@ int CeedVectorGetArrayWrite(CeedVector vec, CeedMemType mem_type, @ref User **/ int CeedVectorRestoreArray(CeedVector vec, CeedScalar **array) { - int ierr; - - if (vec->state % 2 != 1) - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot restore CeedVector array access, " - "access was not granted"); - if (vec->RestoreArray) { - ierr = vec->RestoreArray(vec); CeedChk(ierr); + if (vec->state % 2 != 1) { + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot restore CeedVector array access, access was not granted"); } + if (vec->RestoreArray) CeedCall(vec->RestoreArray(vec)); *array = NULL; vec->state++; return CEED_ERROR_SUCCESS; @@ -585,19 +529,14 @@ int CeedVectorRestoreArray(CeedVector vec, CeedScalar **array) { @ref User **/ int CeedVectorRestoreArrayRead(CeedVector vec, const CeedScalar **array) { - int ierr; - - if (vec->num_readers == 0) + if (vec->num_readers == 0) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_ACCESS, - "Cannot restore CeedVector array read access, " - "access was not granted"); - // LCOV_EXCL_STOP + return CeedError(vec->ceed, CEED_ERROR_ACCESS, "Cannot restore CeedVector array read access, access was not granted"); + // LCOV_EXCL_STOP + } vec->num_readers--; - if (vec->num_readers == 0 && vec->RestoreArrayRead) { - ierr = vec->RestoreArrayRead(vec); CeedChk(ierr); - } + if (vec->num_readers == 0 && vec->RestoreArrayRead) CeedCall(vec->RestoreArrayRead(vec)); *array = NULL; return CEED_ERROR_SUCCESS; @@ -619,48 +558,45 @@ int CeedVectorRestoreArrayRead(CeedVector vec, const CeedScalar **array) { @ref User **/ int CeedVectorNorm(CeedVector vec, CeedNormType norm_type, CeedScalar *norm) { - int ierr; - bool has_valid_array = true; - ierr = CeedVectorHasValidArray(vec, &has_valid_array); CeedChk(ierr); - if (!has_valid_array) + CeedCall(CeedVectorHasValidArray(vec, &has_valid_array)); + if (!has_valid_array) { // LCOV_EXCL_START return CeedError(vec->ceed, CEED_ERROR_BACKEND, - "CeedVector has no valid data to compute norm, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP + "CeedVector has no valid data to compute norm, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } // Backend impl for GPU, if added if (vec->Norm) { - ierr = vec->Norm(vec, norm_type, norm); CeedChk(ierr); + CeedCall(vec->Norm(vec, norm_type, norm)); return CEED_ERROR_SUCCESS; } const CeedScalar *array; - ierr = CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &array); CeedChk(ierr); + CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &array)); *norm = 0.; switch (norm_type) { - case CEED_NORM_1: - for (CeedInt i=0; ilength; i++) { - *norm += fabs(array[i]); - } - break; - case CEED_NORM_2: - for (CeedInt i=0; ilength; i++) { - *norm += fabs(array[i])*fabs(array[i]); - } - break; - case CEED_NORM_MAX: - for (CeedInt i=0; ilength; i++) { - const CeedScalar abs_v_i = fabs(array[i]); - *norm = *norm > abs_v_i ? *norm : abs_v_i; - } + case CEED_NORM_1: + for (CeedInt i = 0; i < vec->length; i++) { + *norm += fabs(array[i]); + } + break; + case CEED_NORM_2: + for (CeedInt i = 0; i < vec->length; i++) { + *norm += fabs(array[i]) * fabs(array[i]); + } + break; + case CEED_NORM_MAX: + for (CeedInt i = 0; i < vec->length; i++) { + const CeedScalar abs_v_i = fabs(array[i]); + *norm = *norm > abs_v_i ? *norm : abs_v_i; + } } - if (norm_type == CEED_NORM_2) - *norm = sqrt(*norm); + if (norm_type == CEED_NORM_2) *norm = sqrt(*norm); - ierr = CeedVectorRestoreArrayRead(vec, &array); CeedChk(ierr); + CeedCall(CeedVectorRestoreArrayRead(vec, &array)); return CEED_ERROR_SUCCESS; } @@ -675,30 +611,27 @@ int CeedVectorNorm(CeedVector vec, CeedNormType norm_type, CeedScalar *norm) { @ref User **/ int CeedVectorScale(CeedVector x, CeedScalar alpha) { - int ierr; CeedScalar *x_array = NULL; - CeedSize n_x; + CeedSize n_x; bool has_valid_array = true; - ierr = CeedVectorHasValidArray(x, &has_valid_array); CeedChk(ierr); - if (!has_valid_array) + CeedCall(CeedVectorHasValidArray(x, &has_valid_array)); + if (!has_valid_array) { // LCOV_EXCL_START return CeedError(x->ceed, CEED_ERROR_BACKEND, - "CeedVector has no valid data to scale, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP + "CeedVector has no valid data to scale, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } - ierr = CeedVectorGetLength(x, &n_x); CeedChk(ierr); + CeedCall(CeedVectorGetLength(x, &n_x)); // Backend implementation - if (x->Scale) - return x->Scale(x, alpha); + if (x->Scale) return x->Scale(x, alpha); // Default implementation - ierr = CeedVectorGetArrayWrite(x, CEED_MEM_HOST, &x_array); CeedChk(ierr); - for (CeedInt i=0; iceed, CEED_ERROR_UNSUPPORTED, - "Cannot add vector of different lengths"); - // LCOV_EXCL_STOP - if (x == y) + return CeedError(y->ceed, CEED_ERROR_UNSUPPORTED, "Cannot add vector of different lengths"); + // LCOV_EXCL_STOP + } + if (x == y) { // LCOV_EXCL_START - return CeedError(y->ceed, CEED_ERROR_UNSUPPORTED, - "Cannot use same vector for x and y in CeedVectorAXPY"); - // LCOV_EXCL_STOP + return CeedError(y->ceed, CEED_ERROR_UNSUPPORTED, "Cannot use same vector for x and y in CeedVectorAXPY"); + // LCOV_EXCL_STOP + } bool has_valid_array_x = true, has_valid_array_y = true; - ierr = CeedVectorHasValidArray(x, &has_valid_array_x); CeedChk(ierr); - if (!has_valid_array_x) + CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x)); + if (!has_valid_array_x) { // LCOV_EXCL_START - return CeedError(x->ceed, CEED_ERROR_BACKEND, - "CeedVector x has no valid data, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP - ierr = CeedVectorHasValidArray(y, &has_valid_array_y); CeedChk(ierr); - if (!has_valid_array_y) + return CeedError(x->ceed, CEED_ERROR_BACKEND, "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } + CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y)); + if (!has_valid_array_y) { // LCOV_EXCL_START - return CeedError(y->ceed, CEED_ERROR_BACKEND, - "CeedVector y has no valid data, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP + return CeedError(y->ceed, CEED_ERROR_BACKEND, "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } Ceed ceed_parent_x, ceed_parent_y; - ierr = CeedGetParent(x->ceed, &ceed_parent_x); CeedChk(ierr); - ierr = CeedGetParent(y->ceed, &ceed_parent_y); CeedChk(ierr); - if (ceed_parent_x != ceed_parent_y) + CeedCall(CeedGetParent(x->ceed, &ceed_parent_x)); + CeedCall(CeedGetParent(y->ceed, &ceed_parent_y)); + if (ceed_parent_x != ceed_parent_y) { // LCOV_EXCL_START - return CeedError(y->ceed, CEED_ERROR_INCOMPATIBLE, - "Vectors x and y must be created by the same Ceed context"); - // LCOV_EXCL_STOP + return CeedError(y->ceed, CEED_ERROR_INCOMPATIBLE, "Vectors x and y must be created by the same Ceed context"); + // LCOV_EXCL_STOP + } // Backend implementation if (y->AXPY) { - ierr = y->AXPY(y, alpha, x); CeedChk(ierr); + CeedCall(y->AXPY(y, alpha, x)); return CEED_ERROR_SUCCESS; } // Default implementation - ierr = CeedVectorGetArrayWrite(y, CEED_MEM_HOST, &y_array); CeedChk(ierr); - ierr = CeedVectorGetArrayRead(x, CEED_MEM_HOST, &x_array); CeedChk(ierr); + CeedCall(CeedVectorGetArrayWrite(y, CEED_MEM_HOST, &y_array)); + CeedCall(CeedVectorGetArrayRead(x, CEED_MEM_HOST, &x_array)); - assert(x_array); assert(y_array); + assert(x_array); + assert(y_array); - for (CeedInt i=0; iceed, CEED_ERROR_UNSUPPORTED, - "Cannot multiply vectors of different lengths"); - // LCOV_EXCL_STOP + return CeedError(w->ceed, CEED_ERROR_UNSUPPORTED, "Cannot multiply vectors of different lengths"); + // LCOV_EXCL_STOP + } Ceed ceed_parent_w, ceed_parent_x, ceed_parent_y; - ierr = CeedGetParent(w->ceed, &ceed_parent_w); CeedChk(ierr); - ierr = CeedGetParent(x->ceed, &ceed_parent_x); CeedChk(ierr); - ierr = CeedGetParent(y->ceed, &ceed_parent_y); CeedChk(ierr); - if ((ceed_parent_w != ceed_parent_x) || - (ceed_parent_w != ceed_parent_y)) + CeedCall(CeedGetParent(w->ceed, &ceed_parent_w)); + CeedCall(CeedGetParent(x->ceed, &ceed_parent_x)); + CeedCall(CeedGetParent(y->ceed, &ceed_parent_y)); + if ((ceed_parent_w != ceed_parent_x) || (ceed_parent_w != ceed_parent_y)) { // LCOV_EXCL_START - return CeedError(w->ceed, CEED_ERROR_INCOMPATIBLE, - "Vectors w, x, and y must be created by the same Ceed context"); - // LCOV_EXCL_STOP + return CeedError(w->ceed, CEED_ERROR_INCOMPATIBLE, "Vectors w, x, and y must be created by the same Ceed context"); + // LCOV_EXCL_STOP + } bool has_valid_array_x = true, has_valid_array_y = true; - ierr = CeedVectorHasValidArray(x, &has_valid_array_x); CeedChk(ierr); - if (!has_valid_array_x) + CeedCall(CeedVectorHasValidArray(x, &has_valid_array_x)); + if (!has_valid_array_x) { // LCOV_EXCL_START - return CeedError(x->ceed, CEED_ERROR_BACKEND, - "CeedVector x has no valid data, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP - ierr = CeedVectorHasValidArray(y, &has_valid_array_y); CeedChk(ierr); - if (!has_valid_array_y) + return CeedError(x->ceed, CEED_ERROR_BACKEND, "CeedVector x has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } + CeedCall(CeedVectorHasValidArray(y, &has_valid_array_y)); + if (!has_valid_array_y) { // LCOV_EXCL_START - return CeedError(y->ceed, CEED_ERROR_BACKEND, - "CeedVector y has no valid data, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP + return CeedError(y->ceed, CEED_ERROR_BACKEND, "CeedVector y has no valid data, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } // Backend implementation if (w->PointwiseMult) { - ierr = w->PointwiseMult(w, x, y); CeedChk(ierr); + CeedCall(w->PointwiseMult(w, x, y)); return CEED_ERROR_SUCCESS; } // Default implementation - ierr = CeedVectorGetArrayWrite(w, CEED_MEM_HOST, &w_array); CeedChk(ierr); + CeedCall(CeedVectorGetArrayWrite(w, CEED_MEM_HOST, &w_array)); if (x != w) { - ierr = CeedVectorGetArrayRead(x, CEED_MEM_HOST, &x_array); CeedChk(ierr); + CeedCall(CeedVectorGetArrayRead(x, CEED_MEM_HOST, &x_array)); } else { x_array = w_array; } + if (y != w && y != x) { - ierr = CeedVectorGetArrayRead(y, CEED_MEM_HOST, &y_array); CeedChk(ierr); + CeedCall(CeedVectorGetArrayRead(y, CEED_MEM_HOST, &y_array)); } else if (y != x) { y_array = w_array; } else { y_array = x_array; } - assert(w_array); assert(x_array); assert(y_array); + assert(w_array); + assert(x_array); + assert(y_array); - for (CeedInt i=0; iceed, CEED_ERROR_BACKEND, - "CeedVector has no valid data to compute reciprocal, " - "must set data with CeedVectorSetValue or CeedVectorSetArray"); - // LCOV_EXCL_STOP + "CeedVector has no valid data to compute reciprocal, must set data with CeedVectorSetValue or CeedVectorSetArray"); + // LCOV_EXCL_STOP + } // Check if vector data set - if (!vec->state) + if (!vec->state) { // LCOV_EXCL_START - return CeedError(vec->ceed, CEED_ERROR_INCOMPLETE, - "CeedVector must have data set to take reciprocal"); - // LCOV_EXCL_STOP + return CeedError(vec->ceed, CEED_ERROR_INCOMPLETE, "CeedVector must have data set to take reciprocal"); + // LCOV_EXCL_STOP + } // Backend impl for GPU, if added if (vec->Reciprocal) { - ierr = vec->Reciprocal(vec); CeedChk(ierr); + CeedCall(vec->Reciprocal(vec)); return CEED_ERROR_SUCCESS; } CeedSize len; - ierr = CeedVectorGetLength(vec, &len); CeedChk(ierr); + CeedCall(CeedVectorGetLength(vec, &len)); CeedScalar *array; - ierr = CeedVectorGetArrayWrite(vec, CEED_MEM_HOST, &array); CeedChk(ierr); - for (CeedInt i=0; i CEED_EPSILON) - array[i] = 1./array[i]; + CeedCall(CeedVectorGetArrayWrite(vec, CEED_MEM_HOST, &array)); + for (CeedInt i = 0; i < len; i++) { + if (fabs(array[i]) > CEED_EPSILON) array[i] = 1. / array[i]; + } - ierr = CeedVectorRestoreArray(vec, &array); CeedChk(ierr); + CeedCall(CeedVectorRestoreArray(vec, &array)); return CEED_ERROR_SUCCESS; } @@ -928,16 +851,14 @@ int CeedVectorReciprocal(CeedVector vec) { **/ int CeedVectorView(CeedVector vec, const char *fp_fmt, FILE *stream) { const CeedScalar *x; - - int ierr = CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &x); CeedChk(ierr); + CeedCall(CeedVectorGetArrayRead(vec, CEED_MEM_HOST, &x)); char fmt[1024]; fprintf(stream, "CeedVector length %ld\n", (long)vec->length); snprintf(fmt, sizeof fmt, " %s\n", fp_fmt ? fp_fmt : "%g"); - for (CeedInt i=0; ilength; i++) - fprintf(stream, fmt, x[i]); + for (CeedInt i = 0; i < vec->length; i++) fprintf(stream, fmt, x[i]); - ierr = CeedVectorRestoreArrayRead(vec, &x); CeedChk(ierr); + CeedCall(CeedVectorRestoreArrayRead(vec, &x)); return CEED_ERROR_SUCCESS; } @@ -981,28 +902,21 @@ int CeedVectorGetLength(CeedVector vec, CeedSize *length) { @ref User **/ int CeedVectorDestroy(CeedVector *vec) { - int ierr; - if (!*vec || --(*vec)->ref_count > 0) return CEED_ERROR_SUCCESS; - if (((*vec)->state % 2) == 1) - return CeedError((*vec)->ceed, CEED_ERROR_ACCESS, - "Cannot destroy CeedVector, the writable access " - "lock is in use"); - - if ((*vec)->num_readers > 0) + if (((*vec)->state % 2) == 1) { + return CeedError((*vec)->ceed, CEED_ERROR_ACCESS, "Cannot destroy CeedVector, the writable access lock is in use"); + } + if ((*vec)->num_readers > 0) { // LCOV_EXCL_START - return CeedError((*vec)->ceed, CEED_ERROR_ACCESS, - "Cannot destroy CeedVector, a process has " - "read access"); - // LCOV_EXCL_STOP - - if ((*vec)->Destroy) { - ierr = (*vec)->Destroy(*vec); CeedChk(ierr); + return CeedError((*vec)->ceed, CEED_ERROR_ACCESS, "Cannot destroy CeedVector, a process has read access"); + // LCOV_EXCL_STOP } - ierr = CeedDestroy(&(*vec)->ceed); CeedChk(ierr); - ierr = CeedFree(vec); CeedChk(ierr); + if ((*vec)->Destroy) CeedCall((*vec)->Destroy(*vec)); + + CeedCall(CeedDestroy(&(*vec)->ceed)); + CeedCall(CeedFree(vec)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed.c b/interface/ceed.c index 97721b96bc..2e8c2ad165 100644 --- a/interface/ceed.c +++ b/interface/ceed.c @@ -6,9 +6,9 @@ // This file is part of CEED: http://github.com/ceed #define _POSIX_C_SOURCE 200112 -#include -#include #include +#include +#include #include #include #include @@ -28,7 +28,7 @@ static struct { static size_t num_backends; #define CEED_FTABLE_ENTRY(class, method) \ - {#class #method, offsetof(struct class ##_private, method)} + { #class #method, offsetof(struct class##_private, method) } /// @endcond /// @file @@ -42,7 +42,7 @@ static size_t num_backends; This predefined constant is passed as the \ref CeedRequest argument to interfaces when the caller wishes for the operation to be performed - immediately. The code + immediately. The code @code CeedOperatorApply(op, ..., CEED_REQUEST_IMMEDIATE); @@ -65,7 +65,7 @@ CeedRequest *const CEED_REQUEST_IMMEDIATE = &ceed_request_immediate; This predefined constant is passed as the \ref CeedRequest argument to interfaces when the caller wishes for the operation to be completed in the - order that it is submitted to the device. It is typically used in a construct + order that it is submitted to the device. It is typically used in a construct such as @code @@ -98,10 +98,8 @@ CeedRequest *const CEED_REQUEST_ORDERED = &ceed_request_ordered; @ref User **/ int CeedRequestWait(CeedRequest *req) { - if (!*req) - return CEED_ERROR_SUCCESS; - return CeedError(NULL, CEED_ERROR_UNSUPPORTED, - "CeedRequestWait not implemented"); + if (!*req) return CEED_ERROR_SUCCESS; + return CeedError(NULL, CEED_ERROR_UNSUPPORTED, "CeedRequestWait not implemented"); } /// @} @@ -128,17 +126,16 @@ int CeedRequestWait(CeedRequest *req) { @ref Developer **/ -int CeedRegisterImpl(const char *prefix, int (*init)(const char *, Ceed), - unsigned int priority) { +int CeedRegisterImpl(const char *prefix, int (*init)(const char *, Ceed), unsigned int priority) { if (num_backends >= sizeof(backends) / sizeof(backends[0])) // LCOV_EXCL_START return CeedError(NULL, CEED_ERROR_MAJOR, "Too many backends"); // LCOV_EXCL_STOP strncpy(backends[num_backends].prefix, prefix, CEED_MAX_RESOURCE_LEN); - backends[num_backends].prefix[CEED_MAX_RESOURCE_LEN-1] = 0; - backends[num_backends].init = init; - backends[num_backends].priority = priority; + backends[num_backends].prefix[CEED_MAX_RESOURCE_LEN - 1] = 0; + backends[num_backends].init = init; + backends[num_backends].priority = priority; num_backends++; return CEED_ERROR_SUCCESS; } @@ -162,9 +159,7 @@ int CeedRegisterImpl(const char *prefix, int (*init)(const char *, Ceed), @ref Backend **/ // LCOV_EXCL_START -bool CeedDebugFlag(const Ceed ceed) { - return ceed->is_debug; -} +bool CeedDebugFlag(const Ceed ceed) { return ceed->is_debug; } // LCOV_EXCL_STOP /** @@ -176,9 +171,7 @@ bool CeedDebugFlag(const Ceed ceed) { @ref Backend **/ // LCOV_EXCL_START -bool CeedDebugFlagEnv(void) { - return !!getenv("CEED_DEBUG") || !!getenv("DEBUG") || !!getenv("DBG"); -} +bool CeedDebugFlagEnv(void) { return !!getenv("CEED_DEBUG") || !!getenv("DEBUG") || !!getenv("DBG"); } // LCOV_EXCL_STOP /** @@ -190,15 +183,13 @@ bool CeedDebugFlagEnv(void) { @ref Backend **/ // LCOV_EXCL_START -void CeedDebugImpl256(const unsigned char color, const char *format,...) { +void CeedDebugImpl256(const unsigned char color, const char *format, ...) { va_list args; va_start(args, format); fflush(stdout); - if (color != CEED_DEBUG_COLOR_NONE) - fprintf(stdout, "\033[38;5;%dm", color); + if (color != CEED_DEBUG_COLOR_NONE) fprintf(stdout, "\033[38;5;%dm", color); vfprintf(stdout, format, args); - if (color != CEED_DEBUG_COLOR_NONE) - fprintf(stdout, "\033[m"); + if (color != CEED_DEBUG_COLOR_NONE) fprintf(stdout, "\033[m"); fprintf(stdout, "\n"); fflush(stdout); va_end(args); @@ -222,13 +213,12 @@ void CeedDebugImpl256(const unsigned char color, const char *format,...) { @ref Backend **/ int CeedMallocArray(size_t n, size_t unit, void *p) { - int ierr = posix_memalign((void **)p, CEED_ALIGN, n*unit); - if (ierr) + int ierr = posix_memalign((void **)p, CEED_ALIGN, n * unit); + if (ierr) { // LCOV_EXCL_START - return CeedError(NULL, CEED_ERROR_MAJOR, - "posix_memalign failed to allocate %zd " - "members of size %zd\n", n, unit); - // LCOV_EXCL_STOP + return CeedError(NULL, CEED_ERROR_MAJOR, "posix_memalign failed to allocate %zd members of size %zd\n", n, unit); + // LCOV_EXCL_STOP + } return CEED_ERROR_SUCCESS; } @@ -249,12 +239,11 @@ int CeedMallocArray(size_t n, size_t unit, void *p) { **/ int CeedCallocArray(size_t n, size_t unit, void *p) { *(void **)p = calloc(n, unit); - if (n && unit && !*(void **)p) + if (n && unit && !*(void **)p) { // LCOV_EXCL_START - return CeedError(NULL, CEED_ERROR_MAJOR, - "calloc failed to allocate %zd members of size " - "%zd\n", n, unit); - // LCOV_EXCL_STOP + return CeedError(NULL, CEED_ERROR_MAJOR, "calloc failed to allocate %zd members of size %zd\n", n, unit); + // LCOV_EXCL_STOP + } return CEED_ERROR_SUCCESS; } @@ -274,13 +263,12 @@ int CeedCallocArray(size_t n, size_t unit, void *p) { @ref Backend **/ int CeedReallocArray(size_t n, size_t unit, void *p) { - *(void **)p = realloc(*(void **)p, n*unit); - if (n && unit && !*(void **)p) + *(void **)p = realloc(*(void **)p, n * unit); + if (n && unit && !*(void **)p) { // LCOV_EXCL_START - return CeedError(NULL, CEED_ERROR_MAJOR, - "realloc failed to allocate %zd members of size " - "%zd\n", n, unit); - // LCOV_EXCL_STOP + return CeedError(NULL, CEED_ERROR_MAJOR, "realloc failed to allocate %zd members of size %zd\n", n, unit); + // LCOV_EXCL_STOP + } return CEED_ERROR_SUCCESS; } @@ -299,9 +287,8 @@ int CeedReallocArray(size_t n, size_t unit, void *p) { @ref Backend **/ int CeedStringAllocCopy(const char *source, char **copy) { - int ierr; size_t len = strlen(source); - ierr = CeedCalloc(len + 1, copy); CeedChk(ierr); + CeedCall(CeedCalloc(len + 1, copy)); memcpy(*copy, source, len); return CEED_ERROR_SUCCESS; } @@ -333,8 +320,7 @@ int CeedFree(void *p) { @ref Backend **/ -int CeedRegister(const char *prefix, int (*init)(const char *, Ceed), - unsigned int priority) { +int CeedRegister(const char *prefix, int (*init)(const char *, Ceed), unsigned int priority) { CeedDebugEnv("Backend Register: %s", prefix); CeedRegisterImpl(prefix, init, priority); return CEED_ERROR_SUCCESS; @@ -366,9 +352,8 @@ int CeedIsDebug(Ceed ceed, bool *is_debug) { @ref Backend **/ int CeedGetParent(Ceed ceed, Ceed *parent) { - int ierr; if (ceed->parent) { - ierr = CeedGetParent(ceed->parent, parent); CeedChk(ierr); + CeedCall(CeedGetParent(ceed->parent, parent)); return CEED_ERROR_SUCCESS; } *parent = ceed; @@ -405,7 +390,7 @@ int CeedGetDelegate(Ceed ceed, Ceed *delegate) { @ref Backend **/ int CeedSetDelegate(Ceed ceed, Ceed delegate) { - ceed->delegate = delegate; + ceed->delegate = delegate; delegate->parent = ceed; return CEED_ERROR_SUCCESS; } @@ -422,17 +407,16 @@ int CeedSetDelegate(Ceed ceed, Ceed delegate) { @ref Backend **/ int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name) { - CeedInt ierr; - // Check for object delegate - for (CeedInt i=0; iobj_delegate_count; i++) + for (CeedInt i = 0; i < ceed->obj_delegate_count; i++) { if (!strcmp(obj_name, ceed->obj_delegates->obj_name)) { *delegate = ceed->obj_delegates->delegate; return CEED_ERROR_SUCCESS; } + } // Use default delegate if no object delegate - ierr = CeedGetDelegate(ceed, delegate); CeedChk(ierr); + CeedCall(CeedGetDelegate(ceed, delegate)); return CEED_ERROR_SUCCESS; } @@ -454,21 +438,19 @@ int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name) { @ref Backend **/ int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name) { - int ierr; CeedInt count = ceed->obj_delegate_count; // Malloc or Realloc if (count) { - ierr = CeedRealloc(count+1, &ceed->obj_delegates); CeedChk(ierr); + CeedCall(CeedRealloc(count + 1, &ceed->obj_delegates)); } else { - ierr = CeedCalloc(1, &ceed->obj_delegates); CeedChk(ierr); + CeedCall(CeedCalloc(1, &ceed->obj_delegates)); } ceed->obj_delegate_count++; // Set object delegate ceed->obj_delegates[count].delegate = delegate; - ierr = CeedStringAllocCopy(obj_name, &ceed->obj_delegates[count].obj_name); - CeedChk(ierr); + CeedCall(CeedStringAllocCopy(obj_name, &ceed->obj_delegates[count].obj_name)); // Set delegate parent delegate->parent = ceed; @@ -503,26 +485,23 @@ int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource) { **/ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) { - int ierr; - if (ceed->has_valid_op_fallback_resource) { CeedDebug256(ceed, 1, "---------- CeedOperator Fallback ----------\n"); - CeedDebug(ceed, "Getting fallback from %s to %s\n", ceed->resource, - ceed->op_fallback_resource); + CeedDebug(ceed, "Getting fallback from %s to %s\n", ceed->resource, ceed->op_fallback_resource); } // Create fallback Ceed if uninitalized if (!ceed->op_fallback_ceed && ceed->has_valid_op_fallback_resource) { CeedDebug(ceed, "Creating fallback Ceed"); - Ceed fallback_ceed; + Ceed fallback_ceed; const char *fallback_resource; - ierr = CeedGetOperatorFallbackResource(ceed, &fallback_resource); CeedChk(ierr); - ierr = CeedInit(fallback_resource, &fallback_ceed); CeedChk(ierr); + CeedCall(CeedGetOperatorFallbackResource(ceed, &fallback_resource)); + CeedCall(CeedInit(fallback_resource, &fallback_ceed)); fallback_ceed->op_fallback_parent = ceed; - fallback_ceed->Error = ceed->Error; - ceed->op_fallback_ceed = fallback_ceed; + fallback_ceed->Error = ceed->Error; + ceed->op_fallback_ceed = fallback_ceed; } *fallback_ceed = ceed->op_fallback_ceed; @@ -543,19 +522,14 @@ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) { **/ int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource) { - int ierr; - // Free old - ierr = CeedFree(&ceed->op_fallback_resource); CeedChk(ierr); + CeedCall(CeedFree(&ceed->op_fallback_resource)); // Set new - ierr = CeedStringAllocCopy(resource, (char **)&ceed->op_fallback_resource); - CeedChk(ierr); + CeedCall(CeedStringAllocCopy(resource, (char **)&ceed->op_fallback_resource)); // Check validity - ceed->has_valid_op_fallback_resource = ceed->op_fallback_resource && - ceed->resource && - strcmp(ceed->op_fallback_resource, ceed->resource); + ceed->has_valid_op_fallback_resource = ceed->op_fallback_resource && ceed->resource && strcmp(ceed->op_fallback_resource, ceed->resource); return CEED_ERROR_SUCCESS; } @@ -614,29 +588,26 @@ int CeedSetDeterministic(Ceed ceed, bool is_deterministic) { @ref Backend **/ -int CeedSetBackendFunction(Ceed ceed, const char *type, void *object, - const char *func_name, int (*f)()) { - char lookup_name[CEED_MAX_RESOURCE_LEN+1] = ""; +int CeedSetBackendFunction(Ceed ceed, const char *type, void *object, const char *func_name, int (*f)()) { + char lookup_name[CEED_MAX_RESOURCE_LEN + 1] = ""; // Build lookup name - if (strcmp(type, "Ceed")) - strncat (lookup_name, "Ceed", CEED_MAX_RESOURCE_LEN); + if (strcmp(type, "Ceed")) strncat(lookup_name, "Ceed", CEED_MAX_RESOURCE_LEN); strncat(lookup_name, type, CEED_MAX_RESOURCE_LEN); strncat(lookup_name, func_name, CEED_MAX_RESOURCE_LEN); // Find and use offset - for (CeedInt i = 0; ceed->f_offsets[i].func_name; i++) + for (CeedInt i = 0; ceed->f_offsets[i].func_name; i++) { if (!strcmp(ceed->f_offsets[i].func_name, lookup_name)) { - size_t offset = ceed->f_offsets[i].offset; - int (**fpointer)(void) = (int (**)(void))((char *)object + offset); // *NOPAD* - *fpointer = f; + size_t offset = ceed->f_offsets[i].offset; + int (**fpointer)(void) = (int (**)(void))((char *)object + offset); // *NOPAD* + *fpointer = f; return CEED_ERROR_SUCCESS; } + } // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Requested function '%s' was not found for CEED " - "object '%s'", func_name, type); + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Requested function '%s' was not found for CEED object '%s'", func_name, type); // LCOV_EXCL_STOP } @@ -706,18 +677,15 @@ int CeedReference(Ceed ceed) { @ref User **/ // LCOV_EXCL_START -int CeedRegistryGetList(size_t *n, char ***const resources, - CeedInt **priorities) { - *n = 0; +int CeedRegistryGetList(size_t *n, char ***const resources, CeedInt **priorities) { + *n = 0; *resources = malloc(num_backends * sizeof(**resources)); - if (!resources) - return CeedError(NULL, CEED_ERROR_MAJOR, "malloc() failure"); + if (!resources) return CeedError(NULL, CEED_ERROR_MAJOR, "malloc() failure"); if (priorities) { *priorities = malloc(num_backends * sizeof(**priorities)); - if (!priorities) - return CeedError(NULL, CEED_ERROR_MAJOR, "malloc() failure"); + if (!priorities) return CeedError(NULL, CEED_ERROR_MAJOR, "malloc() failure"); } - for (size_t i=0; i match_len || (n == match_len && match_priority > priority)) { - match_len = n; + match_len = n; match_priority = priority; - match_index = i; + match_index = i; } } // Using Levenshtein distance to find closest match if (match_len <= 1 || match_len != stem_length) { // LCOV_EXCL_START - size_t lev_dis = UINT_MAX; + size_t lev_dis = UINT_MAX; size_t lev_index = UINT_MAX, lev_priority = CEED_MAX_BACKEND_PRIORITY; - for (size_t i=0; i priority)) { - lev_dis = n; + if (n < lev_dis || (n == lev_dis && lev_priority > priority)) { + lev_dis = n; lev_priority = priority; - lev_index = i; + lev_index = i; } } const char *prefix_lev = backends[lev_index].prefix; - size_t lev_length; - for (lev_length=0; prefix_lev[lev_length] - && prefix_lev[lev_length] != '\0'; lev_length++) {} + size_t lev_length = 0; + while (prefix_lev[lev_length] && prefix_lev[lev_length] != '\0') lev_length++; size_t m = (lev_length < stem_length) ? lev_length : stem_length; - if (lev_dis+1 >= m) { - return CeedError(NULL, CEED_ERROR_MAJOR, "No suitable backend: %s", - resource); + if (lev_dis + 1 >= m) { + return CeedError(NULL, CEED_ERROR_MAJOR, "No suitable backend: %s", resource); } else { - return CeedError(NULL, CEED_ERROR_MAJOR, "No suitable backend: %s\n" - "Closest match: %s", resource, backends[lev_index].prefix); + return CeedError(NULL, CEED_ERROR_MAJOR, + "No suitable backend: %s\n" + "Closest match: %s", + resource, backends[lev_index].prefix); } // LCOV_EXCL_STOP } // Setup Ceed - ierr = CeedCalloc(1, ceed); CeedChk(ierr); - ierr = CeedCalloc(1, &(*ceed)->jit_source_roots); CeedChk(ierr); + CeedCall(CeedCalloc(1, ceed)); + CeedCall(CeedCalloc(1, &(*ceed)->jit_source_roots)); const char *ceed_error_handler = getenv("CEED_ERROR_HANDLER"); - if (!ceed_error_handler) - ceed_error_handler = "abort"; - if (!strcmp(ceed_error_handler, "exit")) - (*ceed)->Error = CeedErrorExit; - else if (!strcmp(ceed_error_handler, "store")) - (*ceed)->Error = CeedErrorStore; - else - (*ceed)->Error = CeedErrorAbort; + if (!ceed_error_handler) ceed_error_handler = "abort"; + if (!strcmp(ceed_error_handler, "exit")) (*ceed)->Error = CeedErrorExit; + else if (!strcmp(ceed_error_handler, "store")) (*ceed)->Error = CeedErrorStore; + else (*ceed)->Error = CeedErrorAbort; memcpy((*ceed)->err_msg, "No error message stored", 24); (*ceed)->ref_count = 1; - (*ceed)->data = NULL; + (*ceed)->data = NULL; // Set lookup table FOffset f_offsets[] = { - CEED_FTABLE_ENTRY(Ceed, Error), - CEED_FTABLE_ENTRY(Ceed, GetPreferredMemType), - CEED_FTABLE_ENTRY(Ceed, Destroy), - CEED_FTABLE_ENTRY(Ceed, VectorCreate), - CEED_FTABLE_ENTRY(Ceed, ElemRestrictionCreate), - CEED_FTABLE_ENTRY(Ceed, ElemRestrictionCreateOriented), - CEED_FTABLE_ENTRY(Ceed, ElemRestrictionCreateBlocked), - CEED_FTABLE_ENTRY(Ceed, BasisCreateTensorH1), - CEED_FTABLE_ENTRY(Ceed, BasisCreateH1), - CEED_FTABLE_ENTRY(Ceed, BasisCreateHdiv), - CEED_FTABLE_ENTRY(Ceed, TensorContractCreate), - CEED_FTABLE_ENTRY(Ceed, QFunctionCreate), - CEED_FTABLE_ENTRY(Ceed, QFunctionContextCreate), - CEED_FTABLE_ENTRY(Ceed, OperatorCreate), - CEED_FTABLE_ENTRY(Ceed, CompositeOperatorCreate), - CEED_FTABLE_ENTRY(CeedVector, HasValidArray), - CEED_FTABLE_ENTRY(CeedVector, HasBorrowedArrayOfType), - CEED_FTABLE_ENTRY(CeedVector, SetArray), - CEED_FTABLE_ENTRY(CeedVector, TakeArray), - CEED_FTABLE_ENTRY(CeedVector, SetValue), - CEED_FTABLE_ENTRY(CeedVector, SyncArray), - CEED_FTABLE_ENTRY(CeedVector, GetArray), - CEED_FTABLE_ENTRY(CeedVector, GetArrayRead), - CEED_FTABLE_ENTRY(CeedVector, GetArrayWrite), - CEED_FTABLE_ENTRY(CeedVector, RestoreArray), - CEED_FTABLE_ENTRY(CeedVector, RestoreArrayRead), - CEED_FTABLE_ENTRY(CeedVector, Norm), - CEED_FTABLE_ENTRY(CeedVector, Scale), - CEED_FTABLE_ENTRY(CeedVector, AXPY), - CEED_FTABLE_ENTRY(CeedVector, PointwiseMult), - CEED_FTABLE_ENTRY(CeedVector, Reciprocal), - CEED_FTABLE_ENTRY(CeedVector, Destroy), - CEED_FTABLE_ENTRY(CeedElemRestriction, Apply), - CEED_FTABLE_ENTRY(CeedElemRestriction, ApplyBlock), - CEED_FTABLE_ENTRY(CeedElemRestriction, GetOffsets), - CEED_FTABLE_ENTRY(CeedElemRestriction, Destroy), - CEED_FTABLE_ENTRY(CeedBasis, Apply), - CEED_FTABLE_ENTRY(CeedBasis, Destroy), - CEED_FTABLE_ENTRY(CeedTensorContract, Apply), - CEED_FTABLE_ENTRY(CeedTensorContract, Destroy), - CEED_FTABLE_ENTRY(CeedQFunction, Apply), - CEED_FTABLE_ENTRY(CeedQFunction, SetCUDAUserFunction), - CEED_FTABLE_ENTRY(CeedQFunction, SetHIPUserFunction), - CEED_FTABLE_ENTRY(CeedQFunction, Destroy), - CEED_FTABLE_ENTRY(CeedQFunctionContext, HasValidData), - CEED_FTABLE_ENTRY(CeedQFunctionContext, HasBorrowedDataOfType), - CEED_FTABLE_ENTRY(CeedQFunctionContext, SetData), - CEED_FTABLE_ENTRY(CeedQFunctionContext, TakeData), - CEED_FTABLE_ENTRY(CeedQFunctionContext, GetData), - CEED_FTABLE_ENTRY(CeedQFunctionContext, GetDataRead), - CEED_FTABLE_ENTRY(CeedQFunctionContext, RestoreData), - CEED_FTABLE_ENTRY(CeedQFunctionContext, RestoreDataRead), - CEED_FTABLE_ENTRY(CeedQFunctionContext, DataDestroy), - CEED_FTABLE_ENTRY(CeedQFunctionContext, Destroy), - CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleQFunction), - CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleQFunctionUpdate), - CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleDiagonal), - CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleAddDiagonal), - CEED_FTABLE_ENTRY(CeedOperator, LinearAssemblePointBlockDiagonal), - CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleAddPointBlockDiagonal), - CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleSymbolic), - CEED_FTABLE_ENTRY(CeedOperator, LinearAssemble), - CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleSingle), - CEED_FTABLE_ENTRY(CeedOperator, CreateFDMElementInverse), - CEED_FTABLE_ENTRY(CeedOperator, Apply), - CEED_FTABLE_ENTRY(CeedOperator, ApplyComposite), - CEED_FTABLE_ENTRY(CeedOperator, ApplyAdd), - CEED_FTABLE_ENTRY(CeedOperator, ApplyAddComposite), - CEED_FTABLE_ENTRY(CeedOperator, ApplyJacobian), - CEED_FTABLE_ENTRY(CeedOperator, Destroy), - {NULL, 0} // End of lookup table - used in SetBackendFunction loop + CEED_FTABLE_ENTRY(Ceed, Error), + CEED_FTABLE_ENTRY(Ceed, GetPreferredMemType), + CEED_FTABLE_ENTRY(Ceed, Destroy), + CEED_FTABLE_ENTRY(Ceed, VectorCreate), + CEED_FTABLE_ENTRY(Ceed, ElemRestrictionCreate), + CEED_FTABLE_ENTRY(Ceed, ElemRestrictionCreateOriented), + CEED_FTABLE_ENTRY(Ceed, ElemRestrictionCreateBlocked), + CEED_FTABLE_ENTRY(Ceed, BasisCreateTensorH1), + CEED_FTABLE_ENTRY(Ceed, BasisCreateH1), + CEED_FTABLE_ENTRY(Ceed, BasisCreateHdiv), + CEED_FTABLE_ENTRY(Ceed, TensorContractCreate), + CEED_FTABLE_ENTRY(Ceed, QFunctionCreate), + CEED_FTABLE_ENTRY(Ceed, QFunctionContextCreate), + CEED_FTABLE_ENTRY(Ceed, OperatorCreate), + CEED_FTABLE_ENTRY(Ceed, CompositeOperatorCreate), + CEED_FTABLE_ENTRY(CeedVector, HasValidArray), + CEED_FTABLE_ENTRY(CeedVector, HasBorrowedArrayOfType), + CEED_FTABLE_ENTRY(CeedVector, SetArray), + CEED_FTABLE_ENTRY(CeedVector, TakeArray), + CEED_FTABLE_ENTRY(CeedVector, SetValue), + CEED_FTABLE_ENTRY(CeedVector, SyncArray), + CEED_FTABLE_ENTRY(CeedVector, GetArray), + CEED_FTABLE_ENTRY(CeedVector, GetArrayRead), + CEED_FTABLE_ENTRY(CeedVector, GetArrayWrite), + CEED_FTABLE_ENTRY(CeedVector, RestoreArray), + CEED_FTABLE_ENTRY(CeedVector, RestoreArrayRead), + CEED_FTABLE_ENTRY(CeedVector, Norm), + CEED_FTABLE_ENTRY(CeedVector, Scale), + CEED_FTABLE_ENTRY(CeedVector, AXPY), + CEED_FTABLE_ENTRY(CeedVector, PointwiseMult), + CEED_FTABLE_ENTRY(CeedVector, Reciprocal), + CEED_FTABLE_ENTRY(CeedVector, Destroy), + CEED_FTABLE_ENTRY(CeedElemRestriction, Apply), + CEED_FTABLE_ENTRY(CeedElemRestriction, ApplyBlock), + CEED_FTABLE_ENTRY(CeedElemRestriction, GetOffsets), + CEED_FTABLE_ENTRY(CeedElemRestriction, Destroy), + CEED_FTABLE_ENTRY(CeedBasis, Apply), + CEED_FTABLE_ENTRY(CeedBasis, Destroy), + CEED_FTABLE_ENTRY(CeedTensorContract, Apply), + CEED_FTABLE_ENTRY(CeedTensorContract, Destroy), + CEED_FTABLE_ENTRY(CeedQFunction, Apply), + CEED_FTABLE_ENTRY(CeedQFunction, SetCUDAUserFunction), + CEED_FTABLE_ENTRY(CeedQFunction, SetHIPUserFunction), + CEED_FTABLE_ENTRY(CeedQFunction, Destroy), + CEED_FTABLE_ENTRY(CeedQFunctionContext, HasValidData), + CEED_FTABLE_ENTRY(CeedQFunctionContext, HasBorrowedDataOfType), + CEED_FTABLE_ENTRY(CeedQFunctionContext, SetData), + CEED_FTABLE_ENTRY(CeedQFunctionContext, TakeData), + CEED_FTABLE_ENTRY(CeedQFunctionContext, GetData), + CEED_FTABLE_ENTRY(CeedQFunctionContext, GetDataRead), + CEED_FTABLE_ENTRY(CeedQFunctionContext, RestoreData), + CEED_FTABLE_ENTRY(CeedQFunctionContext, RestoreDataRead), + CEED_FTABLE_ENTRY(CeedQFunctionContext, DataDestroy), + CEED_FTABLE_ENTRY(CeedQFunctionContext, Destroy), + CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleQFunction), + CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleQFunctionUpdate), + CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleDiagonal), + CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleAddDiagonal), + CEED_FTABLE_ENTRY(CeedOperator, LinearAssemblePointBlockDiagonal), + CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleAddPointBlockDiagonal), + CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleSymbolic), + CEED_FTABLE_ENTRY(CeedOperator, LinearAssemble), + CEED_FTABLE_ENTRY(CeedOperator, LinearAssembleSingle), + CEED_FTABLE_ENTRY(CeedOperator, CreateFDMElementInverse), + CEED_FTABLE_ENTRY(CeedOperator, Apply), + CEED_FTABLE_ENTRY(CeedOperator, ApplyComposite), + CEED_FTABLE_ENTRY(CeedOperator, ApplyAdd), + CEED_FTABLE_ENTRY(CeedOperator, ApplyAddComposite), + CEED_FTABLE_ENTRY(CeedOperator, ApplyJacobian), + CEED_FTABLE_ENTRY(CeedOperator, Destroy), + {NULL, 0} // End of lookup table - used in SetBackendFunction loop }; - ierr = CeedCalloc(sizeof(f_offsets), &(*ceed)->f_offsets); CeedChk(ierr); + CeedCall(CeedCalloc(sizeof(f_offsets), &(*ceed)->f_offsets)); memcpy((*ceed)->f_offsets, f_offsets, sizeof(f_offsets)); // Set fallback for advanced CeedOperator functions const char fallbackresource[] = ""; - ierr = CeedSetOperatorFallbackResource(*ceed, fallbackresource); - CeedChk(ierr); + CeedCall(CeedSetOperatorFallbackResource(*ceed, fallbackresource)); // Record env variables CEED_DEBUG or DBG - (*ceed)->is_debug = !!getenv("CEED_DEBUG") || !!getenv("DEBUG") || - !!getenv("DBG"); + (*ceed)->is_debug = !!getenv("CEED_DEBUG") || !!getenv("DEBUG") || !!getenv("DBG"); // Copy resource prefix, if backend setup successful - ierr = CeedStringAllocCopy(backends[match_index].prefix, - (char **)&(*ceed)->resource); - CeedChk(ierr); + CeedCall(CeedStringAllocCopy(backends[match_index].prefix, (char **)&(*ceed)->resource)); // Set default JiT source root // Note: there will always be the default root for every Ceed // but all additional paths are added to the top-most parent - ierr = CeedAddJitSourceRoot(*ceed, (char *)CeedJitSourceRootDefault); - CeedChk(ierr); + CeedCall(CeedAddJitSourceRoot(*ceed, (char *)CeedJitSourceRootDefault)); // Backend specific setup - ierr = backends[match_index].init(&resource[match_help], *ceed); CeedChk(ierr); + CeedCall(backends[match_index].init(&resource[match_help], *ceed)); return CEED_ERROR_SUCCESS; } @@ -985,10 +937,8 @@ int CeedInit(const char *resource, Ceed *ceed) { @ref User **/ int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy) { - int ierr; - - ierr = CeedReference(ceed); CeedChk(ierr); - ierr = CeedDestroy(ceed_copy); CeedChk(ierr); + CeedCall(CeedReference(ceed)); + CeedCall(CeedDestroy(ceed_copy)); *ceed_copy = ceed; return CEED_ERROR_SUCCESS; } @@ -1019,16 +969,14 @@ int CeedGetResource(Ceed ceed, const char **resource) { @ref User **/ int CeedGetPreferredMemType(Ceed ceed, CeedMemType *mem_type) { - int ierr; - if (ceed->GetPreferredMemType) { - ierr = ceed->GetPreferredMemType(mem_type); CeedChk(ierr); + CeedCall(ceed->GetPreferredMemType(mem_type)); } else { Ceed delegate; - ierr = CeedGetDelegate(ceed, &delegate); CeedChk(ierr); + CeedCall(CeedGetDelegate(ceed, &delegate)); if (delegate) { - ierr = CeedGetPreferredMemType(delegate, mem_type); CeedChk(ierr); + CeedCall(CeedGetPreferredMemType(delegate, mem_type)); } else { *mem_type = CEED_MEM_HOST; } @@ -1062,16 +1010,14 @@ int CeedIsDeterministic(Ceed ceed, bool *is_deterministic) { @ref User **/ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) { - int ierr; Ceed ceed_parent; - ierr = CeedGetParent(ceed, &ceed_parent); CeedChk(ierr); + CeedCall(CeedGetParent(ceed, &ceed_parent)); - CeedInt index = ceed_parent->num_jit_source_roots; - size_t path_length = strlen(jit_source_root); - ierr = CeedRealloc(index + 1, &ceed_parent->jit_source_roots); CeedChk(ierr); - ierr = CeedCalloc(path_length + 1, &ceed_parent->jit_source_roots[index]); - CeedChk(ierr); + CeedInt index = ceed_parent->num_jit_source_roots; + size_t path_length = strlen(jit_source_root); + CeedCall(CeedRealloc(index + 1, &ceed_parent->jit_source_roots)); + CeedCall(CeedCalloc(path_length + 1, &ceed_parent->jit_source_roots[index])); memcpy(ceed_parent->jit_source_roots[index], jit_source_root, path_length); ceed_parent->num_jit_source_roots++; @@ -1089,12 +1035,12 @@ int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root) { @ref User **/ int CeedView(Ceed ceed, FILE *stream) { - int ierr; CeedMemType mem_type; - ierr = CeedGetPreferredMemType(ceed, &mem_type); CeedChk(ierr); + CeedCall(CeedGetPreferredMemType(ceed, &mem_type)); - fprintf(stream, "Ceed\n" + fprintf(stream, + "Ceed\n" " Ceed Resource: %s\n" " Preferred MemType: %s\n", ceed->resource, CeedMemTypes[mem_type]); @@ -1111,45 +1057,38 @@ int CeedView(Ceed ceed, FILE *stream) { @ref User **/ int CeedDestroy(Ceed *ceed) { - int ierr; if (!*ceed || --(*ceed)->ref_count > 0) return CEED_ERROR_SUCCESS; - if ((*ceed)->delegate) { - ierr = CeedDestroy(&(*ceed)->delegate); CeedChk(ierr); - } + if ((*ceed)->delegate) CeedCall(CeedDestroy(&(*ceed)->delegate)); if ((*ceed)->obj_delegate_count > 0) { for (CeedInt i = 0; i < (*ceed)->obj_delegate_count; i++) { - ierr = CeedDestroy(&((*ceed)->obj_delegates[i].delegate)); CeedChk(ierr); - ierr = CeedFree(&(*ceed)->obj_delegates[i].obj_name); CeedChk(ierr); + CeedCall(CeedDestroy(&((*ceed)->obj_delegates[i].delegate))); + CeedCall(CeedFree(&(*ceed)->obj_delegates[i].obj_name)); } - ierr = CeedFree(&(*ceed)->obj_delegates); CeedChk(ierr); + CeedCall(CeedFree(&(*ceed)->obj_delegates)); } - if ((*ceed)->Destroy) { - ierr = (*ceed)->Destroy(*ceed); CeedChk(ierr); - } + if ((*ceed)->Destroy) CeedCall((*ceed)->Destroy(*ceed)); for (CeedInt i = 0; i < (*ceed)->num_jit_source_roots; i++) { - ierr = CeedFree(&(*ceed)->jit_source_roots[i]); CeedChk(ierr); + CeedCall(CeedFree(&(*ceed)->jit_source_roots[i])); } - ierr = CeedFree(&(*ceed)->jit_source_roots); CeedChk(ierr); + CeedCall(CeedFree(&(*ceed)->jit_source_roots)); - ierr = CeedFree(&(*ceed)->f_offsets); CeedChk(ierr); - ierr = CeedFree(&(*ceed)->resource); CeedChk(ierr); - ierr = CeedDestroy(&(*ceed)->op_fallback_ceed); CeedChk(ierr); - ierr = CeedFree(&(*ceed)->op_fallback_resource); CeedChk(ierr); - ierr = CeedFree(ceed); CeedChk(ierr); + CeedCall(CeedFree(&(*ceed)->f_offsets)); + CeedCall(CeedFree(&(*ceed)->resource)); + CeedCall(CeedDestroy(&(*ceed)->op_fallback_ceed)); + CeedCall(CeedFree(&(*ceed)->op_fallback_resource)); + CeedCall(CeedFree(ceed)); return CEED_ERROR_SUCCESS; } // LCOV_EXCL_START const char *CeedErrorFormat(Ceed ceed, const char *format, va_list *args) { - if (ceed->parent) - return CeedErrorFormat(ceed->parent, format, args); - if (ceed->op_fallback_parent) - return CeedErrorFormat(ceed->op_fallback_parent, format, args); + if (ceed->parent) return CeedErrorFormat(ceed->parent, format, args); + if (ceed->op_fallback_parent) return CeedErrorFormat(ceed->op_fallback_parent, format, args); // Using pointer to va_list for better FFI, but clang-tidy can't verify va_list is initalized - vsnprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, format, *args); // NOLINT + vsnprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, format, *args); // NOLINT return ceed->err_msg; } // LCOV_EXCL_STOP @@ -1159,20 +1098,17 @@ const char *CeedErrorFormat(Ceed ceed, const char *format, va_list *args) { @ref Developer **/ -int CeedErrorImpl(Ceed ceed, const char *filename, int lineno, const char *func, - int ecode, const char *format, ...) { +int CeedErrorImpl(Ceed ceed, const char *filename, int lineno, const char *func, int ecode, const char *format, ...) { va_list args; - int ret_val; + int ret_val; va_start(args, format); if (ceed) { ret_val = ceed->Error(ceed, filename, lineno, func, ecode, format, &args); } else { // LCOV_EXCL_START const char *ceed_error_handler = getenv("CEED_ERROR_HANDLER"); - if (!ceed_error_handler) - ceed_error_handler = "abort"; - if (!strcmp(ceed_error_handler, "return")) - ret_val = CeedErrorReturn(ceed, filename, lineno, func, ecode, format, &args); + if (!ceed_error_handler) ceed_error_handler = "abort"; + if (!strcmp(ceed_error_handler, "return")) ret_val = CeedErrorReturn(ceed, filename, lineno, func, ecode, format, &args); else // This function will not return ret_val = CeedErrorAbort(ceed, filename, lineno, func, ecode, format, &args); @@ -1190,9 +1126,7 @@ int CeedErrorImpl(Ceed ceed, const char *filename, int lineno, const char *func, @ref Developer **/ // LCOV_EXCL_START -int CeedErrorReturn(Ceed ceed, const char *filename, int line_no, - const char *func, int err_code, const char *format, - va_list *args) { +int CeedErrorReturn(Ceed ceed, const char *filename, int line_no, const char *func, int err_code, const char *format, va_list *args) { return err_code; } // LCOV_EXCL_STOP @@ -1206,23 +1140,16 @@ int CeedErrorReturn(Ceed ceed, const char *filename, int line_no, @ref Developer **/ // LCOV_EXCL_START -int CeedErrorStore(Ceed ceed, const char *filename, int line_no, - const char *func, int err_code, const char *format, - va_list *args) { - if (ceed->parent) - return CeedErrorStore(ceed->parent, filename, line_no, func, err_code, format, - args); - if (ceed->op_fallback_parent) - return CeedErrorStore(ceed->op_fallback_parent, filename, line_no, func, - err_code, format, args); +int CeedErrorStore(Ceed ceed, const char *filename, int line_no, const char *func, int err_code, const char *format, va_list *args) { + if (ceed->parent) return CeedErrorStore(ceed->parent, filename, line_no, func, err_code, format, args); + if (ceed->op_fallback_parent) return CeedErrorStore(ceed->op_fallback_parent, filename, line_no, func, err_code, format, args); // Build message int len; - len = snprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, "%s:%d in %s(): ", - filename, line_no, func); + len = snprintf(ceed->err_msg, CEED_MAX_RESOURCE_LEN, "%s:%d in %s(): ", filename, line_no, func); // Using pointer to va_list for better FFI, but clang-tidy can't verify va_list is initalized // *INDENT-OFF* - vsnprintf(ceed->err_msg + len, CEED_MAX_RESOURCE_LEN - len, format, *args); // NOLINT + vsnprintf(ceed->err_msg + len, CEED_MAX_RESOURCE_LEN - len, format, *args); // NOLINT // *INDENT-ON* return err_code; } @@ -1236,9 +1163,7 @@ int CeedErrorStore(Ceed ceed, const char *filename, int line_no, @ref Developer **/ // LCOV_EXCL_START -int CeedErrorAbort(Ceed ceed, const char *filename, int line_no, - const char *func, int err_code, const char *format, - va_list *args) { +int CeedErrorAbort(Ceed ceed, const char *filename, int line_no, const char *func, int err_code, const char *format, va_list *args) { fprintf(stderr, "%s:%d in %s(): ", filename, line_no, func); vfprintf(stderr, format, *args); fprintf(stderr, "\n"); @@ -1257,11 +1182,10 @@ int CeedErrorAbort(Ceed ceed, const char *filename, int line_no, @ref Developer **/ -int CeedErrorExit(Ceed ceed, const char *filename, int line_no, - const char *func, int err_code, const char *format, va_list *args) { +int CeedErrorExit(Ceed ceed, const char *filename, int line_no, const char *func, int err_code, const char *format, va_list *args) { fprintf(stderr, "%s:%d in %s(): ", filename, line_no, func); // Using pointer to va_list for better FFI, but clang-tidy can't verify va_list is initalized - vfprintf(stderr, format, *args); // NOLINT + vfprintf(stderr, format, *args); // NOLINT fprintf(stderr, "\n"); exit(err_code); return err_code; @@ -1279,8 +1203,7 @@ int CeedErrorExit(Ceed ceed, const char *filename, int line_no, int CeedSetErrorHandler(Ceed ceed, CeedErrorHandler handler) { ceed->Error = handler; if (ceed->delegate) CeedSetErrorHandler(ceed->delegate, handler); - for (CeedInt i=0; iobj_delegate_count; i++) - CeedSetErrorHandler(ceed->obj_delegates[i].delegate, handler); + for (CeedInt i = 0; i < ceed->obj_delegate_count; i++) CeedSetErrorHandler(ceed->obj_delegates[i].delegate, handler); return CEED_ERROR_SUCCESS; } @@ -1296,10 +1219,8 @@ int CeedSetErrorHandler(Ceed ceed, CeedErrorHandler handler) { @ref Developer **/ int CeedGetErrorMessage(Ceed ceed, const char **err_msg) { - if (ceed->parent) - return CeedGetErrorMessage(ceed->parent, err_msg); - if (ceed->op_fallback_parent) - return CeedGetErrorMessage(ceed->op_fallback_parent, err_msg); + if (ceed->parent) return CeedGetErrorMessage(ceed->parent, err_msg); + if (ceed->op_fallback_parent) return CeedGetErrorMessage(ceed->op_fallback_parent, err_msg); *err_msg = ceed->err_msg; return CEED_ERROR_SUCCESS; } @@ -1316,10 +1237,8 @@ int CeedGetErrorMessage(Ceed ceed, const char **err_msg) { @ref Developer **/ int CeedResetErrorMessage(Ceed ceed, const char **err_msg) { - if (ceed->parent) - return CeedResetErrorMessage(ceed->parent, err_msg); - if (ceed->op_fallback_parent) - return CeedResetErrorMessage(ceed->op_fallback_parent, err_msg); + if (ceed->parent) return CeedResetErrorMessage(ceed->parent, err_msg); + if (ceed->op_fallback_parent) return CeedResetErrorMessage(ceed->op_fallback_parent, err_msg); *err_msg = NULL; memcpy(ceed->err_msg, "No error message stored", 24); return CEED_ERROR_SUCCESS; @@ -1355,5 +1274,4 @@ int CeedGetScalarType(CeedScalarType *scalar_type) { return 0; } - /// @} diff --git a/python/tests/libceed-qfunctions.c b/python/tests/libceed-qfunctions.c index 8b4b7018c1..0bf032a73a 100644 --- a/python/tests/libceed-qfunctions.c +++ b/python/tests/libceed-qfunctions.c @@ -10,8 +10,7 @@ // Redefine QFunction Macro // ----------------------------------------------------------------------------- #undef CEED_QFUNCTION -#define CEED_QFUNCTION(name) \ - extern int name +#define CEED_QFUNCTION(name) extern int name // ----------------------------------------------------------------------------- // QFunction Sources diff --git a/python/tests/test-qfunctions.h b/python/tests/test-qfunctions.h index 8abd6a0700..af78f6eebf 100644 --- a/python/tests/test-qfunctions.h +++ b/python/tests/test-qfunctions.h @@ -8,9 +8,7 @@ //------------------------------------------------------------------------------ // Setup 1D mass matrix //------------------------------------------------------------------------------ -CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is quadrature weights, size (Q) // in[1] is Jacobians, size (Q) const CeedScalar *w = in[0], *J = in[1]; @@ -19,10 +17,7 @@ CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, CeedScalar *qdata = out[0]; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedInt type = -1; CeedInit(argv[1], &ceed); CeedGetPreferredMemType(ceed, (CeedMemType *)&type); - if (type == -1) - // LCOV_EXCL_START - printf("Error getting preferred memory type. %" CeedInt_FMT - "\n", type); - // LCOV_EXCL_STOP + if (type == -1) printf("Error getting preferred memory type. %" CeedInt_FMT "\n", type); CeedDestroy(&ceed); return 0; diff --git a/tests/t002-ceed.c b/tests/t002-ceed.c index ef9f5b1239..292bc64949 100644 --- a/tests/t002-ceed.c +++ b/tests/t002-ceed.c @@ -1,11 +1,11 @@ /// @file /// Test return of a CEED object full resource name /// \test Test return of a CEED object full resource name -#include #include +#include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; const char *backend = argv[1]; const char *resource; @@ -13,21 +13,11 @@ int main(int argc, char **argv) { CeedGetResource(ceed, &resource); - const size_t resource_length = strlen(resource); - const bool is_exact_match = strcmp(resource, backend) == 0; - const bool is_match_with_query_arguments = - // LCOV_EXCL_START - !is_exact_match - && memcmp(resource, backend, resource_length) == 0 - && backend[resource_length] == ':'; - // LCOV_EXCL_STOP + const size_t resource_length = strlen(resource); + const bool is_exact_match = strcmp(resource, backend) == 0; + const bool is_match_with_query_arguments = !is_exact_match && memcmp(resource, backend, resource_length) == 0 && backend[resource_length] == ':'; - if (!is_exact_match && !is_match_with_query_arguments) { - // LCOV_EXCL_START - return CeedError(ceed, 1, "Incorrect full resource name: %s != %s\n", - resource, backend); - // LCOV_EXCL_STOP - } + if (!is_exact_match && !is_match_with_query_arguments) return CeedError(ceed, 1, "Incorrect full resource name: %s != %s\n", resource, backend); CeedDestroy(&ceed); return 0; diff --git a/tests/t003-ceed.c b/tests/t003-ceed.c index 602074f3a6..c9accdedf8 100644 --- a/tests/t003-ceed.c +++ b/tests/t003-ceed.c @@ -1,8 +1,8 @@ /// @file /// Test viewing of a CEED object full /// \test Test viewing of a CEED object -#include #include +#include int main(int argc, char **argv) { Ceed ceed; diff --git a/tests/t005-ceed.c b/tests/t005-ceed.c index 770492cb58..1431183ed6 100644 --- a/tests/t005-ceed.c +++ b/tests/t005-ceed.c @@ -12,16 +12,13 @@ int main(int argc, char **argv) { // Check for standard message with default handler const char *err_msg = NULL; CeedGetErrorMessage(ceed, &err_msg); - if (strcmp(err_msg, "No error message stored")) - // LCOV_EXCL_START - printf("Unexpected error message received: \"%s\"\n", err_msg); - // LCOV_EXCL_STOP + if (strcmp(err_msg, "No error message stored")) printf("Unexpected error message received: \"%s\"\n", err_msg); // Set error handler to store error message CeedSetErrorHandler(ceed, CeedErrorStore); // Generate error - CeedVector vec; + CeedVector vec; CeedScalar *array; CeedVectorCreate(ceed, 10, &vec); CeedVectorGetArray(vec, CEED_MEM_HOST, &array); @@ -29,18 +26,12 @@ int main(int argc, char **argv) { // Check error message CeedGetErrorMessage(ceed, &err_msg); - if (!err_msg || !strcmp(err_msg, "No error message stored\n")) - // LCOV_EXCL_START - printf("Unexpected error message received: \"%s\"\n", err_msg); - // LCOV_EXCL_STOP + if (!err_msg || !strcmp(err_msg, "No error message stored\n")) printf("Unexpected error message received: \"%s\"\n", err_msg); CeedResetErrorMessage(ceed, &err_msg); // Check error message reset CeedGetErrorMessage(ceed, &err_msg); - if (strcmp(err_msg, "No error message stored")) - // LCOV_EXCL_START - printf("Unexpected error message received: \"%s\"\n", err_msg); - // LCOV_EXCL_STOP + if (strcmp(err_msg, "No error message stored")) printf("Unexpected error message received: \"%s\"\n", err_msg); // Cleanup CeedVectorRestoreArray(vec, &array); diff --git a/tests/t007-ceed.c b/tests/t007-ceed.c index 45b854d579..6094e7811e 100644 --- a/tests/t007-ceed.c +++ b/tests/t007-ceed.c @@ -7,7 +7,8 @@ int main(int argc, char **argv) { Ceed ceed; size_t end_index; - for (end_index = 0; argv[1][end_index]; end_index++) {} + for (end_index = 0; argv[1][end_index]; end_index++) { + } argv[1][end_index - 1] -= 1; CeedInit(argv[1], &ceed); diff --git a/tests/t009-ceed.c b/tests/t009-ceed.c index 6547087a3f..350b48018b 100644 --- a/tests/t009-ceed.c +++ b/tests/t009-ceed.c @@ -9,17 +9,14 @@ int main(int argc, char **argv) { CeedInit(argv[1], &ceed); CeedInit("/cpu/self/ref/serial", &ceed_2); - CeedReferenceCopy(ceed, &ceed_2); // This destroys the previous ceed_2 - if (ceed != ceed_2) - // LCOV_EXCL_START - printf("Error copying Ceed reference\n"); - // LCOV_EXCL_STOP + CeedReferenceCopy(ceed, &ceed_2); // This destroys the previous ceed_2 + if (ceed != ceed_2) printf("Error copying Ceed reference\n"); CeedDestroy(&ceed); CeedMemType type; - CeedGetPreferredMemType(ceed_2, &type); // Second reference still valid + CeedGetPreferredMemType(ceed_2, &type); // Second reference still valid - CeedDestroy(&ceed_2); // Both references should be destroyed + CeedDestroy(&ceed_2); // Both references should be destroyed return 0; } diff --git a/tests/t100-vector.c b/tests/t100-vector.c index 79d8c66996..531d3655ea 100644 --- a/tests/t100-vector.c +++ b/tests/t100-vector.c @@ -4,27 +4,23 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; - CeedScalar a[10]; + Ceed ceed; + CeedVector x; + CeedInt n; + CeedScalar a[10]; const CeedScalar *b; CeedInit(argv[1], &ceed); n = 10; CeedVectorCreate(ceed, n, &x); - for (CeedInt i=0; i + static int CheckValues(Ceed ceed, CeedVector x, CeedScalar value) { const CeedScalar *b; - CeedSize n; + CeedSize n; CeedVectorGetLength(x, &n); CeedVectorGetArrayRead(x, CEED_MEM_HOST, &b); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; + Ceed ceed; + CeedVector x; + CeedInt n; const CeedScalar *a, *b; CeedInit(argv[1], &ceed); diff --git a/tests/t103-vector.c b/tests/t103-vector.c index 2ee3dd0ee6..f501c4819d 100644 --- a/tests/t103-vector.c +++ b/tests/t103-vector.c @@ -4,11 +4,11 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector X, Y; - CeedInt n; - CeedScalar a[10]; - CeedScalar *x; + Ceed ceed; + CeedVector X, Y; + CeedInt n; + CeedScalar a[10]; + CeedScalar *x; const CeedScalar *y; CeedInit(argv[1], &ceed); @@ -17,8 +17,7 @@ int main(int argc, char **argv) { CeedVectorCreate(ceed, n, &X); CeedVectorCreate(ceed, n, &Y); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; + Ceed ceed; + CeedVector x; const CeedInt n = 10; - CeedScalar a[n]; - CeedScalar *b; + CeedScalar a[n]; + CeedScalar *b; CeedInit(argv[1], &ceed); CeedVectorCreate(ceed, n, &x); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedVector y; - CeedInt n; - CeedScalar a[10]; + Ceed ceed; + CeedVector x; + CeedVector y; + CeedInt n; + CeedScalar a[10]; const CeedScalar *b, *c; CeedInit(argv[1], &ceed); @@ -16,8 +16,7 @@ int main(int argc, char **argv) { n = 10; CeedVectorCreate(ceed, n, &x); CeedVectorCreate(ceed, n, &y); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedVector y; - CeedInt n; - CeedScalar a[10], b[10]; + Ceed ceed; + CeedVector x; + CeedVector y; + CeedInt n; + CeedScalar a[10], b[10]; const CeedScalar *c; CeedInit(argv[1], &ceed); @@ -16,12 +16,10 @@ int main(int argc, char **argv) { n = 10; CeedVectorCreate(ceed, n, &x); CeedVectorCreate(ceed, n, &y); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedVector x; - CeedInt n; + CeedInt n; CeedScalar a[10]; CeedInit(argv[1], &ceed); n = 10; CeedVectorCreate(ceed, n, &x); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedVector x; - CeedInt n = 10; + CeedInt n = 10; CeedScalar a[10]; CeedInit(argv[1], &ceed); CeedVectorCreate(ceed, n, &x); - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error: L1 norm %f != 45.\n", norm); - // LCOV_EXCL_STOP + if (fabs(norm - 45.) > 100. * CEED_EPSILON) printf("Error: L1 norm %f != 45.\n", norm); CeedVectorNorm(x, CEED_NORM_2, &norm); - if (fabs(norm - sqrt(285.)) > 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error: L2 norm %f != sqrt(285.)\n", norm); - // LCOV_EXCL_STOP + if (fabs(norm - sqrt(285.)) > 100. * CEED_EPSILON) printf("Error: L2 norm %f != sqrt(285.)\n", norm); CeedVectorNorm(x, CEED_NORM_MAX, &norm); - if (fabs(norm - 9.) > 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error: Max norm %f != 9.\n", norm); - // LCOV_EXCL_STOP + if (fabs(norm - 9.) > 100. * CEED_EPSILON) printf("Error: Max norm %f != 9.\n", norm); CeedVectorDestroy(&x); CeedDestroy(&ceed); diff --git a/tests/t109-vector.c b/tests/t109-vector.c index 0a7adef0bb..14cf3c8f98 100644 --- a/tests/t109-vector.c +++ b/tests/t109-vector.c @@ -5,39 +5,32 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - const CeedInt n = 10; - CeedScalar a[n]; - CeedScalar *b, *c; + Ceed ceed; + CeedVector x; + const CeedInt n = 10; + CeedScalar a[n]; + CeedScalar *b, *c; const CeedScalar *d; CeedInit(argv[1], &ceed); CeedVectorCreate(ceed, n, &x); - for (CeedInt i=0; i 10.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error taking array c[3] = %f\n", (CeedScalar)c[3]); - // LCOV_EXCL_STOP + if (fabs(c[3] + 3.14) > 10. * CEED_EPSILON) printf("Error taking array c[3] = %f\n", (CeedScalar)c[3]); // Getting array should not modify a CeedVectorGetArrayWrite(x, CEED_MEM_HOST, &b); b[5] = -3.14; CeedVectorRestoreArray(x, &b); - if (fabs(a[5] + 3.14) < 10.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error protecting array a[3] = %f\n", (CeedScalar)a[3]); - // LCOV_EXCL_STOP + if (fabs(a[5] + 3.14) < 10. * CEED_EPSILON) printf("Error protecting array a[3] = %f\n", (CeedScalar)a[3]); -// Note: We do not need to free c because c == a was stack allocated. + // Note: We do not need to free c because c == a was stack allocated. CeedVectorDestroy(&x); // Test with a size zero vector diff --git a/tests/t110-vector.c b/tests/t110-vector.c index afa06f2744..11cf5ea403 100644 --- a/tests/t110-vector.c +++ b/tests/t110-vector.c @@ -4,9 +4,9 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; + Ceed ceed; + CeedVector x; + CeedInt n; CeedScalar *a, *b; CeedInit(argv[1], &ceed); diff --git a/tests/t111-vector.c b/tests/t111-vector.c index f92fe7a4c0..2f931e4b9d 100644 --- a/tests/t111-vector.c +++ b/tests/t111-vector.c @@ -4,9 +4,9 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; + Ceed ceed; + CeedVector x; + CeedInt n; CeedScalar *a, b[10]; CeedInit(argv[1], &ceed); diff --git a/tests/t112-vector.c b/tests/t112-vector.c index 361fd3083a..48ef55a2c2 100644 --- a/tests/t112-vector.c +++ b/tests/t112-vector.c @@ -4,9 +4,9 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; + Ceed ceed; + CeedVector x; + CeedInt n; CeedScalar *a; CeedInit(argv[1], &ceed); diff --git a/tests/t113-vector.c b/tests/t113-vector.c index 399bc561c8..6b6e5a9844 100644 --- a/tests/t113-vector.c +++ b/tests/t113-vector.c @@ -4,11 +4,11 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; + Ceed ceed; + CeedVector x; + CeedInt n; const CeedScalar *a; - CeedScalar *b; + CeedScalar *b; CeedInit(argv[1], &ceed); diff --git a/tests/t114-vector.c b/tests/t114-vector.c index 42cfee7b5a..e10ce6e4b3 100644 --- a/tests/t114-vector.c +++ b/tests/t114-vector.c @@ -4,30 +4,25 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n = 10; - CeedScalar a[10]; + Ceed ceed; + CeedVector x; + CeedInt n = 10; + CeedScalar a[10]; const CeedScalar *b; CeedInit(argv[1], &ceed); CeedVectorCreate(ceed, n, &x); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; - CeedScalar *a; + Ceed ceed; + CeedVector x; + CeedInt n; + CeedScalar *a; const CeedScalar *b; CeedInit(argv[1], &ceed); diff --git a/tests/t116-vector.c b/tests/t116-vector.c index 8a7f39aa83..58a30d9250 100644 --- a/tests/t116-vector.c +++ b/tests/t116-vector.c @@ -4,9 +4,9 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; + Ceed ceed; + CeedVector x; + CeedInt n; CeedScalar *a; CeedInit(argv[1], &ceed); diff --git a/tests/t117-vector.c b/tests/t117-vector.c index 3fc7ca8939..af761d36f0 100644 --- a/tests/t117-vector.c +++ b/tests/t117-vector.c @@ -4,9 +4,9 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n = 10; + Ceed ceed; + CeedVector x; + CeedInt n = 10; CeedScalar *a; CeedInit(argv[1], &ceed); diff --git a/tests/t118-vector.c b/tests/t118-vector.c index 275dca58ed..12f4d4313f 100644 --- a/tests/t118-vector.c +++ b/tests/t118-vector.c @@ -4,9 +4,9 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; + Ceed ceed; + CeedVector x; + CeedInt n; CeedScalar *a; CeedInit(argv[1], &ceed); diff --git a/tests/t119-vector.c b/tests/t119-vector.c index fba3cb9d6a..372e91c13d 100644 --- a/tests/t119-vector.c +++ b/tests/t119-vector.c @@ -1,22 +1,21 @@ /// @file /// Test taking the reciprocal of a vector /// \test Test taking the reciprocal of a vector -#include #include +#include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; - CeedScalar a[10]; + Ceed ceed; + CeedVector x; + CeedInt n; + CeedScalar a[10]; const CeedScalar *b; CeedInit(argv[1], &ceed); n = 10; CeedVectorCreate(ceed, n, &x); - for (CeedInt i=0; i 10.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error reading array b[%" CeedInt_FMT - "] = %f\n",i,(CeedScalar)b[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < n; i++) { + if (fabs(b[i] - 1. / (10 + i)) > 10. * CEED_EPSILON) printf("Error reading array b[%" CeedInt_FMT "] = %f\n", i, (CeedScalar)b[i]); + } CeedVectorRestoreArrayRead(x, &b); CeedVectorDestroy(&x); diff --git a/tests/t120-vector.c b/tests/t120-vector.c index 4c34cce11e..02075797a1 100644 --- a/tests/t120-vector.c +++ b/tests/t120-vector.c @@ -4,25 +4,22 @@ #include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedVector x, x_2; - CeedInt n; + CeedInt n; CeedInit(argv[1], &ceed); n = 10; CeedVectorCreate(ceed, n, &x); - CeedVectorCreate(ceed, n+1, &x_2); + CeedVectorCreate(ceed, n + 1, &x_2); - CeedVectorReferenceCopy(x, &x_2); // This destroys the previous x_2 + CeedVectorReferenceCopy(x, &x_2); // This destroys the previous x_2 CeedVectorDestroy(&x); CeedSize len; - CeedVectorGetLength(x_2, &len); // Second reference still valid - if (len != n) - // LCOV_EXCL_START - printf("Error copying CeedVector reference\n"); - // LCOV_EXCL_STOP + CeedVectorGetLength(x_2, &len); // Second reference still valid + if (len != n) printf("Error copying CeedVector reference\n"); CeedVectorDestroy(&x_2); CeedDestroy(&ceed); diff --git a/tests/t121-vector.c b/tests/t121-vector.c index adc64155e9..d81aec5af9 100644 --- a/tests/t121-vector.c +++ b/tests/t121-vector.c @@ -5,10 +5,10 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt n; - CeedScalar a[10]; + Ceed ceed; + CeedVector x, y; + CeedInt n; + CeedScalar a[10]; const CeedScalar *b; CeedInit(argv[1], &ceed); @@ -16,8 +16,7 @@ int main(int argc, char **argv) { n = 10; CeedVectorCreate(ceed, n, &x); CeedVectorCreate(ceed, n, &y); - for (CeedInt i=0; i 1e-14) + for (CeedInt i = 0; i < n; i++) { + if (fabs(b[i] - (10.0 + i) / 2) > 1e-14) { // LCOV_EXCL_START - printf("Error in alpha x + y at index %" CeedInt_FMT - ", computed: %f actual: %f\n", i, b[i], (10.0 + i)/2); - // LCOV_EXCL_STOP + printf("Error in alpha x + y at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, b[i], (10.0 + i) / 2); + // LCOV_EXCL_STOP + } + } CeedVectorRestoreArrayRead(y, &b); CeedVectorDestroy(&x); diff --git a/tests/t122-vector.c b/tests/t122-vector.c index 168846ee6e..9a3f3d878e 100644 --- a/tests/t122-vector.c +++ b/tests/t122-vector.c @@ -5,10 +5,10 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y, w; - CeedInt n; - CeedScalar a[10]; + Ceed ceed; + CeedVector x, y, w; + CeedInt n; + CeedScalar a[10]; const CeedScalar *b; CeedInit(argv[1], &ceed); @@ -17,42 +17,44 @@ int main(int argc, char **argv) { CeedVectorCreate(ceed, n, &x); CeedVectorCreate(ceed, n, &y); CeedVectorCreate(ceed, n, &w); - for (CeedInt i=0; i 1e-14) + for (CeedInt i = 0; i < n; i++) { + if (fabs(b[i] - i * i) > 1e-14) { // LCOV_EXCL_START - printf("Error in w = x .* y at index %" CeedInt_FMT - ", computed: %f actual: %f\n", i, b[i], 1.0*i*i); - // LCOV_EXCL_STOP + printf("Error in w = x .* y at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, b[i], 1.0 * i * i); + // LCOV_EXCL_STOP + } + } CeedVectorRestoreArrayRead(w, &b); // Test multiplying two vectors into one of the two CeedVectorPointwiseMult(w, w, y); CeedVectorGetArrayRead(w, CEED_MEM_HOST, &b); - for (CeedInt i=0; i 1e-14) + for (CeedInt i = 0; i < n; i++) { + if (fabs(b[i] - i * i * i) > 1e-14) { // LCOV_EXCL_START - printf("Error in w = w .* y at index %" CeedInt_FMT - ", computed: %f actual: %f\n", i, b[i], 1.0*i*i*i); - // LCOV_EXCL_STOP + printf("Error in w = w .* y at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, b[i], 1.0 * i * i * i); + // LCOV_EXCL_STOP + } + } CeedVectorRestoreArrayRead(w, &b); // Test multiplying two vectors into one of the two CeedVectorPointwiseMult(w, x, w); CeedVectorGetArrayRead(w, CEED_MEM_HOST, &b); - for (CeedInt i=0; i 1e-14) + for (CeedInt i = 0; i < n; i++) { + if (fabs(b[i] - i * i * i * i) > 1e-14) { // LCOV_EXCL_START - printf("Error in w = x .* w at index %" CeedInt_FMT - ", computed: %f actual: %f\n", i, b[i], 1.0*i*i*i*i); - // LCOV_EXCL_STOP + printf("Error in w = x .* w at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, b[i], 1.0 * i * i * i * i); + // LCOV_EXCL_STOP + } + } CeedVectorRestoreArrayRead(w, &b); // Test multiplying vector by itself and putting product into self @@ -64,12 +66,13 @@ int main(int argc, char **argv) { } CeedVectorPointwiseMult(y, y, y); CeedVectorGetArrayRead(y, CEED_MEM_HOST, &b); - for (CeedInt i=0; i 1e-14) + for (CeedInt i = 0; i < n; i++) { + if (fabs(b[i] - i * i) > 1e-14) { // LCOV_EXCL_START - printf("Error in y = y .* y at index %" CeedInt_FMT - ", computed: %f actual: %f\n", i, b[i], 1.0*i*i); - // LCOV_EXCL_STOP + printf("Error in y = y .* y at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, b[i], 1.0 * i * i); + // LCOV_EXCL_STOP + } + } CeedVectorRestoreArrayRead(y, &b); CeedVectorDestroy(&x); diff --git a/tests/t123-vector.c b/tests/t123-vector.c index 386c921c61..92fab12cad 100644 --- a/tests/t123-vector.c +++ b/tests/t123-vector.c @@ -5,18 +5,17 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; - CeedInt n; - CeedScalar a[10]; + Ceed ceed; + CeedVector x; + CeedInt n; + CeedScalar a[10]; const CeedScalar *b; CeedInit(argv[1], &ceed); n = 10; CeedVectorCreate(ceed, n, &x); - for (CeedInt i=0; i 1e-14) + for (CeedInt i = 0; i < n; i++) { + if (fabs(b[i] + (10.0 + i) / 2) > 1e-14) { // LCOV_EXCL_START - printf("Error in alpha x at index %" CeedInt_FMT - ", computed: %f actual: %f\n", i, b[i], -(10.0 + i)/2); - // LCOV_EXCL_STOP + printf("Error in alpha x at index %" CeedInt_FMT ", computed: %f actual: %f\n", i, b[i], -(10.0 + i) / 2); + // LCOV_EXCL_STOP + } + } CeedVectorRestoreArrayRead(x, &b); CeedVectorDestroy(&x); diff --git a/tests/t124-vector.c b/tests/t124-vector.c index ad56135d7f..aaebef523e 100644 --- a/tests/t124-vector.c +++ b/tests/t124-vector.c @@ -4,26 +4,23 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x; + Ceed ceed; + CeedVector x; const CeedInt n = 10; - CeedScalar *a; + CeedScalar *a; CeedInit(argv[1], &ceed); CeedVectorCreate(ceed, n, &x); CeedVectorGetArrayWrite(x, CEED_MEM_HOST, &a); - for (CeedInt i = 0; i < n; i++) - a[i] = 3*i; + for (CeedInt i = 0; i < n; i++) a[i] = 3 * i; CeedVectorRestoreArray(x, &a); CeedVectorGetArrayRead(x, CEED_MEM_HOST, (const CeedScalar **)&a); - for (CeedInt i = 0; i < n; i++) - if (a[i] != (CeedScalar)(3*i)) - // LCOV_EXCL_START - printf("Error writing array a[%" CeedInt_FMT "] = %f\n", i, a[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < n; i++) { + if (a[i] != (CeedScalar)(3 * i)) printf("Error writing array a[%" CeedInt_FMT "] = %f\n", i, a[i]); + } CeedVectorRestoreArrayRead(x, (const CeedScalar **)&a); CeedVectorDestroy(&x); diff --git a/tests/t200-elemrestriction.c b/tests/t200-elemrestriction.c index 9690bd6370..25849732ea 100644 --- a/tests/t200-elemrestriction.c +++ b/tests/t200-elemrestriction.c @@ -4,37 +4,32 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 3; - CeedInt ind[2*num_elem]; - CeedScalar a[num_elem+1]; - const CeedScalar *yy; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 3; + CeedInt ind[2 * num_elem]; + CeedScalar a[num_elem + 1]; + const CeedScalar *yy; CeedElemRestriction r; CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_elem+1, &x); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 3; - CeedScalar a[num_elem*2]; - const CeedScalar *yy; - CeedInt strides[3] = {1, 2, 2}; - CeedInt layout[3]; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 3; + CeedScalar a[num_elem * 2]; + const CeedScalar *yy; + CeedInt strides[3] = {1, 2, 2}; + CeedInt layout[3]; CeedElemRestriction r; CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_elem*2, &x); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 8; - CeedInt elem_size = 2; - CeedInt num_blk = 2; - CeedInt blk_size = 5; - CeedInt ind[elem_size*num_elem]; - CeedScalar a[num_elem + 1]; - const CeedScalar *xx, *yy; - CeedInt layout[3]; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 8; + CeedInt elem_size = 2; + CeedInt num_blk = 2; + CeedInt blk_size = 5; + CeedInt ind[elem_size * num_elem]; + CeedScalar a[num_elem + 1]; + const CeedScalar *xx, *yy; + CeedInt layout[3]; CeedElemRestriction r; CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_elem+1, &x); - for (CeedInt i=0; i 0 && i < num_elem ? 2.0 : 1.0)) - // LCOV_EXCL_START - printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", - i, (double)xx[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_elem + 1; i++) { + if (xx[i] != (10 + i) * (i > 0 && i < num_elem ? 2.0 : 1.0)) printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", i, (double)xx[i]); + } CeedVectorRestoreArrayRead(x, &xx); CeedVectorDestroy(&x); diff --git a/tests/t203-elemrestriction.c b/tests/t203-elemrestriction.c index aa4d6153cf..43fa9e63ea 100644 --- a/tests/t203-elemrestriction.c +++ b/tests/t203-elemrestriction.c @@ -5,70 +5,68 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 8; - CeedInt elem_size = 2; - CeedInt num_blk = 2; - CeedInt blk_size = 5; - CeedInt num_comp = 3; - CeedInt ind[elem_size*num_elem]; - CeedScalar a[num_comp*(num_elem + 1)]; - const CeedScalar *xx, *yy; - CeedInt layout[3]; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 8; + CeedInt elem_size = 2; + CeedInt num_blk = 2; + CeedInt blk_size = 5; + CeedInt num_comp = 3; + CeedInt ind[elem_size * num_elem]; + CeedScalar a[num_comp * (num_elem + 1)]; + const CeedScalar *xx, *yy; + CeedInt layout[3]; CeedElemRestriction r; CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_comp*(num_elem+1), &x); - for (CeedInt i=0; i 0 && i < num_elem ? 2.0 : 1.0)) + for (CeedInt i = 0; i < num_elem + 1; i++) { + for (CeedInt j = 0; j < num_comp; j++) { + if (xx[i + j * (num_elem + 1)] != ((j + 1) * 10 + i) * (i > 0 && i < num_elem ? 2.0 : 1.0)) { // LCOV_EXCL_START - printf("Error in restricted array x[%" CeedInt_FMT - "][%" CeedInt_FMT "] = %f\n", - j, i, (double)xx[i+j*(num_elem+1)]); - // LCOV_EXCL_STOP + printf("Error in restricted array x[%" CeedInt_FMT "][%" CeedInt_FMT "] = %f\n", j, i, (double)xx[i + j * (num_elem + 1)]); + // LCOV_EXCL_STOP + } } } CeedVectorRestoreArrayRead(x, &xx); diff --git a/tests/t204-elemrestriction.c b/tests/t204-elemrestriction.c index c22eaffde4..b929dd4472 100644 --- a/tests/t204-elemrestriction.c +++ b/tests/t204-elemrestriction.c @@ -5,33 +5,31 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 3; - CeedInt ind[2*num_elem]; - CeedInt layout[3]; - CeedScalar a[2*(num_elem+1)]; - const CeedScalar *yy; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 3; + CeedInt ind[2 * num_elem]; + CeedInt layout[3]; + CeedScalar a[2 * (num_elem + 1)]; + const CeedScalar *yy; CeedElemRestriction r; CeedInit(argv[1], &ceed); // Setup - CeedVectorCreate(ceed, 2*(num_elem+1), &x); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 3; - CeedInt ind[2*num_elem]; - CeedInt layout[3]; - CeedScalar a[2*(num_elem+1)]; - const CeedScalar *yy; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 3; + CeedInt ind[2 * num_elem]; + CeedInt layout[3]; + CeedScalar a[2 * (num_elem + 1)]; + const CeedScalar *yy; CeedElemRestriction r; CeedInit(argv[1], &ceed); // Setup - CeedVectorCreate(ceed, 2*(num_elem+1), &x); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 5; - CeedInt ind[2*num_elem]; - CeedInt layout[3]; - CeedScalar mult; - CeedScalar a[2*(num_elem*2)]; - const CeedScalar *yy; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 5; + CeedInt ind[2 * num_elem]; + CeedInt layout[3]; + CeedScalar mult; + CeedScalar a[2 * (num_elem * 2)]; + const CeedScalar *yy; CeedElemRestriction r; CeedInit(argv[1], &ceed); // Setup - CeedVectorCreate(ceed, 2*(num_elem*2), &x); + CeedVectorCreate(ceed, 2 * (num_elem * 2), &x); - for (CeedInt i=0; i0&&i 0 && i < num_elem ? 2 : 1; + if (yy[i] != i * mult) printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", i, (CeedScalar)yy[i], i * mult); + if (yy[i + num_elem + 1] != (10 + i) * mult) { // LCOV_EXCL_START - printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", - i+num_elem+1, (CeedScalar)yy[i+num_elem+1], (10.+i)*mult); - // LCOV_EXCL_STOP + printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", i + num_elem + 1, (CeedScalar)yy[i + num_elem + 1], (10. + i) * mult); + // LCOV_EXCL_STOP + } } CeedVectorRestoreArrayRead(y, &yy); diff --git a/tests/t207-elemrestriction.c b/tests/t207-elemrestriction.c index 212ba08a58..5a2a14377b 100644 --- a/tests/t207-elemrestriction.c +++ b/tests/t207-elemrestriction.c @@ -5,37 +5,38 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 5; - CeedInt ind[2*num_elem]; - CeedInt layout[3]; - CeedScalar mult; - CeedScalar a[2*(num_elem*2)]; - const CeedScalar *yy; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 5; + CeedInt ind[2 * num_elem]; + CeedInt layout[3]; + CeedScalar mult; + CeedScalar a[2 * (num_elem * 2)]; + const CeedScalar *yy; CeedElemRestriction r; CeedInit(argv[1], &ceed); // Setup - CeedVectorCreate(ceed, 2*(num_elem*2), &x); + CeedVectorCreate(ceed, 2 * (num_elem * 2), &x); - for (CeedInt i=0; i0&&i 0 && i < num_elem ? 2 : 1; + if (yy[2 * i] != i * mult) { // LCOV_EXCL_START - printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", - 2*i, (CeedScalar)yy[2*i], i*mult); - // LCOV_EXCL_STOP - if (yy[2*i+1] != (10+i)*mult) + printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", 2 * i, (CeedScalar)yy[2 * i], i * mult); + // LCOV_EXCL_STOP + } + if (yy[2 * i + 1] != (10 + i) * mult) { // LCOV_EXCL_START - printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", - 2*i+1, (CeedScalar)yy[2*i+1], (10.+i)*mult); - // LCOV_EXCL_STOP + printf("Error in restricted array y[%" CeedInt_FMT "] = %f != %f\n", 2 * i + 1, (CeedScalar)yy[2 * i + 1], (10. + i) * mult); + // LCOV_EXCL_STOP + } } CeedVectorRestoreArrayRead(y, &yy); diff --git a/tests/t208-elemrestriction.c b/tests/t208-elemrestriction.c index 90b175c6e2..8d62f426da 100644 --- a/tests/t208-elemrestriction.c +++ b/tests/t208-elemrestriction.c @@ -5,65 +5,61 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 8; - CeedInt elem_size = 2; - CeedInt blk_size = 5; - CeedInt ind[elem_size*num_elem]; - CeedScalar a[num_elem + 1]; - const CeedScalar *xx, *yy; - CeedInt layout[3]; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 8; + CeedInt elem_size = 2; + CeedInt blk_size = 5; + CeedInt ind[elem_size * num_elem]; + CeedScalar a[num_elem + 1]; + const CeedScalar *xx, *yy; + CeedInt layout[3]; CeedElemRestriction r; CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_elem+1, &x); - for (CeedInt i=0; i blk_size && i < num_elem ? 2.0 : 1.0)) + for (CeedInt i = blk_size; i < num_elem + 1; i++) { + if (xx[i] != (10 + i) * (i > blk_size && i < num_elem ? 2.0 : 1.0)) { // LCOV_EXCL_START - printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", - i, (double)xx[i]); - // LCOV_EXCL_STOP + printf("Error in restricted array x[%" CeedInt_FMT "] = %f\n", i, (double)xx[i]); + // LCOV_EXCL_STOP + } + } CeedVectorRestoreArrayRead(x, &xx); CeedVectorDestroy(&x); diff --git a/tests/t209-elemrestriction.c b/tests/t209-elemrestriction.c index 98dd980803..a53f14b073 100644 --- a/tests/t209-elemrestriction.c +++ b/tests/t209-elemrestriction.c @@ -4,36 +4,36 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector mult; - CeedInt num_elem = 3; - CeedInt ind[4*num_elem]; - const CeedScalar *mm; + Ceed ceed; + CeedVector mult; + CeedInt num_elem = 3; + CeedInt ind[4 * num_elem]; + const CeedScalar *mm; CeedElemRestriction r; CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, 3*num_elem+1, &mult); - CeedVectorSetValue(mult, 0); // Allocates array + CeedVectorCreate(ceed, 3 * num_elem + 1, &mult); + CeedVectorSetValue(mult, 0); // Allocates array - for (CeedInt i=0; i 0 && i < 3*num_elem && (i%3==0) ? 1 : 0)) != mm[i]) + for (CeedInt i = 0; i < 3 * num_elem + 1; i++) { + if ((1 + (i > 0 && i < 3 * num_elem && (i % 3 == 0) ? 1 : 0)) != mm[i]) { // LCOV_EXCL_START - printf("Error in multiplicity vector: mult[%" CeedInt_FMT - "] = %f\n", i, (CeedScalar)mm[i]); - // LCOV_EXCL_STOP + printf("Error in multiplicity vector: mult[%" CeedInt_FMT "] = %f\n", i, (CeedScalar)mm[i]); + // LCOV_EXCL_STOP + } + } CeedVectorRestoreArrayRead(mult, &mm); CeedVectorDestroy(&mult); diff --git a/tests/t210-elemrestriction.c b/tests/t210-elemrestriction.c index cb5f01a86d..9c23b7513d 100644 --- a/tests/t210-elemrestriction.c +++ b/tests/t210-elemrestriction.c @@ -7,18 +7,17 @@ int main(int argc, char **argv) { Ceed ceed; CeedInt num_elem = 3; - CeedInt ind[2*num_elem]; + CeedInt ind[2 * num_elem]; CeedElemRestriction r; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 8; - CeedInt elem_size = 2; - CeedInt num_blk = 2; - CeedInt blk_size = 5; - CeedInt num_comp = 3; - CeedInt ind[elem_size*num_elem]; - CeedInt *ceed_ind = malloc(sizeof(CeedInt)*elem_size*num_elem); - CeedScalar a[num_comp*(num_elem + 1)]; - const CeedScalar *xx, *yy; - CeedInt layout[3]; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 8; + CeedInt elem_size = 2; + CeedInt num_blk = 2; + CeedInt blk_size = 5; + CeedInt num_comp = 3; + CeedInt ind[elem_size * num_elem]; + CeedInt *ceed_ind = malloc(sizeof(CeedInt) * elem_size * num_elem); + CeedScalar a[num_comp * (num_elem + 1)]; + const CeedScalar *xx, *yy; + CeedInt layout[3]; CeedElemRestriction r; CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_comp*(num_elem+1), &x); - for (CeedInt i=0; i 0 && i < num_elem ? 2.0 : 1.0)) + for (CeedInt i = 0; i < num_elem + 1; i++) { + for (CeedInt j = 0; j < num_comp; j++) { + if (xx[i + j * (num_elem + 1)] != ((j + 1) * 10 + i) * (i > 0 && i < num_elem ? 2.0 : 1.0)) { // LCOV_EXCL_START - printf("Error in restricted array x[%" CeedInt_FMT - "][%" CeedInt_FMT "] = %f\n", - j, i, (double)xx[i+j*(num_elem+1)]); - // LCOV_EXCL_STOP + printf("Error in restricted array x[%" CeedInt_FMT "][%" CeedInt_FMT "] = %f\n", j, i, (double)xx[i + j * (num_elem + 1)]); + // LCOV_EXCL_STOP + } } } CeedVectorRestoreArrayRead(x, &xx); diff --git a/tests/t214-elemrestriction.c b/tests/t214-elemrestriction.c index 332fd9f5ed..6037b8d69e 100644 --- a/tests/t214-elemrestriction.c +++ b/tests/t214-elemrestriction.c @@ -4,20 +4,19 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedInt num_elem = 3; - CeedInt ind[2*num_elem]; - const CeedInt *offsets; + Ceed ceed; + CeedInt num_elem = 3; + CeedInt ind[2 * num_elem]; + const CeedInt *offsets; CeedElemRestriction r; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedInt num_elem = 3; - CeedInt ind[2*num_elem]; - const CeedInt *offsets; + Ceed ceed; + CeedInt num_elem = 3; + CeedInt ind[2 * num_elem]; + const CeedInt *offsets; CeedElemRestriction r; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedInt num_elem = 3, comp_stride = 1; - CeedInt ind[2*num_elem]; + Ceed ceed; + CeedInt num_elem = 3, comp_stride = 1; + CeedInt ind[2 * num_elem]; CeedElemRestriction r, r_2; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector x, y; - CeedInt num_elem = 6, P = 2, dim = 1; - CeedInt ind[P*num_elem]; - bool orient[P*num_elem]; - CeedScalar a[num_elem+1]; - const CeedScalar *yy; + Ceed ceed; + CeedVector x, y; + CeedInt num_elem = 6, P = 2, dim = 1; + CeedInt ind[P * num_elem]; + bool orient[P * num_elem]; + CeedScalar a[num_elem + 1]; + const CeedScalar *yy; CeedElemRestriction r; CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, num_elem+1, &x); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedBasis b; CeedInit(argv[1], &ceed); // Test skipped if using single precision if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Test not implemented in single precision\n"); + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Test not implemented in single precision\n"); } CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 4, 4, CEED_GAUSS_LOBATTO, &b); CeedBasisView(b, stdout); CeedBasisDestroy(&b); - CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 4, 4, CEED_GAUSS, &b); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 4, 4, CEED_GAUSS, &b); CeedBasisView(b, stdout); CeedBasisDestroy(&b); diff --git a/tests/t301-basis.c b/tests/t301-basis.c index a5c00e23f5..b25636c7bd 100644 --- a/tests/t301-basis.c +++ b/tests/t301-basis.c @@ -6,7 +6,7 @@ #include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedScalar A[12] = {1, -1, 4, 1, 4, -2, 1, 4, 2, 1, -1, 0}; CeedScalar qr[12] = {1, -1, 4, 1, 4, -2, 1, 4, 2, 1, -1, 0}; CeedScalar A_qr[12] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; @@ -15,17 +15,18 @@ int main(int argc, char **argv) { CeedInit(argv[1], &ceed); CeedQRFactorization(ceed, qr, tau, 4, 3); - for (CeedInt i=0; i<3; i++) - for (CeedInt j=i; j<3; j++) - A_qr[i*3+j] = qr[i*3+j]; + for (CeedInt i = 0; i < 3; i++) { + for (CeedInt j = i; j < 3; j++) A_qr[i * 3 + j] = qr[i * 3 + j]; + } CeedHouseholderApplyQ(A_qr, qr, tau, CEED_NOTRANSPOSE, 4, 3, 3, 3, 1); - for (CeedInt i=0; i<12; i++) - if (fabs(A_qr[i] - A[i]) > 100.*CEED_EPSILON) + for (CeedInt i = 0; i < 12; i++) { + if (fabs(A_qr[i] - A[i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("Error in QR factorization A_qr[%" CeedInt_FMT - "] = %f != A[%" CeedInt_FMT "] = %f\n", i, A_qr[i], i, A[i]); - // LCOV_EXCL_STOP + printf("Error in QR factorization A_qr[%" CeedInt_FMT "] = %f != A[%" CeedInt_FMT "] = %f\n", i, A_qr[i], i, A[i]); + // LCOV_EXCL_STOP + } + } CeedDestroy(&ceed); return 0; diff --git a/tests/t302-basis.c b/tests/t302-basis.c index 1a39ac2b0c..fa8bc6aa4a 100644 --- a/tests/t302-basis.c +++ b/tests/t302-basis.c @@ -6,12 +6,12 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedInt P = 4; - CeedScalar collo_grad_1d[(P+2)*(P+2)], x_2[P+2]; + Ceed ceed; + CeedInt P = 4; + CeedScalar collo_grad_1d[(P + 2) * (P + 2)], x_2[P + 2]; const CeedScalar *grad_1d, *q_ref; - CeedScalar sum = 0.0; - CeedBasis b; + CeedScalar sum = 0.0; + CeedBasis b; CeedInit(argv[1], &ceed); @@ -20,52 +20,44 @@ int main(int argc, char **argv) { CeedBasisGetCollocatedGrad(b, collo_grad_1d); CeedBasisGetGrad(b, &grad_1d); - for (CeedInt i=0; i 100*CEED_EPSILON) + for (CeedInt i = 0; i < P; i++) { + for (CeedInt j = 0; j < P; j++) { + if (fabs(collo_grad_1d[j + P * i] - grad_1d[j + P * i]) > 100 * CEED_EPSILON) { // LCOV_EXCL_START - printf("Error in collocated gradient %f != %f\n", collo_grad_1d[j+P*i], - grad_1d[j+P*i]); - // LCOV_EXCL_START + printf("Error in collocated gradient %f != %f\n", collo_grad_1d[j + P * i], grad_1d[j + P * i]); + // LCOV_EXCL_START + } + } + } CeedBasisDestroy(&b); // Q = P, not already collocated - CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, P, P, CEED_GAUSS, &b); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, P, P, CEED_GAUSS, &b); CeedBasisGetCollocatedGrad(b, collo_grad_1d); CeedBasisGetQRef(b, &q_ref); - for (CeedInt i=0; i 100*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error in collocated gradient %f != %f\n", sum, 2*q_ref[i]); - // LCOV_EXCL_STOP + for (CeedInt j = 0; j < P; j++) sum += collo_grad_1d[j + P * i] * x_2[j]; + if (fabs(sum - 2 * q_ref[i]) > 100 * CEED_EPSILON) printf("Error in collocated gradient %f != %f\n", sum, 2 * q_ref[i]); } CeedBasisDestroy(&b); // Q = P + 2, not already collocated - CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, P, P+2, CEED_GAUSS, &b); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, P, P + 2, CEED_GAUSS, &b); CeedBasisGetCollocatedGrad(b, collo_grad_1d); CeedBasisGetQRef(b, &q_ref); - for (CeedInt i=0; i 100*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error in collocated gradient %f != %f\n", sum, 2*q_ref[i]); - // LCOV_EXCL_STOP + for (CeedInt j = 0; j < P + 2; j++) sum += collo_grad_1d[j + (P + 2) * i] * x_2[j]; + if (fabs(sum - 2 * q_ref[i]) > 100 * CEED_EPSILON) printf("Error in collocated gradient %f != %f\n", sum, 2 * q_ref[i]); } CeedBasisDestroy(&b); diff --git a/tests/t303-basis.c b/tests/t303-basis.c index 47751f4af9..8c5b86c858 100644 --- a/tests/t303-basis.c +++ b/tests/t303-basis.c @@ -5,16 +5,15 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedBasis b; + Ceed ceed; + CeedBasis b; CeedVector U, V; - CeedInt Q = 8, P = 2, num_comp = 1, dim = 3, - len = pow((CeedScalar)(Q), dim); + CeedInt Q = 8, P = 2, num_comp = 1, dim = 3, len = pow((CeedScalar)(Q), dim); CeedInit(argv[1], &ceed); - CeedVectorCreate(ceed, len, &U); - CeedVectorCreate(ceed, len+1, &V); + CeedVectorCreate(ceed, len, &U); + CeedVectorCreate(ceed, len + 1, &V); CeedBasisCreateTensorH1Lagrange(ceed, dim, num_comp, P, Q, CEED_GAUSS, &b); diff --git a/tests/t304-basis.c b/tests/t304-basis.c index 7bd437b838..4e5e06ce27 100644 --- a/tests/t304-basis.c +++ b/tests/t304-basis.c @@ -6,10 +6,10 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedInt P = 4; + Ceed ceed; + CeedInt P = 4; CeedScalar M[16], Q[16], lambda[4], Q_lambda_Qt[16]; - CeedBasis basis; + CeedBasis basis; CeedInit(argv[1], &ceed); @@ -18,33 +18,34 @@ int main(int argc, char **argv) { const CeedScalar *interp, *quad_weights; CeedBasisGetInterp(basis, &interp); CeedBasisGetQWeights(basis, &quad_weights); - for (int i=0; i 100.*CEED_EPSILON) + } + for (int i = 0; i < P; i++) { + for (int j = 0; j < P; j++) { + if (fabs(M[P * i + j] - Q_lambda_Qt[P * i + j]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("Error in diagonalization [%" CeedInt_FMT - ", %" CeedInt_FMT "]: %f != %f\n", - i, j, M[P*i+j], Q_lambda_Qt[P*i+j]); - // LCOV_EXCL_STOP + printf("Error in diagonalization [%" CeedInt_FMT ", %" CeedInt_FMT "]: %f != %f\n", i, j, M[P * i + j], Q_lambda_Qt[P * i + j]); + // LCOV_EXCL_STOP + } + } + } CeedBasisDestroy(&basis); CeedDestroy(&ceed); diff --git a/tests/t305-basis.c b/tests/t305-basis.c index 9c777f3595..4e4300473b 100644 --- a/tests/t305-basis.c +++ b/tests/t305-basis.c @@ -5,10 +5,10 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedInt P = 4, Q = 4; - CeedScalar M[P*P], K[P*P], X[P*P], lambda[P]; - CeedBasis basis; + Ceed ceed; + CeedInt P = 4, Q = 4; + CeedScalar M[P * P], K[P * P], X[P * P], lambda[P]; + CeedBasis basis; CeedInit(argv[1], &ceed); @@ -18,67 +18,70 @@ int main(int argc, char **argv) { CeedBasisGetInterp(basis, &interp); CeedBasisGetGrad(basis, &grad); CeedBasisGetQWeights(basis, &quad_weights); - for (int i=0; i 100.*CEED_EPSILON) + } + for (int i = 0; i < P; i++) { + for (int j = 0; j < P; j++) { + if (fabs(M[P * i + j] - (i == j ? 1.0 : 0.0)) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("Error in diagonalization of M [%" CeedInt_FMT - ", %" CeedInt_FMT "]: %f != %f\n", - i, j, M[P*i+j], (i == j ? 1.0 : 0.0)); - // LCOV_EXCL_STOP + printf("Error in diagonalization of M [%" CeedInt_FMT ", %" CeedInt_FMT "]: %f != %f\n", i, j, M[P * i + j], (i == j ? 1.0 : 0.0)); + // LCOV_EXCL_STOP + } + } + } // Check X^T K X = Lamda - for (int i=0; i 100.*CEED_EPSILON) + } + for (int i = 0; i < P; i++) { + for (int j = 0; j < P; j++) { + if (fabs(K[P * i + j] - (i == j ? lambda[i] : 0.0)) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("Error in diagonalization of K [%" CeedInt_FMT - ", %" CeedInt_FMT "]: %f != %f\n", - i, j, K[P*i+j], (i == j ? lambda[i] : 0.0)); - // LCOV_EXCL_STOP + printf("Error in diagonalization of K [%" CeedInt_FMT ", %" CeedInt_FMT "]: %f != %f\n", i, j, K[P * i + j], (i == j ? lambda[i] : 0.0)); + // LCOV_EXCL_STOP + } + } + } CeedBasisDestroy(&basis); CeedDestroy(&ceed); diff --git a/tests/t306-basis.c b/tests/t306-basis.c index ba970b8a23..063ae36358 100644 --- a/tests/t306-basis.c +++ b/tests/t306-basis.c @@ -4,7 +4,7 @@ #include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedBasis b; CeedInit(argv[1], &ceed); @@ -15,14 +15,8 @@ int main(int argc, char **argv) { CeedBasisGetNumNodes(b, &P); CeedBasisGetNumQuadraturePoints(b, &Q); - if (P != 64) - // LCOV_EXCL_START - printf("%" CeedInt_FMT " != 64\n", P); - // LCOV_EXCL_STOP - if (Q != 125) - // LCOV_EXCL_START - printf("%" CeedInt_FMT " != 125\n", Q); - // LCOV_EXCL_STOP + if (P != 64) printf("%" CeedInt_FMT " != 64\n", P); + if (Q != 125) printf("%" CeedInt_FMT " != 125\n", Q); CeedBasisDestroy(&b); CeedDestroy(&ceed); diff --git a/tests/t307-basis.c b/tests/t307-basis.c index 202e083ca5..6117bf26d5 100644 --- a/tests/t307-basis.c +++ b/tests/t307-basis.c @@ -4,25 +4,21 @@ #include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedBasis b, b_2; - CeedInt P_1d = 4; + CeedInt P_1d = 4; CeedInit(argv[1], &ceed); CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, P_1d, 4, CEED_GAUSS_LOBATTO, &b); - CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, P_1d+1, 4, CEED_GAUSS_LOBATTO, - &b_2); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, P_1d + 1, 4, CEED_GAUSS_LOBATTO, &b_2); - CeedBasisReferenceCopy(b, &b_2); // This destroys the previous b_2 + CeedBasisReferenceCopy(b, &b_2); // This destroys the previous b_2 CeedBasisDestroy(&b); CeedInt P_1d_2; CeedBasisGetNumNodes1D(b_2, &P_1d_2); - if (P_1d != P_1d_2) - // LCOV_EXCL_START - printf("Error copying CeedBasis reference\n"); - // LCOV_EXCL_STOP + if (P_1d != P_1d_2) printf("Error copying CeedBasis reference\n"); CeedBasisDestroy(&b_2); CeedDestroy(&ceed); diff --git a/tests/t310-basis.c b/tests/t310-basis.c index 1f5bf2edd6..fcacdd6121 100644 --- a/tests/t310-basis.c +++ b/tests/t310-basis.c @@ -2,16 +2,15 @@ /// Test square Gauss Lobatto interp_1d is identity /// \test Test square Gauss Lobatto interp_1d is identity #include -#include #include +#include int main(int argc, char **argv) { - Ceed ceed; - CeedBasis b; - CeedVector U, V; - int i, dim = 2, P_1d = 4, Q_1d = 4, len = (int)(pow((CeedScalar)(Q_1d), - dim) + 0.4); - CeedScalar u[len]; + Ceed ceed; + CeedBasis b; + CeedVector U, V; + int i, dim = 2, P_1d = 4, Q_1d = 4, len = (int)(pow((CeedScalar)(Q_1d), dim) + 0.4); + CeedScalar u[len]; const CeedScalar *v; CeedInit(argv[1], &ceed); @@ -19,21 +18,17 @@ int main(int argc, char **argv) { CeedVectorCreate(ceed, len, &U); CeedVectorCreate(ceed, len, &V); - for (i = 0; i < len; i++) - u[i] = 1.0; + for (i = 0; i < len; i++) u[i] = 1.0; CeedVectorSetArray(U, CEED_MEM_HOST, CEED_USE_POINTER, u); - CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, P_1d, Q_1d, - CEED_GAUSS_LOBATTO, &b); + CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, P_1d, Q_1d, CEED_GAUSS_LOBATTO, &b); CeedBasisApply(b, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, U, V); CeedVectorGetArrayRead(V, CEED_MEM_HOST, &v); - for (i = 0; i < len; i++) - if (fabs(v[i] - 1.) > 10.*CEED_EPSILON) - // LCOV_EXCL_START - printf("v[%" CeedInt_FMT "] = %f != 1.\n", i, v[i]); - // LCOV_EXCL_STOP + for (i = 0; i < len; i++) { + if (fabs(v[i] - 1.) > 10. * CEED_EPSILON) printf("v[%" CeedInt_FMT "] = %f != 1.\n", i, v[i]); + } CeedVectorRestoreArrayRead(V, &v); CeedBasisDestroy(&b); diff --git a/tests/t311-basis.c b/tests/t311-basis.c index 0da4acd504..34ee2d0fc7 100644 --- a/tests/t311-basis.c +++ b/tests/t311-basis.c @@ -7,19 +7,19 @@ #define ALEN(a) (sizeof(a) / sizeof((a)[0])) static CeedScalar PolyEval(CeedScalar x, CeedInt n, const CeedScalar *p) { - CeedScalar y = p[n-1]; - for (CeedInt i=n-2; i>=0; i--) y = y*x + p[i]; + CeedScalar y = p[n - 1]; + for (CeedInt i = n - 2; i >= 0; i--) y = y * x + p[i]; return y; } int main(int argc, char **argv) { - Ceed ceed; - CeedVector X, X_q, U, U_q; - CeedBasis basis_x_lobatto, basis_u_lobatto, basis_x_gauss, basis_u_gauss; - CeedInt Q = 6; - const CeedScalar p[6] = {1, 2, 3, 4, 5, 6}; // 1 + 2x + 3x^2 + ... + Ceed ceed; + CeedVector X, X_q, U, U_q; + CeedBasis basis_x_lobatto, basis_u_lobatto, basis_x_gauss, basis_u_gauss; + CeedInt Q = 6; + const CeedScalar p[6] = {1, 2, 3, 4, 5, 6}; // 1 + 2x + 3x^2 + ... const CeedScalar *xq, *uuq; - CeedScalar x[2], uq[Q]; + CeedScalar x[2], uq[Q]; CeedInit(argv[1], &ceed); @@ -30,20 +30,16 @@ int main(int argc, char **argv) { CeedVectorSetValue(U, 0); CeedVectorCreate(ceed, Q, &U_q); - CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, Q, CEED_GAUSS_LOBATTO, - &basis_x_lobatto); - CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, Q, Q, CEED_GAUSS_LOBATTO, - &basis_u_lobatto); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, Q, CEED_GAUSS_LOBATTO, &basis_x_lobatto); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, Q, Q, CEED_GAUSS_LOBATTO, &basis_u_lobatto); - for (int i = 0; i < 2; i++) - x[i] = CeedIntPow(-1, i+1); + for (int i = 0; i < 2; i++) x[i] = CeedIntPow(-1, i + 1); CeedVectorSetArray(X, CEED_MEM_HOST, CEED_USE_POINTER, (CeedScalar *)&x); CeedBasisApply(basis_x_lobatto, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, X, X_q); CeedVectorGetArrayRead(X_q, CEED_MEM_HOST, &xq); - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("%f != %f = p(%f)\n", uuq[i], px, xq[i]); - // LCOV_EXCL_STOP + if (fabs(uuq[i] - px) > 100. * CEED_EPSILON) printf("%f != %f = p(%f)\n", uuq[i], px, xq[i]); } CeedVectorRestoreArrayRead(X_q, &xq); CeedVectorRestoreArrayRead(U_q, &uuq); diff --git a/tests/t312-basis.c b/tests/t312-basis.c index a80798169a..310fe7870b 100644 --- a/tests/t312-basis.c +++ b/tests/t312-basis.c @@ -7,19 +7,19 @@ #define ALEN(a) (sizeof(a) / sizeof((a)[0])) static CeedScalar PolyEval(CeedScalar x, CeedInt n, const CeedScalar *p) { - CeedScalar y = p[n-1]; - for (CeedInt i=n-2; i>=0; i--) y = y*x + p[i]; + CeedScalar y = p[n - 1]; + for (CeedInt i = n - 2; i >= 0; i--) y = y * x + p[i]; return y; } int main(int argc, char **argv) { - Ceed ceed; - CeedVector X, X_q, U, U_q, W; - CeedBasis basis_x_lobatto, basis_x_gauss, basis_u_gauss; - CeedInt Q = 6; - const CeedScalar p[6] = {1, 2, 3, 4, 5, 6}; // 1 + 2x + 3x^2 + ... + Ceed ceed; + CeedVector X, X_q, U, U_q, W; + CeedBasis basis_x_lobatto, basis_x_gauss, basis_u_gauss; + CeedInt Q = 6; + const CeedScalar p[6] = {1, 2, 3, 4, 5, 6}; // 1 + 2x + 3x^2 + ... const CeedScalar *xq, *uq, *w; - CeedScalar u[Q], x[2], sum, error, pint[ALEN(p)+1]; + CeedScalar u[Q], x[2], sum, error, pint[ALEN(p) + 1]; CeedInit(argv[1], &ceed); @@ -32,18 +32,15 @@ int main(int argc, char **argv) { CeedVectorCreate(ceed, Q, &W); CeedVectorSetValue(W, 0); - CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, Q, CEED_GAUSS_LOBATTO, - &basis_x_lobatto); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, Q, CEED_GAUSS_LOBATTO, &basis_x_lobatto); - for (int i = 0; i < 2; i++) - x[i] = CeedIntPow(-1, i+1); + for (int i = 0; i < 2; i++) x[i] = CeedIntPow(-1, i + 1); CeedVectorSetArray(X, CEED_MEM_HOST, CEED_USE_POINTER, x); CeedBasisApply(basis_x_lobatto, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, X, X_q); CeedVectorGetArrayRead(X_q, CEED_MEM_HOST, &xq); - for (CeedInt i=0; i 100.*CEED_EPSILON) + if (error > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("Error %e sum %g exact %g\n", error, sum, - PolyEval(1, ALEN(pint), pint) - PolyEval(-1, ALEN(pint), pint)); - // LCOV_EXCL_STOP + printf("Error %e sum %g exact %g\n", error, sum, PolyEval(1, ALEN(pint), pint) - PolyEval(-1, ALEN(pint), pint)); + // LCOV_EXCL_STOP + } CeedVectorDestroy(&X); CeedVectorDestroy(&X_q); diff --git a/tests/t313-basis.c b/tests/t313-basis.c index b8d6572ef1..8cef0a7593 100644 --- a/tests/t313-basis.c +++ b/tests/t313-basis.c @@ -6,7 +6,7 @@ static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) { CeedScalar result = 1, center = 0.1; - for (CeedInt d=0; d 1E-4) { // LCOV_EXCL_START printf("[%" CeedInt_FMT "] %f != %f=f(%f", dim, u[i], fx, xx[0]); - for (CeedInt d=1; d 1) result += atan(x[1] + 0.2); - if (dim > 2) result += exp(-(x[2] + 0.3)*(x[2] + 0.3)); + if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3)); return result; } static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) { CeedScalar tol; if (scalar_type == CEED_SCALAR_FP32) { - if (dim == 3) - tol = 0.05; - else - tol = 1.e-3; + if (dim == 3) tol = 0.05; + else tol = 1.e-3; } else { tol = 1.e-11; } @@ -29,42 +27,38 @@ int main(int argc, char **argv) { CeedInit(argv[1], &ceed); - for (CeedInt dim=1; dim<=3; dim++) { - CeedVector X, X_q, U, U_q, ones, grad_T_ones; - CeedBasis basis_x_lobatto, basis_u_gauss; - CeedInt P = 8, Q = 10, P_dim = CeedIntPow(P, dim), Q_dim = CeedIntPow(Q, dim), - X_dim = CeedIntPow(2, dim); - CeedScalar x[X_dim*dim], u[P_dim]; + for (CeedInt dim = 1; dim <= 3; dim++) { + CeedVector X, X_q, U, U_q, ones, grad_T_ones; + CeedBasis basis_x_lobatto, basis_u_gauss; + CeedInt P = 8, Q = 10, P_dim = CeedIntPow(P, dim), Q_dim = CeedIntPow(Q, dim), X_dim = CeedIntPow(2, dim); + CeedScalar x[X_dim * dim], u[P_dim]; const CeedScalar *x_q, *u_q, *grad_t_ones_array; - CeedScalar sum_1 = 0, sum_2 = 0; + CeedScalar sum_1 = 0, sum_2 = 0; - for (CeedInt d=0; d tol) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, sum_2); - // LCOV_EXCL_STOP + if (fabs(sum_1 - sum_2) > tol) printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, sum_2); CeedVectorDestroy(&X); CeedVectorDestroy(&X_q); diff --git a/tests/t315-basis.c b/tests/t315-basis.c index 481ed409f3..a8ef49b3cf 100644 --- a/tests/t315-basis.c +++ b/tests/t315-basis.c @@ -7,17 +7,15 @@ static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) { CeedScalar result = tanh(x[0] + 0.1); if (dim > 1) result += atan(x[1] + 0.2); - if (dim > 2) result += exp(-(x[2] + 0.3)*(x[2] + 0.3)); + if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3)); return result; } static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) { CeedScalar tol; if (scalar_type == CEED_SCALAR_FP32) { - if (dim == 3) - tol = 1.e-3; - else - tol = 1.e-4; + if (dim == 3) tol = 1.e-3; + else tol = 1.e-4; } else { tol = 1.e-11; } @@ -29,68 +27,57 @@ int main(int argc, char **argv) { CeedInit(argv[1], &ceed); - for (CeedInt dim=1; dim<=3; dim++) { - CeedVector X, X_q, U, U_q, ones, grad_T_ones; - CeedBasis basis_x_lobatto, basis_u_gauss; - CeedInt P = 8, Q = 8, P_dim = CeedIntPow(P, dim), Qdim_ = CeedIntPow(Q, dim), - X_dim = CeedIntPow(2, dim); - CeedScalar x[X_dim*dim], u[P_dim]; + for (CeedInt dim = 1; dim <= 3; dim++) { + CeedVector X, X_q, U, U_q, ones, grad_T_ones; + CeedBasis basis_x_lobatto, basis_u_gauss; + CeedInt P = 8, Q = 8, P_dim = CeedIntPow(P, dim), Qdim_ = CeedIntPow(Q, dim), X_dim = CeedIntPow(2, dim); + CeedScalar x[X_dim * dim], u[P_dim]; const CeedScalar *x_q, *u_q, *grad_t_ones_array; - CeedScalar sum_1 = 0, sum_2 = 0; + CeedScalar sum_1 = 0, sum_2 = 0; - for (CeedInt d=0; d tol) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, sum_2); - // LCOV_EXCL_STOP + if (fabs(sum_1 - sum_2) > tol) printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, sum_2); CeedVectorDestroy(&X); CeedVectorDestroy(&X_q); diff --git a/tests/t316-basis.c b/tests/t316-basis.c index e96f69264a..d58645d07a 100644 --- a/tests/t316-basis.c +++ b/tests/t316-basis.c @@ -7,17 +7,15 @@ static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) { CeedScalar result = tanh(x[0] + 0.1); if (dim > 1) result += atan(x[1] + 0.2); - if (dim > 2) result += exp(-(x[2] + 0.3)*(x[2] + 0.3)); + if (dim > 2) result += exp(-(x[2] + 0.3) * (x[2] + 0.3)); return result; } static CeedScalar GetTolerance(CeedScalarType scalar_type, int dim) { CeedScalar tol; if (scalar_type == CEED_SCALAR_FP32) { - if (dim == 3) - tol = 0.005; - else - tol = 1.e-4; + if (dim == 3) tol = 0.005; + else tol = 1.e-4; } else { tol = 1.e-11; } @@ -29,42 +27,38 @@ int main(int argc, char **argv) { CeedInit(argv[1], &ceed); - for (CeedInt dim=1; dim<=3; dim++) { - CeedVector X, X_q, U, U_q, ones, grad_T_ones; - CeedBasis basis_x_lobatto, basis_u_gauss; - CeedInt P = 8, Q = 7, P_dim = CeedIntPow(P, dim), Q_dim = CeedIntPow(Q, dim), - X_dim = CeedIntPow(2, dim); - CeedScalar x[X_dim*dim], u[P_dim]; + for (CeedInt dim = 1; dim <= 3; dim++) { + CeedVector X, X_q, U, U_q, ones, grad_T_ones; + CeedBasis basis_x_lobatto, basis_u_gauss; + CeedInt P = 8, Q = 7, P_dim = CeedIntPow(P, dim), Q_dim = CeedIntPow(Q, dim), X_dim = CeedIntPow(2, dim); + CeedScalar x[X_dim * dim], u[P_dim]; const CeedScalar *x_q, *u_q, *grad_t_ones_array; - CeedScalar sum_1 = 0, sum_2 = 0; + CeedScalar sum_1 = 0, sum_2 = 0; - for (CeedInt d=0; d tol) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, sum_2); - // LCOV_EXCL_STOP + if (fabs(sum_1 - sum_2) > tol) printf("[%" CeedInt_FMT "] %f != %f\n", dim, sum_1, sum_2); CeedVectorDestroy(&X); CeedVectorDestroy(&X_q); diff --git a/tests/t317-basis.c b/tests/t317-basis.c index 4b7e8b762a..3202729c22 100644 --- a/tests/t317-basis.c +++ b/tests/t317-basis.c @@ -7,20 +7,20 @@ #define ALEN(a) (sizeof(a) / sizeof((a)[0])) static CeedScalar PolyEval(CeedScalar x, CeedInt n, const CeedScalar *p) { - CeedScalar y = p[n-1]; - for (CeedInt i=n-2; i>=0; i--) y = y*x + p[i]; + CeedScalar y = p[n - 1]; + for (CeedInt i = n - 2; i >= 0; i--) y = y * x + p[i]; return y; } int main(int argc, char **argv) { - Ceed ceed; - CeedVector X, X_q, U, U_q; - CeedBasis basis_x_lobatto, basis_u_lobatto, basis_x_gauss, basis_u_gauss; - CeedInt Q = 6; - const CeedScalar p[6] = {1, 2, 3, 4, 5, 6}; // 1 + 2x + 3x^2 + ... - const CeedScalar dp[5] = {2, 6, 12, 20, 30}; // 2 + 6x + 12x^2 + ... + Ceed ceed; + CeedVector X, X_q, U, U_q; + CeedBasis basis_x_lobatto, basis_u_lobatto, basis_x_gauss, basis_u_gauss; + CeedInt Q = 6; + const CeedScalar p[6] = {1, 2, 3, 4, 5, 6}; // 1 + 2x + 3x^2 + ... + const CeedScalar dp[5] = {2, 6, 12, 20, 30}; // 2 + 6x + 12x^2 + ... const CeedScalar *xq, *uuq; - CeedScalar x[2], uq[Q]; + CeedScalar x[2], uq[Q]; CeedInit(argv[1], &ceed); @@ -31,20 +31,16 @@ int main(int argc, char **argv) { CeedVectorSetValue(U, 0); CeedVectorCreate(ceed, Q, &U_q); - CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, Q, CEED_GAUSS_LOBATTO, - &basis_x_lobatto); - CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, Q, Q, CEED_GAUSS_LOBATTO, - &basis_u_lobatto); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, 2, Q, CEED_GAUSS_LOBATTO, &basis_x_lobatto); + CeedBasisCreateTensorH1Lagrange(ceed, 1, 1, Q, Q, CEED_GAUSS_LOBATTO, &basis_u_lobatto); - for (int i = 0; i < 2; i++) - x[i] = CeedIntPow(-1, i+1); + for (int i = 0; i < 2; i++) x[i] = CeedIntPow(-1, i + 1); CeedVectorSetArray(X, CEED_MEM_HOST, CEED_USE_POINTER, (CeedScalar *)&x); CeedBasisApply(basis_x_lobatto, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, X, X_q); CeedVectorGetArrayRead(X_q, CEED_MEM_HOST, &xq); - for (CeedInt i=0; i 1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("%f != %f = p(%f)\n", uuq[i], px, xq[i]); - // LCOV_EXCL_STOP + if (fabs(uuq[i] - px) > 1000. * CEED_EPSILON) printf("%f != %f = p(%f)\n", uuq[i], px, xq[i]); } CeedVectorRestoreArrayRead(X_q, &xq); CeedVectorRestoreArrayRead(U_q, &uuq); diff --git a/tests/t318-basis.c b/tests/t318-basis.c index 5b0239438e..e0374bc876 100644 --- a/tests/t318-basis.c +++ b/tests/t318-basis.c @@ -6,7 +6,7 @@ static CeedScalar Eval(CeedInt dim, const CeedScalar x[]) { CeedScalar result = 1, center = 0.1; - for (CeedInt d=0; d 1E-4) { // LCOV_EXCL_START printf("[%" CeedInt_FMT "] %f != %f = f(%f", dim, u[i], fx, xx[0]); - for (CeedInt d=1; d tol) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, u_to[i], u); - // LCOV_EXCL_STOP + if (fabs(u - u_to[i]) > tol) printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, u_to[i], u); } CeedVectorRestoreArrayRead(X_to, &x_to); CeedVectorRestoreArrayRead(U_to, &u_to); // Project and take gradient - CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, U_from, - dU_to); + CeedBasisApply(basis_project, 1, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, U_from, dU_to); // Check solution CeedVectorGetArrayRead(dU_to, CEED_MEM_HOST, &du_to); CeedVectorGetArrayRead(X_to, CEED_MEM_HOST, &x_to); - for (CeedInt i=0; i tol) + if (fabs(du - du_to[P_to_dim * (dim - 1 - d) + i]) > tol) { // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, - i, d, du_to[P_to_dim * (dim - 1 - d) + i], du); - // LCOV_EXCL_STOP + printf("[%" CeedInt_FMT ", %" CeedInt_FMT ", %" CeedInt_FMT "] %f != %f\n", dim, i, d, du_to[P_to_dim * (dim - 1 - d) + i], du); + // LCOV_EXCL_STOP + } } } CeedVectorRestoreArrayRead(X_to, &x_to); diff --git a/tests/t320-basis.c b/tests/t320-basis.c index cce99b6d5b..06f54ba728 100644 --- a/tests/t320-basis.c +++ b/tests/t320-basis.c @@ -1,28 +1,25 @@ /// @file /// Test creation and destruction of a 2D Simplex non-tensor H1 basis /// \test Test creation and distruction of a 2D Simplex non-tensor H1 basis -#include #include "t320-basis.h" +#include + int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; const CeedInt P = 6, Q = 4, dim = 2; - CeedBasis b; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; + CeedBasis b; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; CeedInit(argv[1], &ceed); // Test skipped if using single precision - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Test not implemented in single precision"); - } + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Test not implemented in single precision"); buildmats(q_ref, q_weight, interp, grad); - CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, P, Q, interp, grad, q_ref, - q_weight, &b); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, P, Q, interp, grad, q_ref, q_weight, &b); CeedBasisView(b, stdout); CeedBasisDestroy(&b); diff --git a/tests/t320-basis.h b/tests/t320-basis.h index 2df2ddd556..6e0f28b4d8 100644 --- a/tests/t320-basis.h +++ b/tests/t320-basis.h @@ -5,46 +5,46 @@ // // This file is part of CEED: http://github.com/ceed -static void buildmats(CeedScalar *q_ref, CeedScalar *q_weight, - CeedScalar *interp, - CeedScalar *grad) { +#include + +static void buildmats(CeedScalar *q_ref, CeedScalar *q_weight, CeedScalar *interp, CeedScalar *grad) { CeedInt P = 6, Q = 4; - q_ref[0] = 0.2; - q_ref[1] = 0.6; - q_ref[2] = 1./3.; - q_ref[3] = 0.2; - q_ref[4] = 0.2; - q_ref[5] = 0.2; - q_ref[6] = 1./3.; - q_ref[7] = 0.6; - q_weight[0] = 25./96.; - q_weight[1] = 25./96.; - q_weight[2] = -27./96.; - q_weight[3] = 25./96.; + q_ref[0] = 0.2; + q_ref[1] = 0.6; + q_ref[2] = 1. / 3.; + q_ref[3] = 0.2; + q_ref[4] = 0.2; + q_ref[5] = 0.2; + q_ref[6] = 1. / 3.; + q_ref[7] = 0.6; + q_weight[0] = 25. / 96.; + q_weight[1] = 25. / 96.; + q_weight[2] = -27. / 96.; + q_weight[3] = 25. / 96.; // Loop over quadrature points - for (int i=0; i #include + #include "t320-basis.h" -CeedScalar feval(CeedScalar x1, CeedScalar x2) { - return x1*x1 + x2*x2 + x1*x2 + 1; -} +// polynomial eval helper +static CeedScalar feval(CeedScalar x1, CeedScalar x2) { return x1 * x1 + x2 * x2 + x1 * x2 + 1; } +// main test int main(int argc, char **argv) { - Ceed ceed; - CeedVector In, Out; - const CeedInt P = 6, Q = 4, dim = 2; - CeedBasis b; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; - CeedScalar xq[] = {0.2, 0.6, 1./3., 0.2, 0.2, 0.2, 1./3., 0.6}; - CeedScalar xr[] = {0., 0.5, 1., 0., 0.5, 0., 0., 0., 0., 0.5, 0.5, 1.}; + Ceed ceed; + CeedVector In, Out; + const CeedInt P = 6, Q = 4, dim = 2; + CeedBasis b; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; + CeedScalar xq[] = {0.2, 0.6, 1. / 3., 0.2, 0.2, 0.2, 1. / 3., 0.6}; + CeedScalar xr[] = {0., 0.5, 1., 0., 0.5, 0., 0., 0., 0., 0.5, 0.5, 1.}; const CeedScalar *out; - CeedScalar in[P], value; + CeedScalar in[P], value; buildmats(q_ref, q_weight, interp, grad); CeedInit(argv[1], &ceed); - CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, P, Q, interp, grad, q_ref, - q_weight, &b); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, P, Q, interp, grad, q_ref, q_weight, &b); // Interpolate function to quadrature points - for (int i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] %f != %f\n", i, out[i], value); - // LCOV_EXCL_STOP + for (int i = 0; i < Q; i++) { + value = feval(xq[0 * Q + i], xq[1 * Q + i]); + if (fabs(out[i] - value) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", i, out[i], value); } CeedVectorRestoreArrayRead(Out, &out); diff --git a/tests/t322-basis.c b/tests/t322-basis.c index 60ac78fb1e..7c38bfea9a 100644 --- a/tests/t322-basis.c +++ b/tests/t322-basis.c @@ -3,33 +3,32 @@ /// \test Test integration with a 2D Simplex non-tensor H1 basis #include #include + #include "t320-basis.h" -CeedScalar feval(CeedScalar x1, CeedScalar x2) { - return x1*x1 + x2*x2 + x1*x2 + 1; -} +// polynomial eval helper +static CeedScalar feval(CeedScalar x1, CeedScalar x2) { return x1 * x1 + x2 * x2 + x1 * x2 + 1; } +// main test int main(int argc, char **argv) { - Ceed ceed; - CeedVector In, Out, Weights; - const CeedInt P = 6, Q = 4, dim = 2; - CeedBasis b; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; - CeedScalar xr[] = {0., 0.5, 1., 0., 0.5, 0., 0., 0., 0., 0.5, 0.5, 1.}; + Ceed ceed; + CeedVector In, Out, Weights; + const CeedInt P = 6, Q = 4, dim = 2; + CeedBasis b; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; + CeedScalar xr[] = {0., 0.5, 1., 0., 0.5, 0., 0., 0., 0., 0.5, 0.5, 1.}; const CeedScalar *out, *weights; - CeedScalar in[P], sum; + CeedScalar in[P], sum; buildmats(q_ref, q_weight, interp, grad); CeedInit(argv[1], &ceed); - CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, P, Q, interp, grad, q_ref, - q_weight, &b); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, P, Q, interp, grad, q_ref, q_weight, &b); // Interpolate function to quadrature points - for (int i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("%f != %f\n", sum, 17./24.); - // LCOV_EXCL_STOP + for (int i = 0; i < Q; i++) sum += out[i] * weights[i]; + if (fabs(sum - 17. / 24.) > 100. * CEED_EPSILON) printf("%f != %f\n", sum, 17. / 24.); CeedVectorRestoreArrayRead(Out, &out); CeedVectorRestoreArrayRead(Weights, &weights); diff --git a/tests/t323-basis.c b/tests/t323-basis.c index 11996aca7c..1dd7e9756e 100644 --- a/tests/t323-basis.c +++ b/tests/t323-basis.c @@ -3,59 +3,51 @@ /// \test Test grad with a 2D Simplex non-tensor H1 basis #include #include + #include "t320-basis.h" -CeedScalar feval(CeedScalar x1, CeedScalar x2) { - return x1*x1 + x2*x2 + x1*x2 + 1; -} +// polynomial eval helper +static CeedScalar feval(CeedScalar x1, CeedScalar x2) { return x1 * x1 + x2 * x2 + x1 * x2 + 1; } -CeedScalar dfeval(CeedScalar x1, CeedScalar x2) { - return 2*x1 + x2; -} +// polynomial derivative helper +CeedScalar dfeval(CeedScalar x1, CeedScalar x2) { return 2 * x1 + x2; } +// main test int main(int argc, char **argv) { - Ceed ceed; - CeedVector In, Out; - const CeedInt P = 6, Q = 4, dim = 2; - CeedBasis b; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; - CeedScalar xq[] = {0.2, 0.6, 1./3., 0.2, 0.2, 0.2, 1./3., 0.6}; - CeedScalar xr[] = {0., 0.5, 1., 0., 0.5, 0., 0., 0., 0., 0.5, 0.5, 1.}; + Ceed ceed; + CeedVector In, Out; + const CeedInt P = 6, Q = 4, dim = 2; + CeedBasis b; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; + CeedScalar xq[] = {0.2, 0.6, 1. / 3., 0.2, 0.2, 0.2, 1. / 3., 0.6}; + CeedScalar xr[] = {0., 0.5, 1., 0., 0.5, 0., 0., 0., 0., 0.5, 0.5, 1.}; const CeedScalar *out; - CeedScalar in[P], value; + CeedScalar in[P], value; buildmats(q_ref, q_weight, interp, grad); CeedInit(argv[1], &ceed); - CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, P, Q, interp, grad, q_ref, - q_weight, &b); + CeedBasisCreateH1(ceed, CEED_TOPOLOGY_TRIANGLE, 1, P, Q, interp, grad, q_ref, q_weight, &b); // Interpolate function to quadrature points - for (int i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] %f != %f\n", i, out[0*Q+i], value); - // LCOV_EXCL_STOP - value = dfeval(xq[1*Q+i], xq[0*Q+i]); - if (fabs(out[1*Q+i] - value) > 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] %f != %f\n", i, out[1*Q+i], value); - // LCOV_EXCL_STOP + for (int i = 0; i < Q; i++) { + value = dfeval(xq[0 * Q + i], xq[1 * Q + i]); + if (fabs(out[0 * Q + i] - value) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", i, out[0 * Q + i], value); + value = dfeval(xq[1 * Q + i], xq[0 * Q + i]); + if (fabs(out[1 * Q + i] - value) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", i, out[1 * Q + i], value); } CeedVectorRestoreArrayRead(Out, &out); diff --git a/tests/t324-basis.c b/tests/t324-basis.c index 857664df60..4b034d0892 100644 --- a/tests/t324-basis.c +++ b/tests/t324-basis.c @@ -3,33 +3,33 @@ /// \test Test grad transposewith a 2D Simplex non-tensor H1 basis #include #include + #include "t320-basis.h" int main(int argc, char **argv) { - Ceed ceed; - CeedVector In, Out; - const CeedInt P = 6, Q = 4, dim = 2; - CeedBasis b; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; + Ceed ceed; + CeedVector In, Out; + const CeedInt P = 6, Q = 4, dim = 2; + CeedBasis b; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; const CeedScalar *out; - CeedScalar colsum[P]; + CeedScalar colsum[P]; buildmats(q_ref, q_weight, interp, grad); CeedInit(argv[1], &ceed); - for (int i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] %f != %f\n", i, out[i], colsum[i]); - // LCOV_EXCL_STOP + for (int i = 0; i < P; i++) { + if (fabs(colsum[i] - out[i]) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", i, out[i], colsum[i]); + } CeedVectorRestoreArrayRead(Out, &out); CeedVectorDestroy(&In); diff --git a/tests/t325-basis.c b/tests/t325-basis.c index 3045c96893..3b5e4a21f8 100644 --- a/tests/t325-basis.c +++ b/tests/t325-basis.c @@ -3,52 +3,52 @@ /// \test Test grad transposewith a 2D Simplex non-tensor H1 basis #include #include + #include "t320-basis.h" int main(int argc, char **argv) { - Ceed ceed; - CeedVector In, Out; - const CeedInt P = 6, Q = 4, dim = 2, num_comp = 3; - CeedBasis b; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; + Ceed ceed; + CeedVector In, Out; + const CeedInt P = 6, Q = 4, dim = 2, num_comp = 3; + CeedBasis b; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; const CeedScalar *out; - CeedScalar colsum[P], *in; + CeedScalar colsum[P], *in; buildmats(q_ref, q_weight, interp, grad); CeedInit(argv[1], &ceed); - for (int i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] %f != %f\n", p, out[p+n*P], n*colsum[p]); - // LCOV_EXCL_STOP + for (int p = 0; p < P; p++) { + for (int n = 0; n < num_comp; n++) { + if (fabs(n * colsum[p] - out[p + n * P]) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] %f != %f\n", p, out[p + n * P], n * colsum[p]); + } + } CeedVectorRestoreArrayRead(Out, &out); CeedVectorDestroy(&In); diff --git a/tests/t330-basis.c b/tests/t330-basis.c index df41a7c67b..32d20b5e5c 100644 --- a/tests/t330-basis.c +++ b/tests/t330-basis.c @@ -1,30 +1,29 @@ /// @file /// Test creation and destruction of a 2D Quad non-tensor Hdiv basis /// \test Test creation and distruction of a 2D Quad non-tensor Hdiv basis -#include #include "t330-basis.h" +#include + int main(int argc, char **argv) { - Ceed ceed; - const CeedInt Q = 3, dim = 2, num_qpts = Q*Q, elem_nodes = 4; - CeedInt num_comp = 1; - CeedInt P = dim*elem_nodes; // dof per element! dof is vector in H(div) - CeedBasis b; - CeedScalar q_ref[dim*num_qpts], q_weights[num_qpts]; - CeedScalar interp[dim*P*num_qpts], div[P*num_qpts]; + Ceed ceed; + const CeedInt Q = 3, dim = 2, num_qpts = Q * Q, elem_nodes = 4; + CeedInt num_comp = 1; + CeedInt P = dim * elem_nodes; // dof per element! dof is vector in H(div) + CeedBasis b; + CeedScalar q_ref[dim * num_qpts], q_weights[num_qpts]; + CeedScalar interp[dim * P * num_qpts], div[P * num_qpts]; CeedInit(argv[1], &ceed); // Test skipped if using single precision if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Test not implemented in single precision"); + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Test not implemented in single precision"); // LCOV_EXCL_STOP HdivBasisQuad(Q, q_ref, q_weights, interp, div, CEED_GAUSS); - CeedBasisCreateHdiv(ceed, CEED_TOPOLOGY_QUAD, num_comp, P, num_qpts, interp, - div, q_ref, q_weights, &b); + CeedBasisCreateHdiv(ceed, CEED_TOPOLOGY_QUAD, num_comp, P, num_qpts, interp, div, q_ref, q_weights, &b); // interp[0]--.interp[num_qpts-1] ==> basis in x-direction // interp[num_qpts]--.interp[dim*num_qpts-1] ==> basis in y-direction CeedBasisView(b, stdout); diff --git a/tests/t330-basis.h b/tests/t330-basis.h index 2c0e6484d6..95c9c0df63 100644 --- a/tests/t330-basis.h +++ b/tests/t330-basis.h @@ -5,6 +5,8 @@ // // This file is part of CEED: http://github.com/ceed +#include + // Hdiv basis for quadrilateral linear BDMelement in 2D // Local numbering is as follow (each edge has 2 vector dof) // b4 b5 @@ -20,59 +22,57 @@ int NodalHdivBasisQuad(CeedScalar *X, CeedScalar *Bx, CeedScalar *By) { CeedScalar x_hat = X[0]; CeedScalar y_hat = X[1]; - Bx[0] = -0.125 + 0.125*x_hat*x_hat; - By[0] = -0.25 + 0.25*x_hat + 0.25*y_hat + -0.25*x_hat*y_hat; - Bx[1] = 0.125 + -0.125*x_hat*x_hat; - By[1] = -0.25 + -0.25*x_hat + 0.25*y_hat + 0.25*x_hat*y_hat; - Bx[2] = 0.25 + 0.25*x_hat + -0.25*y_hat + -0.25*x_hat*y_hat; - By[2] = -0.125 + 0.125*y_hat*y_hat; - Bx[3] = 0.25 + 0.25*x_hat + 0.25*y_hat + 0.25*x_hat*y_hat; - By[3] = 0.125 + -0.125*y_hat*y_hat; - Bx[4] = -0.125 + 0.125*x_hat*x_hat; - By[4] = 0.25 + -0.25*x_hat + 0.25*y_hat + -0.25*x_hat*y_hat; - Bx[5] = 0.125 + -0.125*x_hat*x_hat; - By[5] = 0.25 + 0.25*x_hat + 0.25*y_hat + 0.25*x_hat*y_hat; - Bx[6] = -0.25 + 0.25*x_hat + 0.25*y_hat + -0.25*x_hat*y_hat; - By[6] = -0.125 + 0.125*y_hat*y_hat; - Bx[7] = -0.25 + 0.25*x_hat + -0.25*y_hat + 0.25*x_hat*y_hat; - By[7] = 0.125 + -0.125*y_hat*y_hat; + Bx[0] = -0.125 + 0.125 * x_hat * x_hat; + By[0] = -0.25 + 0.25 * x_hat + 0.25 * y_hat + -0.25 * x_hat * y_hat; + Bx[1] = 0.125 + -0.125 * x_hat * x_hat; + By[1] = -0.25 + -0.25 * x_hat + 0.25 * y_hat + 0.25 * x_hat * y_hat; + Bx[2] = 0.25 + 0.25 * x_hat + -0.25 * y_hat + -0.25 * x_hat * y_hat; + By[2] = -0.125 + 0.125 * y_hat * y_hat; + Bx[3] = 0.25 + 0.25 * x_hat + 0.25 * y_hat + 0.25 * x_hat * y_hat; + By[3] = 0.125 + -0.125 * y_hat * y_hat; + Bx[4] = -0.125 + 0.125 * x_hat * x_hat; + By[4] = 0.25 + -0.25 * x_hat + 0.25 * y_hat + -0.25 * x_hat * y_hat; + Bx[5] = 0.125 + -0.125 * x_hat * x_hat; + By[5] = 0.25 + 0.25 * x_hat + 0.25 * y_hat + 0.25 * x_hat * y_hat; + Bx[6] = -0.25 + 0.25 * x_hat + 0.25 * y_hat + -0.25 * x_hat * y_hat; + By[6] = -0.125 + 0.125 * y_hat * y_hat; + Bx[7] = -0.25 + 0.25 * x_hat + -0.25 * y_hat + 0.25 * x_hat * y_hat; + By[7] = 0.125 + -0.125 * y_hat * y_hat; return 0; } -static void HdivBasisQuad(CeedInt Q, CeedScalar *q_ref, CeedScalar *q_weights, - CeedScalar *interp, CeedScalar *div, CeedQuadMode quad_mode) { - +static void HdivBasisQuad(CeedInt Q, CeedScalar *q_ref, CeedScalar *q_weights, CeedScalar *interp, CeedScalar *div, CeedQuadMode quad_mode) { // Get 1D quadrature on [-1,1] CeedScalar q_ref_1d[Q], q_weight_1d[Q]; switch (quad_mode) { - case CEED_GAUSS: - CeedGaussQuadrature(Q, q_ref_1d, q_weight_1d); - break; - // LCOV_EXCL_START - case CEED_GAUSS_LOBATTO: - CeedLobattoQuadrature(Q, q_ref_1d, q_weight_1d); - break; + case CEED_GAUSS: + CeedGaussQuadrature(Q, q_ref_1d, q_weight_1d); + break; + // LCOV_EXCL_START + case CEED_GAUSS_LOBATTO: + CeedLobattoQuadrature(Q, q_ref_1d, q_weight_1d); + break; } // LCOV_EXCL_STOP // Divergence operator; Divergence of nodal basis for ref element - CeedScalar D[8] = {0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25}; + CeedScalar D[8] = {0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25}; // Loop over quadrature points CeedScalar Bx[8], By[8]; CeedScalar X[2]; - for (CeedInt i=0; i #include + #include "t330-basis.h" int main(int argc, char **argv) { - Ceed ceed; - const CeedInt num_nodes = 4, Q = 3, dim = 2, num_qpts = Q*Q; - CeedInt num_comp = 1; // one vector componenet - CeedInt P = dim*num_nodes; // dof per element! - CeedBasis b; - CeedScalar q_ref[dim*num_qpts], q_weights[num_qpts]; - CeedScalar div[P*num_qpts], interp[P*dim*num_qpts]; - CeedVector X, Y; + Ceed ceed; + const CeedInt num_nodes = 4, Q = 3, dim = 2, num_qpts = Q * Q; + CeedInt num_comp = 1; // one vector componenet + CeedInt P = dim * num_nodes; // dof per element! + CeedBasis b; + CeedScalar q_ref[dim * num_qpts], q_weights[num_qpts]; + CeedScalar div[P * num_qpts], interp[P * dim * num_qpts]; + CeedVector X, Y; const CeedScalar *y, *x; CeedInit(argv[1], &ceed); HdivBasisQuad(Q, q_ref, q_weights, interp, div, CEED_GAUSS); - CeedBasisCreateHdiv(ceed, CEED_TOPOLOGY_QUAD, num_comp, P, num_qpts, interp, - div, q_ref, q_weights, &b); + CeedBasisCreateHdiv(ceed, CEED_TOPOLOGY_QUAD, num_comp, P, num_qpts, interp, div, q_ref, q_weights, &b); // Test GetInterp for H(div) const CeedScalar *interp2; CeedBasisGetInterp(b, &interp2); - for (CeedInt i=0; i 100.*CEED_EPSILON) + for (CeedInt i = 0; i < P * dim * num_qpts; i++) { + if (fabs(interp[i] - interp2[i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START printf("%f != %f\n", interp[i], interp2[i]); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } } CeedVectorCreate(ceed, P, &X); CeedVectorSetValue(X, 1.0); - CeedVectorCreate(ceed, num_qpts*dim, &Y); + CeedVectorCreate(ceed, num_qpts * dim, &Y); CeedVectorSetValue(Y, 0.); // BasisApply for H(div): CEED_EVAL_INTERP, NOTRANSPOSE case CeedBasisApply(b, 1, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, X, Y); CeedVectorGetArrayRead(Y, CEED_MEM_HOST, &y); - for (CeedInt i=0; i 100.*CEED_EPSILON) + for (CeedInt i = 0; i < dim * num_qpts; i++) { + if (fabs(q_ref[i] - y[i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START printf("%f != %f\n", q_ref[i], y[i]); - // LCOV_EXCL_STOP + // LCOV_EXCL_STOP + } } CeedVectorRestoreArrayRead(Y, &y); @@ -55,13 +57,10 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(X, CEED_MEM_HOST, &x); CeedScalar sum = 0.; - for (CeedInt i = 0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("sum of array %f != %f\n", sum, 0.0); - // LCOV_EXCL_STOP + if (fabs(sum) > 100. * CEED_EPSILON) printf("sum of array %f != %f\n", sum, 0.0); CeedVectorRestoreArrayRead(X, &x); CeedBasisDestroy(&b); diff --git a/tests/t332-basis.c b/tests/t332-basis.c index 525d39845b..daccafc0d5 100644 --- a/tests/t332-basis.c +++ b/tests/t332-basis.c @@ -3,32 +3,29 @@ /// \test Test GetDiv and BasisApply for a 2D Quad non-tensor H(div) basis #include #include + #include "t330-basis.h" int main(int argc, char **argv) { - Ceed ceed; - const CeedInt num_nodes = 4, Q = 3, dim = 2, num_qpts = Q*Q; - CeedInt num_comp = 1; // one vector componenet - CeedInt P = dim*num_nodes; // dof per element! - CeedBasis b; - CeedScalar q_ref[dim*num_qpts], q_weights[num_qpts]; - CeedScalar div[P*num_qpts], interp[P*dim*num_qpts]; - CeedVector X, Y; + Ceed ceed; + const CeedInt num_nodes = 4, Q = 3, dim = 2, num_qpts = Q * Q; + CeedInt num_comp = 1; // one vector componenet + CeedInt P = dim * num_nodes; // dof per element! + CeedBasis b; + CeedScalar q_ref[dim * num_qpts], q_weights[num_qpts]; + CeedScalar div[P * num_qpts], interp[P * dim * num_qpts]; + CeedVector X, Y; const CeedScalar *y, *x; CeedInit(argv[1], &ceed); HdivBasisQuad(Q, q_ref, q_weights, interp, div, CEED_GAUSS); - CeedBasisCreateHdiv(ceed, CEED_TOPOLOGY_QUAD, num_comp, P, num_qpts, interp, - div, q_ref, q_weights, &b); + CeedBasisCreateHdiv(ceed, CEED_TOPOLOGY_QUAD, num_comp, P, num_qpts, interp, div, q_ref, q_weights, &b); // Test GetDiv const CeedScalar *div2; CeedBasisGetDiv(b, &div2); - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("%f != %f\n", div[i], div2[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < P * num_qpts; i++) { + if (fabs(div[i] - div2[i]) > 100. * CEED_EPSILON) printf("%f != %f\n", div[i], div2[i]); } CeedVectorCreate(ceed, P, &X); CeedVectorSetValue(X, 1); @@ -38,11 +35,8 @@ int main(int argc, char **argv) { CeedBasisApply(b, 1, CEED_NOTRANSPOSE, CEED_EVAL_DIV, X, Y); CeedVectorGetArrayRead(Y, CEED_MEM_HOST, &y); - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("%f != %f\n", 2.0, y[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_qpts; i++) { + if (fabs(P * 0.25 - y[i]) > 100. * CEED_EPSILON) printf("%f != %f\n", 2.0, y[i]); } CeedVectorRestoreArrayRead(Y, &y); @@ -52,11 +46,8 @@ int main(int argc, char **argv) { CeedBasisApply(b, 1, CEED_TRANSPOSE, CEED_EVAL_DIV, Y, X); CeedVectorGetArrayRead(X, CEED_MEM_HOST, &x); - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("%f != %f\n", 2.0, x[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < P; i++) { + if (fabs(num_qpts * 0.25 - x[i]) > 100. * CEED_EPSILON) printf("%f != %f\n", 2.0, x[i]); } CeedVectorRestoreArrayRead(X, &x); diff --git a/tests/t400-qfunction.c b/tests/t400-qfunction.c index c7d0b05605..d01fe678ac 100644 --- a/tests/t400-qfunction.c +++ b/tests/t400-qfunction.c @@ -1,17 +1,18 @@ /// @file /// Test creation, evaluation, and destruction for QFunction /// \test Test creation, evaluation, and destruction for QFunction -#include #include "t400-qfunction.h" +#include + int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector Q_data, W, U, V; - CeedQFunction qf_setup, qf_mass; - CeedInt Q = 8; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector Q_data, W, U, V; + CeedQFunction qf_setup, qf_mass; + CeedInt Q = 8; const CeedScalar *vv; - CeedScalar w[Q], u[Q], v[Q]; + CeedScalar w[Q], u[Q], v[Q]; CeedInit(argv[1], &ceed); @@ -24,11 +25,11 @@ int main(int argc, char **argv) { CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); - for (CeedInt i=0; i -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { - const CeedScalar *w = in[0]; - CeedScalar *q_data = out[0]; - for (CeedInt i=0; i #include -#include "t401-qfunction.h" - int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector Q_data, W, U, V; - CeedQFunction qf_setup, qf_mass; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector Q_data, W, U, V; + CeedQFunction qf_setup, qf_mass; CeedQFunctionContext ctx; - CeedInt Q = 8; - const CeedScalar *vv; - CeedScalar w[Q], u[Q], v[Q], ctx_data[5] = {1, 2, 3, 4, 5}; + CeedInt Q = 8; + const CeedScalar *vv; + CeedScalar w[Q], u[Q], v[Q], ctx_data[5] = {1, 2, 3, 4, 5}; CeedInit(argv[1], &ceed); @@ -28,15 +28,14 @@ int main(int argc, char **argv) { CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); CeedQFunctionContextCreate(ceed, &ctx); - CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(ctx_data), &ctx_data); + CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(ctx_data), &ctx_data); CeedQFunctionSetContext(qf_mass, ctx); - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] v %f != vv %f\n",i, v[i], vv[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < Q; i++) { + if (fabs(ctx_data[4] * v[i] - vv[i]) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] v %f != vv %f\n", i, v[i], vv[i]); + } CeedVectorRestoreArrayRead(V, &vv); CeedVectorDestroy(&W); diff --git a/tests/t401-qfunction.h b/tests/t401-qfunction.h index b14b4a31d4..03d24706d2 100644 --- a/tests/t401-qfunction.h +++ b/tests/t401-qfunction.h @@ -7,23 +7,20 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { - const CeedScalar *w = in[0]; - CeedScalar *q_data = out[0]; - for (CeedInt i=0; i + #include "t400-qfunction.h" int main(int argc, char **argv) { - Ceed ceed; - CeedQFunction qf_setup, qf_mass; + Ceed ceed; + CeedQFunction qf_setup, qf_mass; CeedQFunctionContext ctx; CeedInit(argv[1], &ceed); @@ -26,12 +27,10 @@ int main(int argc, char **argv) { CeedQFunctionContextCreate(ceed, &ctx); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { CeedScalar ctxData[5] = {1, 2, 3, 4, 5}; - CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(ctxData), &ctxData); - } else { // Make context twice as long so the size is the same in output + CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(ctxData), &ctxData); + } else { // Make context twice as long so the size is the same in output CeedScalar ctxData[10] = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}; - CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(ctxData), &ctxData); + CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(ctxData), &ctxData); } CeedQFunctionContextView(ctx, stdout); diff --git a/tests/t403-qfunction.c b/tests/t403-qfunction.c index 6a92e20c0b..ca97ddf8d6 100644 --- a/tests/t403-qfunction.c +++ b/tests/t403-qfunction.c @@ -2,11 +2,12 @@ /// Test creation, copying, and destruction for QFunction and QFunctionContext /// \test Test creation, copying, and destruction for QFunction and QFunctionContext #include + #include "t400-qfunction.h" int main(int argc, char **argv) { - Ceed ceed; - CeedQFunction qf, qf_2; + Ceed ceed; + CeedQFunction qf, qf_2; CeedQFunctionContext ctx, ctx_2; CeedInit(argv[1], &ceed); @@ -14,20 +15,14 @@ int main(int argc, char **argv) { CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf); CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_2); - CeedQFunctionReferenceCopy(qf, &qf_2); // This destroys the previous qf_2 - if (qf != qf_2) - // LCOV_EXCL_START - printf("Error copying CeedQFunction reference\n"); - // LCOV_EXCL_STOP + CeedQFunctionReferenceCopy(qf, &qf_2); // This destroys the previous qf_2 + if (qf != qf_2) printf("Error copying CeedQFunction reference\n"); CeedQFunctionContextCreate(ceed, &ctx); CeedQFunctionContextCreate(ceed, &ctx_2); CeedQFunctionContextReferenceCopy(ctx, &ctx_2); - if (ctx != ctx_2) - // LCOV_EXCL_START - printf("Error copying CeedQFunctionContext reference\n"); - // LCOV_EXCL_STOP + if (ctx != ctx_2) printf("Error copying CeedQFunctionContext reference\n"); CeedQFunctionDestroy(&qf); CeedQFunctionDestroy(&qf_2); diff --git a/tests/t404-qfunction.c b/tests/t404-qfunction.c index 49978def0f..54d5d4a52a 100644 --- a/tests/t404-qfunction.c +++ b/tests/t404-qfunction.c @@ -5,44 +5,33 @@ #include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedQFunctionContext ctx; - CeedScalar ctxData[5] = {1, 2, 3, 4, 5}, *ctxDataCopy; + CeedScalar ctxData[5] = {1, 2, 3, 4, 5}, *ctxDataCopy; CeedInit(argv[1], &ceed); // Set borrowed pointer CeedQFunctionContextCreate(ceed, &ctx); - CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(ctxData), &ctxData); + CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(ctxData), &ctxData); // Update borrowed pointer CeedQFunctionContextGetData(ctx, CEED_MEM_HOST, &ctxDataCopy); ctxDataCopy[4] = 6; CeedQFunctionContextRestoreData(ctx, &ctxDataCopy); - if (ctxData[4] != 6) - // LCOV_EXCL_START - printf("error modifying data: %f != 6.0\n", ctxData[4]); - // LCOV_EXCL_STOP + if (ctxData[4] != 6) printf("error modifying data: %f != 6.0\n", ctxData[4]); // Take back borrowed pointer CeedQFunctionContextTakeData(ctx, CEED_MEM_HOST, &ctxDataCopy); - if (ctxDataCopy[4] != 6) - // LCOV_EXCL_START - printf("error accessing borrowed data: %f != 6.0\n", ctxDataCopy[4]); - // LCOV_EXCL_STOP + if (ctxDataCopy[4] != 6) printf("error accessing borrowed data: %f != 6.0\n", ctxDataCopy[4]); // Set copied data ctxData[4] = 6; - CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(ctxData), &ctxData); + CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(ctxData), &ctxData); // Check copied data CeedQFunctionContextGetData(ctx, CEED_MEM_HOST, &ctxDataCopy); - if (ctxDataCopy[4] != 6) - // LCOV_EXCL_START - printf("error accessing copied data: %f != 6.0\n", ctxDataCopy[4]); - // LCOV_EXCL_STOP + if (ctxDataCopy[4] != 6) printf("error accessing copied data: %f != 6.0\n", ctxDataCopy[4]); CeedQFunctionContextRestoreData(ctx, &ctxDataCopy); CeedQFunctionContextDestroy(&ctx); diff --git a/tests/t405-qfunction.c b/tests/t405-qfunction.c index 8bb419868a..525f37fd52 100644 --- a/tests/t405-qfunction.c +++ b/tests/t405-qfunction.c @@ -1,17 +1,18 @@ /// @file /// Test QFunction helper macro /// \test Test QFunction helper macro -#include #include "t405-qfunction.h" +#include + int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector Q_data, W, U, V; - CeedQFunction qf_setup, qf_mass; - CeedInt Q = 8; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector Q_data, W, U, V; + CeedQFunction qf_setup, qf_mass; + CeedInt Q = 8; const CeedScalar *vv; - CeedScalar w[Q], u[Q], v[Q]; + CeedScalar w[Q], u[Q], v[Q]; CeedInit(argv[1], &ceed); @@ -24,11 +25,11 @@ int main(int argc, char **argv) { CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); - for (CeedInt i=0; i -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { - const CeedScalar *w = in[0]; - CeedScalar *q_data = out[0]; - for (CeedInt i=0; i + #include "t406-qfunction-scales.h" -CEED_QFUNCTION_HELPER CeedScalar times_two(CeedScalar x) { - return SCALE_TWO * x; -} +CEED_QFUNCTION_HELPER CeedScalar times_two(CeedScalar x) { return SCALE_TWO * x; } -CEED_QFUNCTION_HELPER CeedScalar times_three(CeedScalar x) { - return SCALE_THREE * x; -} +CEED_QFUNCTION_HELPER CeedScalar times_three(CeedScalar x) { return SCALE_THREE * x; } #endif diff --git a/tests/t406-qfunction.c b/tests/t406-qfunction.c index b1402d6fe3..2d4b43ea1d 100644 --- a/tests/t406-qfunction.c +++ b/tests/t406-qfunction.c @@ -1,20 +1,21 @@ /// @file /// Test QFunction helper macro /// \test Test QFunction helper macro +#include "t406-qfunction.h" + #include #include #include #include -#include "t406-qfunction.h" int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector Q_data, W, U, V; - CeedQFunction qf_setup, qf_mass; - CeedInt Q = 8; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector Q_data, W, U, V; + CeedQFunction qf_setup, qf_mass; + CeedInt Q = 8; const CeedScalar *vv; - CeedScalar w[Q], u[Q], v[Q]; + CeedScalar w[Q], u[Q], v[Q]; CeedInit(argv[1], &ceed); @@ -27,11 +28,11 @@ int main(int argc, char **argv) { CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP); CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP); - for (CeedInt i=0; i 1E3*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] v %f != vv %f\n",i, 5*v[i]*sqrt(2.), vv[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < Q; i++) { + if (fabs(5 * v[i] * sqrt(2.) - vv[i]) > 1E3 * CEED_EPSILON) printf("[%" CeedInt_FMT "] v %f != vv %f\n", i, 5 * v[i] * sqrt(2.), vv[i]); + } CeedVectorRestoreArrayRead(V, &vv); CeedVectorDestroy(&W); diff --git a/tests/t406-qfunction.h b/tests/t406-qfunction.h index da7dd540f7..c77a684beb 100644 --- a/tests/t406-qfunction.h +++ b/tests/t406-qfunction.h @@ -6,28 +6,26 @@ // This file is part of CEED: http://github.com/ceed // Note: intentionally testing strange spacing in '#include's -#include #include +#include + #include "./t406-qfunction-scales.h" -# include "t406-qfunction-helper.h" +#include "t406-qfunction-helper.h" -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) { - const CeedScalar *w = in[0]; - CeedScalar *q_data = out[0]; - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedQFunctionContext ctx; - CeedScalar ctxData[5] = {1, 2, 3, 4, 5}, *ctxDataCopy; + CeedScalar ctxData[5] = {1, 2, 3, 4, 5}, *ctxDataCopy; CeedInit(argv[1], &ceed); CeedQFunctionContextCreate(ceed, &ctx); - CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(ctxData), &ctxData); + CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(ctxData), &ctxData); // Get data access CeedQFunctionContextGetDataRead(ctx, CEED_MEM_HOST, &ctxDataCopy); diff --git a/tests/t409-qfunction.c b/tests/t409-qfunction.c index b94b27857e..3e8bbd284d 100644 --- a/tests/t409-qfunction.c +++ b/tests/t409-qfunction.c @@ -1,21 +1,21 @@ /// @file /// Test creation, evaluation, and destruction for QFunction /// \test Test creation, evaluation, and destruction for QFunction +#include "t409-qfunction.h" + #include #include -#include "t409-qfunction.h" - int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector U, V; - CeedQFunction qf; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector U, V; + CeedQFunction qf; CeedQFunctionContext ctx; - CeedInt Q = 8; - const CeedScalar *v; - bool is_writable = true; - CeedScalar ctx_data[5] = {1, 2, 3, 4, 5}; + CeedInt Q = 8; + const CeedScalar *v; + bool is_writable = true; + CeedScalar ctx_data[5] = {1, 2, 3, 4, 5}; CeedInit(argv[1], &ceed); @@ -24,8 +24,7 @@ int main(int argc, char **argv) { CeedQFunctionAddOutput(qf, "v", 1, CEED_EVAL_INTERP); CeedQFunctionContextCreate(ceed, &ctx); - CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(ctx_data), &ctx_data); + CeedQFunctionContextSetData(ctx, CEED_MEM_HOST, CEED_COPY_VALUES, sizeof(ctx_data), &ctx_data); CeedQFunctionSetContext(qf, ctx); CeedQFunctionSetContextWritable(qf, is_writable); @@ -35,14 +34,14 @@ int main(int argc, char **argv) { CeedVectorSetValue(V, 0.0); { - in[0] = U; + in[0] = U; out[0] = V; CeedQFunctionApply(qf, Q, in, out); } CeedVectorGetArrayRead(V, CEED_MEM_HOST, &v); - for (CeedInt i=0; i 100.*CEED_EPSILON) + for (CeedInt i = 0; i < Q; i++) + if (fabs(v[i] - ctx_data[1]) > 100. * CEED_EPSILON) // LCOV_EXCL_START printf("v[%" CeedInt_FMT "] %f != 2.0\n", i, v[i]); // LCOV_EXCL_STOP @@ -70,7 +69,7 @@ int main(int argc, char **argv) { is_writable = false; CeedQFunctionSetContextWritable(qf, is_writable); { - in[0] = U; + in[0] = U; out[0] = V; // Will only error in `/cpu/self/memcheck/*` backends CeedQFunctionApply(qf, Q, in, out); diff --git a/tests/t409-qfunction.h b/tests/t409-qfunction.h index 1e4291ff61..ff89ff2c74 100644 --- a/tests/t409-qfunction.h +++ b/tests/t409-qfunction.h @@ -7,13 +7,12 @@ #include -CEED_QFUNCTION(scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) { - CeedScalar *scale = (CeedScalar *)ctx; - const CeedScalar *u = in[0]; - CeedScalar *v = out[0]; +CEED_QFUNCTION(scale)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + CeedScalar *scale = (CeedScalar *)ctx; + const CeedScalar *u = in[0]; + CeedScalar *v = out[0]; - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector Q_data, J, W, U, V; - CeedQFunction qf_setup, qf_mass; - CeedInt Q = 8; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector Q_data, J, W, U, V; + CeedQFunction qf_setup, qf_mass; + CeedInt Q = 8; const CeedScalar *vv; - CeedScalar j[Q], w[Q], u[Q], v[Q]; + CeedScalar j[Q], w[Q], u[Q], v[Q]; CeedInit(argv[1], &ceed); CeedQFunctionCreateInteriorByName(ceed, "Mass1DBuild", &qf_setup); CeedQFunctionCreateInteriorByName(ceed, "MassApply", &qf_mass); - for (CeedInt i=0; i int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector U, V; - CeedQFunction qf; - CeedInt Q = 8; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector U, V; + CeedQFunction qf; + CeedInt Q = 8; const CeedScalar *v; - CeedScalar u[Q]; + CeedScalar u[Q]; CeedInit(argv[1], &ceed); CeedQFunctionCreateIdentity(ceed, 1, CEED_EVAL_INTERP, CEED_EVAL_INTERP, &qf); - for (CeedInt i=0; i1e-14) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] v %f != u %f\n",i, v[i], u[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < Q; i++) { + if (fabs(v[i] - u[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %f != u %f\n", i, v[i], u[i]); + } CeedVectorRestoreArrayRead(V, &v); CeedVectorDestroy(&U); diff --git a/tests/t412-qfunction.c b/tests/t412-qfunction.c index 2abaa137d5..898bfe0556 100644 --- a/tests/t412-qfunction.c +++ b/tests/t412-qfunction.c @@ -5,39 +5,35 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector U, V; - CeedQFunction qf; - CeedInt Q = 8, size = 3; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector U, V; + CeedQFunction qf; + CeedInt Q = 8, size = 3; const CeedScalar *v; - CeedScalar u[Q*size]; + CeedScalar u[Q * size]; CeedInit(argv[1], &ceed); - CeedQFunctionCreateIdentity(ceed, size, CEED_EVAL_INTERP, CEED_EVAL_NONE, - &qf); + CeedQFunctionCreateIdentity(ceed, size, CEED_EVAL_INTERP, CEED_EVAL_NONE, &qf); - for (CeedInt i=0; i1e-12) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] v %f != u %f\n",i, v[i], u[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < Q * size; i++) { + if (fabs(v[i] - u[i]) > 1e-12) printf("[%" CeedInt_FMT "] v %f != u %f\n", i, v[i], u[i]); + } CeedVectorRestoreArrayRead(V, &v); CeedVectorDestroy(&U); diff --git a/tests/t413-qfunction.c b/tests/t413-qfunction.c index c67adfe836..690502ae76 100644 --- a/tests/t413-qfunction.c +++ b/tests/t413-qfunction.c @@ -4,7 +4,7 @@ #include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedQFunction qf_setup, qf_mass; CeedInit(argv[1], &ceed); diff --git a/tests/t414-qfunction.c b/tests/t414-qfunction.c index e5a4f9d48d..df50012d39 100644 --- a/tests/t414-qfunction.c +++ b/tests/t414-qfunction.c @@ -6,18 +6,18 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector Q_data, J, W, U, V; - CeedQFunction qf_setup, qf_mass; - CeedInt Q = 8; - const CeedInt num_comp = 3; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector Q_data, J, W, U, V; + CeedQFunction qf_setup, qf_mass; + CeedInt Q = 8; + const CeedInt num_comp = 3; const CeedScalar *vv; CeedInit(argv[1], &ceed); for (CeedInt dim = 2; dim <= 3; dim++) { - CeedInt num_qpts = CeedIntPow(Q, dim); + CeedInt num_qpts = CeedIntPow(Q, dim); CeedScalar j[num_qpts * dim * dim], w[num_qpts], u[num_qpts * num_comp]; char name[13] = ""; @@ -25,18 +25,18 @@ int main(int argc, char **argv) { CeedQFunctionCreateInteriorByName(ceed, name, &qf_setup); CeedQFunctionCreateInteriorByName(ceed, "Vector3MassApply", &qf_mass); - for (CeedInt i=0; i 10*CEED_EPSILON) + for (CeedInt i = 0; i < num_qpts; i++) sum += vv[i + c * num_qpts]; + if (fabs(sum - (c + 1)) > 10 * CEED_EPSILON) { // LCOV_EXCL_START - printf("%" CeedInt_FMT "D volume error in component %" CeedInt_FMT - ": %f != %f\n", dim, c, sum, (c + 1.0)); - // LCOV_EXCL_STOP + printf("%" CeedInt_FMT "D volume error in component %" CeedInt_FMT ": %f != %f\n", dim, c, sum, (c + 1.0)); + // LCOV_EXCL_STOP + } } CeedVectorRestoreArrayRead(V, &vv); diff --git a/tests/t415-qfunction.c b/tests/t415-qfunction.c index ffec354639..c928880726 100644 --- a/tests/t415-qfunction.c +++ b/tests/t415-qfunction.c @@ -6,41 +6,39 @@ #include int main(int argc, char **argv) { - Ceed ceed; - CeedVector in[16], out[16]; - CeedVector Q_data, J, W, dU, dV; - CeedQFunction qf_setup, qf_diff; - CeedInt Q = 8; - const CeedInt num_comp = 3; + Ceed ceed; + CeedVector in[16], out[16]; + CeedVector Q_data, J, W, dU, dV; + CeedQFunction qf_setup, qf_diff; + CeedInt Q = 8; + const CeedInt num_comp = 3; const CeedScalar *vv; CeedInit(argv[1], &ceed); for (CeedInt dim = 1; dim <= 3; dim++) { - CeedInt num_qpts = CeedIntPow(Q, dim); - CeedScalar j[num_qpts * dim * dim], w[num_qpts], - du[num_qpts * dim * num_comp]; + CeedInt num_qpts = CeedIntPow(Q, dim); + CeedScalar j[num_qpts * dim * dim], w[num_qpts], du[num_qpts * dim * num_comp]; char name_setup[26] = "", name_apply[26] = ""; snprintf(name_setup, sizeof name_setup, "Poisson%" CeedInt_FMT "DBuild", dim); CeedQFunctionCreateInteriorByName(ceed, name_setup, &qf_setup); - snprintf(name_apply, sizeof name_apply, "Vector3Poisson%" CeedInt_FMT "DApply", - dim); + snprintf(name_apply, sizeof name_apply, "Vector3Poisson%" CeedInt_FMT "DApply", dim); CeedQFunctionCreateInteriorByName(ceed, name_apply, &qf_diff); - for (CeedInt i=0; i 10*CEED_EPSILON) + for (CeedInt i = 0; i < num_qpts; i++) { + for (CeedInt g = 0; g < dim; g++) sum += vv[i + (g * num_comp + c) * num_qpts]; + } + if (fabs(sum - dim * (c + 1)) > 10 * CEED_EPSILON) { // LCOV_EXCL_START - printf("%" CeedInt_FMT "D volume error in component %" CeedInt_FMT - ": %f != %f\n", dim, c, sum, dim * (c + 1.0)); - // LCOV_EXCL_STOP + printf("%" CeedInt_FMT "D volume error in component %" CeedInt_FMT ": %f != %f\n", dim, c, sum, dim * (c + 1.0)); + // LCOV_EXCL_STOP + } } CeedVectorRestoreArrayRead(dV, &vv); diff --git a/tests/t500-operator.c b/tests/t500-operator.c index d16c273369..93ccffb199 100644 --- a/tests/t500-operator.c +++ b/tests/t500-operator.c @@ -1,58 +1,54 @@ /// @file /// Test creation, action, and destruction for mass matrix operator /// \test Test creation, action, and destruction for mass matrix operator +#include "t500-operator.h" + #include -#include #include - -#include "t500-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - const CeedScalar *hv; - CeedInt num_elem = 15, P = 5, Q = 8; - CeedInt num_nodes_x = num_elem+1, num_nodes_u = num_elem*(P-1)+1; - CeedInt ind_x[num_elem*2], ind_u[num_elem*P]; - CeedScalar x[num_nodes_x]; - -//! [Ceed Init] + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + const CeedScalar *hv; + CeedInt num_elem = 15, P = 5, Q = 8; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (P - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * P]; + CeedScalar x[num_nodes_x]; + + //! [Ceed Init] CeedInit(argv[1], &ceed); -//! [Ceed Init] - for (CeedInt i=0; i 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n",i, hv[i]); + for (CeedInt i = 0; i < num_nodes_u; i++) { + if (fabs(hv[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, hv[i]); + } CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t500-operator.h b/tests/t500-operator.h index 00c9d52bba..f4c93c6dc5 100644 --- a/tests/t500-operator.h +++ b/tests/t500-operator.h @@ -7,22 +7,19 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *dxdX = in[1]; - CeedScalar *rho = out[0]; - for (CeedInt i=0; i -#include #include +#include #include "t500-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - const CeedScalar *hv; - CeedInt num_elem = 15, P = 5, Q = 8; - CeedInt num_nodes_x = num_elem+1, num_nodes_u = num_elem*(P-1)+1; - CeedInt ind_x[num_elem*2], ind_u[num_elem*P]; - CeedScalar x[num_nodes_x]; - CeedScalar sum; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + const CeedScalar *hv; + CeedInt num_elem = 15, P = 5, Q = 8; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (P - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * P]; + CeedScalar x[num_nodes_x]; + CeedScalar sum; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_nodes_u; i++) sum += hv[i]; + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t502-operator.c b/tests/t502-operator.c index 5cc4ff5e38..8d0d8602b3 100644 --- a/tests/t502-operator.c +++ b/tests/t502-operator.c @@ -1,49 +1,45 @@ /// @file /// Test creation, action, and destruction for mass matrix operator with multiple components /// \test Test creation, action, and destruction for mass matrix operator with multiple components +#include "t502-operator.h" + #include -#include #include - -#include "t502-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - CeedScalar *hu; - const CeedScalar *hv; - CeedInt num_elem = 15, P = 5, Q = 8; - CeedInt num_nodes_x = num_elem+1, num_nodes_u = num_elem*(P-1)+1; - CeedInt ind_x[num_elem*2], ind_u[num_elem*P]; - CeedScalar x[num_nodes_x]; - CeedScalar sum_1, sum_2; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + CeedScalar *hu; + const CeedScalar *hv; + CeedInt num_elem = 15, P = 5, Q = 8; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (P - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * P]; + CeedScalar x[num_nodes_x]; + CeedScalar sum_1, sum_2; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum_1); - // LCOV_EXCL_STOP - if (fabs(sum_2-2.)>1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 2.0\n", sum_2); - // LCOV_EXCL_STOP + if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 2.0\n", sum_2); CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t502-operator.h b/tests/t502-operator.h index fac104dd2a..ba061b9a2c 100644 --- a/tests/t502-operator.h +++ b/tests/t502-operator.h @@ -7,24 +7,21 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *dxdX = in[1]; - CeedScalar *rho = out[0]; - for (CeedInt i=0; i -#include #include +#include #include "t500-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - const CeedScalar *hv; - CeedInt num_elem = 15, P = 5, Q = 8; - CeedInt Nx = num_elem+1, Nu = num_elem*(P-1)+1; - CeedInt ind_x[num_elem*2], ind_u[num_elem*P]; - CeedScalar x[Nx]; - CeedScalar sum; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + const CeedScalar *hv; + CeedInt num_elem = 15, P = 5, Q = 8; + CeedInt Nx = num_elem + 1, Nu = num_elem * (P - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * P]; + CeedScalar x[Nx]; + CeedScalar sum; CeedInit(argv[1], &ceed); @@ -29,31 +29,27 @@ int main(int argc, char **argv) { CeedVectorCreate(ceed, Nu, &V); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < Nu; i++) sum += hv[i]; + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t504-operator.c b/tests/t504-operator.c index 3efe16be40..b171422738 100644 --- a/tests/t504-operator.c +++ b/tests/t504-operator.c @@ -2,42 +2,39 @@ /// Test viewing of mass matrix operator /// \test Test viewing of mass matrix operator #include -#include #include +#include #include "t500-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; - CeedBasis bx, bu; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data; - CeedInt num_elem = 15, P = 5, Q = 8; - CeedInt num_nodes_x = num_elem+1, num_nodes_u = num_elem*(P-1)+1; - CeedInt ind_x[num_elem*2], ind_u[num_elem*P]; + CeedBasis bx, bu; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data; + CeedInt num_elem = 15, P = 5, Q = 8; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (P - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * P]; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i -#include #include +#include #include "t500-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - const CeedScalar *hv; - CeedInt num_elem = 15, P = 5, Q = 8; - CeedInt num_nodes_x = num_elem+1, num_nodes_u = num_elem*(P-1)+1; - CeedInt ind_x[num_elem*2], ind_u[num_elem*P]; - CeedScalar x[num_nodes_x]; - CeedScalar sum; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + const CeedScalar *hv; + CeedInt num_elem = 15, P = 5, Q = 8; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (P - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * P]; + CeedScalar x[num_nodes_x]; + CeedScalar sum; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_nodes_u; i++) sum += hv[i]; + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V, &hv); // Apply with V = 1 @@ -109,12 +96,8 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(V, CEED_MEM_HOST, &hv); sum = -num_nodes_u; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_nodes_u; i++) sum += hv[i]; + if (fabs(sum - (1.)) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t506-operator.c b/tests/t506-operator.c index f1f5e0fe06..737a3878a4 100644 --- a/tests/t506-operator.c +++ b/tests/t506-operator.c @@ -2,59 +2,51 @@ /// Test creation reuse of the same QFunction for multiple operators /// \test Test creation reuse of the same QFunction for multiple operators #include -#include #include +#include + #include "t502-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i_small, elem_restr_qd_i_large; - CeedBasis basis_x_small, basis_x_large, basis_u_small, basis_u_large; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup_small, op_mass_small, - op_setup_large, op_mass_large; - CeedVector q_data_small, q_data_large, X, U, V; - CeedScalar *hu; - const CeedScalar *hv; - CeedInt num_elem = 15, P = 5, Q = 8, scale = 3; - CeedInt num_nodes_x = num_elem+1, num_nodes_u = num_elem*(P-1)+1; - CeedInt ind_x[num_elem*2], ind_u[num_elem*P]; - CeedScalar x[num_nodes_x]; - CeedScalar sum_1, sum_2; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i_small, elem_restr_qd_i_large; + CeedBasis basis_x_small, basis_x_large, basis_u_small, basis_u_large; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup_small, op_mass_small, op_setup_large, op_mass_large; + CeedVector q_data_small, q_data_large, X, U, V; + CeedScalar *hu; + const CeedScalar *hv; + CeedInt num_elem = 15, P = 5, Q = 8, scale = 3; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (P - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * P]; + CeedScalar x[num_nodes_x]; + CeedScalar sum_1, sum_2; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum_1); - // LCOV_EXCL_STOP - if (fabs(sum_2-2.)>1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 2.0\n", sum_2); - // LCOV_EXCL_STOP + if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 2.0\n", sum_2); CeedVectorRestoreArrayRead(V, &hv); // 'Large' operator @@ -153,19 +124,14 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(V, CEED_MEM_HOST, &hv); - sum_1 = 0.; sum_2 = 0.; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum_1); - // LCOV_EXCL_STOP - if (fabs(sum_2-2.)>1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 2.0\n", sum_2); - // LCOV_EXCL_STOP + if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 2.0\n", sum_2); CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t507-operator.c b/tests/t507-operator.c index 9e1b53a564..eb2e4e4f4e 100644 --- a/tests/t507-operator.c +++ b/tests/t507-operator.c @@ -1,49 +1,45 @@ /// @file /// Test VLA macro for operator /// \test VLA marco for operator +#include "t507-operator.h" + #include -#include #include - -#include "t507-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - CeedScalar *hu; - const CeedScalar *hv; - CeedInt num_elem = 15, P = 5, Q = 8; - CeedInt num_nodes_x = num_elem+1, num_nodes_u = num_elem*(P-1)+1; - CeedInt ind_x[num_elem*2], ind_u[num_elem*P]; - CeedScalar x[num_nodes_x]; - CeedScalar sum_1, sum_2; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + CeedScalar *hu; + const CeedScalar *hv; + CeedInt num_elem = 15, P = 5, Q = 8; + CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (P - 1) + 1; + CeedInt ind_x[num_elem * 2], ind_u[num_elem * P]; + CeedScalar x[num_nodes_x]; + CeedScalar sum_1, sum_2; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum_1); - // LCOV_EXCL_STOP - if (fabs(sum_2-2.)>1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 2.0\n", sum_2); - // LCOV_EXCL_STOP + if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 2.0\n", sum_2); CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t507-operator.h b/tests/t507-operator.h index 2927435d47..2bee4f2008 100644 --- a/tests/t507-operator.h +++ b/tests/t507-operator.h @@ -7,27 +7,23 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *dxdX = in[1]; - CeedScalar *rho = out[0]; + CeedScalar *rho = out[0]; - for (CeedInt i=0; i -#include #include +#include #include "t500-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedQFunction qf, qf_2; - CeedOperator op, op_2; + CeedOperator op, op_2; CeedInit(argv[1], &ceed); CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf); CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_2); - CeedOperatorCreate(ceed, qf, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op); - CeedOperatorCreate(ceed, qf_2, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_2); + CeedOperatorCreate(ceed, qf, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op); + CeedOperatorCreate(ceed, qf_2, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_2); - CeedOperatorReferenceCopy(op, &op_2); // This destroys the previous op_2 - if (op != op_2) - // LCOV_EXCL_START - printf("Error copying CeedOperator reference\n"); - // LCOV_EXCL_STOP + CeedOperatorReferenceCopy(op, &op_2); // This destroys the previous op_2 + if (op != op_2) printf("Error copying CeedOperator reference\n"); CeedQFunctionDestroy(&qf); CeedQFunctionDestroy(&qf_2); diff --git a/tests/t509-operator.c b/tests/t509-operator.c index 04c392836d..c8be910a64 100644 --- a/tests/t509-operator.c +++ b/tests/t509-operator.c @@ -2,66 +2,55 @@ /// Test creation, action, and destruction for identity operator /// \test Test creation, action, and destruction for identity operator #include -#include #include +#include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_u, elem_restr_u_i; - CeedBasis basis_u; - CeedQFunction qf_identity; - CeedOperator op_identity; - CeedVector U, V; - const CeedScalar *hv; - CeedInt num_elem = 15, P = 5, Q = 8; - CeedInt elem_size = P, num_nodes = num_elem*(P-1)+1; - CeedInt ind_u[num_elem*P]; + CeedBasis basis_u; + CeedQFunction qf_identity; + CeedOperator op_identity; + CeedVector U, V; + const CeedScalar *hv; + CeedInt num_elem = 15, P = 5, Q = 8; + CeedInt elem_size = P, num_nodes = num_elem * (P - 1) + 1; + CeedInt ind_u[num_elem * P]; CeedInit(argv[1], &ceed); // Restrictions - for (CeedInt i=0; i1e-14) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] Computed Value: %f != True Value: 1.0\n", i, hv[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < elem_size * num_elem; i++) { + if (fabs(hv[i] - 3.) > 1e-14) printf("[%" CeedInt_FMT "] Computed Value: %f != True Value: 1.0\n", i, hv[i]); + } CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_identity); diff --git a/tests/t510-operator.c b/tests/t510-operator.c index 3ec1a5e5ee..50626dcf49 100644 --- a/tests/t510-operator.c +++ b/tests/t510-operator.c @@ -1,79 +1,75 @@ /// @file /// Test creation, action, and destruction for mass matrix operator /// \test Test creation, action, and destruction for mass matrix operator +#include "t510-operator.h" + #include -#include #include +#include + #include "t320-basis.h" -#include "t510-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - const CeedScalar *hv; - CeedInt num_elem = 12, dim = 2, P = 6, Q = 4; - CeedInt nx = 3, ny = 2; - CeedInt row, col, offset; - CeedInt num_dofs = (nx*2+1)*(ny*2+1), num_qpts = num_elem*Q; - CeedInt ind_x[num_elem*P]; - CeedScalar x[dim*num_dofs]; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + const CeedScalar *hv; + CeedInt num_elem = 12, dim = 2, P = 6, Q = 4; + CeedInt nx = 3, ny = 2; + CeedInt row, col, offset; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * Q; + CeedInt ind_x[num_elem * P]; + CeedScalar x[dim * num_dofs]; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n",i, hv[i]); + for (CeedInt i = 0; i < num_dofs; i++) { + if (fabs(hv[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, hv[i]); + } CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t510-operator.h b/tests/t510-operator.h index 0bbb388102..2dc9cbe914 100644 --- a/tests/t510-operator.h +++ b/tests/t510-operator.h @@ -7,22 +7,19 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *J = in[1]; - CeedScalar *rho = out[0]; - for (CeedInt i=0; i -#include #include +#include + #include "t320-basis.h" #include "t510-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - const CeedScalar *hv; - CeedInt num_elem = 12, dim = 2, P = 6, Q = 4; - CeedInt nx = 3, ny = 2; - CeedInt row, col, offset; - CeedInt num_dofs = (nx*2+1)*(ny*2+1), num_qpts = num_elem*Q; - CeedInt indx[num_elem*P]; - CeedScalar x[dim*num_dofs]; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; - CeedScalar sum; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + const CeedScalar *hv; + CeedInt num_elem = 12, dim = 2, P = 6, Q = 4; + CeedInt nx = 3, ny = 2; + CeedInt row, col, offset; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * Q; + CeedInt indx[num_elem * P]; + CeedScalar x[dim * num_dofs]; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; + CeedScalar sum; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_dofs; i++) sum += hv[i]; + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V, &hv); CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t520-operator.c b/tests/t520-operator.c index 2561cc14ee..a3d7615c96 100644 --- a/tests/t520-operator.c +++ b/tests/t520-operator.c @@ -2,8 +2,9 @@ /// Test creation, action, and destruction for composite mass matrix operator /// \test Test creation, action, and destruction for composite mass matrix operator #include -#include #include +#include + #include "t320-basis.h" #include "t510-operator.h" @@ -17,43 +18,32 @@ */ int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, - elem_restr_qd_i_tet, - elem_restr_x_hex, elem_restr_u_hex, - elem_restr_qd_i_hex; - CeedBasis basis_x_tet, basis_u_tet, - basis_x_hex, basis_u_hex; - CeedQFunction qf_setup_tet, qf_mass_tet, - qf_setup_hex, qf_mass_hex; - CeedOperator op_setup_tet, op_mass_tet, - op_setup_hex, op_mass_hex, - op_setup, op_mass; - CeedVector q_data_tet, q_data_hex, X, U, V; - const CeedScalar *hv; - CeedInt num_elem_tet = 6, P_tet = 6, Q_tet = 4, - num_elem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; - CeedInt n_x = 3, n_y = 3, - n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; - CeedInt row, col, offset; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), - num_qpts_tet = num_elem_tet*Q_tet, - num_qpts_hex = num_elem_hex*Q_hex*Q_hex; - CeedInt ind_x_tet[num_elem_tet*P_tet], - ind_x_hex[num_elem_hex*P_hex*P_hex]; - CeedScalar x[dim*num_dofs]; - CeedScalar q_ref[dim*Q_tet], q_weight[Q_tet]; - CeedScalar interp[P_tet*Q_tet], grad[dim*P_tet*Q_tet]; + Ceed ceed; + CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, elem_restr_qd_i_tet, elem_restr_x_hex, elem_restr_u_hex, elem_restr_qd_i_hex; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex; + CeedOperator op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass; + CeedVector q_data_tet, q_data_hex, X, U, V; + const CeedScalar *hv; + CeedInt num_elem_tet = 6, P_tet = 6, Q_tet = 4, num_elem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; + CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * Q_tet, num_qpts_hex = num_elem_hex * Q_hex * Q_hex; + CeedInt ind_x_tet[num_elem_tet * P_tet], ind_x_hex[num_elem_hex * P_hex * P_hex]; + CeedScalar x[dim * num_dofs]; + CeedScalar q_ref[dim * Q_tet], q_weight[Q_tet]; + CeedScalar interp[P_tet * Q_tet], grad[dim * P_tet * Q_tet]; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n",i, hv[i]); + for (CeedInt i = 0; i < num_dofs; i++) { + if (fabs(hv[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, hv[i]); + } CeedVectorRestoreArrayRead(V, &hv); // Cleanup diff --git a/tests/t521-operator.c b/tests/t521-operator.c index 5b3207c32c..eff424a9fe 100644 --- a/tests/t521-operator.c +++ b/tests/t521-operator.c @@ -2,8 +2,9 @@ /// Test creation, action, and destruction for composite mass matrix operator /// \test Test creation, action, and destruction for composite mass matrix operator #include -#include #include +#include + #include "t320-basis.h" #include "t510-operator.h" @@ -17,44 +18,33 @@ */ int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, - elem_restr_qd_i_tet, - elem_restr_x_hex, elem_restr_u_hex, - elem_restr_qd_i_hex; - CeedBasis basis_x_tet, basis_u_tet, - basis_x_hex, basis_u_hex; - CeedQFunction qf_setup_tet, qf_mass_tet, - qf_setup_hex, qf_mass_hex; - CeedOperator op_setup_tet, op_mass_tet, - op_setup_hex, op_mass_hex, - op_setup, op_mass; - CeedVector q_data_tet, q_data_hex, X, U, V; - const CeedScalar *hv; - CeedInt num_elem_tet = 6, P_tet = 6, Q_tet = 4, - num_elem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; - CeedInt n_x = 3, n_y = 3, - n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; - CeedInt row, col, offset; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), - num_qpts_tet = num_elem_tet*Q_tet, - num_qpts_hex = num_elem_hex*Q_hex*Q_hex; - CeedInt ind_x_tet[num_elem_tet*P_tet], - ind_x_hex[num_elem_hex*P_hex*P_hex]; - CeedScalar x[dim*num_dofs]; - CeedScalar q_ref[dim*Q_tet], q_weight[Q_tet]; - CeedScalar interp[P_tet*Q_tet], grad[dim*P_tet*Q_tet]; - CeedScalar sum; + Ceed ceed; + CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, elem_restr_qd_i_tet, elem_restr_x_hex, elem_restr_u_hex, elem_restr_qd_i_hex; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex; + CeedOperator op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass; + CeedVector q_data_tet, q_data_hex, X, U, V; + const CeedScalar *hv; + CeedInt num_elem_tet = 6, P_tet = 6, Q_tet = 4, num_elem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; + CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * Q_tet, num_qpts_hex = num_elem_hex * Q_hex * Q_hex; + CeedInt ind_x_tet[num_elem_tet * P_tet], ind_x_hex[num_elem_hex * P_hex * P_hex]; + CeedScalar x[dim * num_dofs]; + CeedScalar q_ref[dim * Q_tet], q_weight[Q_tet]; + CeedScalar interp[P_tet * Q_tet], grad[dim * P_tet * Q_tet]; + CeedScalar sum; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_dofs; i++) sum += hv[i]; + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V, &hv); // Cleanup diff --git a/tests/t522-operator.c b/tests/t522-operator.c index 4f6418eeba..37d635935a 100644 --- a/tests/t522-operator.c +++ b/tests/t522-operator.c @@ -1,11 +1,13 @@ /// @file /// Test creation, action, and destruction for diffusion matrix operator /// \test Test creation, action, and destruction for diffusion matrix operator +#include "t522-operator.h" + #include -#include #include +#include + #include "t320-basis.h" -#include "t522-operator.h" /* The mesh comprises of two rows of 3 quadralaterals followed by one row of 6 triangles: @@ -17,185 +19,141 @@ */ int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, - elem_restr_qd_i_tet, - elem_restr_x_hex, elem_restr_u_hex, - elem_restr_qd_i_hex; - CeedBasis basis_x_tet, basis_u_tet, - basis_x_hex, basis_u_hex; - CeedQFunction qf_setup_tet, qf_diff_tet, - qf_setup_hex, qf_diff_hex; - CeedOperator op_setup_tet, op_diff_tet, - op_setup_hex, op_diff_hex, - op_setup, op_diff; - CeedVector q_data_tet, q_data_hex, X, U, V; - const CeedScalar *hv; - CeedInt num_elem_tet = 6, P_tet = 6, Q_tet = 4, - num_elem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; - CeedInt n_x = 3, n_y = 3, - n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; - CeedInt row, col, offset; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), - num_qpts_tet = num_elem_tet*Q_tet, - num_qpts_hex = num_elem_hex*Q_hex*Q_hex; - CeedInt ind_x_tet[num_elem_tet*P_tet], - ind_x_hex[num_elem_hex*P_hex*P_hex]; - CeedScalar x[dim*num_dofs]; - CeedScalar q_ref[dim*Q_tet], q_weight[Q_tet]; - CeedScalar interp[P_tet*Q_tet], grad[dim*P_tet*Q_tet]; + Ceed ceed; + CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, elem_restr_qd_i_tet, elem_restr_x_hex, elem_restr_u_hex, elem_restr_qd_i_hex; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_setup_tet, qf_diff_tet, qf_setup_hex, qf_diff_hex; + CeedOperator op_setup_tet, op_diff_tet, op_setup_hex, op_diff_hex, op_setup, op_diff; + CeedVector q_data_tet, q_data_hex, X, U, V; + const CeedScalar *hv; + CeedInt num_elem_tet = 6, P_tet = 6, Q_tet = 4, num_elem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; + CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * Q_tet, num_qpts_hex = num_elem_hex * Q_hex * Q_hex; + CeedInt ind_x_tet[num_elem_tet * P_tet], ind_x_hex[num_elem_hex * P_hex * P_hex]; + CeedScalar x[dim * num_dofs]; + CeedScalar q_ref[dim * Q_tet], q_weight[Q_tet]; + CeedScalar interp[P_tet * Q_tet], grad[dim * P_tet * Q_tet]; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i100.*CEED_EPSILON) printf("Computed: %f != True: 0.0\n", hv[i]); + for (CeedInt i = 0; i < num_dofs; i++) { + if (fabs(hv[i]) > 100. * CEED_EPSILON) printf("Computed: %f != True: 0.0\n", hv[i]); + } CeedVectorRestoreArrayRead(V, &hv); // Cleanup diff --git a/tests/t522-operator.h b/tests/t522-operator.h index 3d163c7d4b..f7ff45596f 100644 --- a/tests/t522-operator.h +++ b/tests/t522-operator.h @@ -7,54 +7,46 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *qw = in[0], *J = in[1]; - CeedScalar *qd = out[0]; + CeedScalar *qd = out[0]; - for (CeedInt i=0; i -#include #include +#include + #include "t320-basis.h" #include "t510-operator.h" @@ -17,87 +18,68 @@ */ int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, - elem_restr_qd_tet, - elem_restr_x_hex, elem_restr_u_hex, - elem_restr_qd_i_hex; - CeedBasis basis_x_tet, basis_u_tet, - basis_x_hex, basis_u_hex; - CeedQFunction qf_setup_tet, qf_mass_tet, - qf_setup_hex, qf_mass_hex; - CeedOperator op_setup_tet, op_mass_tet, - op_setup_hex, op_mass_hex, - op_setup, op_mass; - CeedVector q_data_tet, q_data_hex, X; - CeedInt nelem_tet = 6, P_tet = 6, Q_tet = 4, - nelem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; - CeedInt nx = 3, ny = 3, - nx_tet = 3, ny_tet = 1, nx_hex = 3; - CeedInt row, col, offset; - CeedInt num_dofs = (nx*2+1)*(ny*2+1), - num_qpts_tet = nelem_tet*Q_tet, - num_qpts_hex = nelem_hex*Q_hex*Q_hex; - CeedInt ind_x_tet[nelem_tet*P_tet], - ind_x_hex[nelem_hex*P_hex*P_hex]; - CeedScalar q_ref[dim*Q_tet], q_weight[Q_tet]; - CeedScalar interp[P_tet*Q_tet], grad[dim*P_tet*Q_tet]; + Ceed ceed; + CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, elem_restr_qd_tet, elem_restr_x_hex, elem_restr_u_hex, elem_restr_qd_i_hex; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex; + CeedOperator op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass; + CeedVector q_data_tet, q_data_hex, X; + CeedInt nelem_tet = 6, P_tet = 6, Q_tet = 4, nelem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; + CeedInt nx = 3, ny = 3, nx_tet = 3, ny_tet = 1, nx_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts_tet = nelem_tet * Q_tet, num_qpts_hex = nelem_hex * Q_hex * Q_hex; + CeedInt ind_x_tet[nelem_tet * P_tet], ind_x_hex[nelem_hex * P_hex * P_hex]; + CeedScalar q_ref[dim * Q_tet], q_weight[Q_tet]; + CeedScalar interp[P_tet * Q_tet], grad[dim * P_tet * Q_tet]; CeedInit(argv[1], &ceed); // DoF Coordinates - CeedVectorCreate(ceed, dim*num_dofs, &X); + CeedVectorCreate(ceed, dim * num_dofs, &X); // Qdata Vectors CeedVectorCreate(ceed, num_qpts_tet, &q_data_tet); CeedVectorCreate(ceed, num_qpts_hex, &q_data_hex); // Set up _tet Elements - for (CeedInt i=0; i -#include #include +#include + #include "t320-basis.h" #include "t510-operator.h" @@ -17,44 +18,33 @@ */ int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, - elem_restr_qd_i_tet, - elem_restr_x_hex, elem_restr_u_hex, - elem_restr_qd_i_hex; - CeedBasis basis_x_tet, basis_u_tet, - basis_x_hex, basis_u_hex; - CeedQFunction qf_setup_tet, qf_mass_tet, - qf_setup_hex, qf_mass_hex; - CeedOperator op_setup_tet, op_mass_tet, - op_setup_hex, op_mass_hex, - op_setup, op_mass; - CeedVector q_data_tet, q_data_hex, X, U, V; - const CeedScalar *hv; - CeedInt nelem_tet = 6, P_tet = 6, Q_tet = 4, - nelem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; - CeedInt nx = 3, ny = 3, - nx_tet = 3, ny_tet = 1, nx_hex = 3; - CeedInt row, col, offset; - CeedInt num_dofs = (nx*2+1)*(ny*2+1), - num_qpts_tet = nelem_tet*Q_tet, - num_qpts_hex = nelem_hex*Q_hex*Q_hex; - CeedInt ind_x_tet[nelem_tet*P_tet], - ind_x_hex[nelem_hex*P_hex*P_hex]; - CeedScalar x[dim*num_dofs]; - CeedScalar q_ref[dim*Q_tet], q_weight[Q_tet]; - CeedScalar interp[P_tet*Q_tet], grad[dim*P_tet*Q_tet]; - CeedScalar sum; + Ceed ceed; + CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, elem_restr_qd_i_tet, elem_restr_x_hex, elem_restr_u_hex, elem_restr_qd_i_hex; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex; + CeedOperator op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass; + CeedVector q_data_tet, q_data_hex, X, U, V; + const CeedScalar *hv; + CeedInt nelem_tet = 6, P_tet = 6, Q_tet = 4, nelem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; + CeedInt nx = 3, ny = 3, nx_tet = 3, ny_tet = 1, nx_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts_tet = nelem_tet * Q_tet, num_qpts_hex = nelem_hex * Q_hex * Q_hex; + CeedInt ind_x_tet[nelem_tet * P_tet], ind_x_hex[nelem_hex * P_hex * P_hex]; + CeedScalar x[dim * num_dofs]; + CeedScalar q_ref[dim * Q_tet], q_weight[Q_tet]; + CeedScalar interp[P_tet * Q_tet], grad[dim * P_tet * Q_tet]; + CeedScalar sum; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_dofs; i++) sum += hv[i]; + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V, &hv); // Apply Add @@ -236,12 +188,8 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(V, CEED_MEM_HOST, &hv); sum = -num_dofs; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_dofs; i++) sum += hv[i]; + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V, &hv); // Cleanup diff --git a/tests/t525-operator.c b/tests/t525-operator.c index 0683da4fec..a7122665e4 100644 --- a/tests/t525-operator.c +++ b/tests/t525-operator.c @@ -3,10 +3,11 @@ /// \test Test setting QFunctionContext fields from Operator #include #include + #include "t500-operator.h" typedef struct { - int count; + int count; double other; } TestContext1; @@ -16,66 +17,50 @@ typedef struct { } TestContext2; int main(int argc, char **argv) { - Ceed ceed; - CeedQFunctionContext qf_ctx_sub_1, qf_ctx_sub_2; + Ceed ceed; + CeedQFunctionContext qf_ctx_sub_1, qf_ctx_sub_2; CeedContextFieldLabel count_label, other_label, time_label, bad_label; - CeedQFunction qf_sub_1, qf_sub_2; - CeedOperator op_sub_1, op_sub_2, op_composite; + CeedQFunction qf_sub_1, qf_sub_2; + CeedOperator op_sub_1, op_sub_2, op_composite; TestContext1 ctx_data_1 = { - .count = 42, - .other = -3.0, + .count = 42, + .other = -3.0, }; TestContext2 ctx_data_2 = { - .time = 1.0, - .other = -3.0, + .time = 1.0, + .other = -3.0, }; CeedInit(argv[1], &ceed); // First sub-operator CeedQFunctionContextCreate(ceed, &qf_ctx_sub_1); - CeedQFunctionContextSetData(qf_ctx_sub_1, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(TestContext1), &ctx_data_1); - CeedQFunctionContextRegisterInt32(qf_ctx_sub_1, "count", - offsetof(TestContext1, count), - 1, "some sort of counter"); - CeedQFunctionContextRegisterDouble(qf_ctx_sub_1, "other", - offsetof(TestContext1, other), - 1, "some other value"); + CeedQFunctionContextSetData(qf_ctx_sub_1, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(TestContext1), &ctx_data_1); + CeedQFunctionContextRegisterInt32(qf_ctx_sub_1, "count", offsetof(TestContext1, count), 1, "some sort of counter"); + CeedQFunctionContextRegisterDouble(qf_ctx_sub_1, "other", offsetof(TestContext1, other), 1, "some other value"); CeedQFunctionCreateInterior(ceed, 1, setup, setup_loc, &qf_sub_1); CeedQFunctionSetContext(qf_sub_1, qf_ctx_sub_1); - CeedOperatorCreate(ceed, qf_sub_1, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_sub_1); + CeedOperatorCreate(ceed, qf_sub_1, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_sub_1); // Check setting field in operator CeedOperatorContextGetFieldLabel(op_sub_1, "count", &count_label); int value_count = 43; CeedOperatorContextSetInt32(op_sub_1, count_label, &value_count); - if (ctx_data_1.count != 43) - // LCOV_EXCL_START - printf("Incorrect context data for count: %" CeedInt_FMT " != 43", - ctx_data_1.count); - // LCOV_EXCL_STOP + if (ctx_data_1.count != 43) printf("Incorrect context data for count: %" CeedInt_FMT " != 43", ctx_data_1.count); // Second sub-operator CeedQFunctionContextCreate(ceed, &qf_ctx_sub_2); - CeedQFunctionContextSetData(qf_ctx_sub_2, CEED_MEM_HOST, CEED_USE_POINTER, - sizeof(TestContext2), &ctx_data_2); - CeedQFunctionContextRegisterDouble(qf_ctx_sub_2, "time", - offsetof(TestContext2, time), - 1, "current time"); - CeedQFunctionContextRegisterDouble(qf_ctx_sub_2, "other", - offsetof(TestContext2, other), - 1, "some other value"); + CeedQFunctionContextSetData(qf_ctx_sub_2, CEED_MEM_HOST, CEED_USE_POINTER, sizeof(TestContext2), &ctx_data_2); + CeedQFunctionContextRegisterDouble(qf_ctx_sub_2, "time", offsetof(TestContext2, time), 1, "current time"); + CeedQFunctionContextRegisterDouble(qf_ctx_sub_2, "other", offsetof(TestContext2, other), 1, "some other value"); CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_sub_2); CeedQFunctionSetContext(qf_sub_2, qf_ctx_sub_2); - CeedOperatorCreate(ceed, qf_sub_2, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_sub_2); + CeedOperatorCreate(ceed, qf_sub_2, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_sub_2); // Composite operator CeedCompositeOperatorCreate(ceed, &op_composite); @@ -86,10 +71,7 @@ int main(int argc, char **argv) { CeedOperatorContextGetFieldLabel(op_composite, "time", &time_label); double value_time = 2.0; CeedOperatorContextSetDouble(op_composite, time_label, &value_time); - if (ctx_data_2.time != 2.0) - // LCOV_EXCL_START - printf("Incorrect context data for time: %f != 2.0\n", ctx_data_2.time); - // LCOV_EXCL_STOP + if (ctx_data_2.time != 2.0) printf("Incorrect context data for time: %f != 2.0\n", ctx_data_2.time); // Check setting field in context of multiple sub-operators for composite operator CeedOperatorContextGetFieldLabel(op_composite, "other", &other_label); @@ -97,21 +79,12 @@ int main(int argc, char **argv) { CeedOperatorContextGetFieldLabel(op_composite, "other", &other_label); double value_other = 9000.; CeedOperatorContextSetDouble(op_composite, other_label, &value_other); - if (ctx_data_1.other != 9000.0) - // LCOV_EXCL_START - printf("Incorrect context data for other: %f != 2.0\n", ctx_data_1.other); - // LCOV_EXCL_STOP - if (ctx_data_2.other != 9000.0) - // LCOV_EXCL_START - printf("Incorrect context data for other: %f != 2.0\n", ctx_data_2.other); - // LCOV_EXCL_STOP + if (ctx_data_1.other != 9000.0) printf("Incorrect context data for other: %f != 2.0\n", ctx_data_1.other); + if (ctx_data_2.other != 9000.0) printf("Incorrect context data for other: %f != 2.0\n", ctx_data_2.other); // Check requesting label for field that doesn't exist returns NULL CeedOperatorContextGetFieldLabel(op_composite, "bad", &bad_label); - if (bad_label) - // LCOV_EXCL_START - printf("Incorrect context label returned\n"); - // LCOV_EXCL_STOP + if (bad_label) printf("Incorrect context label returned\n"); CeedQFunctionContextDestroy(&qf_ctx_sub_1); CeedQFunctionContextDestroy(&qf_ctx_sub_2); diff --git a/tests/t526-operator.c b/tests/t526-operator.c index cdc2c26795..6c2dee994a 100644 --- a/tests/t526-operator.c +++ b/tests/t526-operator.c @@ -2,8 +2,9 @@ /// Test FLOP estimation for composite mass matrix operator /// \test Test FLOP estimation for composite mass matrix operator #include -#include #include +#include + #include "t320-basis.h" /* The mesh comprises of two rows of 3 quadralaterals followed by one row @@ -16,29 +17,20 @@ */ int main(int argc, char **argv) { - Ceed ceed; - CeedSize flop_estimate; - CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, - elem_restr_qd_i_tet, - elem_restr_x_hex, elem_restr_u_hex, - elem_restr_qd_i_hex; - CeedBasis basis_x_tet, basis_u_tet, - basis_x_hex, basis_u_hex; - CeedQFunction qf_mass; - CeedOperator op_mass_tet, op_mass_hex, op_mass; - CeedVector q_data_tet, q_data_hex; - CeedInt num_elem_tet = 6, P_tet = 6, Q_tet = 4, - num_elem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; - CeedInt n_x = 3, n_y = 3, - n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; - CeedInt row, col, offset; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), - num_qpts_tet = num_elem_tet*Q_tet, - num_qpts_hex = num_elem_hex*Q_hex*Q_hex; - CeedInt ind_x_tet[num_elem_tet*P_tet], - ind_x_hex[num_elem_hex*P_hex*P_hex]; - CeedScalar q_ref[dim*Q_tet], q_weight[Q_tet]; - CeedScalar interp[P_tet*Q_tet], grad[dim*P_tet*Q_tet]; + Ceed ceed; + CeedSize flop_estimate; + CeedElemRestriction elem_restr_x_tet, elem_restr_u_tet, elem_restr_qd_i_tet, elem_restr_x_hex, elem_restr_u_hex, elem_restr_qd_i_hex; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_mass; + CeedOperator op_mass_tet, op_mass_hex, op_mass; + CeedVector q_data_tet, q_data_hex; + CeedInt num_elem_tet = 6, P_tet = 6, Q_tet = 4, num_elem_hex = 6, P_hex = 3, Q_hex = 4, dim = 2; + CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * Q_tet, num_qpts_hex = num_elem_hex * Q_hex * Q_hex; + CeedInt ind_x_tet[num_elem_tet * P_tet], ind_x_hex[num_elem_hex * P_hex * P_hex]; + CeedScalar q_ref[dim * Q_tet], q_weight[Q_tet]; + CeedScalar interp[P_tet * Q_tet], grad[dim * P_tet * Q_tet]; CeedInit(argv[1], &ceed); @@ -47,103 +39,77 @@ int main(int argc, char **argv) { CeedVectorCreate(ceed, num_qpts_hex, &q_data_hex); // Set up Tet Elements - for (CeedInt i=0; i -#include #include +#include + #include "t510-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i, elem_restr_lin_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, A, u, v; - const CeedScalar *a, *q; - CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; - CeedInt nx = 3, ny = 2; - CeedInt num_dofs = (nx*2+1)*(ny*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar x[dim*num_dofs]; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i, elem_restr_lin_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, A, u, v; + const CeedScalar *a, *q; + CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; + CeedInt nx = 3, ny = 2; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar x[dim * num_dofs]; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 1e-9) - // LCOV_EXCL_START - printf("Error: A[%" CeedInt_FMT "] = %f != %f\n", i, a[i], q[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_qpts; i++) + if (fabs(q[i] - a[i]) > 1e-9) printf("Error: A[%" CeedInt_FMT "] = %f != %f\n", i, a[i], q[i]); CeedVectorRestoreArrayRead(A, &a); CeedVectorRestoreArrayRead(q_data, &q); @@ -119,16 +107,12 @@ int main(int argc, char **argv) { CeedOperatorApply(op_mass, u, v, CEED_REQUEST_IMMEDIATE); // Check output - CeedScalar area = 0.0; + CeedScalar area = 0.0; const CeedScalar *vv; CeedVectorGetArrayRead(v, CEED_MEM_HOST, &vv); - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error: True operator computed area = %f != 1.0\n", area); - // LCOV_EXCL_STOP + if (fabs(area - 1.0) > 100. * CEED_EPSILON) printf("Error: True operator computed area = %f != 1.0\n", area); // Switch to new q_data CeedVectorGetArrayRead(A, CEED_MEM_HOST, &a); @@ -141,13 +125,9 @@ int main(int argc, char **argv) { // Check output area = 0.0; CeedVectorGetArrayRead(v, CEED_MEM_HOST, &vv); - for (CeedInt i=0; i 1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error: Linearized operator computed area = %f != 1.0\n", area); - // LCOV_EXCL_STOP + if (fabs(area - 1.0) > 1000. * CEED_EPSILON) printf("Error: Linearized operator computed area = %f != 1.0\n", area); // Cleanup CeedQFunctionDestroy(&qf_setup); diff --git a/tests/t530-operator.h b/tests/t530-operator.h index 0bbb388102..2dc9cbe914 100644 --- a/tests/t530-operator.h +++ b/tests/t530-operator.h @@ -7,22 +7,19 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *J = in[1]; - CeedScalar *rho = out[0]; - for (CeedInt i=0; i -#include #include -#include "t531-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i, elem_restr_lin_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_diff, qf_diff_lin; - CeedOperator op_setup, op_diff, op_diff_lin; - CeedVector q_data, X, A, u, v; - CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; - CeedInt nx = 3, ny = 2; - CeedInt num_dofs = (nx*2+1)*(ny*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar x[dim*num_dofs]; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i, elem_restr_lin_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_diff, qf_diff_lin; + CeedOperator op_setup, op_diff, op_diff_lin; + CeedVector q_data, X, A, u, v; + CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; + CeedInt nx = 3, ny = 2; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar x[dim * num_dofs]; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error: Operator computed v[%" CeedInt_FMT "] = %f != 0.0\n", i, vv[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_dofs; i++) { + if (fabs(vv[i]) > 100. * CEED_EPSILON) printf("Error: Operator computed v[%" CeedInt_FMT "] = %f != 0.0\n", i, vv[i]); + } CeedVectorRestoreArrayRead(v, &vv); // Assemble QFunction CeedOperatorSetQFunctionAssemblyReuse(op_diff, true); - CeedOperatorLinearAssembleQFunction(op_diff, &A, &elem_restr_lin_i, - CEED_REQUEST_IMMEDIATE); + CeedOperatorLinearAssembleQFunction(op_diff, &A, &elem_restr_lin_i, CEED_REQUEST_IMMEDIATE); // Second call will be no-op since SetQFunctionUpdated was not called CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op_diff, false); - CeedOperatorLinearAssembleQFunction(op_diff, &A, &elem_restr_lin_i, - CEED_REQUEST_IMMEDIATE); + CeedOperatorLinearAssembleQFunction(op_diff, &A, &elem_restr_lin_i, CEED_REQUEST_IMMEDIATE); // QFunction - apply assembled CeedQFunctionCreateInterior(ceed, 1, diff_lin, diff_lin_loc, &qf_diff_lin); CeedQFunctionAddInput(qf_diff_lin, "du", dim, CEED_EVAL_GRAD); - CeedQFunctionAddInput(qf_diff_lin, "qdata", dim*dim, CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_diff_lin, "qdata", dim * dim, CEED_EVAL_NONE); CeedQFunctionAddOutput(qf_diff_lin, "dv", dim, CEED_EVAL_GRAD); // Operator - apply assembled - CeedOperatorCreate(ceed, qf_diff_lin, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_diff_lin); - CeedOperatorSetField(op_diff_lin, "du", elem_restr_u, basis_u, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_diff_lin, "qdata", elem_restr_lin_i, - CEED_BASIS_COLLOCATED, A); - CeedOperatorSetField(op_diff_lin, "dv", elem_restr_u, basis_u, - CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_diff_lin, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_lin); + CeedOperatorSetField(op_diff_lin, "du", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_diff_lin, "qdata", elem_restr_lin_i, CEED_BASIS_COLLOCATED, A); + CeedOperatorSetField(op_diff_lin, "dv", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); // Apply new Poisson Operator CeedVectorSetValue(v, 0.0); @@ -140,11 +124,9 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(v, CEED_MEM_HOST, &vv); - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error: Linearized operator computed v[i] = %f != 0.0\n", vv[i]); - // LCOV_EXCL_STOP + for (CeedInt i = 0; i < num_dofs; i++) { + if (fabs(vv[i]) > 100. * CEED_EPSILON) printf("Error: Linearized operator computed v[i] = %f != 0.0\n", vv[i]); + } CeedVectorRestoreArrayRead(v, &vv); // Cleanup diff --git a/tests/t531-operator.h b/tests/t531-operator.h index 4e02a4ada2..77b705947f 100644 --- a/tests/t531-operator.h +++ b/tests/t531-operator.h @@ -7,9 +7,7 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store // the symmetric part of the result. @@ -21,24 +19,23 @@ CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, CeedScalar *qd = out[0]; // Quadrature point loop - for (CeedInt i=0; i -#include #include -#include "t532-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_mass_i, elem_restr_qd_diff_i, elem_restr_lin_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply, qf_apply_lin; - CeedOperator op_setup_mass, op_setup_diff, op_apply, op_apply_lin; - CeedVector q_data_mass, q_data_diff, X, A, u, v; - CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; - CeedInt nx = 3, ny = 2; - CeedInt num_dofs = (nx*2+1)*(ny*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar x[dim*num_dofs]; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_mass_i, elem_restr_qd_diff_i, elem_restr_lin_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply, qf_apply_lin; + CeedOperator op_setup_mass, op_setup_diff, op_apply, op_apply_lin; + CeedVector q_data_mass, q_data_diff, X, A, u, v; + CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; + CeedInt nx = 3, ny = 2; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar x[dim * num_dofs]; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error: True operator computed area = %f != 1.0\n", area); - // LCOV_EXCL_STOP + if (fabs(area - 1.0) > 100. * CEED_EPSILON) printf("Error: True operator computed area = %f != 1.0\n", area); // Assemble QFunction CeedOperatorSetQFunctionAssemblyReuse(op_apply, true); - CeedOperatorLinearAssembleQFunction(op_apply, &A, &elem_restr_lin_i, - CEED_REQUEST_IMMEDIATE); + CeedOperatorLinearAssembleQFunction(op_apply, &A, &elem_restr_lin_i, CEED_REQUEST_IMMEDIATE); // Second call will be no-op since SetQFunctionUpdated was not called CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(op_apply, false); - CeedOperatorLinearAssembleQFunction(op_apply, &A, &elem_restr_lin_i, - CEED_REQUEST_IMMEDIATE); + CeedOperatorLinearAssembleQFunction(op_apply, &A, &elem_restr_lin_i, CEED_REQUEST_IMMEDIATE); // QFunction - apply assembled CeedQFunctionCreateInterior(ceed, 1, apply_lin, apply_lin_loc, &qf_apply_lin); CeedQFunctionAddInput(qf_apply_lin, "du", dim, CEED_EVAL_GRAD); - CeedQFunctionAddInput(qf_apply_lin, "qdata", (dim+1)*(dim+1), CEED_EVAL_NONE); + CeedQFunctionAddInput(qf_apply_lin, "qdata", (dim + 1) * (dim + 1), CEED_EVAL_NONE); CeedQFunctionAddInput(qf_apply_lin, "u", 1, CEED_EVAL_INTERP); CeedQFunctionAddOutput(qf_apply_lin, "v", 1, CEED_EVAL_INTERP); CeedQFunctionAddOutput(qf_apply_lin, "dv", dim, CEED_EVAL_GRAD); // Operator - apply assembled - CeedOperatorCreate(ceed, qf_apply_lin, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_apply_lin); - CeedOperatorSetField(op_apply_lin, "du", elem_restr_u, basis_u, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply_lin, "qdata", elem_restr_lin_i, - CEED_BASIS_COLLOCATED, A); - CeedOperatorSetField(op_apply_lin, "u", elem_restr_u, basis_u, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply_lin, "v", elem_restr_u, basis_u, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply_lin, "dv", elem_restr_u, basis_u, - CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_apply_lin, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply_lin); + CeedOperatorSetField(op_apply_lin, "du", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply_lin, "qdata", elem_restr_lin_i, CEED_BASIS_COLLOCATED, A); + CeedOperatorSetField(op_apply_lin, "u", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply_lin, "v", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply_lin, "dv", elem_restr_u, basis_u, CEED_VECTOR_ACTIVE); // Apply assembled QFunction operator CeedVectorSetValue(v, 0.0); @@ -184,13 +153,9 @@ int main(int argc, char **argv) { // Check output area = 0.0; CeedVectorGetArrayRead(v, CEED_MEM_HOST, &vv); - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Error: Assembled operator computed area = %f != 1.0\n", area); - // LCOV_EXCL_STOP + if (fabs(area - 1.0) > 100. * CEED_EPSILON) printf("Error: Assembled operator computed area = %f != 1.0\n", area); // Cleanup CeedQFunctionDestroy(&qf_setup_mass); diff --git a/tests/t532-operator.h b/tests/t532-operator.h index 9ad804ebe7..567cbe8416 100644 --- a/tests/t532-operator.h +++ b/tests/t532-operator.h @@ -7,20 +7,16 @@ #include -CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *J = in[0], *weight = in[1]; - CeedScalar *rho = out[0]; - for (CeedInt i=0; i -#include #include +#include + #include "t510-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, A, U, V; - CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; - CeedInt nx = 3, ny = 2; - CeedInt num_dofs = (nx*2+1)*(ny*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs]; - CeedScalar *u; - const CeedScalar *a, *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, A, U, V; + CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; + CeedInt nx = 3, ny = 2; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs]; + CeedScalar *u; + const CeedScalar *a, *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 100.*CEED_EPSILON) + for (int i = 0; i < num_dofs; i++) + if (fabs(a[i] - assembled_true[i]) > 100. * CEED_EPSILON) // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], - assembled_true[i]); + printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], assembled_true[i]); // LCOV_EXCL_STOP CeedVectorRestoreArrayRead(A, &a); diff --git a/tests/t534-operator.c b/tests/t534-operator.c index ec0525a189..72639dd555 100644 --- a/tests/t534-operator.c +++ b/tests/t534-operator.c @@ -1,62 +1,59 @@ /// @file /// Test assembly of Poisson operator diagonal /// \test Test assembly of Poisson operator diagonal +#include "t534-operator.h" + #include -#include #include -#include "t534-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_diff; - CeedOperator op_setup, op_diff; - CeedVector q_data, X, A, U, V; - CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; - CeedInt n_x = 3, n_y = 2; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs]; - CeedScalar *u; - const CeedScalar *a, *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_diff; + CeedOperator op_setup, op_diff; + CeedVector q_data, X, A, U, V; + CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs]; + CeedScalar *u; + const CeedScalar *a, *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], - assembled_true[i]); - // LCOV_EXCL_STOP + for (int i = 0; i < num_dofs; i++) { + if (fabs(a[i] - assembled_true[i]) > 1000. * CEED_EPSILON) printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], assembled_true[i]); + } CeedVectorRestoreArrayRead(A, &a); // Cleanup diff --git a/tests/t534-operator.h b/tests/t534-operator.h index 7a76f45ac4..35e0426776 100644 --- a/tests/t534-operator.h +++ b/tests/t534-operator.h @@ -7,9 +7,7 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store // the symmetric part of the result. @@ -21,24 +19,23 @@ CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, CeedScalar *qd = out[0]; // Quadrature point loop - for (CeedInt i=0; i -#include #include -#include "t535-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_mass_i, elem_restr_qd_diff_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply; - CeedOperator op_setup_mass, op_setup_diff, op_apply; - CeedVector q_data_mass, q_data_diff, X, A, U, V; - CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; - CeedInt n_x = 3, n_y = 2; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs]; - CeedScalar *u; - const CeedScalar *a, *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_mass_i, elem_restr_qd_diff_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply; + CeedOperator op_setup_mass, op_setup_diff, op_apply; + CeedVector q_data_mass, q_data_diff, X, A, U, V; + CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs]; + CeedScalar *u; + const CeedScalar *a, *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], - assembled_true[i]); - // LCOV_EXCL_STOP + for (int i = 0; i < num_dofs; i++) { + if (fabs(a[i] - assembled_true[i]) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], assembled_true[i]); + } CeedVectorRestoreArrayRead(A, &a); // Cleanup diff --git a/tests/t535-operator.h b/tests/t535-operator.h index 3fee235b51..454c48ba17 100644 --- a/tests/t535-operator.h +++ b/tests/t535-operator.h @@ -7,20 +7,16 @@ #include -CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *J = in[0], *weight = in[1]; - CeedScalar *rho = out[0]; - for (CeedInt i=0; i -#include #include +#include + #include "t320-basis.h" #include "t535-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_mass_i, elem_restr_qd_diff_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply; - CeedOperator op_setup_mass, op_setup_diff, op_apply; - CeedVector q_data_mass, q_data_diff, X, A, U, V; - CeedInt num_elem = 12, dim = 2, P = 6, Q = 4; - CeedInt n_x = 3, n_y = 2; - CeedInt row, col, offset; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs]; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; - CeedScalar *u; - const CeedScalar *a, *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_mass_i, elem_restr_qd_diff_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply; + CeedOperator op_setup_mass, op_setup_diff, op_apply; + CeedVector q_data_mass, q_data_diff, X, A, U, V; + CeedInt num_elem = 12, dim = 2, P = 6, Q = 4; + CeedInt n_x = 3, n_y = 2; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs]; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; + CeedScalar *u; + const CeedScalar *a, *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], - assembled_true[i]); - // LCOV_EXCL_STOP + for (int i = 0; i < num_dofs; i++) { + if (fabs(a[i] - assembled_true[i]) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], assembled_true[i]); + } CeedVectorRestoreArrayRead(A, &a); // Cleanup diff --git a/tests/t537-operator.c b/tests/t537-operator.c index c0fe3dc43b..8a72268613 100644 --- a/tests/t537-operator.c +++ b/tests/t537-operator.c @@ -1,71 +1,67 @@ /// @file /// Test assembly of mass matrix operator point block diagonal /// \test Test assembly of mass matrix operator point block diagonal +#include "t537-operator.h" + #include -#include #include -#include "t537-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, A, U, V; - CeedInt num_elem = 6, P = 3, Q = 4, dim = 2, num_comp = 2; - CeedInt nx = 3, ny = 2; - CeedInt num_dofs = (nx*2+1)*(ny*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar x[dim*num_dofs], assembled_true[num_comp*num_comp*num_dofs]; - CeedScalar *u; - const CeedScalar *a, *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, A, U, V; + CeedInt num_elem = 6, P = 3, Q = 4, dim = 2, num_comp = 2; + CeedInt nx = 3, ny = 2; + CeedInt num_dofs = (nx * 2 + 1) * (ny * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar x[dim * num_dofs], assembled_true[num_comp * num_comp * num_dofs]; + CeedScalar *u; + const CeedScalar *a, *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 0) - u[ind_old] = 0.0; + CeedInt ind = i + j * num_dofs; + u[ind] = 1.0; + if (ind > 0) u[ind_old] = 0.0; ind_old = ind; CeedVectorRestoreArray(U, &u); @@ -120,20 +108,16 @@ int main(int argc, char **argv) { // Retrieve entry CeedVectorGetArrayRead(V, CEED_MEM_HOST, &v); - for (int k = 0; k 100.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], - assembled_true[i]); - // LCOV_EXCL_STOP + for (int i = 0; i < num_comp * num_comp * num_dofs; i++) { + if (fabs(a[i] - assembled_true[i]) > 100. * CEED_EPSILON) printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], assembled_true[i]); + } CeedVectorRestoreArrayRead(A, &a); // Cleanup diff --git a/tests/t537-operator.h b/tests/t537-operator.h index 05f5fbc881..807b03aeba 100644 --- a/tests/t537-operator.h +++ b/tests/t537-operator.h @@ -7,24 +7,20 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *J = in[1]; - CeedScalar *rho = out[0]; - for (CeedInt i=0; i -#include #include +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_mass_i, elem_restr_qd_diff_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup_mass, qf_mass, qf_setup_diff, qf_diff; - CeedOperator op_setup_mass, op_mass, op_setup_diff, op_diff, op_apply; - CeedVector q_data_mass, q_data_diff, X, A, U, V; - CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; - CeedInt n_x = 3, n_y = 2; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs]; - CeedScalar *u; - const CeedScalar *a, *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_mass_i, elem_restr_qd_diff_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup_mass, qf_mass, qf_setup_diff, qf_diff; + CeedOperator op_setup_mass, op_mass, op_setup_diff, op_diff, op_apply; + CeedVector q_data_mass, q_data_diff, X, A, U, V; + CeedInt num_elem = 6, P = 3, Q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs]; + CeedScalar *u; + const CeedScalar *a, *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], - assembled_true[i]); - // LCOV_EXCL_STOP + for (int i = 0; i < num_dofs; i++) { + if (fabs(a[i] - assembled_true[i]) > 1000. * CEED_EPSILON) printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, a[i], assembled_true[i]); + } CeedVectorRestoreArrayRead(A, &a); // Cleanup diff --git a/tests/t540-operator.c b/tests/t540-operator.c index 76c428da13..705f2119df 100644 --- a/tests/t540-operator.c +++ b/tests/t540-operator.c @@ -1,32 +1,34 @@ /// @file /// Test creation and use of FDM element inverse /// \test Test creation and use of FDM element inverse +#include "t540-operator.h" + #include -#include #include -#include "t540-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x_i, elem_restr_u_i, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup_mass, qf_apply; - CeedOperator op_setup_mass, op_apply, op_inv; - CeedVector q_data_mass, X, U, V; - CeedInt num_elem = 1, P = 4, Q = 5, dim = 2; - CeedInt num_dofs = P*P, num_qpts = num_elem*Q*Q; - CeedScalar x[dim*num_elem*(2*2)]; - const CeedScalar *u; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup_mass, qf_apply; + CeedOperator op_setup_mass, op_apply, op_inv; + CeedVector q_data_mass, X, U, V; + CeedInt num_elem = 1, P = 4, Q = 5, dim = 2; + CeedInt num_dofs = P * P, num_qpts = num_elem * Q * Q; + CeedScalar x[dim * num_elem * (2 * 2)]; + const CeedScalar *u; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i<2; i++) - for (CeedInt j=0; j<2; j++) { - x[i+j*2+0*4] = i; - x[i+j*2+1*4] = j; + for (CeedInt i = 0; i < 2; i++) { + for (CeedInt j = 0; j < 2; j++) { + x[i + j * 2 + 0 * 4] = i; + x[i + j * 2 + 1 * 4] = j; } - CeedVectorCreate(ceed, dim*num_elem*(2*2), &X); + } + CeedVectorCreate(ceed, dim * num_elem * (2 * 2), &X); CeedVectorSetArray(X, CEED_MEM_HOST, CEED_USE_POINTER, x); // Qdata Vector @@ -35,38 +37,30 @@ int main(int argc, char **argv) { // Element Setup // Restrictions - CeedInt strides_x[3] = {1, 2*2, 2*2*dim}; - CeedElemRestrictionCreateStrided(ceed, num_elem, 2*2, dim, dim*num_elem*2*2, - strides_x, &elem_restr_x_i); + CeedInt strides_x[3] = {1, 2 * 2, 2 * 2 * dim}; + CeedElemRestrictionCreateStrided(ceed, num_elem, 2 * 2, dim, dim * num_elem * 2 * 2, strides_x, &elem_restr_x_i); - CeedInt strides_u[3] = {1, P*P, P*P}; - CeedElemRestrictionCreateStrided(ceed, num_elem, P*P, 1, num_dofs, strides_u, - &elem_restr_u_i); + CeedInt strides_u[3] = {1, P * P, P * P}; + CeedElemRestrictionCreateStrided(ceed, num_elem, P * P, 1, num_dofs, strides_u, &elem_restr_u_i); - CeedInt strides_qd[3] = {1, Q*Q, Q*Q}; - CeedElemRestrictionCreateStrided(ceed, num_elem, Q*Q, 1, num_qpts, strides_qd, - &elem_restr_qd_i); + CeedInt strides_qd[3] = {1, Q * Q, Q * Q}; + CeedElemRestrictionCreateStrided(ceed, num_elem, Q * Q, 1, num_qpts, strides_qd, &elem_restr_qd_i); // Bases CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, Q, CEED_GAUSS, &basis_x); CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, P, Q, CEED_GAUSS, &basis_u); // QFunction - setup mass - CeedQFunctionCreateInterior(ceed, 1, setup_mass, setup_mass_loc, - &qf_setup_mass); - CeedQFunctionAddInput(qf_setup_mass, "dx", dim*dim, CEED_EVAL_GRAD); + CeedQFunctionCreateInterior(ceed, 1, setup_mass, setup_mass_loc, &qf_setup_mass); + CeedQFunctionAddInput(qf_setup_mass, "dx", dim * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_setup_mass, "weight", 1, CEED_EVAL_WEIGHT); CeedQFunctionAddOutput(qf_setup_mass, "qdata", 1, CEED_EVAL_NONE); // Operator - setup mass - CeedOperatorCreate(ceed, qf_setup_mass, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_setup_mass); - CeedOperatorSetField(op_setup_mass, "dx", elem_restr_x_i, basis_x, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_mass, "weight", CEED_ELEMRESTRICTION_NONE, - basis_x, CEED_VECTOR_NONE); - CeedOperatorSetField(op_setup_mass, "qdata", elem_restr_qd_i, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_setup_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_mass); + CeedOperatorSetField(op_setup_mass, "dx", elem_restr_x_i, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_mass, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_mass, "qdata", elem_restr_qd_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // Apply Setup Operator CeedOperatorApply(op_setup_mass, X, q_data_mass, CEED_REQUEST_IMMEDIATE); @@ -78,14 +72,10 @@ int main(int argc, char **argv) { CeedQFunctionAddOutput(qf_apply, "v", 1, CEED_EVAL_INTERP); // Operator - apply - CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_apply); - CeedOperatorSetField(op_apply, "u", elem_restr_u_i, basis_u, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply, "mass qdata", elem_restr_qd_i, - CEED_BASIS_COLLOCATED, q_data_mass); - CeedOperatorSetField(op_apply, "v", elem_restr_u_i, basis_u, - CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); + CeedOperatorSetField(op_apply, "u", elem_restr_u_i, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply, "mass qdata", elem_restr_qd_i, CEED_BASIS_COLLOCATED, q_data_mass); + CeedOperatorSetField(op_apply, "v", elem_restr_u_i, basis_u, CEED_VECTOR_ACTIVE); // Apply original operator CeedVectorCreate(ceed, num_dofs, &U); @@ -102,12 +92,9 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u); - for (int i=0; i 500.*CEED_EPSILON) - // LCOV_EXCL_START - printf("[%" CeedInt_FMT "] Error in inverse: %e - 1.0 = %e\n", - i, u[i], u[i] - 1.); - // LCOV_EXCL_STOP + for (int i = 0; i < num_dofs; i++) { + if (fabs(u[i] - 1.0) > 500. * CEED_EPSILON) printf("[%" CeedInt_FMT "] Error in inverse: %e - 1.0 = %e\n", i, u[i], u[i] - 1.); + } CeedVectorRestoreArrayRead(U, &u); // Cleanup diff --git a/tests/t540-operator.h b/tests/t540-operator.h index 91c10583f8..1dc169e617 100644 --- a/tests/t540-operator.h +++ b/tests/t540-operator.h @@ -7,19 +7,16 @@ #include -CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup_mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *J = in[0], *weight = in[1]; - CeedScalar *rho = out[0]; - for (CeedInt i=0; i +#include #include #include -#include -#include "t541-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; CeedElemRestriction elem_restr_x_i, elem_restr_u_i, elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup_diff, qf_apply; - CeedOperator op_setup_diff, op_apply, op_inv; - CeedVector q_data_diff, X, U, V, W; - CeedInt num_elem = 1, P = 4, Q = 5, dim = 2; - CeedInt num_dofs = P*P, num_qpts = num_elem*Q*Q, q_data_size = dim*(dim+1)/2; - CeedScalar x[dim*num_elem*(2*2)]; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup_diff, qf_apply; + CeedOperator op_setup_diff, op_apply, op_inv; + CeedVector q_data_diff, X, U, V, W; + CeedInt num_elem = 1, P = 4, Q = 5, dim = 2; + CeedInt num_dofs = P * P, num_qpts = num_elem * Q * Q, q_data_size = dim * (dim + 1) / 2; + CeedScalar x[dim * num_elem * (2 * 2)]; CeedInit(argv[1], &ceed); // Test skipped if using single precision - if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { - return CeedError(ceed, CEED_ERROR_UNSUPPORTED, - "Test not implemented in single precision"); - } + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Test not implemented in single precision"); // DoF Coordinates - for (CeedInt i=0; i<2; i++) - for (CeedInt j=0; j<2; j++) { - x[i+j*2+0*4] = i; - x[i+j*2+1*4] = j; + for (CeedInt i = 0; i < 2; i++) { + for (CeedInt j = 0; j < 2; j++) { + x[i + j * 2 + 0 * 4] = i; + x[i + j * 2 + 1 * 4] = j; } - CeedVectorCreate(ceed, dim*num_elem*(2*2), &X); + } + CeedVectorCreate(ceed, dim * num_elem * (2 * 2), &X); CeedVectorSetArray(X, CEED_MEM_HOST, CEED_USE_POINTER, x); // Qdata Vector - CeedVectorCreate(ceed, q_data_size*num_qpts, &q_data_diff); + CeedVectorCreate(ceed, q_data_size * num_qpts, &q_data_diff); // Element Setup // Restrictions - CeedInt strides_x[3] = {1, 2*2, 2*2*dim}; - CeedElemRestrictionCreateStrided(ceed, num_elem, 2*2, dim, dim*num_elem*2*2, - strides_x, &elem_restr_x_i); + CeedInt strides_x[3] = {1, 2 * 2, 2 * 2 * dim}; + CeedElemRestrictionCreateStrided(ceed, num_elem, 2 * 2, dim, dim * num_elem * 2 * 2, strides_x, &elem_restr_x_i); - CeedInt strides_u[3] = {1, P*P, P*P}; - CeedElemRestrictionCreateStrided(ceed, num_elem, P*P, 1, num_dofs, strides_u, - &elem_restr_u_i); + CeedInt strides_u[3] = {1, P * P, P * P}; + CeedElemRestrictionCreateStrided(ceed, num_elem, P * P, 1, num_dofs, strides_u, &elem_restr_u_i); - CeedInt strides_qd[3] = {1, Q*Q, q_data_size *Q*Q}; - CeedElemRestrictionCreateStrided(ceed, num_elem, Q*Q, q_data_size, - num_qpts*q_data_size, strides_qd, &elem_restr_qd_i); + CeedInt strides_qd[3] = {1, Q * Q, q_data_size * Q * Q}; + CeedElemRestrictionCreateStrided(ceed, num_elem, Q * Q, q_data_size, num_qpts * q_data_size, strides_qd, &elem_restr_qd_i); // Bases CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, Q, CEED_GAUSS, &basis_x); CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, P, Q, CEED_GAUSS, &basis_u); // QFunction - setup diff - CeedQFunctionCreateInterior(ceed, 1, setup_diff, setup_diff_loc, - &qf_setup_diff); - CeedQFunctionAddInput(qf_setup_diff, "dx", dim*dim, CEED_EVAL_GRAD); + CeedQFunctionCreateInterior(ceed, 1, setup_diff, setup_diff_loc, &qf_setup_diff); + CeedQFunctionAddInput(qf_setup_diff, "dx", dim * dim, CEED_EVAL_GRAD); CeedQFunctionAddInput(qf_setup_diff, "weight", 1, CEED_EVAL_WEIGHT); CeedQFunctionAddOutput(qf_setup_diff, "qdata", q_data_size, CEED_EVAL_NONE); // Operator - setup diff - CeedOperatorCreate(ceed, qf_setup_diff, CEED_QFUNCTION_NONE, - CEED_QFUNCTION_NONE, &op_setup_diff); - CeedOperatorSetField(op_setup_diff, "dx", elem_restr_x_i, basis_x, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_setup_diff, "weight", CEED_ELEMRESTRICTION_NONE, - basis_x, CEED_VECTOR_NONE); - CeedOperatorSetField(op_setup_diff, "qdata", elem_restr_qd_i, - CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_setup_diff, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_diff); + CeedOperatorSetField(op_setup_diff, "dx", elem_restr_x_i, basis_x, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_setup_diff, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); + CeedOperatorSetField(op_setup_diff, "qdata", elem_restr_qd_i, CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); // Apply Setup Operator CeedOperatorApply(op_setup_diff, X, q_data_diff, CEED_REQUEST_IMMEDIATE); @@ -84,14 +75,10 @@ int main(int argc, char **argv) { CeedQFunctionAddOutput(qf_apply, "v", dim, CEED_EVAL_GRAD); // Operator - apply - CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, - &op_apply); - CeedOperatorSetField(op_apply, "u", elem_restr_u_i, basis_u, - CEED_VECTOR_ACTIVE); - CeedOperatorSetField(op_apply, "qdata_diff", elem_restr_qd_i, - CEED_BASIS_COLLOCATED, q_data_diff); - CeedOperatorSetField(op_apply, "v", elem_restr_u_i, basis_u, - CEED_VECTOR_ACTIVE); + CeedOperatorCreate(ceed, qf_apply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_apply); + CeedOperatorSetField(op_apply, "u", elem_restr_u_i, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetField(op_apply, "qdata_diff", elem_restr_qd_i, CEED_BASIS_COLLOCATED, q_data_diff); + CeedOperatorSetField(op_apply, "v", elem_restr_u_i, basis_u, CEED_VECTOR_ACTIVE); // Create FDM element inverse CeedOperatorCreateFDMElementInverse(op_apply, &op_inv, CEED_REQUEST_IMMEDIATE); @@ -108,10 +95,18 @@ int main(int argc, char **argv) { CeedVectorSetValue(U, 0.0); CeedVectorGetArray(U, CEED_MEM_HOST, &u); switch (i) { - case 0: u[0] = 1.0; break; - case 1: u[P-1] = 1.0; break; - case 2: u[P*P-P] = 1.0; break; - case 3: u[P*P-1] = 1.0; break; + case 0: + u[0] = 1.0; + break; + case 1: + u[P - 1] = 1.0; + break; + case 2: + u[P * P - P] = 1.0; + break; + case 3: + u[P * P - 1] = 1.0; + break; } CeedVectorRestoreArray(U, &u); @@ -119,131 +114,50 @@ int main(int argc, char **argv) { const CeedScalar *v; CeedVectorGetArrayRead(V, CEED_MEM_HOST, &v); - S[0*4+i] = -v[0]; - S[1*4+i] = -v[P-1]; - S[2*4+i] = -v[P*P-P]; - S[3*4+i] = -v[P*P-1]; + S[0 * 4 + i] = -v[0]; + S[1 * 4 + i] = -v[P - 1]; + S[2 * 4 + i] = -v[P * P - P]; + S[3 * 4 + i] = -v[P * P - 1]; CeedVectorRestoreArrayRead(V, &v); } CeedScalar S_inv[16]; { CeedScalar det; - S_inv[0] = S[5] * S[10] * S[15] - - S[5] * S[11] * S[14] - - S[9] * S[6] * S[15] + - S[9] * S[7] * S[14] + - S[13] * S[6] * S[11] - - S[13] * S[7] * S[10]; - - S_inv[4] = -S[4] * S[10] * S[15] + - S[4] * S[11] * S[14] + - S[8] * S[6] * S[15] - - S[8] * S[7] * S[14] - - S[12] * S[6] * S[11] + - S[12] * S[7] * S[10]; - - S_inv[8] = S[4] * S[9] * S[15] - - S[4] * S[11] * S[13] - - S[8] * S[5] * S[15] + - S[8] * S[7] * S[13] + - S[12] * S[5] * S[11] - - S[12] * S[7] * S[9]; - - S_inv[12] = -S[4] * S[9] * S[14] + - S[4] * S[10] * S[13] + - S[8] * S[5] * S[14] - - S[8] * S[6] * S[13] - - S[12] * S[5] * S[10] + - S[12] * S[6] * S[9]; - - S_inv[1] = -S[1] * S[10] * S[15] + - S[1] * S[11] * S[14] + - S[9] * S[2] * S[15] - - S[9] * S[3] * S[14] - - S[13] * S[2] * S[11] + - S[13] * S[3] * S[10]; - - S_inv[5] = S[0] * S[10] * S[15] - - S[0] * S[11] * S[14] - - S[8] * S[2] * S[15] + - S[8] * S[3] * S[14] + - S[12] * S[2] * S[11] - - S[12] * S[3] * S[10]; - - S_inv[9] = -S[0] * S[9] * S[15] + - S[0] * S[11] * S[13] + - S[8] * S[1] * S[15] - - S[8] * S[3] * S[13] - - S[12] * S[1] * S[11] + - S[12] * S[3] * S[9]; - - S_inv[13] = S[0] * S[9] * S[14] - - S[0] * S[10] * S[13] - - S[8] * S[1] * S[14] + - S[8] * S[2] * S[13] + - S[12] * S[1] * S[10] - - S[12] * S[2] * S[9]; - - S_inv[2] = S[1] * S[6] * S[15] - - S[1] * S[7] * S[14] - - S[5] * S[2] * S[15] + - S[5] * S[3] * S[14] + - S[13] * S[2] * S[7] - - S[13] * S[3] * S[6]; - - S_inv[6] = -S[0] * S[6] * S[15] + - S[0] * S[7] * S[14] + - S[4] * S[2] * S[15] - - S[4] * S[3] * S[14] - - S[12] * S[2] * S[7] + - S[12] * S[3] * S[6]; - - S_inv[10] = S[0] * S[5] * S[15] - - S[0] * S[7] * S[13] - - S[4] * S[1] * S[15] + - S[4] * S[3] * S[13] + - S[12] * S[1] * S[7] - - S[12] * S[3] * S[5]; - - S_inv[14] = -S[0] * S[5] * S[14] + - S[0] * S[6] * S[13] + - S[4] * S[1] * S[14] - - S[4] * S[2] * S[13] - - S[12] * S[1] * S[6] + - S[12] * S[2] * S[5]; - - S_inv[3] = -S[1] * S[6] * S[11] + - S[1] * S[7] * S[10] + - S[5] * S[2] * S[11] - - S[5] * S[3] * S[10] - - S[9] * S[2] * S[7] + - S[9] * S[3] * S[6]; - - S_inv[7] = S[0] * S[6] * S[11] - - S[0] * S[7] * S[10] - - S[4] * S[2] * S[11] + - S[4] * S[3] * S[10] + - S[8] * S[2] * S[7] - - S[8] * S[3] * S[6]; - - S_inv[11] = -S[0] * S[5] * S[11] + - S[0] * S[7] * S[9] + - S[4] * S[1] * S[11] - - S[4] * S[3] * S[9] - - S[8] * S[1] * S[7] + - S[8] * S[3] * S[5]; - - S_inv[15] = S[0] * S[5] * S[10] - - S[0] * S[6] * S[9] - - S[4] * S[1] * S[10] + - S[4] * S[2] * S[9] + - S[8] * S[1] * S[6] - - S[8] * S[2] * S[5]; - - det = 1/(S[0]*S_inv[0] + S[1]*S_inv[4] + S[2]*S_inv[8] + S[3]*S_inv[12]); - - for (CeedInt i = 0; i < 16; i++) - S_inv[i] *= det; + S_inv[0] = S[5] * S[10] * S[15] - S[5] * S[11] * S[14] - S[9] * S[6] * S[15] + S[9] * S[7] * S[14] + S[13] * S[6] * S[11] - S[13] * S[7] * S[10]; + + S_inv[4] = -S[4] * S[10] * S[15] + S[4] * S[11] * S[14] + S[8] * S[6] * S[15] - S[8] * S[7] * S[14] - S[12] * S[6] * S[11] + S[12] * S[7] * S[10]; + + S_inv[8] = S[4] * S[9] * S[15] - S[4] * S[11] * S[13] - S[8] * S[5] * S[15] + S[8] * S[7] * S[13] + S[12] * S[5] * S[11] - S[12] * S[7] * S[9]; + + S_inv[12] = -S[4] * S[9] * S[14] + S[4] * S[10] * S[13] + S[8] * S[5] * S[14] - S[8] * S[6] * S[13] - S[12] * S[5] * S[10] + S[12] * S[6] * S[9]; + + S_inv[1] = -S[1] * S[10] * S[15] + S[1] * S[11] * S[14] + S[9] * S[2] * S[15] - S[9] * S[3] * S[14] - S[13] * S[2] * S[11] + S[13] * S[3] * S[10]; + + S_inv[5] = S[0] * S[10] * S[15] - S[0] * S[11] * S[14] - S[8] * S[2] * S[15] + S[8] * S[3] * S[14] + S[12] * S[2] * S[11] - S[12] * S[3] * S[10]; + + S_inv[9] = -S[0] * S[9] * S[15] + S[0] * S[11] * S[13] + S[8] * S[1] * S[15] - S[8] * S[3] * S[13] - S[12] * S[1] * S[11] + S[12] * S[3] * S[9]; + + S_inv[13] = S[0] * S[9] * S[14] - S[0] * S[10] * S[13] - S[8] * S[1] * S[14] + S[8] * S[2] * S[13] + S[12] * S[1] * S[10] - S[12] * S[2] * S[9]; + + S_inv[2] = S[1] * S[6] * S[15] - S[1] * S[7] * S[14] - S[5] * S[2] * S[15] + S[5] * S[3] * S[14] + S[13] * S[2] * S[7] - S[13] * S[3] * S[6]; + + S_inv[6] = -S[0] * S[6] * S[15] + S[0] * S[7] * S[14] + S[4] * S[2] * S[15] - S[4] * S[3] * S[14] - S[12] * S[2] * S[7] + S[12] * S[3] * S[6]; + + S_inv[10] = S[0] * S[5] * S[15] - S[0] * S[7] * S[13] - S[4] * S[1] * S[15] + S[4] * S[3] * S[13] + S[12] * S[1] * S[7] - S[12] * S[3] * S[5]; + + S_inv[14] = -S[0] * S[5] * S[14] + S[0] * S[6] * S[13] + S[4] * S[1] * S[14] - S[4] * S[2] * S[13] - S[12] * S[1] * S[6] + S[12] * S[2] * S[5]; + + S_inv[3] = -S[1] * S[6] * S[11] + S[1] * S[7] * S[10] + S[5] * S[2] * S[11] - S[5] * S[3] * S[10] - S[9] * S[2] * S[7] + S[9] * S[3] * S[6]; + + S_inv[7] = S[0] * S[6] * S[11] - S[0] * S[7] * S[10] - S[4] * S[2] * S[11] + S[4] * S[3] * S[10] + S[8] * S[2] * S[7] - S[8] * S[3] * S[6]; + + S_inv[11] = -S[0] * S[5] * S[11] + S[0] * S[7] * S[9] + S[4] * S[1] * S[11] - S[4] * S[3] * S[9] - S[8] * S[1] * S[7] + S[8] * S[3] * S[5]; + + S_inv[15] = S[0] * S[5] * S[10] - S[0] * S[6] * S[9] - S[4] * S[1] * S[10] + S[4] * S[2] * S[9] + S[8] * S[1] * S[6] - S[8] * S[2] * S[5]; + + det = 1 / (S[0] * S_inv[0] + S[1] * S_inv[4] + S[2] * S_inv[8] + S[3] * S_inv[12]); + + for (CeedInt i = 0; i < 16; i++) S_inv[i] *= det; } // Set initial values @@ -252,10 +166,9 @@ int main(int argc, char **argv) { CeedLobattoQuadrature(P, nodes, NULL); CeedScalar *u; CeedVectorGetArray(U, CEED_MEM_HOST, &u); - for (CeedInt i=0; i 2e-3) + for (CeedInt i = 0; i < P; i++) { + for (CeedInt j = 0; j < P; j++) { + if (fabs(u[i * P + j] - w[i * P + j]) > 2e-3) { // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in inverse: %e != %e\n", - i, j, w[i*P+j], u[i*P+j]); - // LCOV_EXCL_STOP + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in inverse: %e != %e\n", i, j, w[i * P + j], u[i * P + j]); + // LCOV_EXCL_STOP + } + } + } CeedVectorRestoreArrayRead(U, &u); CeedVectorRestoreArrayRead(W, &w); } diff --git a/tests/t541-operator.h b/tests/t541-operator.h index 303febe433..9a7a3bb0a9 100644 --- a/tests/t541-operator.h +++ b/tests/t541-operator.h @@ -7,9 +7,7 @@ #include -CEED_QFUNCTION(setup_diff)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup_diff)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // in[0] is Jacobians with shape [2, nc=2, Q] // in[1] is quadrature weights, size (Q) const CeedScalar *J = in[0], *w = in[1]; @@ -18,25 +16,23 @@ CEED_QFUNCTION(setup_diff)(void *ctx, const CeedInt Q, CeedScalar *q_data = out[0]; // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -#include #include +#include #include "t502-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_qd_i, - elem_restr_u_c, elem_restr_u_f; - CeedBasis basis_x, basis_u_c, basis_u_f; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass_c, op_mass_f, - op_prolong, op_restrict; - CeedVector q_data, X, U_c, U_f, - V_c, V_f, p_mult_f; - const CeedScalar *hv; - CeedInt num_elem = 15, P_c = 3, P_f = 5, Q = 8, num_comp = 2; - CeedInt num_dofs_x = num_elem+1, num_dofs_u_c = num_elem*(P_c-1)+1, - num_dofs_u_f = num_elem*(P_f-1)+1; - CeedInt ind_u_c[num_elem*P_c], ind_u_f[num_elem*P_f], - ind_x[num_elem*2]; - CeedScalar x[num_dofs_x]; - CeedScalar sum; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_qd_i, elem_restr_u_c, elem_restr_u_f; + CeedBasis basis_x, basis_u_c, basis_u_f; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass_c, op_mass_f, op_prolong, op_restrict; + CeedVector q_data, X, U_c, U_f, V_c, V_f, p_mult_f; + const CeedScalar *hv; + CeedInt num_elem = 15, P_c = 3, P_f = 5, Q = 8, num_comp = 2; + CeedInt num_dofs_x = num_elem + 1, num_dofs_u_c = num_elem * (P_c - 1) + 1, num_dofs_u_f = num_elem * (P_f - 1) + 1; + CeedInt ind_u_c[num_elem * P_c], ind_u_f[num_elem * P_f], ind_x[num_elem * 2]; + CeedScalar x[num_dofs_x]; + CeedScalar sum; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 2.) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); CeedVectorRestoreArrayRead(V_c, &hv); // Prolong coarse u - CeedVectorCreate(ceed, num_comp*num_dofs_u_f, &U_f); + CeedVectorCreate(ceed, num_comp * num_dofs_u_f, &U_f); CeedOperatorApply(op_prolong, U_c, U_f, CEED_REQUEST_IMMEDIATE); // Fine problem - CeedVectorCreate(ceed, num_comp*num_dofs_u_f, &V_f); + CeedVectorCreate(ceed, num_comp * num_dofs_u_f, &V_f); CeedOperatorApply(op_mass_f, U_f, V_f, CEED_REQUEST_IMMEDIATE); // Check output CeedVectorGetArrayRead(V_f, CEED_MEM_HOST, &hv); sum = 0.; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 2.) > 1000. * CEED_EPSILON) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum); CeedVectorRestoreArrayRead(V_f, &hv); // Restrict state to coarse grid @@ -153,13 +127,10 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(V_c, CEED_MEM_HOST, &hv); sum = 0.; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 2.) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); CeedVectorRestoreArrayRead(V_c, &hv); // Cleanup diff --git a/tests/t551-operator.c b/tests/t551-operator.c index dbffdcec6a..79be40fbee 100644 --- a/tests/t551-operator.c +++ b/tests/t551-operator.c @@ -2,90 +2,76 @@ /// Test creation, action, and destruction for mass matrix operator with multigrid level, non-tensor basis and interpolation basis generation /// \test Test creation, action, and destruction for mass matrix operator with multigrid level, non-tensor basis and interpolation basis generation #include -#include #include +#include #include "t502-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_qd_i, - elem_restr_u_c, elem_restr_u_f; - CeedBasis basis_x, basis_temp, basis_c, basis_f; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass_c, op_mass_f, - op_prolong, op_restrict; - CeedVector q_data, X, U_c, U_f, - V_c, V_f, p_mult_fine; - const CeedScalar *hv; - CeedInt num_elem = 15, P_c = 3, P_f = 5, Q = 8, num_comp = 2; - CeedInt num_dofs_x = num_elem+1, num_dofs_u_c = num_elem*(P_c-1)+1, - num_dofs_u_f = num_elem*(P_f-1)+1; - CeedInt ind_u_c[num_elem*P_c], ind_u_f[num_elem*P_f], - ind_x[num_elem*2]; - CeedScalar x[num_dofs_x]; - CeedScalar sum; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_qd_i, elem_restr_u_c, elem_restr_u_f; + CeedBasis basis_x, basis_temp, basis_c, basis_f; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass_c, op_mass_f, op_prolong, op_restrict; + CeedVector q_data, X, U_c, U_f, V_c, V_f, p_mult_fine; + const CeedScalar *hv; + CeedInt num_elem = 15, P_c = 3, P_f = 5, Q = 8, num_comp = 2; + CeedInt num_dofs_x = num_elem + 1, num_dofs_u_c = num_elem * (P_c - 1) + 1, num_dofs_u_f = num_elem * (P_f - 1) + 1; + CeedInt ind_u_c[num_elem * P_c], ind_u_f[num_elem * P_f], ind_x[num_elem * 2]; + CeedScalar x[num_dofs_x]; + CeedScalar sum; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 2.) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); CeedVectorRestoreArrayRead(V_c, &hv); // Prolong coarse u - CeedVectorCreate(ceed, num_comp*num_dofs_u_f, &U_f); + CeedVectorCreate(ceed, num_comp * num_dofs_u_f, &U_f); CeedOperatorApply(op_prolong, U_c, U_f, CEED_REQUEST_IMMEDIATE); // Fine problem - CeedVectorCreate(ceed, num_comp*num_dofs_u_f, &V_f); + CeedVectorCreate(ceed, num_comp * num_dofs_u_f, &V_f); CeedOperatorApply(op_mass_f, U_f, V_f, CEED_REQUEST_IMMEDIATE); // Check output CeedVectorGetArrayRead(V_f, CEED_MEM_HOST, &hv); sum = 0.; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 2.) > 1000. * CEED_EPSILON) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum); CeedVectorRestoreArrayRead(V_f, &hv); // Restrict state to coarse grid @@ -168,13 +140,10 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(V_c, CEED_MEM_HOST, &hv); sum = 0.; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 2.) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); CeedVectorRestoreArrayRead(V_c, &hv); // Cleanup diff --git a/tests/t552-operator.c b/tests/t552-operator.c index 6ccc19abbb..74050122fa 100644 --- a/tests/t552-operator.c +++ b/tests/t552-operator.c @@ -2,72 +2,62 @@ /// Test creation, action, and destruction for mass matrix operator with multigrid level, tensor basis /// \test Test creation, action, and destruction for mass matrix operator with multigrid level, tensor basis #include -#include #include +#include #include "t502-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_qd, - elem_restr_u_c, elem_restr_c_f; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass_c, op_mass_f, - op_prolong, op_restrict; - CeedVector q_data, X, U_c, U_f, V_c, V_f, p_mult_f; - const CeedScalar *hv; - CeedInt num_elem = 15, P_c = 3, P_f = 5, Q = 8, num_comp = 2; - CeedInt num_dofs_x = num_elem+1, num_dofs_u_c = num_elem*(P_c-1)+1, - num_dofs_u_f = num_elem*(P_f-1)+1; - CeedInt ind_u_c[num_elem*P_c], ind_u_f[num_elem*P_f], - ind_x[num_elem*2]; - CeedScalar x[num_dofs_x]; - CeedScalar sum; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_qd, elem_restr_u_c, elem_restr_c_f; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass_c, op_mass_f, op_prolong, op_restrict; + CeedVector q_data, X, U_c, U_f, V_c, V_f, p_mult_f; + const CeedScalar *hv; + CeedInt num_elem = 15, P_c = 3, P_f = 5, Q = 8, num_comp = 2; + CeedInt num_dofs_x = num_elem + 1, num_dofs_u_c = num_elem * (P_c - 1) + 1, num_dofs_u_f = num_elem * (P_f - 1) + 1; + CeedInt ind_u_c[num_elem * P_c], ind_u_f[num_elem * P_f], ind_x[num_elem * 2]; + CeedScalar x[num_dofs_x]; + CeedScalar sum; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 2.) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); CeedVectorRestoreArrayRead(V_c, &hv); // Prolong coarse u - CeedVectorCreate(ceed, num_comp*num_dofs_u_f, &U_f); + CeedVectorCreate(ceed, num_comp * num_dofs_u_f, &U_f); CeedOperatorApply(op_prolong, U_c, U_f, CEED_REQUEST_IMMEDIATE); // Fine problem - CeedVectorCreate(ceed, num_comp*num_dofs_u_f, &V_f); + CeedVectorCreate(ceed, num_comp * num_dofs_u_f, &V_f); CeedOperatorApply(op_mass_f, U_f, V_f, CEED_REQUEST_IMMEDIATE); // Check output CeedVectorGetArrayRead(V_f, CEED_MEM_HOST, &hv); sum = 0.; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 2.) > 1000. * CEED_EPSILON) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum); CeedVectorRestoreArrayRead(V_f, &hv); // Restrict state to coarse grid @@ -158,13 +131,10 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(V_c, CEED_MEM_HOST, &hv); sum = 0.; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 2.) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); CeedVectorRestoreArrayRead(V_c, &hv); // Cleanup diff --git a/tests/t553-operator.c b/tests/t553-operator.c index 304e29a43f..ab5c93862f 100644 --- a/tests/t553-operator.c +++ b/tests/t553-operator.c @@ -2,61 +2,49 @@ /// Test creation, action, and destruction for mass matrix operator with multigrid level, nontensor basis /// \test Test creation, action, and destruction for mass matrix operator with multigrid level, nontensor basis #include -#include #include +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_qd, - elem_restr_u_c, elem_restr_u_f; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass_c, op_mass_f, - op_prolong, op_restrict; - CeedVector q_data, X, U_c, U_f, - V_c, V_f, p_mult_f; - const CeedScalar *hv; - CeedInt num_elem = 15, P_c = 3, P_f = 5, Q = 8; - CeedInt num_dofs_x = num_elem+1, num_dofs_u_c = num_elem*(P_c-1)+1, - num_dofs_u_f = num_elem*(P_f-1)+1; - CeedInt ind_u_c[num_elem*P_c], ind_u_f[num_elem*P_f], - ind_x[num_elem*2]; - CeedScalar x[num_dofs_x]; - CeedScalar sum; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_qd, elem_restr_u_c, elem_restr_u_f; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass_c, op_mass_f, op_prolong, op_restrict; + CeedVector q_data, X, U_c, U_f, V_c, V_f, p_mult_f; + const CeedScalar *hv; + CeedInt num_elem = 15, P_c = 3, P_f = 5, Q = 8; + CeedInt num_dofs_x = num_elem + 1, num_dofs_u_c = num_elem * (P_c - 1) + 1, num_dofs_u_f = num_elem * (P_f - 1) + 1; + CeedInt ind_u_c[num_elem * P_c], ind_u_f[num_elem * P_f], ind_x[num_elem * 2]; + CeedScalar x[num_dofs_x]; + CeedScalar sum; CeedInit(argv[1], &ceed); - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Coarse Grid: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V_c, &hv); // Prolong coarse u @@ -133,13 +108,10 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(V_f, CEED_MEM_HOST, &hv); sum = 0.; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Fine Grid: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area Fine Grid: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V_f, &hv); // Restrict state to coarse grid @@ -148,13 +120,10 @@ int main(int argc, char **argv) { // Check output CeedVectorGetArrayRead(V_c, CEED_MEM_HOST, &hv); sum = 0.; - for (CeedInt i=0; i1000.*CEED_EPSILON) - // LCOV_EXCL_START - printf("Computed Area Coarse Grid: %f != True Area: 1.0\n", sum); - // LCOV_EXCL_STOP + if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 1.0\n", sum); CeedVectorRestoreArrayRead(V_c, &hv); // Cleanup diff --git a/tests/t560-operator.c b/tests/t560-operator.c index 6e53660443..a454c9b7e5 100644 --- a/tests/t560-operator.c +++ b/tests/t560-operator.c @@ -2,62 +2,60 @@ /// Test full assembly of mass matrix operator /// \test Test full assembly of mass matrix operator #include -#include #include +#include + #include "t510-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - CeedInt P = 3, Q = 4, dim = 2; - CeedInt n_x = 3, n_y = 2; - CeedInt num_elem = n_x * n_y; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar assembled[num_dofs*num_dofs]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs*num_dofs]; - CeedScalar *u; - const CeedScalar *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + CeedInt P = 3, Q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar assembled[num_dofs * num_dofs]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs * num_dofs]; + CeedScalar *u; + const CeedScalar *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i - 100.*CEED_EPSILON) + for (int i = 0; i < num_dofs; i++) { + for (int j = 0; j < num_dofs; j++) { + if (fabs(assembled[j * num_dofs + i] - assembled_true[j * num_dofs + i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", - i, j, assembled[j*num_dofs+i], assembled_true[j*num_dofs+i]); - // LCOV_EXCL_STOP + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled[j * num_dofs + i], + assembled_true[j * num_dofs + i]); + // LCOV_EXCL_STOP + } + } + } // Cleanup free(rows); diff --git a/tests/t561-operator.c b/tests/t561-operator.c index 56c5454db8..80aa3326be 100644 --- a/tests/t561-operator.c +++ b/tests/t561-operator.c @@ -2,63 +2,60 @@ /// Test full assembly of Poisson operator /// \test Test full assembly of Poisson operator #include -#include #include +#include + #include "t534-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_diff; - CeedOperator op_setup, op_diff; - CeedVector q_data, X, U, V; - CeedInt P = 3, Q = 4, dim = 2; - CeedInt n_x = 3, n_y = 2; - CeedInt num_elem = n_x * n_y; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar assembled[num_dofs*num_dofs]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs*num_dofs]; - CeedScalar *u; - const CeedScalar *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_diff; + CeedOperator op_setup, op_diff; + CeedVector q_data, X, U, V; + CeedInt P = 3, Q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar assembled[num_dofs * num_dofs]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs * num_dofs]; + CeedScalar *u; + const CeedScalar *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i - 100.*CEED_EPSILON) + for (int i = 0; i < num_dofs; i++) { + for (int j = 0; j < num_dofs; j++) { + if (fabs(assembled[j * num_dofs + i] - assembled_true[j * num_dofs + i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", - i, j, assembled[j*num_dofs+i], assembled_true[j*num_dofs+i]); - // LCOV_EXCL_STOP + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled[j * num_dofs + i], + assembled_true[j * num_dofs + i]); + // LCOV_EXCL_STOP + } + } + } // Cleanup free(rows); diff --git a/tests/t562-operator.c b/tests/t562-operator.c index 0ef7c96c6a..299bc77e8c 100644 --- a/tests/t562-operator.c +++ b/tests/t562-operator.c @@ -2,109 +2,93 @@ /// Test full assembly of mass and Poisson operator (see t535) /// \test Test full assembly of mass and Poisson operator #include -#include #include +#include + #include "t535-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_mass_i, elem_restr_qd_diff_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply; - CeedOperator op_setup_mass, op_setup_diff, op_apply; - CeedVector q_data_mass, q_data_diff, X, U, V; - CeedInt P = 3, Q = 4, dim = 2; - CeedInt n_x = 3, n_y = 2; - CeedInt num_elem = n_x * n_y; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar assembled[num_dofs*num_dofs]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs*num_dofs]; - CeedScalar *u; - const CeedScalar *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_mass_i, elem_restr_qd_diff_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply; + CeedOperator op_setup_mass, op_setup_diff, op_apply; + CeedVector q_data_mass, q_data_diff, X, U, V; + CeedInt P = 3, Q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar assembled[num_dofs * num_dofs]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs * num_dofs]; + CeedScalar *u; + const CeedScalar *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i - 100.*CEED_EPSILON) + for (int i = 0; i < num_dofs; i++) { + for (int j = 0; j < num_dofs; j++) { + if (fabs(assembled[j * num_dofs + i] - assembled_true[j * num_dofs + i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", - i, j, assembled[j*num_dofs+i], assembled_true[j*num_dofs+i]); - // LCOV_EXCL_STOP + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled[j * num_dofs + i], + assembled_true[j * num_dofs + i]); + // LCOV_EXCL_STOP + } + } + } // Cleanup free(rows); diff --git a/tests/t563-operator.c b/tests/t563-operator.c index 2501dcabad..e271135c30 100644 --- a/tests/t563-operator.c +++ b/tests/t563-operator.c @@ -2,124 +2,106 @@ /// Test full assembly of mass and Poisson operator (see t536) /// \test Test full assembly of mass and Poisson operator #include -#include #include +#include + #include "t320-basis.h" #include "t535-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_mass_i, elem_restr_qd_diff_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply; - CeedOperator op_setup_mass, op_setup_diff, op_apply; - CeedVector q_data_mass, q_data_diff, X, U, V; - CeedInt num_elem = 12, dim = 2, P = 6, Q = 4; - CeedInt n_x = 3, n_y = 2; - CeedInt row, col, offset; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar assembled[num_dofs*num_dofs]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs*num_dofs]; - CeedScalar q_ref[dim*Q], q_weight[Q]; - CeedScalar interp[P*Q], grad[dim*P*Q]; - CeedScalar *u; - const CeedScalar *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_mass_i, elem_restr_qd_diff_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup_mass, qf_setup_diff, qf_apply; + CeedOperator op_setup_mass, op_setup_diff, op_apply; + CeedVector q_data_mass, q_data_diff, X, U, V; + CeedInt num_elem = 12, dim = 2, P = 6, Q = 4; + CeedInt n_x = 3, n_y = 2; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar assembled[num_dofs * num_dofs]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs * num_dofs]; + CeedScalar q_ref[dim * Q], q_weight[Q]; + CeedScalar interp[P * Q], grad[dim * P * Q]; + CeedScalar *u; + const CeedScalar *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i - 100.*CEED_EPSILON) + for (int i = 0; i < num_dofs; i++) { + for (int j = 0; j < num_dofs; j++) { + if (fabs(assembled[j * num_dofs + i] - assembled_true[j * num_dofs + i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", - i, j, assembled[j*num_dofs+i], assembled_true[j*num_dofs+i]); - // LCOV_EXCL_STOP + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled[j * num_dofs + i], + assembled_true[j * num_dofs + i]); + // LCOV_EXCL_STOP + } + } + } // Cleanup free(rows); diff --git a/tests/t564-operator.c b/tests/t564-operator.c index 586add1400..db296ed726 100644 --- a/tests/t564-operator.c +++ b/tests/t564-operator.c @@ -2,72 +2,68 @@ /// Test assembly of mass matrix operator (multi-component) see t537 /// \test Test assembly of mass matrix operator (multi-component) #include -#include #include +#include + #include "t537-operator.h" int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - CeedInt P = 3, Q = 4, dim = 2, num_comp = 2; - CeedInt n_x = 1, n_y = 1; - CeedInt num_elem = n_x * n_y; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar assembled[num_comp*num_comp*num_dofs*num_dofs]; - CeedScalar x[dim*num_dofs], assembled_true[num_comp*num_comp*num_dofs*num_dofs]; - CeedScalar *u; - const CeedScalar *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + CeedInt P = 3, Q = 4, dim = 2, num_comp = 2; + CeedInt n_x = 1, n_y = 1; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar assembled[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar x[dim * num_dofs], assembled_true[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar *u; + const CeedScalar *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 0) - u[indOld] = 0.0; + u[ind] = 1.0; + if (ind > 0) u[indOld] = 0.0; indOld = ind; CeedVectorRestoreArray(U, &u); @@ -132,22 +120,21 @@ int main(int argc, char **argv) { CeedOperatorApply(op_mass, U, V, CEED_REQUEST_IMMEDIATE); CeedVectorGetArrayRead(V, CEED_MEM_HOST, &v); - for (CeedInt k=0; k - 100.*CEED_EPSILON) + for (CeedInt i = 0; i < num_comp * num_dofs; i++) { + for (CeedInt j = 0; j < num_comp * num_dofs; j++) { + if (fabs(assembled[j * num_dofs * num_comp + i] - assembled_true[j * num_dofs * num_comp + i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", - i, j, assembled[j*num_dofs*num_comp+i], assembled_true[j*num_dofs*num_comp+i]); - // LCOV_EXCL_STOP + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled[j * num_dofs * num_comp + i], + assembled_true[j * num_dofs * num_comp + i]); + // LCOV_EXCL_STOP + } + } + } // Cleanup free(rows); diff --git a/tests/t565-operator.c b/tests/t565-operator.c index b10f9e996b..9aa68cc5a7 100644 --- a/tests/t565-operator.c +++ b/tests/t565-operator.c @@ -2,66 +2,62 @@ /// Test full assembly of composite operator (see t538) /// \test Test full assembly of composite operator #include -#include #include +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_mass_i, elem_restr_qd_diff_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup_mass, qf_mass, qf_setup_diff, qf_diff; - CeedOperator op_setup_mass, op_mass, op_setup_diff, op_diff, op_apply; - CeedVector q_data_mass, q_data_diff, X, U, V; - CeedInt P = 3, Q = 4, dim = 2; - CeedInt n_x = 3, n_y = 2; - CeedInt num_elem = n_x * n_y; - CeedInt num_dofs = (n_x*2+1)*(n_y*2+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar assembled[num_dofs*num_dofs]; - CeedScalar x[dim*num_dofs], assembled_true[num_dofs*num_dofs]; - CeedScalar *u; - const CeedScalar *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_mass_i, elem_restr_qd_diff_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup_mass, qf_mass, qf_setup_diff, qf_diff; + CeedOperator op_setup_mass, op_mass, op_setup_diff, op_diff, op_apply; + CeedVector q_data_mass, q_data_diff, X, U, V; + CeedInt P = 3, Q = 4, dim = 2; + CeedInt n_x = 3, n_y = 2; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar assembled[num_dofs * num_dofs]; + CeedScalar x[dim * num_dofs], assembled_true[num_dofs * num_dofs]; + CeedScalar *u; + const CeedScalar *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i - 100.*CEED_EPSILON) + for (int i = 0; i < num_dofs; i++) { + for (int j = 0; j < num_dofs; j++) { + if (fabs(assembled[j * num_dofs + i] - assembled_true[j * num_dofs + i]) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", - i, j, assembled[j*num_dofs+i], assembled_true[j*num_dofs+i]); - // LCOV_EXCL_STOP + printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled[j * num_dofs + i], + assembled_true[j * num_dofs + i]); + // LCOV_EXCL_STOP + } + } + } // Cleanup free(rows); diff --git a/tests/t566-operator.c b/tests/t566-operator.c index f2db62437d..222632af77 100644 --- a/tests/t566-operator.c +++ b/tests/t566-operator.c @@ -1,73 +1,69 @@ /// @file /// Test assembly of non-symmetric mass matrix operator (multi-component) see t537 /// \test Test assembly of non-symmetric mass matrix operator (multi-component) +#include "t566-operator.h" + #include -#include #include -#include "t566-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_mass; - CeedOperator op_setup, op_mass; - CeedVector q_data, X, U, V; - CeedInt P = 3, Q = 3, dim = 2, num_comp = 2; - CeedInt n_x = 1, n_y = 1; - CeedInt num_elem = n_x * n_y; - CeedInt num_dofs = (n_x*(P-1)+1)*(n_y*(P-1)+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar assembled[num_comp*num_comp*num_dofs*num_dofs]; - CeedScalar x[dim*num_dofs], assembled_true[num_comp*num_comp*num_dofs*num_dofs]; - CeedScalar *u; - const CeedScalar *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_mass; + CeedOperator op_setup, op_mass; + CeedVector q_data, X, U, V; + CeedInt P = 3, Q = 3, dim = 2, num_comp = 2; + CeedInt n_x = 1, n_y = 1; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * (P - 1) + 1) * (n_y * (P - 1) + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar assembled[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar x[dim * num_dofs], assembled_true[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar *u; + const CeedScalar *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 0) - u[indOld] = 0.0; + CeedInt ind = node_in + comp_in * num_dofs; + u[ind] = 1.0; + if (ind > 0) u[indOld] = 0.0; indOld = ind; CeedVectorRestoreArray(U, &u); @@ -134,30 +122,25 @@ int main(int argc, char **argv) { CeedOperatorApply(op_mass, U, V, CEED_REQUEST_IMMEDIATE); CeedVectorGetArrayRead(V, CEED_MEM_HOST, &v); - for (CeedInt k=0; k - 100.*CEED_EPSILON) + if (fabs(assembled_value - assembled_true_value) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("[(%" CeedInt_FMT ", %" CeedInt_FMT "), (%" CeedInt_FMT - ", %" CeedInt_FMT ")] Error in assembly: %f != %f\n", - node_out, comp_out, node_in, comp_in, - assembled_value, assembled_true_value); - // LCOV_EXCL_STOP + printf("[(%" CeedInt_FMT ", %" CeedInt_FMT "), (%" CeedInt_FMT ", %" CeedInt_FMT ")] Error in assembly: %f != %f\n", node_out, comp_out, + node_in, comp_in, assembled_value, assembled_true_value); + // LCOV_EXCL_STOP + } } } } diff --git a/tests/t566-operator.h b/tests/t566-operator.h index 4b00885864..f0303f3c52 100644 --- a/tests/t566-operator.h +++ b/tests/t566-operator.h @@ -7,29 +7,25 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { const CeedScalar *weight = in[0], *J = in[1]; - CeedScalar *rho = out[0]; - for (CeedInt i=0; i -#include #include -#include "t567-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_diff; - CeedOperator op_setup, op_diff; - CeedVector q_data, X, U, V; - CeedInt P = 3, Q = 3, dim = 2, num_comp = 2; - CeedInt n_x = 1, n_y = 1; - CeedInt num_elem = n_x * n_y; - CeedInt num_dofs = (n_x*(P-1)+1)*(n_y*(P-1)+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar assembled[num_comp*num_comp*num_dofs*num_dofs]; - CeedScalar x[dim*num_dofs], assembled_true[num_comp*num_comp*num_dofs*num_dofs]; - CeedScalar *u; - const CeedScalar *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_diff; + CeedOperator op_setup, op_diff; + CeedVector q_data, X, U, V; + CeedInt P = 3, Q = 3, dim = 2, num_comp = 2; + CeedInt n_x = 1, n_y = 1; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * (P - 1) + 1) * (n_y * (P - 1) + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar assembled[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar x[dim * num_dofs], assembled_true[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar *u; + const CeedScalar *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 0) - u[indOld] = 0.0; + CeedInt ind = node_in + comp_in * num_dofs; + u[ind] = 1.0; + if (ind > 0) u[indOld] = 0.0; indOld = ind; CeedVectorRestoreArray(U, &u); @@ -134,30 +122,25 @@ int main(int argc, char **argv) { CeedOperatorApply(op_diff, U, V, CEED_REQUEST_IMMEDIATE); CeedVectorGetArrayRead(V, CEED_MEM_HOST, &v); - for (CeedInt k=0; k - 100.*CEED_EPSILON) + if (fabs(assembled_value - assembled_true_value) > 100. * CEED_EPSILON) { // LCOV_EXCL_START - printf("[(%" CeedInt_FMT ", %" CeedInt_FMT "), (%" CeedInt_FMT - ", %" CeedInt_FMT ")] Error in assembly: %f != %f\n", - node_out, comp_out, node_in, comp_in, - assembled_value, assembled_true_value); - // LCOV_EXCL_STOP + printf("[(%" CeedInt_FMT ", %" CeedInt_FMT "), (%" CeedInt_FMT ", %" CeedInt_FMT ")] Error in assembly: %f != %f\n", node_out, comp_out, + node_in, comp_in, assembled_value, assembled_true_value); + // LCOV_EXCL_STOP + } } } } diff --git a/tests/t567-operator.h b/tests/t567-operator.h index 56b8132380..55a5822c18 100644 --- a/tests/t567-operator.h +++ b/tests/t567-operator.h @@ -7,18 +7,14 @@ #include -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { - // *INDENT-OFF* - const CeedScalar *w = in[0], - (*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[1]; - CeedScalar (*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; - // *INDENT-ON* +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { + // *INDENT-OFF* + const CeedScalar *w = in[0], (*J)[2][CEED_Q_VLA] = (const CeedScalar(*)[2][CEED_Q_VLA])in[1]; + CeedScalar(*q_data)[CEED_Q_VLA] = (CeedScalar(*)[CEED_Q_VLA])out[0]; + // *INDENT-ON* // Quadrature point loop - CeedPragmaSIMD - for (CeedInt i=0; i -#include #include -#include "t568-operator.h" +#include int main(int argc, char **argv) { - Ceed ceed; - CeedElemRestriction elem_restr_x, elem_restr_u, - elem_restr_qd_i; - CeedBasis basis_x, basis_u; - CeedQFunction qf_setup, qf_diff; - CeedOperator op_setup, op_diff; - CeedVector q_data, X, U, V; - CeedInt P = 3, Q = 3, dim = 2, num_comp = 2; - CeedInt n_x = 1, n_y = 1; - CeedInt num_elem = n_x * n_y; - CeedInt num_dofs = (n_x*(P-1)+1)*(n_y*(P-1)+1), num_qpts = num_elem*Q*Q; - CeedInt ind_x[num_elem*P*P]; - CeedScalar assembled[num_comp*num_comp*num_dofs*num_dofs]; - CeedScalar x[dim*num_dofs], assembled_true[num_comp*num_comp*num_dofs*num_dofs]; - CeedScalar *u; - const CeedScalar *v; + Ceed ceed; + CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_qd_i; + CeedBasis basis_x, basis_u; + CeedQFunction qf_setup, qf_diff; + CeedOperator op_setup, op_diff; + CeedVector q_data, X, U, V; + CeedInt P = 3, Q = 3, dim = 2, num_comp = 2; + CeedInt n_x = 1, n_y = 1; + CeedInt num_elem = n_x * n_y; + CeedInt num_dofs = (n_x * (P - 1) + 1) * (n_y * (P - 1) + 1), num_qpts = num_elem * Q * Q; + CeedInt ind_x[num_elem * P * P]; + CeedScalar assembled[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar x[dim * num_dofs], assembled_true[num_comp * num_comp * num_dofs * num_dofs]; + CeedScalar *u; + const CeedScalar *v; CeedInit(argv[1], &ceed); // DoF Coordinates - for (CeedInt i=0; i 0) - u[indOld] = 0.0; + CeedInt ind = node_in + comp_in * num_dofs; + u[ind] = 1.0; + if (ind > 0) u[indOld] = 0.0; indOld = ind; CeedVectorRestoreArray(U, &u); @@ -136,30 +123,25 @@ int main(int argc, char **argv) { CeedOperatorApply(op_diff, U, V, CEED_REQUEST_IMMEDIATE); CeedVectorGetArrayRead(V, CEED_MEM_HOST, &v); - for (CeedInt k=0; k -CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) { +CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) { // At every quadrature point, compute qw/det(J).adj(J).adj(J)^T and store // the symmetric part of the result. @@ -21,24 +19,23 @@ CEED_QFUNCTION(setup)(void *ctx, const CeedInt Q, CeedScalar *qd = out[0]; // Quadrature point loop - for (CeedInt i=0; i