diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md index a4059bd55..83a89a930 100644 --- a/docs/algorithms/kem/ml_kem.md +++ b/docs/algorithms/kem/ml_kem.md @@ -7,7 +7,7 @@ - **Authors' website**: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203 - **Specification version**: ML-KEM. - **Primary Source**: - - **Source**: https://github.com/bhess/mlkem-native/commit/fc3fa5f4618ef2e5567d0595be3cdb002d4baaac + - **Source**: https://github.com/bhess/mlkem-native/commit/8f54f09f21583fc0e29103f200fd5a42ec57665d - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0 diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml index e9e831270..3d7cc3d89 100644 --- a/docs/algorithms/kem/ml_kem.yml +++ b/docs/algorithms/kem/ml_kem.yml @@ -17,7 +17,7 @@ website: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203 nist-round: FIPS203 spec-version: ML-KEM primary-upstream: - source: https://github.com/bhess/mlkem-native/commit/fc3fa5f4618ef2e5567d0595be3cdb002d4baaac + source: https://github.com/bhess/mlkem-native/commit/8f54f09f21583fc0e29103f200fd5a42ec57665d spdx-license-identifier: CC0-1.0 or Apache-2.0 parameter-sets: - name: ML-KEM-512 diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index f9c237289..357bedf77 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -33,8 +33,8 @@ upstreams: - name: mlkem-native git_url: https://github.com/bhess/mlkem-native.git - git_branch: updates-2 - git_commit: fc3fa5f4618ef2e5567d0595be3cdb002d4baaac + git_branch: updates-3 + git_commit: 8f54f09f21583fc0e29103f200fd5a42ec57665d kem_meta_path: '{pretty_name_full}_META.yml' kem_scheme_path: '.' - diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt index cf49ea0ef..4ccb40555 100644 --- a/src/kem/ml_kem/CMakeLists.txt +++ b/src/kem/ml_kem/CMakeLists.txt @@ -24,7 +24,7 @@ if(OQS_ENABLE_KEM_ml_kem_512_x86_64) endif() if(OQS_ENABLE_KEM_ml_kem_512_aarch64) - add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/cbd.c mlkem-native_ml-kem-512_aarch64/debug/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/ntt.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/polyvec.c mlkem-native_ml-kem-512_aarch64/rej_uniform.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c) + add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/cbd.c mlkem-native_ml-kem-512_aarch64/debug/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/ntt.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/polyvec.c mlkem-native_ml-kem-512_aarch64/rej_uniform.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c) target_include_directories(ml_kem_512_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_aarch64) target_include_directories(ml_kem_512_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_512_aarch64 PUBLIC -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE) @@ -50,7 +50,7 @@ if(OQS_ENABLE_KEM_ml_kem_768_x86_64) endif() if(OQS_ENABLE_KEM_ml_kem_768_aarch64) - add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/cbd.c mlkem-native_ml-kem-768_aarch64/debug/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/ntt.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/polyvec.c mlkem-native_ml-kem-768_aarch64/rej_uniform.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c) + add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/cbd.c mlkem-native_ml-kem-768_aarch64/debug/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/ntt.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/polyvec.c mlkem-native_ml-kem-768_aarch64/rej_uniform.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c) target_include_directories(ml_kem_768_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_aarch64) target_include_directories(ml_kem_768_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_768_aarch64 PUBLIC -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE) @@ -76,7 +76,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024_x86_64) endif() if(OQS_ENABLE_KEM_ml_kem_1024_aarch64) - add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/cbd.c mlkem-native_ml-kem-1024_aarch64/debug/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/ntt.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/polyvec.c mlkem-native_ml-kem-1024_aarch64/rej_uniform.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c) + add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/cbd.c mlkem-native_ml-kem-1024_aarch64/debug/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/ntt.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/polyvec.c mlkem-native_ml-kem-1024_aarch64/rej_uniform.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c) target_include_directories(ml_kem_1024_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_aarch64) target_include_directories(ml_kem_1024_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_1024_aarch64 PUBLIC -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c index ecf1b529a..b638e2081 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/aarch64_zetas.c @@ -20,7 +20,7 @@ * Table of zeta values used in the AArch64 forward NTT * See autogenerate_files.py for details. */ -const int16_t aarch64_ntt_zetas_layer01234[] = { +ALIGN const int16_t aarch64_ntt_zetas_layer01234[] = { -1600, -15749, -749, -7373, -40, -394, -687, -6762, 630, 6201, -1432, -14095, 848, 8347, 0, 0, 1062, 10453, 296, 2914, -882, -8682, 0, 0, -1410, -13879, 1339, 13180, 1476, 14529, @@ -31,7 +31,7 @@ const int16_t aarch64_ntt_zetas_layer01234[] = { 0, 0, -1583, -15582, -1355, -13338, 821, 8081, 0, 0, }; -const int16_t aarch64_ntt_zetas_layer56[] = { +ALIGN const int16_t aarch64_ntt_zetas_layer56[] = { 289, 289, 331, 331, -76, -76, -1573, -1573, 2845, 2845, 3258, 3258, -748, -748, -15483, -15483, 17, 17, 583, 583, 1637, 1637, -1041, -1041, 167, 167, 5739, @@ -77,7 +77,7 @@ const int16_t aarch64_ntt_zetas_layer56[] = { 10129, 10129, -3878, -3878, -11566, -11566, }; -const int16_t aarch64_invntt_zetas_layer01234[] = { +ALIGN const int16_t aarch64_invntt_zetas_layer01234[] = { 1583, 15582, -821, -8081, 1355, 13338, 0, 0, -569, -5601, 450, 4429, 936, 9213, 0, 0, 69, 679, 447, 4400, -535, -5266, 0, 0, 543, 5345, 1235, 12156, -1426, -14036, @@ -88,7 +88,7 @@ const int16_t aarch64_invntt_zetas_layer01234[] = { -848, -8347, 1432, 14095, -630, -6201, 687, 6762, 0, 0, }; -const int16_t aarch64_invntt_zetas_layer56[] = { +ALIGN const int16_t aarch64_invntt_zetas_layer56[] = { -910, -910, -1227, -1227, 219, 219, 855, 855, -8957, -8957, -12078, -12078, 2156, 2156, 8416, 8416, 1175, 1175, 394, 394, -1029, -1029, -1212, -1212, 11566, 11566, 3878, @@ -134,7 +134,7 @@ const int16_t aarch64_invntt_zetas_layer56[] = { -16113, -16113, -5739, -5739, -167, -167, }; -const int16_t aarch64_zetas_mulcache_native[] = { +ALIGN const int16_t aarch64_zetas_mulcache_native[] = { 17, -17, -568, 568, 583, -583, -680, 680, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 1409, -1409, -667, 667, -48, 48, 233, -233, 756, -756, -1173, 1173, -314, 314, -279, 279, -1626, @@ -149,7 +149,7 @@ const int16_t aarch64_zetas_mulcache_native[] = { 1219, -394, 394, 885, -885, -1175, 1175, }; -const int16_t aarch64_zetas_mulcache_twisted_native[] = { +ALIGN const int16_t aarch64_zetas_mulcache_twisted_native[] = { 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 7441, -7441, -11546, @@ -168,7 +168,6 @@ const int16_t aarch64_zetas_mulcache_twisted_native[] = { }; #else -#include "params.h" /* Dummy declaration for compilers disliking empty compilation units */ #define empty_cu_aarch64_zetas MLKEM_NAMESPACE(empty_cu_aarch64_zetas) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h index 2f3b0ef4f..6a5ee8a7d 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/arith_native_aarch64.h @@ -19,6 +19,7 @@ MLKEM_NAMESPACE(aarch64_zetas_mulcache_native) #define aarch64_zetas_mulcache_twisted_native \ MLKEM_NAMESPACE(aarch64_zetas_mulcache_twisted_native) +#define rej_uniform_table MLKEM_NAMESPACE(rej_uniform_table) extern const int16_t aarch64_ntt_zetas_layer01234[]; extern const int16_t aarch64_ntt_zetas_layer56[]; @@ -26,6 +27,7 @@ extern const int16_t aarch64_invntt_zetas_layer01234[]; extern const int16_t aarch64_invntt_zetas_layer56[]; extern const int16_t aarch64_zetas_mulcache_native[]; extern const int16_t aarch64_zetas_mulcache_twisted_native[]; +extern const uint8_t rej_uniform_table[]; #define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean) void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *); @@ -41,7 +43,7 @@ void intt_asm_opt(int16_t *, const int16_t *, const int16_t *); #define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) unsigned int rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, - unsigned int buflen); + unsigned int buflen, const uint8_t *table); #define poly_reduce_asm_clean MLKEM_NAMESPACE(poly_reduce_asm_clean) void poly_reduce_asm_clean(int16_t *); diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h index c91a7eedd..b0ff3d597 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/clean_impl.h @@ -74,7 +74,7 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len, { return -1; } - return (int)rej_uniform_asm_clean(r, buf, buflen); + return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table); } #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/common.i b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/common.i deleted file mode 100644 index 6b8880311..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/common.i +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: MIT - */ - -#if __APPLE__ -#define ASM_LOAD(dst, symbol) \ - adrp dst, symbol @PAGE %% add dst, dst, symbol @PAGEOFF -#else /* __APPLE__ */ -#define ASM_LOAD(dst, symbol) \ - adrp dst, symbol; \ - add dst, dst, : lo12 : symbol; -#endif /* __APPLE__ */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S index 2f05d8cca..623a82ae9 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_clean.S @@ -26,10 +26,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) // @@ -145,38 +141,7 @@ .text - .global MLKEM_NAMESPACE(intt_asm_clean) - .global _MLKEM_NAMESPACE(intt_asm_clean) - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ninv_addr: .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 -ninv_tw_addr: .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - -MLKEM_NAMESPACE(intt_asm_clean): -_MLKEM_NAMESPACE(intt_asm_clean): - push_stack + .global MLKEM_ASM_NAMESPACE(intt_asm_clean) in .req x0 r01234_ptr .req x1 @@ -227,16 +192,41 @@ _MLKEM_NAMESPACE(intt_asm_clean): t2 .req v27 t3 .req v28 - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ninv .req v29 + q_ninv .req q29 ninv_tw .req v30 + q_ninv_tw .req q30 + +/* Literal pool */ +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm + +.p2align 4 +c_consts: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +c_ninv: dup8h 512 +c_ninv_tw: dup8h 5040 + +MLKEM_ASM_NAMESPACE(intt_asm_clean): + push_stack - ASM_LOAD(xtmp, ninv_addr) - ld1r {ninv.8h}, [xtmp] - ASM_LOAD(xtmp, ninv_tw_addr) - ld1r {ninv_tw.8h}, [xtmp] + ldr q_consts, c_consts + ldr q_ninv, c_ninv + ldr q_ninv_tw, c_ninv_tw mov inp, in mov count, #8 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S index fc720e504..e332efef8 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/intt_opt.S @@ -26,10 +26,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) // @@ -145,38 +141,7 @@ .text - .global MLKEM_NAMESPACE(intt_asm_opt) - .global _MLKEM_NAMESPACE(intt_asm_opt) - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ninv_addr: .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 -ninv_tw_addr: .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - -MLKEM_NAMESPACE(intt_asm_opt): -_MLKEM_NAMESPACE(intt_asm_opt): - push_stack + .global MLKEM_ASM_NAMESPACE(intt_asm_opt) in .req x0 r01234_ptr .req x1 @@ -227,16 +192,41 @@ _MLKEM_NAMESPACE(intt_asm_opt): t2 .req v27 t3 .req v28 - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ninv .req v29 + q_ninv .req q29 ninv_tw .req v30 + q_ninv_tw .req q30 + +/* Literal pool */ +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm + +.p2align 4 +c_consts: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +c_ninv: dup8h 512 +c_ninv_tw: dup8h 5040 + +MLKEM_ASM_NAMESPACE(intt_asm_opt): + push_stack - ASM_LOAD(xtmp, ninv_addr) - ld1r {ninv.8h}, [xtmp] - ASM_LOAD(xtmp, ninv_tw_addr) - ld1r {ninv_tw.8h}, [xtmp] + ldr q_consts, c_consts + ldr q_ninv, c_ninv + ldr q_ninv_tw, c_ninv_tw mov inp, in mov count, #8 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S index b00d08810..877a5f689 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_clean.S @@ -27,10 +27,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) // @@ -64,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] -.endm - .macro load_roots_012 ldr q_root0, [r01234_ptr], #32 ldr q_root1, [r01234_ptr, #-16] @@ -100,13 +90,6 @@ trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro transpose_single data_out, data_in - trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s - trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s - trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s - trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s -.endm - .macro save_vregs sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -182,11 +165,11 @@ t3 .req v28 .text - .global MLKEM_NAMESPACE(ntt_asm_clean) - .global _MLKEM_NAMESPACE(ntt_asm_clean) + .global MLKEM_ASM_NAMESPACE(ntt_asm_clean) +/* Literal pool */ .p2align 4 -const_addr: +c_consts: .short 3329 .short 20159 .short 0 @@ -196,12 +179,9 @@ const_addr: .short 0 .short 0 -MLKEM_NAMESPACE(ntt_asm_clean): -_MLKEM_NAMESPACE(ntt_asm_clean): +MLKEM_ASM_NAMESPACE(ntt_asm_clean): push_stack - ASM_LOAD(xtmp, const_addr) - - ld1 {consts.8h}, [xtmp] + ldr q_consts, c_consts mov inp, in mov count, #4 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S index b8a22ecfc..15103a595 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/ntt_opt.S @@ -27,10 +27,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) @@ -64,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] -.endm - .macro load_roots_012 ldr q_root0, [r01234_ptr], #32 ldr q_root1, [r01234_ptr, #-16] @@ -100,13 +90,6 @@ trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro transpose_single data_out, data_in - trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s - trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s - trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s - trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s -.endm - .macro save_vregs sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -182,11 +165,11 @@ t3 .req v28 .text - .global MLKEM_NAMESPACE(ntt_asm_opt) - .global _MLKEM_NAMESPACE(ntt_asm_opt) + .global MLKEM_ASM_NAMESPACE(ntt_asm_opt) +/* Literal pool */ .p2align 4 -const_addr: +c_consts: .short 3329 .short 20159 .short 0 @@ -196,12 +179,9 @@ const_addr: .short 0 .short 0 -MLKEM_NAMESPACE(ntt_asm_opt): -_MLKEM_NAMESPACE(ntt_asm_opt): +MLKEM_ASM_NAMESPACE(ntt_asm_opt): push_stack - ASM_LOAD(xtmp, const_addr) - - ld1 {consts.8h}, [xtmp] + ldr q_consts, c_consts mov inp, in mov count, #4 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h index 5e4d00e1f..b22674026 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/opt_impl.h @@ -75,7 +75,7 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len, { return -1; } - return (int)rej_uniform_asm_clean(r, buf, buflen); + return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table); } #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S index 3e1bc5cf4..f70a40221 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_clean.S @@ -6,65 +6,85 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c .endm -.macro scalar_signed_to_unsigned a - sshr mask.8h, \a\().8h, #15 - and mask.16b, modulus.16b, mask.16b - add \a\().8h, \a\().8h, mask.8h +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 20159 // Barrett twist of 1 wrt 2^27 +c_mont_constant: dup8h -1044 // 2^16 % 3329 +c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) + +/* + * Some modular arithmetic macros + */ + +/* Barrett reduction */ +.macro barrett_reduce a + sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] + srshr tmp.8h, tmp.8h, #11 + mls \a\().8h, tmp.8h, modulus.h[0] .endm +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ .macro mulmod dst, src, const, const_twisted sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, tmp0.8h, consts.h[0] + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/* Turns signed-canonical to unsigned canonical representative + * through conditional addition of the modulus. + * + * Expected modulus in `modulus`. */ +.macro scalar_signed_to_unsigned a + sshr mask.8h, \a\().8h, #15 + and mask.16b, modulus.16b, mask.16b + add \a\().8h, \a\().8h, mask.8h .endm /********************************** * poly_reduce() * **********************************/ -.global MLKEM_NAMESPACE(poly_reduce_asm_clean) -.global _MLKEM_NAMESPACE(poly_reduce_asm_clean) -.p2align 4 -const_addr: - .short 3329 // ML-KEM modulus - .short 20159 // Barrett twist of 1 wrt 2^27 - .short -1044 // 2^16 % 3329 - .short -10276 // Barrett twist of -1044 (wrt 2^16) - .short 0 - .short 0 - .short 0 - .short 0 - - ptr .req x0 - count .req x1 - xtmp .req x2 - wmodulus .req w3 - - q_data .req q0 - data .req v0 - t0 .req v1 - consts .req v2 - mask .req v3 - modulus .req v4 - -MLKEM_NAMESPACE(poly_reduce_asm_clean): -_MLKEM_NAMESPACE(poly_reduce_asm_clean): - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov wmodulus, #3329 - dup modulus.8h, wmodulus +.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean) + + ptr .req x0 + count .req x1 + + data .req v0 + q_data .req q0 + + tmp .req v1 + mask .req v2 + modulus .req v3 + q_modulus .req q3 + modulus_twisted .req v4 + q_modulus_twisted .req q4 + +MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean): + + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted mov count, #8 loop_start: @@ -95,57 +115,64 @@ loop_start: .unreq ptr .unreq count - .unreq xtmp - .unreq q_data + .unreq data - .unreq t0 - .unreq consts + .unreq q_data + + .unreq tmp .unreq mask - .unreq wmodulus .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_mulcache_compute() * ********************************************/ -.global MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean) -.global _MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean) - - cache_ptr .req x0 - data_ptr .req x1 - zeta_ptr .req x2 - zeta_twisted_ptr .req x3 - xtmp .req x4 - - count .req x5 - - data_odd .req v1 - - zeta .req v2 - zeta_twisted .req v3 - q_zeta .req q2 - q_zeta_twisted .req q3 - - q_tmp0 .req q4 - q_tmp1 .req q5 - tmp0 .req v4 - tmp1 .req v5 - consts .req v6 - dst .req v7 - q_dst .req q7 - -MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean): -_MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov count, #(16) + +.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean) + + cache_ptr .req x0 + data_ptr .req x1 + zeta_ptr .req x2 + zeta_twisted_ptr .req x3 + count .req x4 + + data_odd .req v0 + zeta .req v1 + q_zeta .req q1 + zeta_twisted .req v2 + q_zeta_twisted .req q2 + + tmp0 .req v3 + q_tmp0 .req q3 + tmp1 .req v4 + q_tmp1 .req q4 + dst .req v5 + q_dst .req q5 + + modulus .req v6 + q_modulus .req q6 + modulus_twisted .req v7 + q_modulus_twisted .req q7 + +MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean): + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + + mov count, #16 mulcache_compute_loop_start: ldr q_tmp0, [data_ptr], #32 ldr q_tmp1, [data_ptr, #-16] ldr q_zeta, [zeta_ptr], #16 ldr q_zeta_twisted, [zeta_twisted_ptr], #16 + + // The mulcache of a polynomial a + b*X in Fq[X^2-zeta] is b*zeta; + // Since tmp0 || tmp1 represents multiple such polynomails as + // (a0,b0,a1,b1,...), extract only the odd elements. uzp2 data_odd.8h, tmp0.8h, tmp1.8h mulmod dst, data_odd, zeta, zeta_twisted + str q_dst, [cache_ptr], #16 subs count, count, #1 @@ -156,7 +183,7 @@ mulcache_compute_loop_start: .unreq cache_ptr .unreq data_ptr .unreq zeta_ptr - .unreq xtmp + .unreq zeta_twisted_ptr .unreq count .unreq data_odd @@ -165,34 +192,35 @@ mulcache_compute_loop_start: .unreq zeta_twisted .unreq q_zeta_twisted - .unreq q_tmp0 - .unreq q_tmp1 .unreq tmp0 + .unreq q_tmp0 .unreq tmp1 - .unreq consts + .unreq q_tmp1 .unreq dst + .unreq q_dst + + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_tobytes() * ********************************************/ -.global MLKEM_NAMESPACE(poly_tobytes_asm_clean) -.global _MLKEM_NAMESPACE(poly_tobytes_asm_clean) +.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_clean) data0 .req v0 data1 .req v1 - out0 .req v2 out1 .req v3 out2 .req v4 - - tmp .req v5 + tmp .req v5 dst .req x0 src .req x1 count .req x2 -MLKEM_NAMESPACE(poly_tobytes_asm_clean): -_MLKEM_NAMESPACE(poly_tobytes_asm_clean): +MLKEM_ASM_NAMESPACE(poly_tobytes_asm_clean): mov count, #16 poly_tobytes_asm_clean_asm_loop_start: @@ -229,37 +257,33 @@ poly_tobytes_asm_clean_asm_loop_start: /********************************** * poly_tomont() * **********************************/ -.global MLKEM_NAMESPACE(poly_tomont_asm_clean) -.global _MLKEM_NAMESPACE(poly_tomont_asm_clean) - - src .req x0 - count .req x1 - xtmp .req x2 - - wtmp2 .req w3 +.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean) - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 + src .req x0 + count .req x1 - consts .req v2 + data .req v0 + q_data .req q0 + res .req v1 + q_res .req q1 - factor .req v3 - factor_t .req v4 + factor .req v2 + q_factor .req q2 + factor_t .req v3 + q_factor_t .req q3 + modulus .req v4 + q_modulus .req q4 + modulus_twisted .req v5 + q_modulus_twisted .req q5 - tmp0 .req v5 + tmp0 .req v6 -MLKEM_NAMESPACE(poly_tomont_asm_clean): -_MLKEM_NAMESPACE(poly_tomont_asm_clean): +MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - ldrsh wtmp2, [xtmp, #(2*2)] - dup factor.8h, wtmp2 - ldrh wtmp2, [xtmp, #(3*2)] - dup factor_t.8h, wtmp2 + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + ldr q_factor, c_mont_constant + ldr q_factor_t, c_barrett_twist mov count, #8 poly_tomont_asm_loop: @@ -285,4 +309,23 @@ poly_tomont_asm_loop: ret + .unreq src + .unreq count + + .unreq data + .unreq q_data + .unreq res + .unreq q_res + + .unreq factor + .unreq q_factor + .unreq factor_t + .unreq q_factor_t + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted + + .unreq tmp0 + #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S index df3b21008..e58ee77c4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/poly_opt.S @@ -6,117 +6,137 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c .endm -.macro scalar_signed_to_unsigned a - sshr mask.8h, \a\().8h, #15 - and mask.16b, modulus.16b, mask.16b - add \a\().8h, \a\().8h, mask.8h +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 20159 // Barrett twist of 1 wrt 2^27 +c_mont_constant: dup8h -1044 // 2^16 % 3329 +c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) + +/* + * Some modular arithmetic macros + */ + +/* Barrett reduction */ +.macro barrett_reduce a + sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] + srshr tmp.8h, tmp.8h, #11 + mls \a\().8h, tmp.8h, modulus.h[0] .endm +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ .macro mulmod dst, src, const, const_twisted sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, tmp0.8h, consts.h[0] + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/* Turns signed-canonical to unsigned canonical representative + * through conditional addition of the modulus. + * + * Expected modulus in `modulus`. */ +.macro scalar_signed_to_unsigned a + sshr mask.8h, \a\().8h, #15 + and mask.16b, modulus.16b, mask.16b + add \a\().8h, \a\().8h, mask.8h .endm /********************************** * poly_reduce() * **********************************/ -.global MLKEM_NAMESPACE(poly_reduce_asm_opt) -.global _MLKEM_NAMESPACE(poly_reduce_asm_opt) -.p2align 4 -const_addr: - .short 3329 // ML-KEM modulus - .short 20159 // Barrett twist of 1 wrt 2^27 - .short -1044 // 2^16 % 3329 - .short -10276 // Barrett twist of -1044 (wrt 2^16) - .short 0 - .short 0 - .short 0 - .short 0 - - ptr .req x0 - count .req x1 - xtmp .req x2 - wmodulus .req w3 - - q_data .req q0 - data .req v0 - t0 .req v1 - consts .req v2 - mask .req v3 - modulus .req v4 - -MLKEM_NAMESPACE(poly_reduce_asm_opt): -_MLKEM_NAMESPACE(poly_reduce_asm_opt): - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov wmodulus, #3329 - dup modulus.8h, wmodulus +.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt) + + ptr .req x0 + count .req x1 + + data .req v0 + q_data .req q0 + + tmp .req v1 + mask .req v2 + modulus .req v3 + q_modulus .req q3 + modulus_twisted .req v4 + q_modulus_twisted .req q4 + +MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): + + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted mov count, #8 - // Instructions: 15 - // Expected cycles: 22 - // Expected IPC: 0.68 + // Instructions: 15 + // Expected cycles: 22 + // Expected IPC: 0.68 - // Cycle bound: 22.0 - // IPC bound: 0.68 + // Cycle bound: 22.0 + // IPC bound: 0.68 - // Wall time: 0.04s - // User time: 0.04s + // Wall time: 0.05s + // User time: 0.05s - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q21, [x0, #48] // *............................. - ldr q3, [x0, #32] // ..*........................... - sqdmulh v6.8H, v21.8H, v2.H[1] // ....*......................... - sqdmulh v30.8H, v3.8H, v2.H[1] // ......*....................... - srshr v6.8H, v6.8H, #11 // ........*..................... - srshr v30.8H, v30.8H, #11 // ..........*................... - mls v21.8H, v6.8H, v2.H[0] // ...........*.................. - mls v3.8H, v30.8H, v2.H[0] // .............*................ - ldr q22, [x0], #64 // ..............*............... - sshr v6.8H, v21.8H, #15 // ................*............. - sshr v30.8H, v3.8H, #15 // .................*............ - and v6.16B, v4.16B, v6.16B // ..................*........... - add v6.8H, v21.8H, v6.8H // ...................*.......... - and v21.16B, v4.16B, v30.16B // ....................*......... - add v21.8H, v3.8H, v21.8H // .....................*........ + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q21, [x0, #32] // *............................. + ldr q23, [x0, #48] // ..*........................... + sqdmulh v7.8H, v21.8H, v4.H[0] // ....*......................... + sqdmulh v30.8H, v23.8H, v4.H[0] // ......*....................... + srshr v7.8H, v7.8H, #11 // ........*..................... + srshr v30.8H, v30.8H, #11 // ..........*................... + mls v21.8H, v7.8H, v3.H[0] // ...........*.................. + mls v23.8H, v30.8H, v3.H[0] // .............*................ + ldr q5, [x0, #16] // ..............*............... + sshr v7.8H, v21.8H, #15 // ................*............. + sshr v30.8H, v23.8H, #15 // .................*............ + and v7.16B, v3.16B, v7.16B // ..................*........... + add v21.8H, v21.8H, v7.8H // ...................*.......... + and v7.16B, v3.16B, v30.16B // ....................*......... + add v16.8H, v23.8H, v7.8H // .....................*........ // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q31, [x0, #48] // *.............................. - // sqdmulh v24.8H, v31.8H, v2.H[1] // ....*.......................... - // srshr v6.8H, v24.8H, #11 // ........*...................... - // mls v31.8H, v6.8H, v2.H[0] // ...........*................... - // ldr q30, [x0, #32] // ..*............................ - // sshr v16.8H, v31.8H, #15 // ................*.............. - // and v28.16B, v4.16B, v16.16B // ..................*............ - // sqdmulh v23.8H, v30.8H, v2.H[1] // ......*........................ - // add v6.8H, v31.8H, v28.8H // ...................*........... - // srshr v18.8H, v23.8H, #11 // ..........*.................... - // mls v30.8H, v18.8H, v2.H[0] // .............*................. - // ldr q22, [x0], #64 // ..............*................ - // sshr v0.8H, v30.8H, #15 // .................*............. - // and v29.16B, v4.16B, v0.16B // ....................*.......... - // add v21.8H, v30.8H, v29.8H // .....................*......... + // ldr q30, [x0, #32] // *.............................. + // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*.......................... + // ldr q2, [x0, #48] // ..*............................ + // srshr v19.8H, v22.8H, #11 // ........*...................... + // mls v30.8H, v19.8H, v3.H[0] // ...........*................... + // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................ + // sshr v31.8H, v30.8H, #15 // ................*.............. + // srshr v25.8H, v25.8H, #11 // ..........*.................... + // and v18.16B, v3.16B, v31.16B // ..................*............ + // mls v2.8H, v25.8H, v3.H[0] // .............*................. + // add v21.8H, v30.8H, v18.8H // ...................*........... + // ldr q5, [x0, #16] // ..............*................ + // sshr v18.8H, v2.8H, #15 // .................*............. + // and v27.16B, v3.16B, v18.16B // ....................*.......... + // add v16.8H, v2.8H, v27.8H // .....................*......... sub count, count, #1 -loop_start: +1: // Instructions: 32 // Expected cycles: 36 // Expected IPC: 0.89 @@ -130,77 +150,77 @@ loop_start: // -------- cycle (expected) ---------> // 0 25 // |------------------------|---------- - sqdmulh v0.8H, v22.8H, v2.H[1] // *................................... - str q6, [x0, #-16] // .*.................................. - ldr q31, [x0, #48] // ..e................................. - srshr v5.8H, v0.8H, #11 // ....*............................... - str q21, [x0, #-32] // .....*.............................. - sqdmulh v24.8H, v31.8H, v2.H[1] // ......e............................. - mls v22.8H, v5.8H, v2.H[0] // .......*............................ - ldr q20, [x0, #-48] // ........*........................... - srshr v6.8H, v24.8H, #11 // ..........e......................... - sshr v0.8H, v22.8H, #15 // ...........*........................ - sqdmulh v21.8H, v20.8H, v2.H[1] // ............*....................... - mls v31.8H, v6.8H, v2.H[0] // .............e...................... - ldr q30, [x0, #32] // ..............e..................... - srshr v23.8H, v21.8H, #11 // ................*................... - and v19.16B, v4.16B, v0.16B // .................*.................. - sshr v16.8H, v31.8H, #15 // ..................e................. - mls v20.8H, v23.8H, v2.H[0] // ...................*................ - and v28.16B, v4.16B, v16.16B // ....................e............... - sqdmulh v23.8H, v30.8H, v2.H[1] // .....................e.............. - add v6.8H, v31.8H, v28.8H // ......................e............. - add v31.8H, v22.8H, v19.8H // .......................*............ - sshr v17.8H, v20.8H, #15 // ........................*........... - srshr v18.8H, v23.8H, #11 // .........................e.......... - and v26.16B, v4.16B, v17.16B // ..........................*......... - add v7.8H, v20.8H, v26.8H // ...........................*........ - mls v30.8H, v18.8H, v2.H[0] // ............................e....... - ldr q22, [x0], #64 // .............................e...... - str q7, [x0, #-112] // ...............................*.... - sshr v0.8H, v30.8H, #15 // ................................e... - str q31, [x0, #-128] // .................................*.. - and v29.16B, v4.16B, v0.16B // ..................................e. - add v21.8H, v30.8H, v29.8H // ...................................e - - // ------------------------ cycle (expected) -------------------------> - // 0 25 50 - // |------------------------|------------------------|----------------- - // ldr q0, [x0], #64 // ...........................e......'............................~.... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ..................................*................................. - // srshr v1.8h, v1.8h, #11 // ..~...............................'...*............................. - // mls v0.8h, v1.8h, v2.h[0] // .....~............................'......*.......................... - // sshr v3.8h, v0.8h, #15 // .........~........................'..........*...................... - // and v3.16b, v4.16b, v3.16b // ...............~..................'................*................ - // add v0.8h, v0.8h, v3.8h // .....................~............'......................*.......... - // str q0, [x0, #-64] // ...............................~..'................................* - // ldr q0, [x0, #-48] // ......~...........................'.......*......................... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ..........~.......................'...........*..................... - // srshr v1.8h, v1.8h, #11 // ..............~...................'...............*................. - // mls v0.8h, v1.8h, v2.h[0] // .................~................'..................*.............. - // sshr v3.8h, v0.8h, #15 // ......................~...........'.......................*......... - // and v3.16b, v4.16b, v3.16b // ........................~.........'.........................*....... - // add v0.8h, v0.8h, v3.8h // .........................~........'..........................*...... - // str q0, [x0, #-48] // .............................~....'..............................*.. - // ldr q0, [x0, #-32] // ............e.....................'.............~................... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ...................e..............'....................~............ - // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ - // mls v0.8h, v1.8h, v2.h[0] // ..........................e.......'...........................~..... - // sshr v3.8h, v0.8h, #15 // ..............................e...'...............................~. - // and v3.16b, v4.16b, v3.16b // ................................e.'................................. - // add v0.8h, v0.8h, v3.8h // .................................e'................................. - // str q0, [x0, #-32] // ...~..............................'....*............................ - // ldr q0, [x0, #-16] // e.................................'.~............................... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ....e.............................'.....~........................... - // srshr v1.8h, v1.8h, #11 // ........e.........................'.........~....................... - // mls v0.8h, v1.8h, v2.h[0] // ...........e......................'............~.................... - // sshr v3.8h, v0.8h, #15 // ................e.................'.................~............... - // and v3.16b, v4.16b, v3.16b // ..................e...............'...................~............. - // add v0.8h, v0.8h, v3.8h // ....................e.............'.....................~........... - // str q0, [x0, #-16] // ..................................'*................................ - - sub count, count, #1 - cbnz count, loop_start + ldr q6, [x0], #64 // *................................... + ldr q30, [x0, #32] // ..e................................. + sqdmulh v31.8H, v6.8H, v4.H[0] // ....*............................... + sqdmulh v29.8H, v5.8H, v4.H[0] // .....*.............................. + sqdmulh v22.8H, v30.8H, v4.H[0] // ......e............................. + str q16, [x0, #-16] // .......*............................ + srshr v20.8H, v31.8H, #11 // ........*........................... + srshr v28.8H, v29.8H, #11 // .........*.......................... + str q21, [x0, #-32] // ..........*......................... + mls v6.8H, v20.8H, v3.H[0] // ...........*........................ + mls v5.8H, v28.8H, v3.H[0] // ............*....................... + ldr q2, [x0, #48] // .............e...................... + sshr v31.8H, v6.8H, #15 // ...............*.................... + srshr v19.8H, v22.8H, #11 // ................e................... + and v22.16B, v3.16B, v31.16B // .................*.................. + add v0.8H, v6.8H, v22.8H // ..................*................. + mls v30.8H, v19.8H, v3.H[0] // ...................e................ + sshr v26.8H, v5.8H, #15 // ....................*............... + sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e.............. + and v17.16B, v3.16B, v26.16B // ......................*............. + add v1.8H, v5.8H, v17.8H // .......................*............ + sshr v31.8H, v30.8H, #15 // ........................e........... + srshr v25.8H, v25.8H, #11 // .........................e.......... + str q1, [x0, #-48] // ..........................*......... + and v18.16B, v3.16B, v31.16B // ...........................e........ + mls v2.8H, v25.8H, v3.H[0] // ............................e....... + add v21.8H, v30.8H, v18.8H // .............................e...... + ldr q5, [x0, #16] // ..............................e..... + sshr v18.8H, v2.8H, #15 // ................................e... + str q0, [x0, #-64] // .................................*.. + and v27.16B, v3.16B, v18.16B // ..................................e. + add v16.8H, v2.8H, v27.8H // ...................................e + + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + // ldr q0, [x0], #64 // ..................................*................................. + // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*............................. + // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*......................... + // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*...................... + // sshr v2.8h, v0.8h, #15 // .............~....................'..............*.................. + // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................ + // add v0.8h, v0.8h, v2.8h // ................~.................'.................*............... + // str q0, [x0, #-64] // ...............................~..'................................* + // ldr q0, [x0, #-48] // ............................e.....'.............................~... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................ + // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................ + // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*..................... + // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*............. + // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*........... + // add v0.8h, v0.8h, v2.8h // .....................~............'......................*.......... + // str q0, [x0, #-48] // ........................~.........'.........................*....... + // ldr q0, [x0, #-32] // e.................................'.~............................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~........................... + // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~................. + // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~.............. + // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~......... + // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~...... + // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~.... + // str q0, [x0, #-32] // ........~.........................'.........*....................... + // ldr q0, [x0, #-16] // ...........e......................'............~.................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............ + // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ + // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~..... + // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~. + // and v2.16b, v3.16b, v2.16b // ................................e.'................................. + // add v0.8h, v0.8h, v2.8h // .................................e'................................. + // str q0, [x0, #-16] // .....~............................'......*.......................... + + sub count, count, 1 + cbnz count, 1b // Instructions: 17 // Expected cycles: 23 // Expected IPC: 0.74 @@ -214,94 +234,96 @@ loop_start: // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q29, [x0, #-48] // *............................. - sqdmulh v1.8H, v22.8H, v2.H[1] // ..*........................... + sqdmulh v20.8H, v5.8H, v4.H[0] // *............................. + ldr q24, [x0], #64 // .*............................ str q21, [x0, #-32] // ...*.......................... - sqdmulh v25.8H, v29.8H, v2.H[1] // ....*......................... - str q6, [x0, #-16] // .....*........................ - srshr v30.8H, v1.8H, #11 // ......*....................... - srshr v20.8H, v25.8H, #11 // ........*..................... - mls v22.8H, v30.8H, v2.H[0] // .........*.................... - mls v29.8H, v20.8H, v2.H[0] // ...........*.................. - sshr v24.8H, v22.8H, #15 // .............*................ - and v31.16B, v4.16B, v24.16B // ...............*.............. - sshr v24.8H, v29.8H, #15 // ................*............. - add v3.8H, v22.8H, v31.8H // .................*............ - and v17.16B, v4.16B, v24.16B // ..................*........... - add v25.8H, v29.8H, v17.8H // ...................*.......... - str q3, [x0, #-64] // ....................*......... - str q25, [x0, #-48] // ......................*....... + srshr v20.8H, v20.8H, #11 // ....*......................... + sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................ + str q16, [x0, #-16] // ......*....................... + mls v5.8H, v20.8H, v3.H[0] // .......*...................... + srshr v20.8H, v25.8H, #11 // .........*.................... + sshr v2.8H, v5.8H, #15 // ...........*.................. + mls v24.8H, v20.8H, v3.H[0] // ............*................. + and v20.16B, v3.16B, v2.16B // .............*................ + add v31.8H, v5.8H, v20.8H // ..............*............... + sshr v20.8H, v24.8H, #15 // ................*............. + str q31, [x0, #-48] // .................*............ + and v31.16B, v3.16B, v20.16B // ..................*........... + add v24.8H, v24.8H, v31.8H // ...................*.......... + str q24, [x0, #-64] // ......................*....... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // sqdmulh v0.8H, v22.8H, v2.H[1] // ..*............................ - // str q6, [x0, #-16] // .....*......................... - // srshr v5.8H, v0.8H, #11 // ......*........................ - // str q21, [x0, #-32] // ...*........................... - // mls v22.8H, v5.8H, v2.H[0] // .........*..................... - // ldr q20, [x0, #-48] // *.............................. - // sshr v0.8H, v22.8H, #15 // .............*................. - // sqdmulh v21.8H, v20.8H, v2.H[1] // ....*.......................... - // srshr v23.8H, v21.8H, #11 // ........*...................... - // and v19.16B, v4.16B, v0.16B // ...............*............... - // mls v20.8H, v23.8H, v2.H[0] // ...........*................... - // add v31.8H, v22.8H, v19.8H // .................*............. - // sshr v17.8H, v20.8H, #15 // ................*.............. - // and v26.16B, v4.16B, v17.16B // ..................*............ - // add v7.8H, v20.8H, v26.8H // ...................*........... - // str q7, [x0, #-48] // ......................*........ - // str q31, [x0, #-64] // ....................*.......... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0], #64 // .*............................. + // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*......................... + // sqdmulh v29.8H, v5.8H, v4.H[0] // *.............................. + // str q16, [x0, #-16] // ......*........................ + // srshr v20.8H, v31.8H, #11 // .........*..................... + // srshr v28.8H, v29.8H, #11 // ....*.......................... + // str q21, [x0, #-32] // ...*........................... + // mls v6.8H, v20.8H, v3.H[0] // ............*.................. + // mls v5.8H, v28.8H, v3.H[0] // .......*....................... + // sshr v31.8H, v6.8H, #15 // ................*.............. + // and v22.16B, v3.16B, v31.16B // ..................*............ + // add v0.8H, v6.8H, v22.8H // ...................*........... + // sshr v26.8H, v5.8H, #15 // ...........*................... + // and v17.16B, v3.16B, v26.16B // .............*................. + // add v1.8H, v5.8H, v17.8H // ..............*................ + // str q1, [x0, #-48] // .................*............. + // str q0, [x0, #-64] // ......................*........ ret .unreq ptr .unreq count - .unreq xtmp - .unreq q_data + .unreq data - .unreq t0 - .unreq consts + .unreq q_data + + .unreq tmp .unreq mask - .unreq wmodulus .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_mulcache_compute() * ********************************************/ -.global MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt) -.global _MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt) - - cache_ptr .req x0 - data_ptr .req x1 - zeta_ptr .req x2 - zeta_twisted_ptr .req x3 - xtmp .req x4 - - count .req x5 - - data_odd .req v1 - - zeta .req v2 - zeta_twisted .req v3 - q_zeta .req q2 - q_zeta_twisted .req q3 - - q_tmp0 .req q4 - q_tmp1 .req q5 - tmp0 .req v4 - tmp1 .req v5 - consts .req v6 - dst .req v7 - q_dst .req q7 - -MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt): -_MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov count, #(16) + +.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt) + + cache_ptr .req x0 + data_ptr .req x1 + zeta_ptr .req x2 + zeta_twisted_ptr .req x3 + count .req x4 + + data_odd .req v0 + zeta .req v1 + q_zeta .req q1 + zeta_twisted .req v2 + q_zeta_twisted .req q2 + + tmp0 .req v3 + q_tmp0 .req q3 + tmp1 .req v4 + q_tmp1 .req q4 + dst .req v5 + q_dst .req q5 + + modulus .req v6 + q_modulus .req q6 + modulus_twisted .req v7 + q_modulus_twisted .req q7 + +MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + + mov count, #16 // Instructions: 7 // Expected cycles: 12 // Expected IPC: 0.58 @@ -315,65 +337,65 @@ _MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt): // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q3, [x1, #16] // *............................. - ldr q1, [x1], #32 // ..*........................... - ldr q26, [x3], #16 // ....*......................... - uzp2 v30.8H, v1.8H, v3.8H // ......*....................... - ldr q21, [x2], #16 // .......*...................... - sqrdmulh v3.8H, v30.8H, v26.8H // .........*.................... - mul v26.8H, v30.8H, v21.8H // ...........*.................. - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q30, [x1, #16] // *.............................. - // ldr q21, [x1], #32 // ..*............................ - // ldr q3, [x3], #16 // ....*.......................... - // uzp2 v24.8H, v21.8H, v30.8H // ......*........................ - // ldr q21, [x2], #16 // .......*....................... - // sqrdmulh v3.8H, v24.8H, v3.8H // .........*..................... - // mul v26.8H, v24.8H, v21.8H // ...........*................... + ldr q1, [x1, #16] // *............................. + ldr q27, [x1], #32 // ..*........................... + ldr q23, [x2], #16 // ....*......................... + uzp2 v27.8H, v27.8H, v1.8H // ......*....................... + ldr q1, [x3], #16 // .......*...................... + mul v2.8H, v27.8H, v23.8H // .........*.................... + sqrdmulh v27.8H, v27.8H, v1.8H // ...........*.................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q29, [x1, #16] // *.............................. + // ldr q21, [x2], #16 // ....*.......................... + // ldr q27, [x1], #32 // ..*............................ + // ldr q7, [x3], #16 // .......*....................... + // uzp2 v28.8H, v27.8H, v29.8H // ......*........................ + // mul v2.8H, v28.8H, v21.8H // .........*..................... + // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*................... sub count, count, #1 -mulcache_compute_loop_start: - // Instructions: 9 - // Expected cycles: 13 - // Expected IPC: 0.69 +1: + // Instructions: 9 + // Expected cycles: 13 + // Expected IPC: 0.69 - // Cycle bound: 13.0 - // IPC bound: 0.69 + // Cycle bound: 13.0 + // IPC bound: 0.69 - // Wall time: 0.08s - // User time: 0.08s + // Wall time: 0.09s + // User time: 0.09s - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q30, [x1, #16] // e............................. - ldr q21, [x1], #32 // ..e........................... - mls v26.8H, v3.8H, v6.H[0] // ....*......................... - ldr q3, [x3], #16 // .....e........................ - uzp2 v24.8H, v21.8H, v30.8H // .......e...................... - ldr q21, [x2], #16 // ........e..................... - str q26, [x0], #16 // ..........*................... - sqrdmulh v3.8H, v24.8H, v3.8H // ...........e.................. - mul v26.8H, v24.8H, v21.8H // ............e................. + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q29, [x1, #16] // e............................. + ldr q21, [x2], #16 // ..e........................... + mls v2.8H, v27.8H, v6.H[0] // ....*......................... + ldr q27, [x1], #32 // .....e........................ + ldr q7, [x3], #16 // .......e...................... + uzp2 v28.8H, v27.8H, v29.8H // .........e.................... + str q2, [x0], #16 // ..........*................... + mul v2.8H, v28.8H, v21.8H // ...........e.................. + sqrdmulh v27.8H, v28.8H, v7.8H // ............e................. // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q4, [x1], #32 // ..e..........'.~..........'.~.. - // ldr q5, [x1, #-16] // e............~............~.... - // ldr q2, [x2], #16 // ........e....'.......~....'.... - // ldr q3, [x3], #16 // .....e.......'....~.......'.... - // uzp2 v1.8h, v4.8h, v5.8h // .......e.....'......~.....'.... - // sqrdmulh v4.8h, v1.8h, v3.8h // ...........e.'..........~.'.... - // mul v7.8h, v1.8h, v2.8h // ............e'...........~'.... - // mls v7.8h, v4.8h, v6.h[0] // ....~........'...*........'.... - // str q7, [x0], #16 // ..........~..'.........*..'.... - - sub count, count, #1 - cbnz count, mulcache_compute_loop_start + // ldr q3, [x1], #32 // .....e.......'....~.......'.... + // ldr q4, [x1, #-16] // e............~............~.... + // ldr q1, [x2], #16 // ..e..........'.~..........'.~.. + // ldr q2, [x3], #16 // .......e.....'......~.....'.... + // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'.... + // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'.... + // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'.... + // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'.... + // str q5, [x0], #16 // ..........~..'.........*..'.... + + sub count, count, 1 + cbnz count, 1b // Instructions: 2 // Expected cycles: 5 // Expected IPC: 0.40 @@ -387,14 +409,14 @@ mulcache_compute_loop_start: // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - mls v26.8H, v3.8H, v6.H[0] // *............................. - str q26, [x0], #16 // ....*......................... + mls v2.8H, v27.8H, v6.H[0] // *............................. + str q2, [x0], #16 // ....*......................... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // mls v26.8H, v3.8H, v6.H[0] // *.............................. - // str q26, [x0], #16 // ....*.......................... + // mls v2.8H, v27.8H, v6.H[0] // *.............................. + // str q2, [x0], #16 // ....*.......................... ret @@ -402,7 +424,7 @@ mulcache_compute_loop_start: .unreq cache_ptr .unreq data_ptr .unreq zeta_ptr - .unreq xtmp + .unreq zeta_twisted_ptr .unreq count .unreq data_odd @@ -411,34 +433,35 @@ mulcache_compute_loop_start: .unreq zeta_twisted .unreq q_zeta_twisted - .unreq q_tmp0 - .unreq q_tmp1 .unreq tmp0 + .unreq q_tmp0 .unreq tmp1 - .unreq consts + .unreq q_tmp1 .unreq dst + .unreq q_dst + + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_tobytes() * ********************************************/ -.global MLKEM_NAMESPACE(poly_tobytes_asm_opt) -.global _MLKEM_NAMESPACE(poly_tobytes_asm_opt) +.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt) data0 .req v0 data1 .req v1 - out0 .req v2 out1 .req v3 out2 .req v4 - - tmp .req v5 + tmp .req v5 dst .req x0 src .req x1 count .req x2 -MLKEM_NAMESPACE(poly_tobytes_asm_opt): -_MLKEM_NAMESPACE(poly_tobytes_asm_opt): +MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt): mov count, #16 poly_tobytes_asm_opt_asm_loop_start: @@ -475,69 +498,65 @@ poly_tobytes_asm_opt_asm_loop_start: /********************************** * poly_tomont() * **********************************/ -.global MLKEM_NAMESPACE(poly_tomont_asm_opt) -.global _MLKEM_NAMESPACE(poly_tomont_asm_opt) - - src .req x0 - count .req x1 - xtmp .req x2 - - wtmp2 .req w3 +.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt) - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 + src .req x0 + count .req x1 - consts .req v2 + data .req v0 + q_data .req q0 + res .req v1 + q_res .req q1 - factor .req v3 - factor_t .req v4 + factor .req v2 + q_factor .req q2 + factor_t .req v3 + q_factor_t .req q3 + modulus .req v4 + q_modulus .req q4 + modulus_twisted .req v5 + q_modulus_twisted .req q5 - tmp0 .req v5 + tmp0 .req v6 -MLKEM_NAMESPACE(poly_tomont_asm_opt): -_MLKEM_NAMESPACE(poly_tomont_asm_opt): +MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - ldrsh wtmp2, [xtmp, #(2*2)] - dup factor.8h, wtmp2 - ldrh wtmp2, [xtmp, #(3*2)] - dup factor_t.8h, wtmp2 + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + ldr q_factor, c_mont_constant + ldr q_factor_t, c_barrett_twist mov count, #8 - // Instructions: 5 - // Expected cycles: 10 - // Expected IPC: 0.50 - // - // Cycle bound: 10.0 - // IPC bound: 0.50 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q6, [x0, #16] // *............................. - ldr q20, [x0, #32] // ..*........................... - mul v21.8H, v6.8H, v3.8H // ....*......................... - sqrdmulh v6.8H, v6.8H, v4.8H // .....*........................ - mls v21.8H, v6.8H, v2.H[0] // .........*.................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q16, [x0, #16] // *.............................. - // mul v21.8H, v16.8H, v3.8H // ....*.......................... - // sqrdmulh v0.8H, v16.8H, v4.8H // .....*......................... - // mls v21.8H, v0.8H, v2.H[0] // .........*..................... - // ldr q20, [x0, #32] // ..*............................ + // Instructions: 5 + // Expected cycles: 7 + // Expected IPC: 0.71 + // + // Cycle bound: 7.0 + // IPC bound: 0.71 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q26, [x0, #48] // *............................. + ldr q23, [x0, #16] // ..*........................... + mul v17.8H, v26.8H, v2.8H // ....*......................... + sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................ + ldr q27, [x0, #32] // ......*....................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q7, [x0, #48] // *.............................. + // ldr q23, [x0, #16] // ..*............................ + // mul v17.8H, v7.8H, v2.8H // ....*.......................... + // sqrdmulh v7.8H, v7.8H, v3.8H // .....*......................... + // ldr q27, [x0, #32] // ......*........................ sub count, count, #1 -poly_tomont_asm_loop: +1: // Instructions: 20 // Expected cycles: 24 // Expected IPC: 0.83 @@ -545,108 +564,127 @@ poly_tomont_asm_loop: // Cycle bound: 24.0 // IPC bound: 0.83 // - // Wall time: 0.58s - // User time: 0.58s + // Wall time: 0.73s + // User time: 0.73s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q18, [x0], #64 // *............................. - str q21, [x0, #-48] // ..*........................... - ldr q16, [x0, #16] // ...e.......................... - sqrdmulh v27.8H, v20.8H, v4.8H // .....*........................ - mul v1.8H, v20.8H, v3.8H // ......*....................... - mul v21.8H, v16.8H, v3.8H // .......e...................... - sqrdmulh v0.8H, v16.8H, v4.8H // ........e..................... - ldr q30, [x0, #-16] // .........*.................... - sqrdmulh v6.8H, v18.8H, v4.8H // ...........*.................. - mul v18.8H, v18.8H, v3.8H // ............*................. - mls v1.8H, v27.8H, v2.H[0] // .............*................ - mul v5.8H, v30.8H, v3.8H // ..............*............... - sqrdmulh v22.8H, v30.8H, v4.8H // ...............*.............. - mls v18.8H, v6.8H, v2.H[0] // ................*............. - str q1, [x0, #-32] // .................*............ - mls v21.8H, v0.8H, v2.H[0] // ..................e........... - mls v5.8H, v22.8H, v2.H[0] // ...................*.......... - str q18, [x0, #-64] // ....................*......... - ldr q20, [x0, #32] // .....................e........ - str q5, [x0, #-16] // .......................*...... - - // ------------- cycle (expected) -------------> + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v5.8H, v23.8H, v3.8H // .*............................ + ldr q7, [x0], #64 // ..*........................... + str q17, [x0, #-16] // ....*......................... + sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................ + sqrdmulh v19.8H, v7.8H, v3.8H // ......*....................... + mul v25.8H, v23.8H, v2.8H // .......*...................... + mul v0.8H, v7.8H, v2.8H // ........*..................... + mul v26.8H, v27.8H, v2.8H // .........*.................... + ldr q7, [x0, #48] // ..........e................... + mls v25.8H, v5.8H, v4.H[0] // ............*................. + ldr q23, [x0, #16] // .............e................ + mls v26.8H, v29.8H, v4.H[0] // ...............*.............. + mls v0.8H, v19.8H, v4.H[0] // ................*............. + str q25, [x0, #-48] // .................*............ + mul v17.8H, v7.8H, v2.8H // ..................e........... + sqrdmulh v7.8H, v7.8H, v3.8H // ...................e.......... + str q0, [x0, #-64] // ....................*......... + ldr q27, [x0, #32] // .....................e........ + str q26, [x0, #-32] // .......................*...... + + // --------- cycle (expected) ----------> // 0 25 - // |------------------------|------------------- - // ldr q0, [x0], #64 // .....................*....................... - // sqrdmulh v5.8h, v0.8h, v4.8h // ........~............'..........*............ - // mul v1.8h, v0.8h, v3.8h // .........~...........'...........*........... - // mls v1.8h, v5.8h, v2.h[0] // .............~.......'...............*....... - // str q1, [x0, #-64] // .................~...'...................*... - // ldr q0, [x0, #-48] // e....................'..~.................... - // sqrdmulh v5.8h, v0.8h, v4.8h // .....e...............'.......~............... - // mul v1.8h, v0.8h, v3.8h // ....e................'......~................ - // mls v1.8h, v5.8h, v2.h[0] // ...............e.....'.................~..... - // str q1, [x0, #-48] // .....................'.*..................... - // ldr q0, [x0, #-32] // ..................e..'....................~.. - // sqrdmulh v5.8h, v0.8h, v4.8h // ..~..................'....*.................. - // mul v1.8h, v0.8h, v3.8h // ...~.................'.....*................. - // mls v1.8h, v5.8h, v2.h[0] // ..........~..........'............*.......... - // str q1, [x0, #-32] // ..............~......'................*...... - // ldr q0, [x0, #-16] // ......~..............'........*.............. - // sqrdmulh v5.8h, v0.8h, v4.8h // ............~........'..............*........ - // mul v1.8h, v0.8h, v3.8h // ...........~.........'.............*......... - // mls v1.8h, v5.8h, v2.h[0] // ................~....'..................*.... - // str q1, [x0, #-16] // ....................~'......................* - - sub count, count, #1 - cbnz count, poly_tomont_asm_loop + // |------------------------|------------ + // ldr q0, [x0], #64 // ..............'.*..................... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*................. + // mul v1.8h, v0.8h, v2.8h // ..............'.......*............... + // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*....... + // str q1, [x0, #-64] // ..........~...'...................*... + // ldr q0, [x0, #-48] // ...e..........'............~.......... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*...................... + // mul v1.8h, v0.8h, v2.8h // ..............'......*................ + // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*........... + // str q1, [x0, #-48] // .......~......'................*...... + // ldr q0, [x0, #-32] // ...........e..'....................~.. + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*.................. + // mul v1.8h, v0.8h, v2.8h // ..............'........*.............. + // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........ + // str q1, [x0, #-32] // .............~'......................* + // ldr q0, [x0, #-16] // e.............'.........~............. + // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~.... + // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~..... + // mls v1.8h, v6.8h, v4.h[0] // ..............*....................... + // str q1, [x0, #-16] // ..............'...*................... + + sub count, count, 1 + cbnz count, 1b // Instructions: 15 - // Expected cycles: 19 - // Expected IPC: 0.79 + // Expected cycles: 18 + // Expected IPC: 0.83 // - // Cycle bound: 19.0 - // IPC bound: 0.79 + // Cycle bound: 18.0 + // IPC bound: 0.83 // - // Wall time: 0.04s - // User time: 0.04s + // Wall time: 0.07s + // User time: 0.07s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - mul v31.8H, v20.8H, v3.8H // *............................. - sqrdmulh v25.8H, v20.8H, v4.8H // .*............................ - ldr q27, [x0], #64 // ..*........................... - ldr q23, [x0, #-16] // ....*......................... - mls v31.8H, v25.8H, v2.H[0] // ......*....................... - sqrdmulh v30.8H, v27.8H, v4.8H // .......*...................... - mul v6.8H, v27.8H, v3.8H // ........*..................... - mul v20.8H, v23.8H, v3.8H // .........*.................... - sqrdmulh v26.8H, v23.8H, v4.8H // ..........*................... - str q31, [x0, #-32] // ...........*.................. - mls v6.8H, v30.8H, v2.H[0] // ............*................. - str q21, [x0, #-48] // .............*................ - mls v20.8H, v26.8H, v2.H[0] // ..............*............... - str q6, [x0, #-64] // ................*............. - str q20, [x0, #-16] // ..................*........... + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v7.8H, v23.8H, v3.8H // .*............................ + mul v26.8H, v23.8H, v2.8H // ..*........................... + sqrdmulh v25.8H, v27.8H, v3.8H // ...*.......................... + ldr q23, [x0], #64 // ....*......................... + mul v27.8H, v27.8H, v2.8H // ......*....................... + mls v26.8H, v7.8H, v4.H[0] // .......*...................... + sqrdmulh v7.8H, v23.8H, v3.8H // ........*..................... + mul v23.8H, v23.8H, v2.8H // .........*.................... + str q17, [x0, #-16] // ..........*................... + mls v27.8H, v25.8H, v4.H[0] // ...........*.................. + str q26, [x0, #-48] // ............*................. + mls v23.8H, v7.8H, v4.H[0] // .............*................ + str q27, [x0, #-32] // ...............*.............. + str q23, [x0, #-64] // .................*............ // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q18, [x0], #64 // ..*............................ - // str q21, [x0, #-48] // .............*................. - // sqrdmulh v27.8H, v20.8H, v4.8H // .*............................. - // mul v1.8H, v20.8H, v3.8H // *.............................. - // ldr q30, [x0, #-16] // ....*.......................... - // sqrdmulh v6.8H, v18.8H, v4.8H // .......*....................... - // mul v18.8H, v18.8H, v3.8H // ........*...................... - // mls v1.8H, v27.8H, v2.H[0] // ......*........................ - // mul v5.8H, v30.8H, v3.8H // .........*..................... - // sqrdmulh v22.8H, v30.8H, v4.8H // ..........*.................... - // mls v18.8H, v6.8H, v2.H[0] // ............*.................. - // str q1, [x0, #-32] // ...........*................... - // mls v5.8H, v22.8H, v2.H[0] // ..............*................ - // str q18, [x0, #-64] // ................*.............. - // str q5, [x0, #-16] // ..................*............ + // mls v17.8H, v7.8H, v4.H[0] // *.............................. + // sqrdmulh v5.8H, v23.8H, v3.8H // .*............................. + // ldr q7, [x0], #64 // ....*.......................... + // str q17, [x0, #-16] // ..........*.................... + // sqrdmulh v29.8H, v27.8H, v3.8H // ...*........................... + // sqrdmulh v19.8H, v7.8H, v3.8H // ........*...................... + // mul v25.8H, v23.8H, v2.8H // ..*............................ + // mul v0.8H, v7.8H, v2.8H // .........*..................... + // mul v26.8H, v27.8H, v2.8H // ......*........................ + // mls v25.8H, v5.8H, v4.H[0] // .......*....................... + // mls v26.8H, v29.8H, v4.H[0] // ...........*................... + // mls v0.8H, v19.8H, v4.H[0] // .............*................. + // str q25, [x0, #-48] // ............*.................. + // str q0, [x0, #-64] // .................*............. + // str q26, [x0, #-32] // ...............*............... ret + .unreq src + .unreq count + + .unreq data + .unreq q_data + .unreq res + .unreq q_res + + .unreq factor + .unreq q_factor + .unreq factor_t + .unreq q_factor_t + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted + + .unreq tmp0 + #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S index bfd1d2b8a..99fb05de5 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_clean.S @@ -12,10 +12,30 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -#include "params.h" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ + +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm -// Needed to provide ASM_LOAD directive -#include "common.i" +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 3327 // Input: // - Vectors al, ah of 32-bit entries @@ -23,9 +43,9 @@ // - Montgomery reductions of al || ah, stored in al .macro montgomery_reduce_long x, a uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, consts.h[2] - smlal \a\()l.4s, t0.4h, constN.4h - smlal2 \a\()h.4s, t0.8h, constN.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h uzp2 \x\().8h, \a\()l.8h, \a\()h.8h .endm @@ -58,10 +78,24 @@ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h .endm +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + .macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2 {\a\()0.8h, \a\()1.8h}, [\a_ptr], #32 - ld2 {\b\()0.8h, \b\()1.8h}, [\b_ptr], #32 - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 .endm .macro save_vregs @@ -101,14 +135,12 @@ a3_ptr .req x10 b3_ptr .req x11 b3_cache_ptr .req x12 - count .req x13 - xtmp .req x14 - modulus .req w15 - constN .req v0 - constX .req v1 - consts .req v2 + modulus .req v0 + q_modulus .req q0 + modulus_twisted .req v2 + q_modulus_twisted .req q2 aa0 .req v3 aa1 .req v4 @@ -131,29 +163,13 @@ t0 .req v28 -.p2align 4 -const_addr: - .short 3329 - .short 20159 - .short 3327 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - #if MLKEM_K == 2 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -172,7 +188,7 @@ k2_loop_start: montgomery_reduce_long out0, res0 montgomery_reduce_long out1, res1 - st2 {out0.8h, out1.8h}, [out], #32 + st2_wrap out, out subs count, count, #1 cbnz count, k2_loop_start @@ -182,17 +198,12 @@ k2_loop_start: #endif /* MLKEM_K == 2 */ #if MLKEM_K == 3 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -216,7 +227,7 @@ k3_loop_start: montgomery_reduce_long out0, res0 montgomery_reduce_long out1, res1 - st2 {out0.8h, out1.8h}, [out], #32 + st2_wrap out, out subs count, count, #1 cbnz count, k3_loop_start @@ -226,17 +237,12 @@ k3_loop_start: #endif /* MLKEM_K == 3 */ #if MLKEM_K == 4 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -270,7 +276,7 @@ k4_loop_start: montgomery_reduce_long out0, res0 montgomery_reduce_long out1, res1 - st2 {out0.8h, out1.8h}, [out], #32 + st2_wrap out, out subs count, count, #1 cbnz count, k4_loop_start diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S index 07dc98efd..16ed77c3f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/polyvec_opt.S @@ -11,10 +11,31 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -#include "params.h" -// Needed to provide ASM_LOAD directive -#include "common.i" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ + +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm + +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 3327 // Input: // - Vectors al, ah of 32-bit entries @@ -22,13 +43,17 @@ // - Montgomery reductions of al || ah, stored in al .macro montgomery_reduce_long x, a uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, consts.h[2] - smlal \a\()l.4s, t0.4h, constN.4h - smlal2 \a\()h.4s, t0.8h, constN.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h uzp2 \x\().8h, \a\()l.8h, \a\()h.8h .endm // Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 .macro pmull d, a, b smull \d\()0l.4s, \a\()0.4h, \b\()0.4h smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h @@ -53,10 +78,24 @@ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h .endm +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + .macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2 {\a\()0.8h, \a\()1.8h}, [\a_ptr], #32 - ld2 {\b\()0.8h, \b\()1.8h}, [\b_ptr], #32 - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 .endm .macro save_vregs @@ -96,14 +135,12 @@ a3_ptr .req x10 b3_ptr .req x11 b3_cache_ptr .req x12 - count .req x13 - xtmp .req x14 - modulus .req w15 - constN .req v0 - constX .req v1 - consts .req v2 + modulus .req v0 + q_modulus .req q0 + modulus_twisted .req v2 + q_modulus_twisted .req q2 aa0 .req v3 aa1 .req v4 @@ -126,29 +163,13 @@ t0 .req v28 -.p2align 4 -const_addr: - .short 3329 - .short 20159 - .short 3327 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - #if MLKEM_K == 2 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -157,195 +178,351 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) mov count, #(MLKEM_N / 16) - // Instructions: 6 - // Expected cycles: 11 - // Expected IPC: 0.55 - - // Cycle bound: 11.0 - // IPC bound: 0.55 - - // Wall time: 0.01s - // User time: 0.01s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ld2 {v6.8H, v7.8H}, [x1], #32 // *............................. - ld2 {v4.8H, v5.8H}, [x2], #32 // ..*........................... - ld1 {v12.8H}, [x3], #16 // ....*......................... - ld2 {v15.8H, v16.8H}, [x4], #32 // ......*....................... - ld2 {v29.8H, v30.8H}, [x5], #32 // ........*..................... - ld1 {v31.8H}, [x6], #16 // ..........*................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ld2 {v6.8H, v7.8H}, [x1], #32 // *.............................. - // ld2 {v4.8H, v5.8H}, [x2], #32 // ..*............................ - // ld1 {v12.8H}, [x3], #16 // ....*.......................... - // ld2 {v15.8H, v16.8H}, [x4], #32 // ......*........................ - // ld2 {v29.8H, v30.8H}, [x5], #32 // ........*...................... - // ld1 {v31.8H}, [x6], #16 // ..........*.................... + // Instructions: 75 + // Expected cycles: 94 + // Expected IPC: 0.80 + + // Cycle bound: 94.0 + // IPC bound: 0.80 + + // Wall time: 1.49s + // User time: 1.49s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q9, [x4], #32 // *.......................................................................... + ldr q5, [x4, #-16] // ......*.................................................................... + ldr q11, [x5], #32 // .*......................................................................... + uzp1 v23.8H, v9.8H, v5.8H // .........*................................................................. + uzp2 v9.8H, v9.8H, v5.8H // .....................*..................................................... + ldr q5, [x2], #32 // ..*........................................................................ + ldr q7, [x5, #-16] // ..............*............................................................ + ldr q21, [x2, #-16] // ...*....................................................................... + uzp2 v10.8H, v11.8H, v7.8H // .................*......................................................... + uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................ + uzp1 v7.8H, v5.8H, v21.8H // ....*...................................................................... + uzp2 v5.8H, v5.8H, v21.8H // .....*..................................................................... + ldr q21, [x1], #32 // .......*................................................................... + ldr q25, [x1, #-16] // ........*.................................................................. + ld1 {v6.8H}, [x3], #16 // ............................*.............................................. + uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................ + uzp2 v21.8H, v21.8H, v25.8H // ...........*............................................................... + smull v25.4S, v26.4H, v5.4H // ............*.............................................................. + smull2 v5.4S, v26.8H, v5.8H // .............*............................................................. + smull v19.4S, v26.4H, v7.4H // ..........................*................................................ + smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................ + smlal v25.4S, v21.4H, v7.4H // ...............*........................................................... + smlal2 v5.4S, v21.8H, v7.8H // ................*.......................................................... + smlal v19.4S, v21.4H, v6.4H // ...................................*....................................... + smlal2 v26.4S, v21.8H, v6.8H // .................................*......................................... + smlal v25.4S, v23.4H, v10.4H // ...................*....................................................... + smlal2 v5.4S, v23.8H, v10.8H // ....................*...................................................... + smlal v19.4S, v23.4H, v11.4H // ......................................*.................................... + smlal2 v26.4S, v23.8H, v11.8H // ....................................*...................................... + ld1 {v23.8H}, [x6], #16 // ........................*.................................................. + smlal v25.4S, v9.4H, v11.4H // ......................*.................................................... + smlal2 v5.4S, v9.8H, v11.8H // .......................*................................................... + smlal2 v26.4S, v9.8H, v23.8H // .......................................*................................... + smlal v19.4S, v9.4H, v23.4H // .........................................*................................. + ldr q9, [x4], #32 // ...............................*........................................... + uzp1 v11.8H, v25.8H, v5.8H // .........................*................................................. + uzp1 v23.8H, v19.8H, v26.8H // .............................................*............................. + mul v11.8H, v11.8H, v2.8H // ...........................*............................................... + mul v23.8H, v23.8H, v2.8H // ..............................................*............................ + ldr q7, [x5], #32 // ................................*.......................................... + smlal2 v5.4S, v11.8H, v0.8H // .............................*............................................. + smlal v25.4S, v11.4H, v0.4H // ..................................*........................................ + ldr q11, [x2], #32 // .....................................*..................................... + ldr q21, [x2, #-16] // ........................................*.................................. + ldr q6, [x4, #-16] // ...............................................*........................... + uzp1 v17.8H, v11.8H, v21.8H // ...........................................*............................... + ldr q10, [x1], #32 // ................................................*.......................... + ldr q29, [x1, #-16] // .................................................*......................... + uzp2 v11.8H, v11.8H, v21.8H // ............................................*.............................. + uzp1 v13.8H, v9.8H, v6.8H // ...................................................*....................... + uzp1 v3.8H, v10.8H, v29.8H // ....................................................*...................... + uzp2 v10.8H, v10.8H, v29.8H // .....................................................*..................... + smull v12.4S, v3.4H, v11.4H // ......................................................*.................... + smull2 v11.4S, v3.8H, v11.8H // .......................................................*................... + ldr q21, [x5, #-16] // ........................................................*.................. + smlal v12.4S, v10.4H, v17.4H // .........................................................*................. + smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................ + uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*............... + uzp1 v15.8H, v7.8H, v21.8H // ............................................................*.............. + smlal v12.4S, v13.4H, v29.4H // .............................................................*............. + smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............ + uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*........... + smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................ + smlal v12.4S, v28.4H, v15.4H // .................................................................*......... + smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........ + smlal v19.4S, v23.4H, v0.4H // ................................................................*.......... + uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................ + smull v23.4S, v3.4H, v17.4H // ......................................................................*.... + uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*..... + uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*...... + mul v14.8H, v9.8H, v2.8H // .......................................................................*... + ld1 {v22.8H}, [x6], #16 // ...................................................................*....... + zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................* + ld1 {v4.8H}, [x3], #16 // .........................................................................*. + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q18, [x4], #32 // *.......................................................................... + // ldr q30, [x5], #32 // ..*........................................................................ + // ldr q8, [x2], #32 // .....*..................................................................... + // ldr q9, [x2, #-16] // .......*................................................................... + // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................ + // uzp2 v4.8H, v8.8H, v9.8H // ...........*............................................................... + // ldr q19, [x4, #-16] // .*......................................................................... + // ldr q29, [x1], #32 // ............*.............................................................. + // ldr q12, [x1, #-16] // .............*............................................................. + // uzp1 v13.8H, v18.8H, v19.8H // ...*....................................................................... + // uzp1 v3.8H, v29.8H, v12.8H // ...............*........................................................... + // uzp2 v10.8H, v29.8H, v12.8H // ................*.......................................................... + // smull v12.4S, v3.4H, v4.4H // .................*......................................................... + // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................ + // ldr q5, [x5, #-16] // ......*.................................................................... + // smlal v12.4S, v10.4H, v17.4H // .....................*..................................................... + // smlal2 v11.4S, v10.8H, v17.8H // ......................*.................................................... + // uzp2 v14.8H, v30.8H, v5.8H // ........*.................................................................. + // uzp1 v15.8H, v30.8H, v5.8H // .........*................................................................. + // smlal v12.4S, v13.4H, v14.4H // .........................*................................................. + // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................ + // uzp2 v28.8H, v18.8H, v19.8H // ....*...................................................................... + // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................ + // smlal2 v11.4S, v28.8H, v15.8H // ...............................*........................................... + // ld1 {v22.8H}, [x6], #16 // .............................*............................................. + // uzp1 v1.8H, v12.8H, v11.8H // ...................................*....................................... + // smull v23.4S, v3.4H, v17.4H // ...................*....................................................... + // mul v14.8H, v1.8H, v2.8H // .....................................*..................................... + // ld1 {v4.8H}, [x3], #16 // ..............*............................................................ + // smlal2 v11.4S, v14.8H, v0.8H // ........................................*.................................. + // smull2 v20.4S, v3.8H, v17.8H // ....................*...................................................... + // ldr q18, [x4], #32 // ..................................*........................................ + // ldr q30, [x5], #32 // .......................................*................................... + // smlal2 v20.4S, v10.8H, v4.8H // ........................*.................................................. + // smlal v12.4S, v14.4H, v0.4H // .........................................*................................. + // smlal v23.4S, v10.4H, v4.4H // .......................*................................................... + // smlal2 v20.4S, v13.8H, v15.8H // ............................*.............................................. + // ldr q8, [x2], #32 // ..........................................*................................ + // smlal v23.4S, v13.4H, v15.4H // ...........................*............................................... + // smlal2 v20.4S, v28.8H, v22.8H // ................................*.......................................... + // ldr q9, [x2, #-16] // ...........................................*............................... + // smlal v23.4S, v28.4H, v22.4H // .................................*......................................... + // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........ + // uzp1 v17.8H, v8.8H, v9.8H // .............................................*............................. + // uzp2 v4.8H, v8.8H, v9.8H // ................................................*.......................... + // uzp1 v5.8H, v23.8H, v20.8H // ....................................*...................................... + // mul v31.8H, v5.8H, v2.8H // ......................................*.................................... + // ldr q19, [x4, #-16] // ............................................*.............................. + // ldr q29, [x1], #32 // ..............................................*............................ + // ldr q12, [x1, #-16] // ...............................................*........................... + // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............ + // uzp1 v13.8H, v18.8H, v19.8H // .................................................*......................... + // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................ + // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*....................... + // smull v12.4S, v3.4H, v4.4H // ....................................................*...................... + // smull2 v11.4S, v3.8H, v4.8H // .....................................................*..................... + // ldr q5, [x5, #-16] // ......................................................*.................... + // smlal v12.4S, v10.4H, v17.4H // .......................................................*................... + // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*.................. + // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*................. + // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................ + // smlal v12.4S, v13.4H, v14.4H // ...........................................................*............... + // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*.............. + // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*............. + // smlal v23.4S, v31.4H, v0.4H // .................................................................*......... + // smlal v12.4S, v28.4H, v15.4H // ...............................................................*........... + // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*.......... + // ld1 {v22.8H}, [x6], #16 // .......................................................................*... + // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*..... + // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*...... + // smull v23.4S, v3.4H, v17.4H // ...................................................................*....... + // mul v14.8H, v1.8H, v2.8H // ......................................................................*.... + // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + // ld1 {v4.8H}, [x3], #16 // ..........................................................................* + // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*. - sub count, count, #1 -k2_loop_start: - // Instructions: 33 - // Expected cycles: 40 - // Expected IPC: 0.82 - - // Cycle bound: 40.0 - // IPC bound: 0.82 - - // Wall time: 1.67s - // User time: 1.67s - - // ---------- cycle (expected) -----------> - // 0 25 - // |------------------------|-------------- - smull v13.4S, v6.4H, v4.4H // *....................................... - smull2 v10.4S, v6.8H, v4.8H // .*...................................... - smull v21.4S, v6.4H, v5.4H // ..*..................................... - smull2 v3.4S, v6.8H, v5.8H // ...*.................................... - smlal v13.4S, v7.4H, v12.4H // ....*................................... - smlal2 v10.4S, v7.8H, v12.8H // .....*.................................. - smlal v21.4S, v7.4H, v4.4H // ......*................................. - smlal2 v3.4S, v7.8H, v4.8H // .......*................................ - smlal v13.4S, v15.4H, v29.4H // ........*............................... - smlal2 v10.4S, v15.8H, v29.8H // .........*.............................. - smlal v21.4S, v15.4H, v30.4H // ..........*............................. - smlal2 v3.4S, v15.8H, v30.8H // ...........*............................ - smlal v13.4S, v16.4H, v31.4H // ............*........................... - smlal2 v10.4S, v16.8H, v31.8H // .............*.......................... - smlal v21.4S, v16.4H, v29.4H // ..............*......................... - smlal2 v3.4S, v16.8H, v29.8H // ...............*........................ - ld2 {v6.8H, v7.8H}, [x1], #32 // ................e....................... - uzp1 v12.8H, v13.8H, v10.8H // ..................*..................... - uzp1 v30.8H, v21.8H, v3.8H // ...................*.................... - mul v12.8H, v12.8H, v2.H[2] // ....................*................... - mul v30.8H, v30.8H, v2.H[2] // .....................*.................. - ld2 {v4.8H, v5.8H}, [x2], #32 // ......................e................. - smlal v13.4S, v12.4H, v0.4H // ........................*............... - smlal2 v10.4S, v12.8H, v0.8H // .........................*.............. - smlal v21.4S, v30.4H, v0.4H // ..........................*............. - smlal2 v3.4S, v30.8H, v0.8H // ...........................*............ - ld1 {v12.8H}, [x3], #16 // ............................e........... - uzp2 v13.8H, v13.8H, v10.8H // ..............................*......... - uzp2 v14.8H, v21.8H, v3.8H // ...............................*........ - ld2 {v15.8H, v16.8H}, [x4], #32 // ................................e....... - st2 {v13.8H, v14.8H}, [x0], #32 // ..................................*..... - ld2 {v29.8H, v30.8H}, [x5], #32 // ....................................e... - ld1 {v31.8H}, [x6], #16 // ......................................e. - - // -------------------- cycle (expected) --------------------> - // 0 25 50 - // |------------------------|------------------------|-------- - // ld2 {v3.8h, v4.8h}, [x1], #32 // e.......................'...............~.................. - // ld2 {v5.8h, v6.8h}, [x2], #32 // ......e.................'.....................~............ - // ld1 {v7.8h}, [x3], #16 // ............e...........'...........................~...... - // smull v8.4s, v3.4h, v5.4h // ........................*.................................. - // smull2 v10.4s, v3.8h, v5.8h // ........................'*................................. - // smlal v8.4s, v4.4h, v7.4h // ........................'...*.............................. - // smlal2 v10.4s, v4.8h, v7.8h // ........................'....*............................. - // smull v9.4s, v3.4h, v6.4h // ........................'.*................................ - // smull2 v11.4s, v3.8h, v6.8h // ........................'..*............................... - // smlal v9.4s, v4.4h, v5.4h // ........................'.....*............................ - // smlal2 v11.4s, v4.8h, v5.8h // ........................'......*........................... - // ld2 {v3.8h, v4.8h}, [x4], #32 // ................e.......'...............................~.. - // ld2 {v5.8h, v6.8h}, [x5], #32 // ....................e...'.................................. - // ld1 {v7.8h}, [x6], #16 // ......................e.'.................................. - // smlal v8.4s, v3.4h, v5.4h // ........................'.......*.......................... - // smlal2 v10.4s, v3.8h, v5.8h // ........................'........*......................... - // smlal v8.4s, v4.4h, v7.4h // ........................'...........*...................... - // smlal2 v10.4s, v4.8h, v7.8h // ........................'............*..................... - // smlal v9.4s, v3.4h, v6.4h // ........................'.........*........................ - // smlal2 v11.4s, v3.8h, v6.8h // ........................'..........*....................... - // smlal v9.4s, v4.4h, v5.4h // ........................'.............*.................... - // smlal2 v11.4s, v4.8h, v5.8h // ........................'..............*................... - // uzp1 v28.8h, v8.8h, v10.8h // ..~.....................'.................*................ - // mul v28.8h, v28.8h, v2.h[2] // ....~...................'...................*.............. - // smlal v8.4s, v28.4h, v0.4h // ........~...............'.......................*.......... - // smlal2 v10.4s, v28.8h, v0.8h // .........~..............'........................*......... - // uzp2 v26.8h, v8.8h, v10.8h // ..............~.........'.............................*.... - // uzp1 v28.8h, v9.8h, v11.8h // ...~....................'..................*............... - // mul v28.8h, v28.8h, v2.h[2] // .....~..................'....................*............. - // smlal v9.4s, v28.4h, v0.4h // ..........~.............'.........................*........ - // smlal2 v11.4s, v28.8h, v0.8h // ...........~............'..........................*....... - // uzp2 v27.8h, v9.8h, v11.8h // ...............~........'..............................*... - // st2 {v26.8h, v27.8h}, [x0], #32 // ..................~.....'.................................* + sub count, count, #2 +1: + // Instructions: 48 + // Expected cycles: 58 + // Expected IPC: 0.83 + + // Cycle bound: 58.0 + // IPC bound: 0.83 + + // Wall time: 6.39s + // User time: 6.39s + + // -------------- original position --------------> + // 0 25 + // |------------------------|---------------------- + smull2 v20.4S, v3.8H, v17.8H // ..........*..................................... + ldr q18, [x4], #32 // .................e.............................. + ldr q30, [x5], #32 // .....................e.......................... + smlal2 v20.4S, v10.8H, v4.8H // ............*................................... + smlal v12.4S, v14.4H, v0.4H // .........................................*...... + smlal v23.4S, v10.4H, v4.4H // ...........*.................................... + str q9, [x0, #16] // ...............................................l + smlal2 v20.4S, v13.8H, v15.8H // ...........................*.................... + ldr q8, [x2], #32 // ....e........................................... + smlal v23.4S, v13.4H, v15.4H // ..........................*..................... + smlal2 v20.4S, v28.8H, v22.8H // .............................*.................. + zip1 v26.8H, v19.8H, v27.8H // ............................................l... + ldr q9, [x2, #-16] // .....e.......................................... + smlal v23.4S, v28.4H, v22.4H // ............................*................... + uzp2 v27.8H, v12.8H, v11.8H // ...........................................*.... + uzp1 v17.8H, v8.8H, v9.8H // ......e......................................... + uzp2 v4.8H, v8.8H, v9.8H // .......e........................................ + uzp1 v5.8H, v23.8H, v20.8H // ..................................*............. + str q26, [x0], #32 // ..............................................l. + mul v31.8H, v5.8H, v2.8H // ...................................*............ + ldr q19, [x4, #-16] // ..................e............................. + ldr q29, [x1], #32 // e............................................... + ldr q12, [x1, #-16] // .e.............................................. + smlal2 v20.4S, v31.8H, v0.8H // .....................................*.......... + uzp1 v13.8H, v18.8H, v19.8H // ...................e............................ + uzp1 v3.8H, v29.8H, v12.8H // ..e............................................. + uzp2 v10.8H, v29.8H, v12.8H // ...e............................................ + smull v12.4S, v3.4H, v4.4H // .............e.................................. + smull2 v11.4S, v3.8H, v4.8H // ..............e................................. + ldr q5, [x5, #-16] // ......................e......................... + smlal v12.4S, v10.4H, v17.4H // ...............e................................ + smlal2 v11.4S, v10.8H, v17.8H // ................e............................... + uzp2 v14.8H, v30.8H, v5.8H // ........................e....................... + uzp1 v15.8H, v30.8H, v5.8H // .......................e........................ + smlal v12.4S, v13.4H, v14.4H // ..............................e................. + smlal2 v11.4S, v13.8H, v14.8H // ...............................e................ + uzp2 v28.8H, v18.8H, v19.8H // ....................e........................... + smlal v23.4S, v31.4H, v0.4H // ....................................*........... + smlal v12.4S, v28.4H, v15.4H // ................................e............... + smlal2 v11.4S, v28.8H, v15.8H // .................................e.............. + ld1 {v22.8H}, [x6], #16 // .........................e...................... + uzp2 v19.8H, v23.8H, v20.8H // ......................................*......... + uzp1 v1.8H, v12.8H, v11.8H // .......................................e........ + smull v23.4S, v3.4H, v17.4H // .........e...................................... + mul v14.8H, v1.8H, v2.8H // ........................................e....... + zip2 v9.8H, v19.8H, v27.8H // .............................................*.. + ld1 {v4.8H}, [x3], #16 // ........e....................................... + smlal2 v11.4S, v14.8H, v0.8H // ..........................................e..... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'.................. + // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'.................. + // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~.......... + // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~...... + // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~... + // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~.. + // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'.................. + // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'.................. + // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~.................. + // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~............. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~............... + // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'.................. + // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'.................. + // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'.................. + // ldr q12, [x4], #32 // e..............................................'~..............................................'~................. + // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'.................. + // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................ + // ldr q13, [x5, #-16] // ............................e..................'............................~..................'.................. + // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'.................. + // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'.................. + // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'.................. + // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~......... + // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~........... + // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~..... + // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........ + // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'.................. + // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'.................. + // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'.................. + // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~. + // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'.................. + // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'.................. + // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'.................. + // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'.................. + // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'.................. + // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'.................. + // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~.............. + // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'.................. + // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~.... + // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l....... + // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'.................. + // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l + // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............ sub count, count, #1 - cbnz count, k2_loop_start - // Instructions: 27 - // Expected cycles: 34 - // Expected IPC: 0.79 - - // Cycle bound: 34.0 - // IPC bound: 0.79 - - // Wall time: 0.25s - // User time: 0.25s - - // ------- cycle (expected) --------> - // 0 25 - // |------------------------|-------- - smull v13.4S, v6.4H, v5.4H // *................................. - smull2 v10.4S, v6.8H, v5.8H // .*................................ - smull v21.4S, v6.4H, v4.4H // ..*............................... - smull2 v3.4S, v6.8H, v4.8H // ...*.............................. - smlal v13.4S, v7.4H, v4.4H // ....*............................. - smlal2 v10.4S, v7.8H, v4.8H // .....*............................ - smlal v21.4S, v7.4H, v12.4H // ......*........................... - smlal2 v3.4S, v7.8H, v12.8H // .......*.......................... - smlal v13.4S, v15.4H, v30.4H // ........*......................... - smlal2 v10.4S, v15.8H, v30.8H // .........*........................ - smlal v21.4S, v15.4H, v29.4H // ..........*....................... - smlal2 v3.4S, v15.8H, v29.8H // ...........*...................... - smlal v13.4S, v16.4H, v29.4H // ............*..................... - smlal2 v10.4S, v16.8H, v29.8H // .............*.................... - smlal v21.4S, v16.4H, v31.4H // ..............*................... - smlal2 v3.4S, v16.8H, v31.8H // ...............*.................. - uzp1 v12.8H, v13.8H, v10.8H // .................*................ - uzp1 v6.8H, v21.8H, v3.8H // ...................*.............. - mul v12.8H, v12.8H, v2.H[2] // ....................*............. - mul v6.8H, v6.8H, v2.H[2] // .....................*............ - smlal v13.4S, v12.4H, v0.4H // ........................*......... - smlal v21.4S, v6.4H, v0.4H // .........................*........ - smlal2 v3.4S, v6.8H, v0.8H // ..........................*....... - smlal2 v10.4S, v12.8H, v0.8H // ...........................*...... - uzp2 v21.8H, v21.8H, v3.8H // ..............................*... - uzp2 v22.8H, v13.8H, v10.8H // ...............................*.. - st2 {v21.8H, v22.8H}, [x0], #32 // .................................* - - // ------- cycle (expected) --------> - // 0 25 - // |------------------------|-------- - // smull v13.4S, v6.4H, v4.4H // ..*............................... - // smull2 v10.4S, v6.8H, v4.8H // ...*.............................. - // smull v21.4S, v6.4H, v5.4H // *................................. - // smull2 v3.4S, v6.8H, v5.8H // .*................................ - // smlal v13.4S, v7.4H, v12.4H // ......*........................... - // smlal2 v10.4S, v7.8H, v12.8H // .......*.......................... - // smlal v21.4S, v7.4H, v4.4H // ....*............................. - // smlal2 v3.4S, v7.8H, v4.8H // .....*............................ - // smlal v13.4S, v15.4H, v29.4H // ..........*....................... - // smlal2 v10.4S, v15.8H, v29.8H // ...........*...................... - // smlal v21.4S, v15.4H, v30.4H // ........*......................... - // smlal2 v3.4S, v15.8H, v30.8H // .........*........................ - // smlal v13.4S, v16.4H, v31.4H // ..............*................... - // smlal2 v10.4S, v16.8H, v31.8H // ...............*.................. - // smlal v21.4S, v16.4H, v29.4H // ............*..................... - // smlal2 v3.4S, v16.8H, v29.8H // .............*.................... - // uzp1 v12.8H, v13.8H, v10.8H // ...................*.............. - // uzp1 v30.8H, v21.8H, v3.8H // .................*................ - // mul v12.8H, v12.8H, v2.H[2] // .....................*............ - // mul v30.8H, v30.8H, v2.H[2] // ....................*............. - // smlal v13.4S, v12.4H, v0.4H // .........................*........ - // smlal2 v10.4S, v12.8H, v0.8H // ..........................*....... - // smlal v21.4S, v30.4H, v0.4H // ........................*......... - // smlal2 v3.4S, v30.8H, v0.8H // ...........................*...... - // uzp2 v13.8H, v13.8H, v10.8H // ..............................*... - // uzp2 v14.8H, v21.8H, v3.8H // ...............................*.. - // st2 {v13.8H, v14.8H}, [x0], #32 // .................................* + cbnz count, 1b + // Instructions: 21 + // Expected cycles: 35 + // Expected IPC: 0.60 + + // Cycle bound: 35.0 + // IPC bound: 0.60 + + // Wall time: 0.08s + // User time: 0.08s + + // ----- original position -----> + // 0 25 + // |------------------------|---- + smull2 v5.4S, v3.8H, v17.8H // *............................. + smlal v12.4S, v14.4H, v0.4H // ..*........................... + smlal v23.4S, v10.4H, v4.4H // ...*.......................... + str q9, [x0, #16] // ....*......................... + smlal2 v5.4S, v10.8H, v4.8H // .*............................ + uzp2 v11.8H, v12.8H, v11.8H // ..........*................... + zip1 v9.8H, v19.8H, v27.8H // ........*..................... + smlal v23.4S, v13.4H, v15.4H // ......*....................... + smlal2 v5.4S, v13.8H, v15.8H // .....*........................ + str q9, [x0], #32 // ............*................. + smlal v23.4S, v28.4H, v22.4H // .........*.................... + smlal2 v5.4S, v28.8H, v22.8H // .......*...................... + uzp1 v9.8H, v23.8H, v5.8H // ...........*.................. + mul v9.8H, v9.8H, v2.8H // .............*................ + smlal2 v5.4S, v9.8H, v0.8H // ..............*............... + smlal v23.4S, v9.4H, v0.4H // ...............*.............. + uzp2 v9.8H, v23.8H, v5.8H // ................*............. + zip2 v5.8H, v9.8H, v11.8H // .................*............ + zip1 v9.8H, v9.8H, v11.8H // ...................*.......... + str q5, [x0, #16] // ..................*........... + str q9, [x0], #32 // ....................*......... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // smull2 v20.4S, v3.8H, v17.8H // *.............................. + // smlal2 v20.4S, v10.8H, v4.8H // ....*.......................... + // smlal v12.4S, v14.4H, v0.4H // .*............................. + // smlal v23.4S, v10.4H, v4.4H // ..*............................ + // str q9, [x0, #16] // ...*........................... + // smlal2 v20.4S, v13.8H, v15.8H // ........*...................... + // smlal v23.4S, v13.4H, v15.4H // .......*....................... + // smlal2 v20.4S, v28.8H, v22.8H // ...........*................... + // zip1 v26.8H, v19.8H, v27.8H // ......*........................ + // smlal v23.4S, v28.4H, v22.4H // ..........*.................... + // uzp2 v27.8H, v12.8H, v11.8H // .....*......................... + // uzp1 v5.8H, v23.8H, v20.8H // ............*.................. + // str q26, [x0], #32 // .........*..................... + // mul v31.8H, v5.8H, v2.8H // .............*................. + // smlal2 v20.4S, v31.8H, v0.8H // ..............*................ + // smlal v23.4S, v31.4H, v0.4H // ...............*............... + // uzp2 v19.8H, v23.8H, v20.8H // ................*.............. + // zip2 v9.8H, v19.8H, v27.8H // .................*............. + // str q9, [x0, #16] // ...................*........... + // zip1 v26.8H, v19.8H, v27.8H // ..................*............ + // str q26, [x0], #32 // ....................*.......... pop_stack @@ -353,17 +530,12 @@ k2_loop_start: #endif /* MLKEM_K == 2 */ #if MLKEM_K == 3 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -375,327 +547,453 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) mov count, #(MLKEM_N / 16) - // Instructions: 52 - // Expected cycles: 66 - // Expected IPC: 0.79 - - // Cycle bound: 66.0 - // IPC bound: 0.79 - - // Wall time: 8.09s - // User time: 8.09s - - // ----------------------- cycle (expected) ------------------------> - // 0 25 50 - // |------------------------|------------------------|--------------- - ld2 {v7.8H, v8.8H}, [x2], #32 // *................................................................. - ld2 {v28.8H, v29.8H}, [x1], #32 // ..*............................................................... - ld1 {v10.8H}, [x3], #16 // ....*............................................................. - smull2 v15.4S, v28.8H, v8.8H // ......*........................................................... - smull v9.4S, v28.4H, v8.4H // .......*.......................................................... - ld1 {v31.8H}, [x3], #16 // ........*......................................................... - smlal2 v15.4S, v29.8H, v7.8H // ..........*....................................................... - ld2 {v19.8H, v20.8H}, [x5], #32 // ...........*...................................................... - smlal v9.4S, v29.4H, v7.4H // .............*.................................................... - ld2 {v16.8H, v17.8H}, [x4], #32 // ..............*................................................... - ld2 {v3.8H, v4.8H}, [x7], #32 // ................*................................................. - smlal v9.4S, v16.4H, v20.4H // ..................*............................................... - smlal2 v15.4S, v16.8H, v20.8H // ...................*.............................................. - smull v1.4S, v28.4H, v7.4H // ....................*............................................. - smull2 v22.4S, v28.8H, v7.8H // .....................*............................................ - smlal v9.4S, v17.4H, v19.4H // ......................*........................................... - smlal2 v15.4S, v17.8H, v19.8H // .......................*.......................................... - ld2 {v6.8H, v7.8H}, [x8], #32 // ........................*......................................... - smlal v1.4S, v29.4H, v10.4H // ..........................*....................................... - smlal2 v22.4S, v29.8H, v10.8H // ...........................*...................................... - smlal2 v15.4S, v3.8H, v7.8H // ............................*..................................... - smlal v9.4S, v3.4H, v7.4H // .............................*.................................... - smlal v1.4S, v16.4H, v19.4H // ..............................*................................... - smlal2 v22.4S, v16.8H, v19.8H // ...............................*.................................. - ld1 {v5.8H}, [x6], #16 // ................................*................................. - smlal v9.4S, v4.4H, v6.4H // ..................................*............................... - smlal2 v15.4S, v4.8H, v6.8H // ...................................*.............................. - smlal v1.4S, v17.4H, v5.4H // ....................................*............................. - smlal2 v22.4S, v17.8H, v5.8H // .....................................*............................ - ld1 {v30.8H}, [x9], #16 // ......................................*........................... - smlal v1.4S, v3.4H, v6.4H // ........................................*......................... - smlal2 v22.4S, v3.8H, v6.8H // .........................................*........................ - ld2 {v11.8H, v12.8H}, [x1], #32 // ..........................................*....................... - smlal v1.4S, v4.4H, v30.4H // ............................................*..................... - smlal2 v22.4S, v4.8H, v30.8H // .............................................*.................... - uzp1 v21.8H, v9.8H, v15.8H // ..............................................*................... - ld1 {v14.8H}, [x6], #16 // ...............................................*.................. - uzp1 v13.8H, v1.8H, v22.8H // .................................................*................ - mul v10.8H, v21.8H, v2.H[2] // ..................................................*............... - mul v21.8H, v13.8H, v2.H[2] // ...................................................*.............. - ld2 {v23.8H, v24.8H}, [x2], #32 // ....................................................*............. - smlal2 v15.4S, v10.8H, v0.8H // ......................................................*........... - smlal2 v22.4S, v21.8H, v0.8H // .......................................................*.......... - smlal v1.4S, v21.4H, v0.4H // ........................................................*......... - smull2 v13.4S, v11.8H, v23.8H // .........................................................*........ - ld2 {v25.8H, v26.8H}, [x5], #32 // ..........................................................*....... - smlal v9.4S, v10.4H, v0.4H // ............................................................*..... - smlal2 v13.4S, v12.8H, v31.8H // .............................................................*.... - smull2 v27.4S, v11.8H, v24.8H // ..............................................................*... - uzp2 v16.8H, v1.8H, v22.8H // ...............................................................*.. - uzp2 v17.8H, v9.8H, v15.8H // ................................................................*. - ld2 {v6.8H, v7.8H}, [x4], #32 // .................................................................* - - // ----------------------- cycle (expected) ------------------------> - // 0 25 50 - // |------------------------|------------------------|--------------- - // ld2 {v23.8H, v24.8H}, [x2], #32 // *................................................................. - // ld2 {v11.8H, v12.8H}, [x1], #32 // ..*............................................................... - // ld2 {v6.8H, v7.8H}, [x4], #32 // ..............*................................................... - // ld1 {v31.8H}, [x3], #16 // ....*............................................................. - // ld2 {v25.8H, v26.8H}, [x5], #32 // ...........*...................................................... - // smull2 v13.4S, v11.8H, v23.8H // .....................*............................................ - // ld1 {v14.8H}, [x6], #16 // ................................*................................. - // smlal2 v13.4S, v12.8H, v31.8H // ...........................*...................................... - // smull2 v27.4S, v11.8H, v24.8H // ......*........................................................... - // smlal2 v13.4S, v6.8H, v25.8H // ...............................*.................................. - // smull v20.4S, v11.4H, v24.4H // .......*.......................................................... - // smull v17.4S, v11.4H, v23.4H // ....................*............................................. - // smlal2 v27.4S, v12.8H, v23.8H // ..........*....................................................... - // smlal2 v13.4S, v7.8H, v14.8H // .....................................*............................ - // smlal v20.4S, v12.4H, v23.4H // .............*.................................................... - // smlal v17.4S, v12.4H, v31.4H // ..........................*....................................... - // smlal2 v27.4S, v6.8H, v26.8H // ...................*.............................................. - // ld1 {v5.8H}, [x9], #16 // ......................................*........................... - // smlal v17.4S, v6.4H, v25.4H // ..............................*................................... - // smlal v20.4S, v6.4H, v26.4H // ..................*............................................... - // ld2 {v23.8H, v24.8H}, [x2], #32 // ....................................................*............. - // ld2 {v15.8H, v16.8H}, [x8], #32 // ........................*......................................... - // smlal v17.4S, v7.4H, v14.4H // ....................................*............................. - // ld2 {v8.8H, v9.8H}, [x7], #32 // ................*................................................. - // smlal v20.4S, v7.4H, v25.4H // ......................*........................................... - // smlal2 v27.4S, v7.8H, v25.8H // .......................*.......................................... - // smlal2 v13.4S, v8.8H, v15.8H // .........................................*........................ - // smlal v17.4S, v8.4H, v15.4H // ........................................*......................... - // smlal v20.4S, v8.4H, v16.4H // .............................*.................................... - // smlal2 v27.4S, v8.8H, v16.8H // ............................*..................................... - // smlal2 v13.4S, v9.8H, v5.8H // .............................................*.................... - // smlal v17.4S, v9.4H, v5.4H // ............................................*..................... - // smlal v20.4S, v9.4H, v15.4H // ..................................*............................... - // smlal2 v27.4S, v9.8H, v15.8H // ...................................*.............................. - // ld2 {v11.8H, v12.8H}, [x1], #32 // ..........................................*....................... - // uzp1 v31.8H, v17.8H, v13.8H // .................................................*................ - // uzp1 v8.8H, v20.8H, v27.8H // ..............................................*................... - // ld2 {v6.8H, v7.8H}, [x4], #32 // .................................................................* - // mul v18.8H, v31.8H, v2.H[2] // ...................................................*.............. - // mul v14.8H, v8.8H, v2.H[2] // ..................................................*............... - // ld1 {v31.8H}, [x3], #16 // ........*......................................................... - // smlal v17.4S, v18.4H, v0.4H // ........................................................*......... - // smlal2 v27.4S, v14.8H, v0.8H // ......................................................*........... - // smlal2 v13.4S, v18.8H, v0.8H // .......................................................*.......... - // smlal v20.4S, v14.4H, v0.4H // ............................................................*..... - // ld2 {v25.8H, v26.8H}, [x5], #32 // ..........................................................*....... - // uzp2 v16.8H, v17.8H, v13.8H // ...............................................................*.. - // smull2 v13.4S, v11.8H, v23.8H // .........................................................*........ - // uzp2 v17.8H, v20.8H, v27.8H // ................................................................*. - // ld1 {v14.8H}, [x6], #16 // ...............................................*.................. - // smlal2 v13.4S, v12.8H, v31.8H // .............................................................*.... - // smull2 v27.4S, v11.8H, v24.8H // ..............................................................*... + // Instructions: 75 + // Expected cycles: 103 + // Expected IPC: 0.73 + + // Cycle bound: 103.0 + // IPC bound: 0.73 + + // Wall time: 0.94s + // User time: 0.94s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q7, [x2, #16] // *.......................................................................... + ldr q20, [x2], #32 // ..*........................................................................ + ldr q15, [x1, #16] // .*......................................................................... + uzp1 v8.8H, v20.8H, v7.8H // ...............*........................................................... + uzp2 v7.8H, v20.8H, v7.8H // ................*.......................................................... + ld1 {v20.8H}, [x3], #16 // ...*....................................................................... + ldr q30, [x1], #32 // ..............*............................................................ + ldr q11, [x4], #32 // ....*...................................................................... + uzp1 v16.8H, v30.8H, v15.8H // .................*......................................................... + uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................ + smull v30.4S, v16.4H, v7.4H // ...................*....................................................... + smull2 v7.4S, v16.8H, v7.8H // ....................*...................................................... + smull v9.4S, v16.4H, v8.4H // .....................*..................................................... + smull2 v16.4S, v16.8H, v8.8H // ......................*.................................................... + smlal v30.4S, v15.4H, v8.4H // .......................*................................................... + smlal2 v7.4S, v15.8H, v8.8H // ........................*.................................................. + smlal v9.4S, v15.4H, v20.4H // .........................*................................................. + smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................ + ldr q20, [x4, #-16] // .....*..................................................................... + ldr q15, [x5], #32 // ......*.................................................................... + uzp1 v8.8H, v11.8H, v20.8H // ...........................*............................................... + uzp2 v20.8H, v11.8H, v20.8H // ............................*.............................................. + ldr q11, [x5, #-16] // .......*................................................................... + ld1 {v27.8H}, [x6], #16 // ........*.................................................................. + uzp1 v10.8H, v15.8H, v11.8H // .............................*............................................. + uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................ + smlal v9.4S, v8.4H, v10.4H // ...............................*........................................... + smlal2 v16.4S, v8.8H, v10.8H // ................................*.......................................... + smlal v30.4S, v8.4H, v15.4H // .................................*......................................... + smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................ + smlal v9.4S, v20.4H, v27.4H // ...................................*....................................... + smlal2 v16.4S, v20.8H, v27.8H // ....................................*...................................... + smlal v30.4S, v20.4H, v10.4H // .....................................*..................................... + smlal2 v7.4S, v20.8H, v10.8H // ......................................*.................................... + ldr q20, [x7], #32 // .........*................................................................. + ldr q15, [x7, #-16] // ..........*................................................................ + ldr q8, [x8], #32 // ...........*............................................................... + uzp1 v11.8H, v20.8H, v15.8H // .......................................*................................... + uzp2 v20.8H, v20.8H, v15.8H // ........................................*.................................. + ldr q15, [x8, #-16] // ............*.............................................................. + ld1 {v27.8H}, [x9], #16 // .............*............................................................. + uzp1 v10.8H, v8.8H, v15.8H // .........................................*................................. + uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................ + smlal v9.4S, v11.4H, v10.4H // ...........................................*............................... + smlal2 v16.4S, v11.8H, v10.8H // ............................................*.............................. + smlal v30.4S, v11.4H, v15.4H // .............................................*............................. + smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................ + smlal v9.4S, v20.4H, v27.4H // ...............................................*........................... + smlal2 v16.4S, v20.8H, v27.8H // ................................................*.......................... + smlal v30.4S, v20.4H, v10.4H // .................................................*......................... + smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................ + ldr q15, [x2], #32 // ...............................................................*........... + uzp1 v20.8H, v9.8H, v16.8H // ....................................................*...................... + uzp1 v8.8H, v30.8H, v7.8H // .....................................................*..................... + mul v20.8H, v20.8H, v2.8H // ......................................................*.................... + mul v8.8H, v8.8H, v2.8H // .......................................................*................... + ldr q21, [x4], #32 // .................................................................*......... + smlal v9.4S, v20.4H, v0.4H // ........................................................*.................. + smlal2 v16.4S, v20.8H, v0.8H // .........................................................*................. + smlal v30.4S, v8.4H, v0.4H // ..........................................................*................ + smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*............... + ldr q6, [x4, #-16] // ..................................................................*........ + uzp2 v27.8H, v9.8H, v16.8H // ............................................................*.............. + uzp2 v10.8H, v30.8H, v7.8H // .............................................................*............. + ldr q16, [x2, #-16] // ...................................................*....................... + ldr q30, [x1, #16] // ..............................................................*............ + ld1 {v9.8H}, [x3], #16 // ................................................................*.......... + ldr q1, [x5], #32 // ...................................................................*....... + ldr q12, [x5, #-16] // ....................................................................*...... + ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + ldr q19, [x7], #32 // ......................................................................*.... + ldr q31, [x7, #-16] // .......................................................................*... + ldr q17, [x8], #32 // ........................................................................*.. + ldr q18, [x8, #-16] // .........................................................................*. + ld1 {v25.8H}, [x9], #16 // ..........................................................................* + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q16, [x2, #16] // *.......................................................................... + // ldr q30, [x1, #16] // ..*........................................................................ + // ldr q15, [x2], #32 // .*......................................................................... + // ld1 {v9.8H}, [x3], #16 // .....*..................................................................... + // ldr q21, [x4], #32 // .......*................................................................... + // ldr q6, [x4, #-16] // ..................*........................................................ + // ldr q1, [x5], #32 // ...................*....................................................... + // ldr q12, [x5, #-16] // ......................*.................................................... + // ld1 {v24.8H}, [x6], #16 // .......................*................................................... + // ldr q19, [x7], #32 // ..................................*........................................ + // ldr q31, [x7, #-16] // ...................................*....................................... + // ldr q17, [x8], #32 // ....................................*...................................... + // ldr q18, [x8, #-16] // .......................................*................................... + // ld1 {v25.8H}, [x9], #16 // ........................................*.................................. + // ldr q20, [x1], #32 // ......*.................................................................... + // uzp1 v7.8H, v15.8H, v16.8H // ...*....................................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ....*...................................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ........*.................................................................. + // uzp2 v20.8H, v20.8H, v30.8H // .........*................................................................. + // smull v30.4S, v8.4H, v15.4H // ..........*................................................................ + // smull2 v15.4S, v8.8H, v15.8H // ...........*............................................................... + // smull v11.4S, v8.4H, v7.4H // ............*.............................................................. + // smull2 v8.4S, v8.8H, v7.8H // .............*............................................................. + // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................ + // smlal2 v15.4S, v20.8H, v7.8H // ...............*........................................................... + // smlal v11.4S, v20.4H, v9.4H // ................*.......................................................... + // smlal2 v8.4S, v20.8H, v9.8H // .................*......................................................... + // uzp1 v7.8H, v21.8H, v6.8H // ....................*...................................................... + // uzp2 v20.8H, v21.8H, v6.8H // .....................*..................................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........................*.................................................. + // uzp2 v9.8H, v1.8H, v12.8H // .........................*................................................. + // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................ + // smlal2 v8.4S, v7.8H, v16.8H // ...........................*............................................... + // smlal v30.4S, v7.4H, v9.4H // ............................*.............................................. + // smlal2 v15.4S, v7.8H, v9.8H // .............................*............................................. + // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................ + // smlal2 v8.4S, v20.8H, v24.8H // ...............................*........................................... + // smlal v30.4S, v20.4H, v16.4H // ................................*.......................................... + // smlal2 v15.4S, v20.8H, v16.8H // .................................*......................................... + // uzp1 v7.8H, v19.8H, v31.8H // .....................................*..................................... + // uzp2 v20.8H, v19.8H, v31.8H // ......................................*.................................... + // uzp1 v16.8H, v17.8H, v18.8H // .........................................*................................. + // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................ + // smlal v11.4S, v7.4H, v16.4H // ...........................................*............................... + // smlal2 v8.4S, v7.8H, v16.8H // ............................................*.............................. + // smlal v30.4S, v7.4H, v9.4H // .............................................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................ + // smlal v11.4S, v20.4H, v25.4H // ...............................................*........................... + // smlal2 v8.4S, v20.8H, v25.8H // ................................................*.......................... + // smlal v30.4S, v20.4H, v16.4H // .................................................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................ + // ldr q16, [x2, #16] // ................................................................*.......... + // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*...................... + // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*..................... + // mul v7.8H, v7.8H, v2.8H // ......................................................*.................... + // mul v20.8H, v20.8H, v2.8H // .......................................................*................... + // smlal v11.4S, v7.4H, v0.4H // .........................................................*................. + // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................ + // smlal v30.4S, v20.4H, v0.4H // ...........................................................*............... + // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*.............. + // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............ + // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*........... + // ldr q30, [x1, #16] // .................................................................*......... + // ldr q15, [x2], #32 // ...................................................*....................... + // ld1 {v9.8H}, [x3], #16 // ..................................................................*........ + // ldr q21, [x4], #32 // ........................................................*.................. + // ldr q6, [x4, #-16] // .............................................................*............. + // ldr q1, [x5], #32 // ...................................................................*....... + // ldr q12, [x5, #-16] // ....................................................................*...... + // ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + // ldr q19, [x7], #32 // ......................................................................*.... + // ldr q31, [x7, #-16] // .......................................................................*... + // ldr q17, [x8], #32 // ........................................................................*.. + // ldr q18, [x8, #-16] // .........................................................................*. + // ld1 {v25.8H}, [x9], #16 // ..........................................................................* sub count, count, #2 -k3_loop_start: - // Instructions: 44 - // Expected cycles: 54 - // Expected IPC: 0.81 - - // Cycle bound: 54.0 - // IPC bound: 0.81 - - // Wall time: 4.51s - // User time: 4.51s - - // ----------------- cycle (expected) ------------------> - // 0 25 50 - // |------------------------|------------------------|--- - st2 {v16.8H, v17.8H}, [x0], #32 // l..................................................... - smlal2 v13.4S, v6.8H, v25.8H // ..*................................................... - smull v20.4S, v11.4H, v24.4H // ...*.................................................. - smull v17.4S, v11.4H, v23.4H // ....*................................................. - smlal2 v27.4S, v12.8H, v23.8H // .....*................................................ - smlal2 v13.4S, v7.8H, v14.8H // ......*............................................... - smlal v20.4S, v12.4H, v23.4H // .......*.............................................. - smlal v17.4S, v12.4H, v31.4H // ........*............................................. - smlal2 v27.4S, v6.8H, v26.8H // .........*............................................ - ld1 {v5.8H}, [x9], #16 // ..........*........................................... - smlal v17.4S, v6.4H, v25.4H // ............*......................................... - smlal v20.4S, v6.4H, v26.4H // .............*........................................ - ld2 {v23.8H, v24.8H}, [x2], #32 // ..............e....................................... - ld2 {v15.8H, v16.8H}, [x8], #32 // ................*..................................... - smlal v17.4S, v7.4H, v14.4H // ..................*................................... - ld2 {v8.8H, v9.8H}, [x7], #32 // ...................*.................................. - smlal v20.4S, v7.4H, v25.4H // .....................*................................ - smlal2 v27.4S, v7.8H, v25.8H // ......................*............................... - smlal2 v13.4S, v8.8H, v15.8H // .......................*.............................. - smlal v17.4S, v8.4H, v15.4H // ........................*............................. - smlal v20.4S, v8.4H, v16.4H // .........................*............................ - smlal2 v27.4S, v8.8H, v16.8H // ..........................*........................... - smlal2 v13.4S, v9.8H, v5.8H // ...........................*.......................... - smlal v17.4S, v9.4H, v5.4H // ............................*......................... - smlal v20.4S, v9.4H, v15.4H // .............................*........................ - smlal2 v27.4S, v9.8H, v15.8H // ..............................*....................... - ld2 {v11.8H, v12.8H}, [x1], #32 // ...............................e...................... - uzp1 v31.8H, v17.8H, v13.8H // .................................*.................... - uzp1 v8.8H, v20.8H, v27.8H // ..................................*................... - ld2 {v6.8H, v7.8H}, [x4], #32 // ...................................e.................. - mul v18.8H, v31.8H, v2.H[2] // .....................................*................ - mul v14.8H, v8.8H, v2.H[2] // ......................................*............... - ld1 {v31.8H}, [x3], #16 // .......................................e.............. - smlal v17.4S, v18.4H, v0.4H // .........................................*............ - smlal2 v27.4S, v14.8H, v0.8H // ..........................................*........... - smlal2 v13.4S, v18.8H, v0.8H // ...........................................*.......... - smlal v20.4S, v14.4H, v0.4H // ............................................*......... - ld2 {v25.8H, v26.8H}, [x5], #32 // .............................................e........ - uzp2 v16.8H, v17.8H, v13.8H // ...............................................*...... - smull2 v13.4S, v11.8H, v23.8H // ................................................e..... - uzp2 v17.8H, v20.8H, v27.8H // .................................................*.... - ld1 {v14.8H}, [x6], #16 // ..................................................e... - smlal2 v13.4S, v12.8H, v31.8H // ....................................................e. - smull2 v27.4S, v11.8H, v24.8H // .....................................................e - - // -------------------------------------- cycle (expected) --------------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|------------------- - // ld2 {v3.8h, v4.8h}, [x1], #32 // .................e......................'..............................~....................... - // ld2 {v5.8h, v6.8h}, [x2], #32 // e.......................................'.............~........................................ - // ld1 {v7.8h}, [x3], #16 // .........................e..............'......................................~............... - // smull v8.4s, v3.4h, v5.4h // ........................................'...*.................................................. - // smull2 v10.4s, v3.8h, v5.8h // ..................................e.....'...............................................~...... - // smlal v8.4s, v4.4h, v7.4h // ........................................'.......*.............................................. - // smlal2 v10.4s, v4.8h, v7.8h // ......................................e.'...................................................~.. - // smull v9.4s, v3.4h, v6.4h // ........................................'..*................................................... - // smull2 v11.4s, v3.8h, v6.8h // .......................................e'....................................................~. - // smlal v9.4s, v4.4h, v5.4h // ........................................'......*............................................... - // smlal2 v11.4s, v4.8h, v5.8h // ........................................'....*................................................. - // ld2 {v3.8h, v4.8h}, [x4], #32 // .....................e..................'..................................~................... - // ld2 {v5.8h, v6.8h}, [x5], #32 // ...............................e........'............................................~......... - // ld1 {v7.8h}, [x6], #16 // ....................................e...'.................................................~.... - // smlal v8.4s, v3.4h, v5.4h // ........................................'...........*.......................................... - // smlal2 v10.4s, v3.8h, v5.8h // ........................................'.*.................................................... - // smlal v8.4s, v4.4h, v7.4h // ....~...................................'.................*.................................... - // smlal2 v10.4s, v4.8h, v7.8h // ........................................'.....*................................................ - // smlal v9.4s, v3.4h, v6.4h // ........................................'............*......................................... - // smlal2 v11.4s, v3.8h, v6.8h // ........................................'........*............................................. - // smlal v9.4s, v4.4h, v5.4h // .......~................................'....................*................................. - // smlal2 v11.4s, v4.8h, v5.8h // ........~...............................'.....................*................................ - // ld2 {v3.8h, v4.8h}, [x7], #32 // .....~..................................'..................*................................... - // ld2 {v5.8h, v6.8h}, [x8], #32 // ..~.....................................'...............*...................................... - // ld1 {v7.8h}, [x9], #16 // ........................................'.........*............................................ - // smlal v8.4s, v3.4h, v5.4h // ..........~.............................'.......................*.............................. - // smlal2 v10.4s, v3.8h, v5.8h // .........~..............................'......................*............................... - // smlal v8.4s, v4.4h, v7.4h // ..............~.........................'...........................*.......................... - // smlal2 v10.4s, v4.8h, v7.8h // .............~..........................'..........................*........................... - // smlal v9.4s, v3.4h, v6.4h // ...........~............................'........................*............................. - // smlal2 v11.4s, v3.8h, v6.8h // ............~...........................'.........................*............................ - // smlal v9.4s, v4.4h, v5.4h // ...............~........................'............................*......................... - // smlal2 v11.4s, v4.8h, v5.8h // ................~.......................'.............................*........................ - // uzp1 v28.8h, v8.8h, v10.8h // ...................~....................'................................*..................... - // mul v28.8h, v28.8h, v2.h[2] // .......................~................'....................................*................. - // smlal v8.4s, v28.4h, v0.4h // ...........................~............'........................................*............. - // smlal2 v10.4s, v28.8h, v0.8h // .............................~..........'..........................................*........... - // uzp2 v26.8h, v8.8h, v10.8h // .................................~......'..............................................*....... - // uzp1 v28.8h, v9.8h, v11.8h // ....................~...................'.................................*.................... - // mul v28.8h, v28.8h, v2.h[2] // ........................~...............'.....................................*................ - // smlal v9.4s, v28.4h, v0.4h // ..............................~.........'...........................................*.......... - // smlal2 v11.4s, v28.8h, v0.8h // ............................~...........'.........................................*............ - // uzp2 v27.8h, v9.8h, v11.8h // ...................................~....'................................................*..... - // st2 {v26.8h, v27.8h}, [x0], #32 // ........................................~.....................................................l +1: + // Instructions: 65 + // Expected cycles: 80 + // Expected IPC: 0.81 + + // Cycle bound: 80.0 + // IPC bound: 0.81 + + // Wall time: 11.64s + // User time: 11.64s + + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + ldr q20, [x1], #32 // *................................................................ + uzp1 v7.8H, v15.8H, v16.8H // ......*.......................................................... + uzp2 v15.8H, v15.8H, v16.8H // .......*......................................................... + uzp1 v8.8H, v20.8H, v30.8H // ..*.............................................................. + uzp2 v20.8H, v20.8H, v30.8H // ...*............................................................. + smull v30.4S, v8.4H, v15.4H // .............*................................................... + smull2 v15.4S, v8.8H, v15.8H // ..............*.................................................. + smull v11.4S, v8.4H, v7.4H // .........*....................................................... + smull2 v8.4S, v8.8H, v7.8H // ..........*...................................................... + smlal v30.4S, v20.4H, v7.4H // ...............*................................................. + smlal2 v15.4S, v20.8H, v7.8H // ................*................................................ + smlal v11.4S, v20.4H, v9.4H // ...........*..................................................... + smlal2 v8.4S, v20.8H, v9.8H // ............*.................................................... + uzp1 v7.8H, v21.8H, v6.8H // ...................*............................................. + uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................ + uzp1 v16.8H, v1.8H, v12.8H // .......................*......................................... + uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................ + smlal v11.4S, v7.4H, v16.4H // ..........................*...................................... + smlal2 v8.4S, v7.8H, v16.8H // ...........................*..................................... + smlal v30.4S, v7.4H, v9.4H // ..............................*.................................. + smlal2 v15.4S, v7.8H, v9.8H // ...............................*................................. + smlal v11.4S, v20.4H, v24.4H // ............................*.................................... + smlal2 v8.4S, v20.8H, v24.8H // .............................*................................... + smlal v30.4S, v20.4H, v16.4H // ................................*................................ + smlal2 v15.4S, v20.8H, v16.8H // .................................*............................... + uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................ + uzp2 v20.8H, v19.8H, v31.8H // .....................................*........................... + uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................ + uzp2 v9.8H, v17.8H, v18.8H // .........................................*....................... + smlal v11.4S, v7.4H, v16.4H // ...........................................*..................... + smlal2 v8.4S, v7.8H, v16.8H // ............................................*.................... + smlal v30.4S, v7.4H, v9.4H // ...............................................*................. + smlal2 v15.4S, v7.8H, v9.8H // ................................................*................ + smlal v11.4S, v20.4H, v25.4H // .............................................*................... + smlal2 v8.4S, v20.8H, v25.8H // ..............................................*.................. + smlal v30.4S, v20.4H, v16.4H // .................................................*............... + smlal2 v15.4S, v20.8H, v16.8H // ..................................................*.............. + ldr q16, [x2, #16] // .....e........................................................... + uzp1 v7.8H, v11.8H, v8.8H // ...................................................*............. + uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........ + mul v7.8H, v7.8H, v2.8H // ....................................................*............ + mul v20.8H, v20.8H, v2.8H // .........................................................*....... + zip2 v9.8H, v27.8H, v10.8H // ..............................................................l.. + zip1 v27.8H, v27.8H, v10.8H // .............................................................l... + smlal v11.4S, v7.4H, v0.4H // .....................................................*........... + smlal2 v8.4S, v7.8H, v0.8H // ......................................................*.......... + smlal v30.4S, v20.4H, v0.4H // ..........................................................*...... + smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*..... + str q27, [x0], #32 // ...............................................................l. + uzp2 v27.8H, v11.8H, v8.8H // .......................................................*......... + str q9, [x0, #-16] // ................................................................l + uzp2 v10.8H, v30.8H, v15.8H // ............................................................*.... + ldr q30, [x1, #16] // .e............................................................... + ldr q15, [x2], #32 // ....e............................................................ + ld1 {v9.8H}, [x3], #16 // ........e........................................................ + ldr q21, [x4], #32 // .................e............................................... + ldr q6, [x4, #-16] // ..................e.............................................. + ldr q1, [x5], #32 // .....................e........................................... + ldr q12, [x5, #-16] // ......................e.......................................... + ld1 {v24.8H}, [x6], #16 // .........................e....................................... + ldr q19, [x7], #32 // ..................................e.............................. + ldr q31, [x7, #-16] // ...................................e............................. + ldr q17, [x8], #32 // ......................................e.......................... + ldr q18, [x8, #-16] // .......................................e......................... + ld1 {v25.8H}, [x9], #16 // ..........................................e...................... + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q12, [x1], #32 // ............................*................................................................~.................................................. + // ldr q13, [x1, #-16] // ...............e............'...................................................~............'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~............................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~.............................................. + // ldr q12, [x2], #32 // ................e...........'....................................................~...........'.................................................. + // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~............. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~................................................. + // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................ + // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'.................................................. + // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~........................................... + // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~.......................................... + // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~....................................... + // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~...................................... + // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~............................................. + // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................ + // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~......................................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................ + // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'.................................................. + // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~.................................... + // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'.................................................. + // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~................................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~.................................. + // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................ + // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~............................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~............................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~.............................. + // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~........................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~.......................... + // ldr q12, [x7], #32 // .......................e....'...........................................................~....'.................................................. + // ldr q13, [x7, #-16] // ........................e...'............................................................~...'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~......................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................ + // ldr q12, [x8], #32 // .........................e..'.............................................................~..'.................................................. + // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~....................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~...................... + // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~..................... + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~.................... + // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~.................. + // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~............... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~.............. + // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............ + // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~.......... + // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~...... + // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~..... + // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~. + // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~........... + // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~......... + // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~.... + // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~... + // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'.................................................. + // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l....... + // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........ + // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l.. + // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l sub count, count, #1 - cbnz count, k3_loop_start - // Instructions: 36 - // Expected cycles: 45 - // Expected IPC: 0.80 - - // Cycle bound: 45.0 - // IPC bound: 0.80 - - // Wall time: 0.30s - // User time: 0.30s - - // ------------- cycle (expected) -------------> - // 0 25 - // |------------------------|------------------- - ld2 {v21.8H, v22.8H}, [x8], #32 // *............................................ - smull v10.4S, v11.4H, v24.4H // ..*.......................................... - smlal2 v13.4S, v6.8H, v25.8H // ...*......................................... - smlal2 v27.4S, v12.8H, v23.8H // ....*........................................ - smull v18.4S, v11.4H, v23.4H // .....*....................................... - smlal v10.4S, v12.4H, v23.4H // ......*...................................... - ld2 {v29.8H, v30.8H}, [x7], #32 // .......*..................................... - smlal2 v27.4S, v6.8H, v26.8H // .........*................................... - smlal v10.4S, v6.4H, v26.4H // ..........*.................................. - smlal v18.4S, v12.4H, v31.4H // ...........*................................. - smlal2 v13.4S, v7.8H, v14.8H // ............*................................ - smlal2 v27.4S, v7.8H, v25.8H // .............*............................... - smlal v10.4S, v7.4H, v25.4H // ..............*.............................. - smlal v18.4S, v6.4H, v25.4H // ...............*............................. - smlal2 v13.4S, v29.8H, v21.8H // ................*............................ - smlal2 v27.4S, v29.8H, v22.8H // .................*........................... - smlal v10.4S, v29.4H, v22.4H // ..................*.......................... - smlal v18.4S, v7.4H, v14.4H // ...................*......................... - ld1 {v3.8H}, [x9], #16 // ....................*........................ - smlal v10.4S, v30.4H, v21.4H // ......................*...................... - smlal v18.4S, v29.4H, v21.4H // .......................*..................... - smlal2 v27.4S, v30.8H, v21.8H // ........................*.................... - smlal2 v13.4S, v30.8H, v3.8H // .........................*................... - smlal v18.4S, v30.4H, v3.4H // ...........................*................. - uzp1 v8.8H, v10.8H, v27.8H // ............................*................ - mul v23.8H, v8.8H, v2.H[2] // ..............................*.............. - uzp1 v30.8H, v18.8H, v13.8H // ...............................*............. - mul v19.8H, v30.8H, v2.H[2] // .................................*........... - smlal2 v27.4S, v23.8H, v0.8H // ..................................*.......... - smlal v10.4S, v23.4H, v0.4H // ...................................*......... - smlal v18.4S, v19.4H, v0.4H // .....................................*....... - smlal2 v13.4S, v19.8H, v0.8H // ......................................*...... - st2 {v16.8H, v17.8H}, [x0], #32 // .......................................*..... - uzp2 v12.8H, v10.8H, v27.8H // .........................................*... - uzp2 v11.8H, v18.8H, v13.8H // ..........................................*.. - st2 {v11.8H, v12.8H}, [x0], #32 // ............................................* - - // ------------- cycle (expected) -------------> - // 0 25 - // |------------------------|------------------- - // st2 {v16.8H, v17.8H}, [x0], #32 // .......................................*..... - // smlal2 v13.4S, v6.8H, v25.8H // ...*......................................... - // smull v20.4S, v11.4H, v24.4H // ..*.......................................... - // smull v17.4S, v11.4H, v23.4H // .....*....................................... - // smlal2 v27.4S, v12.8H, v23.8H // ....*........................................ - // smlal2 v13.4S, v7.8H, v14.8H // ............*................................ - // smlal v20.4S, v12.4H, v23.4H // ......*...................................... - // smlal v17.4S, v12.4H, v31.4H // ...........*................................. - // smlal2 v27.4S, v6.8H, v26.8H // .........*................................... - // ld1 {v5.8H}, [x9], #16 // ....................*........................ - // smlal v17.4S, v6.4H, v25.4H // ...............*............................. - // smlal v20.4S, v6.4H, v26.4H // ..........*.................................. - // ld2 {v15.8H, v16.8H}, [x8], #32 // *............................................ - // smlal v17.4S, v7.4H, v14.4H // ...................*......................... - // ld2 {v8.8H, v9.8H}, [x7], #32 // .......*..................................... - // smlal v20.4S, v7.4H, v25.4H // ..............*.............................. - // smlal2 v27.4S, v7.8H, v25.8H // .............*............................... - // smlal2 v13.4S, v8.8H, v15.8H // ................*............................ - // smlal v17.4S, v8.4H, v15.4H // .......................*..................... - // smlal v20.4S, v8.4H, v16.4H // ..................*.......................... - // smlal2 v27.4S, v8.8H, v16.8H // .................*........................... - // smlal2 v13.4S, v9.8H, v5.8H // .........................*................... - // smlal v17.4S, v9.4H, v5.4H // ...........................*................. - // smlal v20.4S, v9.4H, v15.4H // ......................*...................... - // smlal2 v27.4S, v9.8H, v15.8H // ........................*.................... - // uzp1 v31.8H, v17.8H, v13.8H // ...............................*............. - // uzp1 v8.8H, v20.8H, v27.8H // ............................*................ - // mul v18.8H, v31.8H, v2.H[2] // .................................*........... - // mul v14.8H, v8.8H, v2.H[2] // ..............................*.............. - // smlal v17.4S, v18.4H, v0.4H // .....................................*....... - // smlal2 v27.4S, v14.8H, v0.8H // ..................................*.......... - // smlal2 v13.4S, v18.8H, v0.8H // ......................................*...... - // smlal v20.4S, v14.4H, v0.4H // ...................................*......... - // uzp2 v16.8H, v17.8H, v13.8H // ..........................................*.. - // uzp2 v17.8H, v20.8H, v27.8H // .........................................*... - // st2 {v16.8H, v17.8H}, [x0], #32 // ............................................* + cbnz count, 1b + // Instructions: 55 + // Expected cycles: 61 + // Expected IPC: 0.90 + + // Cycle bound: 61.0 + // IPC bound: 0.90 + + // Wall time: 8.41s + // User time: 8.41s + + // ----------------- original position ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + ldr q7, [x1], #32 // *...................................................... + uzp1 v20.8H, v15.8H, v16.8H // .*..................................................... + uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + uzp1 v23.8H, v7.8H, v30.8H // ...*................................................... + uzp2 v11.8H, v7.8H, v30.8H // ....*.................................................. + smull2 v8.4S, v23.8H, v20.8H // ........*.............................................. + smull v5.4S, v23.4H, v20.4H // .......*............................................... + smull2 v30.4S, v23.8H, v15.8H // ......*................................................ + uzp1 v28.8H, v1.8H, v12.8H // ...............*....................................... + smlal2 v8.4S, v11.8H, v9.8H // ............*.......................................... + smlal v5.4S, v11.4H, v9.4H // ...........*........................................... + uzp1 v3.8H, v21.8H, v6.8H // .............*......................................... + smull v16.4S, v23.4H, v15.4H // .....*................................................. + smlal2 v8.4S, v3.8H, v28.8H // ..................*.................................... + smlal v5.4S, v3.4H, v28.4H // .................*..................................... + uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................ + uzp1 v7.8H, v17.8H, v18.8H // ...........................*........................... + smlal2 v8.4S, v29.8H, v24.8H // ......................*................................ + uzp1 v14.8H, v19.8H, v31.8H // .........................*............................. + smlal v16.4S, v11.4H, v20.4H // .........*............................................. + smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................ + smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................ + uzp2 v20.8H, v1.8H, v12.8H // ................*...................................... + uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................ + smlal2 v30.4S, v3.8H, v20.8H // ....................*.................................. + smlal v16.4S, v3.4H, v20.4H // ...................*................................... + smlal v5.4S, v29.4H, v24.4H // .....................*................................. + uzp2 v9.8H, v17.8H, v18.8H // ............................*.......................... + smlal2 v30.4S, v29.8H, v28.8H // ........................*.............................. + smlal v16.4S, v29.4H, v28.4H // .......................*............................... + smlal v5.4S, v14.4H, v7.4H // .............................*......................... + smlal2 v8.4S, v21.8H, v25.8H // ..................................*.................... + smlal2 v30.4S, v14.8H, v9.8H // ................................*...................... + smlal v16.4S, v14.4H, v9.4H // ...............................*....................... + smlal v5.4S, v21.4H, v25.4H // .................................*..................... + zip1 v20.8H, v27.8H, v10.8H // ..........................................*............ + smlal2 v30.4S, v21.8H, v7.8H // ....................................*.................. + smlal v16.4S, v21.4H, v7.4H // ...................................*................... + uzp1 v7.8H, v5.8H, v8.8H // .....................................*................. + str q20, [x0], #32 // ...............................................*....... + mul v15.8H, v7.8H, v2.8H // .......................................*............... + uzp1 v7.8H, v16.8H, v30.8H // ......................................*................ + zip2 v31.8H, v27.8H, v10.8H // .........................................*............. + mul v20.8H, v7.8H, v2.8H // ........................................*.............. + smlal v5.4S, v15.4H, v0.4H // ...........................................*........... + smlal2 v8.4S, v15.8H, v0.8H // ............................................*.......... + str q31, [x0, #-16] // .................................................*..... + smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........ + smlal v16.4S, v20.4H, v0.4H // .............................................*......... + uzp2 v15.8H, v5.8H, v8.8H // ................................................*...... + uzp2 v20.8H, v16.8H, v30.8H // ..................................................*.... + zip1 v7.8H, v15.8H, v20.8H // ....................................................*.. + zip2 v20.8H, v15.8H, v20.8H // ...................................................*... + str q7, [x0], #32 // .....................................................*. + str q20, [x0, #-16] // ......................................................* + + // -------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // ldr q20, [x1], #32 // *...................................................... + // uzp1 v7.8H, v15.8H, v16.8H // .*..................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ...*................................................... + // uzp2 v20.8H, v20.8H, v30.8H // ....*.................................................. + // smull v30.4S, v8.4H, v15.4H // ............*.......................................... + // smull2 v15.4S, v8.8H, v15.8H // .......*............................................... + // smull v11.4S, v8.4H, v7.4H // ......*................................................ + // smull2 v8.4S, v8.8H, v7.8H // .....*................................................. + // smlal v30.4S, v20.4H, v7.4H // ...................*................................... + // smlal2 v15.4S, v20.8H, v7.8H // ....................*.................................. + // smlal v11.4S, v20.4H, v9.4H // ..........*............................................ + // smlal2 v8.4S, v20.8H, v9.8H // .........*............................................. + // uzp1 v7.8H, v21.8H, v6.8H // ...........*........................................... + // uzp2 v20.8H, v21.8H, v6.8H // ...............*....................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........*.............................................. + // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................ + // smlal v11.4S, v7.4H, v16.4H // ..............*........................................ + // smlal2 v8.4S, v7.8H, v16.8H // .............*......................................... + // smlal v30.4S, v7.4H, v9.4H // .........................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ........................*.............................. + // smlal v11.4S, v20.4H, v24.4H // ..........................*............................ + // smlal2 v8.4S, v20.8H, v24.8H // .................*..................................... + // smlal v30.4S, v20.4H, v16.4H // .............................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ............................*.......................... + // uzp1 v7.8H, v19.8H, v31.8H // ..................*.................................... + // uzp2 v20.8H, v19.8H, v31.8H // .......................*............................... + // uzp1 v16.8H, v17.8H, v18.8H // ................*...................................... + // uzp2 v9.8H, v17.8H, v18.8H // ...........................*........................... + // smlal v11.4S, v7.4H, v16.4H // ..............................*........................ + // smlal2 v8.4S, v7.8H, v16.8H // .....................*................................. + // smlal v30.4S, v7.4H, v9.4H // .................................*..................... + // smlal2 v15.4S, v7.8H, v9.8H // ................................*...................... + // smlal v11.4S, v20.4H, v25.4H // ..................................*.................... + // smlal2 v8.4S, v20.8H, v25.8H // ...............................*....................... + // smlal v30.4S, v20.4H, v16.4H // .....................................*................. + // smlal2 v15.4S, v20.8H, v16.8H // ....................................*.................. + // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................ + // uzp1 v20.8H, v30.8H, v15.8H // .........................................*............. + // mul v7.8H, v7.8H, v2.8H // ........................................*.............. + // mul v20.8H, v20.8H, v2.8H // ...........................................*........... + // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............ + // zip1 v27.8H, v27.8H, v10.8H // ...................................*................... + // smlal v11.4S, v7.4H, v0.4H // ............................................*.......... + // smlal2 v8.4S, v7.8H, v0.8H // .............................................*......... + // smlal v30.4S, v20.4H, v0.4H // ................................................*...... + // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*....... + // str q27, [x0], #32 // .......................................*............... + // uzp2 v27.8H, v11.8H, v8.8H // .................................................*..... + // str q9, [x0, #-16] // ..............................................*........ + // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*.... + // zip2 v9.8H, v27.8H, v10.8H // ....................................................*.. + // zip1 v27.8H, v27.8H, v10.8H // ...................................................*... + // str q27, [x0], #32 // .....................................................*. + // str q9, [x0, #-16] // ......................................................* pop_stack @@ -703,17 +1001,12 @@ k3_loop_start: #endif /* MLKEM_K == 3 */ #if MLKEM_K == 4 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -727,394 +1020,561 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): add b3_ptr, b0_ptr, #(3 * 512) add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) + // Bounds: + + // Each pmull is bound by 2*4096*2^15=2^28, so the final value + // before Montgomery reduction is bound by 2^30. + mov count, #(MLKEM_N / 16) - // Instructions: 18 - // Expected cycles: 27 - // Expected IPC: 0.67 - // - // Cycle bound: 27.0 - // IPC bound: 0.67 - // - // Wall time: 0.03s - // User time: 0.03s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ld2 {v3.8H, v4.8H}, [x1], #32 // *............................. - ld2 {v13.8H, v14.8H}, [x2], #32 // ..*........................... - ld1 {v31.8H}, [x3], #16 // ....*......................... - smull v9.4S, v3.4H, v14.4H // ......*....................... - smull2 v6.4S, v3.8H, v14.8H // .......*...................... - smull2 v29.4S, v3.8H, v13.8H // ........*..................... - smull v11.4S, v3.4H, v13.4H // .........*.................... - smlal v9.4S, v4.4H, v13.4H // ..........*................... - smlal2 v6.4S, v4.8H, v13.8H // ...........*.................. - smlal2 v29.4S, v4.8H, v31.8H // ............*................. - smlal v11.4S, v4.4H, v31.4H // .............*................ - ld2 {v27.8H, v28.8H}, [x7], #32 // ..............*............... - ld2 {v18.8H, v19.8H}, [x10], #32 // ................*............. - ld2 {v20.8H, v21.8H}, [x8], #32 // ..................*........... - ld2 {v25.8H, v26.8H}, [x11], #32 // ....................*......... - ld2 {v16.8H, v17.8H}, [x5], #32 // ......................*....... - ld2 {v7.8H, v8.8H}, [x1], #32 // ........................*..... - ld1 {v3.8H}, [x3], #16 // ..........................*... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ld2 {v7.8H, v8.8H}, [x1], #32 // *.............................. - // ld1 {v3.8H}, [x3], #16 // ....*.......................... - // ld2 {v4.8H, v5.8H}, [x2], #32 // ..*............................ - // ld2 {v27.8H, v28.8H}, [x7], #32 // ..............*................ - // ld2 {v18.8H, v19.8H}, [x10], #32 // ................*.............. - // smull v9.4S, v7.4H, v5.4H // ......*........................ - // ld2 {v20.8H, v21.8H}, [x8], #32 // ..................*............ - // smlal v9.4S, v8.4H, v4.4H // ..........*.................... - // smull2 v29.4S, v7.8H, v4.8H // ........*...................... - // smull2 v6.4S, v7.8H, v5.8H // .......*....................... - // ld2 {v25.8H, v26.8H}, [x11], #32 // ....................*.......... - // smull v11.4S, v7.4H, v4.4H // .........*..................... - // smlal2 v6.4S, v8.8H, v4.8H // ...........*................... - // smlal2 v29.4S, v8.8H, v3.8H // ............*.................. - // ld2 {v16.8H, v17.8H}, [x5], #32 // ......................*........ - // smlal v11.4S, v8.4H, v3.4H // .............*................. - // ld2 {v7.8H, v8.8H}, [x1], #32 // ........................*...... - // ld1 {v3.8H}, [x3], #16 // ..........................*.... + // Instructions: 114 + // Expected cycles: 153 + // Expected IPC: 0.75 + // + // Cycle bound: 153.0 + // IPC bound: 0.75 + // + // Wall time: 0.69s + // User time: 0.69s + // + // ----------------------------------------------- original position -----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + ldr q23, [x2, #16] // .*................................................................................................................ + ldr q19, [x2], #32 // *................................................................................................................. + ldr q17, [x5], #32 // ..*............................................................................................................... + uzp2 v13.8H, v19.8H, v23.8H // ..........*....................................................................................................... + uzp1 v19.8H, v19.8H, v23.8H // ...........*...................................................................................................... + ldr q23, [x5, #-16] // ...*.............................................................................................................. + ldr q30, [x1, #16] // .....*............................................................................................................ + uzp2 v9.8H, v17.8H, v23.8H // ....*............................................................................................................. + uzp1 v23.8H, v17.8H, v23.8H // .......*.......................................................................................................... + ldr q17, [x1], #32 // ......*........................................................................................................... + ldr q10, [x7, #16] // .............*.................................................................................................... + uzp1 v12.8H, v17.8H, v30.8H // ........*......................................................................................................... + uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................ + smull2 v30.4S, v12.8H, v13.8H // ............*..................................................................................................... + smull v13.4S, v12.4H, v13.4H // ............................................*..................................................................... + smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................ + smull v12.4S, v12.4H, v19.4H // ..........................................*....................................................................... + smlal2 v30.4S, v17.8H, v19.8H // ...............................*.................................................................................. + smlal v13.4S, v17.4H, v19.4H // ...............................................*.................................................................. + ldr q19, [x4], #32 // ....................*............................................................................................. + ldr q16, [x4, #-16] // .....................*............................................................................................ + ld1 {v8.8H}, [x3], #16 // ................................*................................................................................. + uzp1 v26.8H, v19.8H, v16.8H // .......................*.......................................................................................... + uzp2 v19.8H, v19.8H, v16.8H // ........................*......................................................................................... + smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................ + smlal v13.4S, v26.4H, v9.4H // ..................................................*............................................................... + smlal2 v22.4S, v17.8H, v8.8H // ........................................*......................................................................... + smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................ + smlal2 v30.4S, v19.8H, v23.8H // ...................................*.............................................................................. + smlal v13.4S, v19.4H, v23.4H // .......................................................*.......................................................... + smlal2 v22.4S, v26.8H, v23.8H // ...........................................*...................................................................... + smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................ + ldr q23, [x7], #32 // ......................*........................................................................................... + ldr q17, [x8, #16] // ..............*................................................................................................... + uzp1 v9.8H, v23.8H, v10.8H // ..........................*....................................................................................... + uzp2 v23.8H, v23.8H, v10.8H // ....................................*............................................................................. + ldr q10, [x10], #32 // ...............*.................................................................................................. + ldr q16, [x10, #-16] // ................*................................................................................................. + ld1 {v8.8H}, [x12], #16 // .................*................................................................................................ + uzp1 v26.8H, v10.8H, v16.8H // ..................*............................................................................................... + uzp2 v10.8H, v10.8H, v16.8H // ...................*.............................................................................................. + ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................ + ldr q3, [x11, #16] // ...........................*...................................................................................... + smlal2 v22.4S, v19.8H, v16.8H // ..............................................*................................................................... + smlal v12.4S, v19.4H, v16.4H // ........................................................*......................................................... + ldr q19, [x11], #32 // ............................*..................................................................................... + ld1 {v16.8H}, [x9], #16 // .............................*.................................................................................... + uzp1 v4.8H, v19.8H, v3.8H // ..................................*............................................................................... + uzp2 v19.8H, v19.8H, v3.8H // .......................................*.......................................................................... + ldr q3, [x8], #32 // ..............................*................................................................................... + ldr q31, [x2], #32 // ......................................*........................................................................... + uzp1 v6.8H, v3.8H, v17.8H // ...................................................*.............................................................. + uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................ + smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*....................................................... + smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*...................................................... + smlal v13.4S, v9.4H, v17.4H // ............................................................*..................................................... + smlal v12.4S, v9.4H, v6.4H // .............................................................*.................................................... + smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*................................................... + smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*.................................................. + smlal v13.4S, v23.4H, v6.4H // ................................................................*................................................. + smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................ + smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*............................................... + smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*.............................................. + smlal v13.4S, v26.4H, v19.4H // ....................................................................*............................................. + smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................ + smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*........................................... + smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*.......................................... + smlal v13.4S, v10.4H, v4.4H // ........................................................................*......................................... + smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................ + ldr q19, [x2, #-16] // .........................................*........................................................................ + uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*...................................... + uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*............................. + mul v23.8H, v23.8H, v2.8H // .............................................................................*.................................... + uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*................................. + uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*.............................. + mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................ + smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................ + smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*............................... + ldr q23, [x5], #32 // .............................................*.................................................................... + smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*...... + uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*........................... + smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*..... + ldr q17, [x5, #-16] // ................................................*................................................................. + ldr q13, [x1, #16] // ......................................................*........................................................... + uzp2 v27.8H, v23.8H, v17.8H // ....................................................*............................................................. + uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*..................................... + uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*.. + ldr q23, [x1], #32 // ..........................................................................*....................................... + zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................* + ldr q3, [x7, #16] // ........................................................................................*......................... + uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*................................... + uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*.................................. + smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*.......................... + ldr q6, [x8, #16] // .........................................................................................*........................ + ldr q23, [x10], #32 // ..........................................................................................*....................... + smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*....... + ldr q17, [x10, #-16] // ...........................................................................................*...................... + ld1 {v22.8H}, [x12], #16 // ............................................................................................*..................... + uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*.................... + uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*................... + ldr q23, [x4], #32 // ...............................................................................................*.................. + ldr q17, [x4, #-16] // ................................................................................................*................. + ldr q4, [x7], #32 // .................................................................................................*................ + uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*............... + uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*.............. + uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............ + smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*... + ld1 {v8.8H}, [x6], #16 // ....................................................................................................*............. + ldr q25, [x11, #16] // ......................................................................................................*........... + ldr q29, [x11], #32 // .......................................................................................................*.......... + ld1 {v12.8H}, [x9], #16 // ........................................................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*. + ldr q14, [x8], #32 // .........................................................................................................*........ + ld1 {v23.8H}, [x3], #16 // .............................................................................................................*.... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q3, [x2], #32 // .*................................................................................................................ + // ldr q17, [x2, #-16] // *................................................................................................................. + // ldr q21, [x5], #32 // ..*............................................................................................................... + // ldr q19, [x5, #-16] // .....*............................................................................................................ + // uzp2 v27.8H, v21.8H, v19.8H // .......*.......................................................................................................... + // ldr q25, [x1, #16] // ......*........................................................................................................... + // ldr q22, [x1], #32 // .........*........................................................................................................ + // uzp1 v28.8H, v21.8H, v19.8H // ........*......................................................................................................... + // uzp1 v31.8H, v22.8H, v25.8H // ...........*...................................................................................................... + // uzp2 v16.8H, v22.8H, v25.8H // ............*..................................................................................................... + // uzp2 v21.8H, v3.8H, v17.8H // ...*.............................................................................................................. + // uzp1 v19.8H, v3.8H, v17.8H // ....*............................................................................................................. + // smull2 v24.4S, v31.8H, v21.8H // .............*.................................................................................................... + // ldr q3, [x7, #16] // ..........*....................................................................................................... + // ldr q6, [x8, #16] // .................................*................................................................................ + // ldr q8, [x10], #32 // ....................................*............................................................................. + // ldr q26, [x10, #-16] // .....................................*............................................................................ + // ld1 {v22.8H}, [x12], #16 // ......................................*........................................................................... + // uzp1 v30.8H, v8.8H, v26.8H // .......................................*.......................................................................... + // uzp2 v11.8H, v8.8H, v26.8H // ........................................*......................................................................... + // ldr q8, [x4], #32 // ...................*.............................................................................................. + // ldr q26, [x4, #-16] // ....................*............................................................................................. + // ldr q4, [x7], #32 // ................................*................................................................................. + // uzp1 v20.8H, v8.8H, v26.8H // ......................*........................................................................................... + // uzp2 v26.8H, v8.8H, v26.8H // .......................*.......................................................................................... + // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................ + // uzp1 v9.8H, v4.8H, v3.8H // ..................................*............................................................................... + // ldr q25, [x11, #16] // ..........................................*....................................................................... + // ldr q29, [x11], #32 // .............................................*.................................................................... + // ld1 {v12.8H}, [x9], #16 // ..............................................*................................................................... + // ldr q14, [x8], #32 // .................................................*................................................................ + // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................ + // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................ + // smlal2 v24.4S, v20.8H, v27.8H // ........................*......................................................................................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*.................................................................. + // smlal2 v24.4S, v26.8H, v28.8H // ............................*..................................................................................... + // uzp2 v4.8H, v4.8H, v3.8H // ...................................*.............................................................................. + // smull2 v13.4S, v31.8H, v19.8H // ...............*.................................................................................................. + // ldr q3, [x2], #32 // ..................................................*............................................................... + // uzp2 v1.8H, v29.8H, v25.8H // ................................................*................................................................. + // smlal2 v13.4S, v16.8H, v23.8H // ..........................*....................................................................................... + // ldr q17, [x2, #-16] // .....................................................................*............................................ + // smull v18.4S, v31.4H, v19.4H // ................*................................................................................................. + // smlal2 v13.4S, v20.8H, v28.8H // ..............................*................................................................................... + // smull v29.4S, v31.4H, v21.4H // ..............*................................................................................................... + // ldr q21, [x5], #32 // ..............................................................................*................................... + // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*...................................................................... + // smlal v29.4S, v16.4H, v19.4H // ..................*............................................................................................... + // ldr q19, [x5, #-16] // ..................................................................................*............................... + // smlal v18.4S, v16.4H, v23.4H // ...........................*...................................................................................... + // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................ + // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*.............................................................. + // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*............................. + // smlal v18.4S, v20.4H, v28.4H // ...............................*.................................................................................. + // ldr q25, [x1, #16] // ...................................................................................*.............................. + // smlal v29.4S, v26.4H, v28.4H // .............................*.................................................................................... + // smlal v18.4S, v26.4H, v8.4H // ............................................*..................................................................... + // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*............................................................. + // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................ + // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*........................................................... + // smlal v29.4S, v9.4H, v26.4H // .......................................................*.......................................................... + // smlal v18.4S, v9.4H, v31.4H // ........................................................*......................................................... + // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................ + // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*....................................................... + // smlal v29.4S, v4.4H, v31.4H // ...........................................................*...................................................... + // smlal v18.4S, v4.4H, v12.4H // ............................................................*..................................................... + // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................................................... + // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*................................................... + // smlal v29.4S, v30.4H, v1.4H // ...............................................................*.................................................. + // smlal v18.4S, v30.4H, v10.4H // ................................................................*................................................. + // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................ + // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*............................................... + // smlal v29.4S, v11.4H, v10.4H // ...................................................................*.............................................. + // smlal v18.4S, v11.4H, v22.4H // ....................................................................*............................................. + // ldr q22, [x1], #32 // .......................................................................................*.......................... + // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*........................................... + // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................ + // mul v19.8H, v31.8H, v2.8H // ........................................................................*......................................... + // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*....................... + // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*...................... + // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................ + // smlal v29.4S, v19.4H, v0.4H // ............................................................................*..................................... + // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*.................................... + // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*....................................... + // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*.......................................... + // mul v23.8H, v26.8H, v2.8H // ...........................................................................*...................................... + // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*................................. + // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*..................... + // ldr q3, [x7, #16] // .........................................................................................*........................ + // ldr q6, [x8, #16] // .............................................................................................*.................... + // ldr q8, [x10], #32 // ..............................................................................................*................... + // ldr q26, [x10, #-16] // ................................................................................................*................. + // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................ + // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*............... + // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*.............. + // ldr q8, [x4], #32 // ....................................................................................................*............. + // ldr q26, [x4, #-16] // .....................................................................................................*............ + // ldr q4, [x7], #32 // ......................................................................................................*........... + // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*.......... + // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*......... + // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*...... + // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........ + // ldr q25, [x11, #16] // ............................................................................................................*..... + // ldr q29, [x11], #32 // .............................................................................................................*.... + // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*... + // ldr q14, [x8], #32 // ................................................................................................................*. + // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*.................. + // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*.................................. + // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................ + // ld1 {v23.8H}, [x3], #16 // .................................................................................................................* + // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*........................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*.. + // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*......................... sub count, count, #2 -k4_loop_start: - // Instructions: 55 - // Expected cycles: 68 - // Expected IPC: 0.81 - // - // Cycle bound: 68.0 - // IPC bound: 0.81 - // - // Wall time: 13.63s - // User time: 13.63s - // - // ------------------------ cycle (expected) -------------------------> - // 0 25 50 - // |------------------------|------------------------|----------------- - ld2 {v13.8H, v14.8H}, [x4], #32 // l................................................................... - ld1 {v15.8H}, [x6], #16 // ..l................................................................. - smlal2 v29.4S, v13.8H, v16.8H // ....l............................................................... - smlal v9.4S, v13.4H, v17.4H // .....l.............................................................. - smlal2 v6.4S, v13.8H, v17.8H // ......l............................................................. - smlal v11.4S, v13.4H, v16.4H // .......l............................................................ - smlal2 v29.4S, v14.8H, v15.8H // ........l........................................................... - smlal v9.4S, v14.4H, v16.4H // .........l.......................................................... - smlal2 v6.4S, v14.8H, v16.8H // ..........l......................................................... - ld1 {v30.8H}, [x9], #16 // ...........l........................................................ - smlal v9.4S, v27.4H, v21.4H // .............l...................................................... - smlal2 v6.4S, v27.8H, v21.8H // ..............l..................................................... - smlal v11.4S, v14.4H, v15.4H // ...............l.................................................... - smlal2 v29.4S, v27.8H, v20.8H // ................l................................................... - smlal v9.4S, v28.4H, v20.4H // .................l.................................................. - smlal2 v6.4S, v28.8H, v20.8H // ..................l................................................. - smlal v11.4S, v27.4H, v20.4H // ...................l................................................ - smlal2 v29.4S, v28.8H, v30.8H // ....................l............................................... - smlal v9.4S, v18.4H, v26.4H // .....................l.............................................. - smlal2 v6.4S, v18.8H, v26.8H // ......................l............................................. - smlal v11.4S, v28.4H, v30.4H // .......................l............................................ - smlal2 v29.4S, v18.8H, v25.8H // ........................l........................................... - smlal v9.4S, v19.4H, v25.4H // .........................l.......................................... - smlal2 v6.4S, v19.8H, v25.8H // ..........................l......................................... - ld1 {v23.8H}, [x12], #16 // ...........................l........................................ - smlal v11.4S, v18.4H, v25.4H // .............................l...................................... - uzp1 v14.8H, v9.8H, v6.8H // ..............................l..................................... - smlal2 v29.4S, v19.8H, v23.8H // ...............................l.................................... - mul v27.8H, v14.8H, v2.H[2] // ................................l................................... - smlal v11.4S, v19.4H, v23.4H // .................................l.................................. - ld2 {v4.8H, v5.8H}, [x2], #32 // ..................................*................................. - smlal v9.4S, v27.4H, v0.4H // ....................................l............................... - smlal2 v6.4S, v27.8H, v0.8H // .....................................l.............................. - ld2 {v27.8H, v28.8H}, [x7], #32 // ......................................*............................. - uzp1 v13.8H, v11.8H, v29.8H // ........................................l........................... - uzp2 v15.8H, v9.8H, v6.8H // .........................................l.......................... - mul v13.8H, v13.8H, v2.H[2] // ..........................................l......................... - ld2 {v18.8H, v19.8H}, [x10], #32 // ...........................................*........................ - smull v9.4S, v7.4H, v5.4H // .............................................*...................... - smlal v11.4S, v13.4H, v0.4H // ..............................................l..................... - smlal2 v29.4S, v13.8H, v0.8H // ...............................................l.................... - ld2 {v20.8H, v21.8H}, [x8], #32 // ................................................*................... - smlal v9.4S, v8.4H, v4.4H // ..................................................*................. - uzp2 v14.8H, v11.8H, v29.8H // ...................................................l................ - smull2 v29.4S, v7.8H, v4.8H // ....................................................*............... - smull2 v6.4S, v7.8H, v5.8H // .....................................................*.............. - ld2 {v25.8H, v26.8H}, [x11], #32 // ......................................................*............. - smull v11.4S, v7.4H, v4.4H // ........................................................*........... - smlal2 v6.4S, v8.8H, v4.8H // .........................................................*.......... - smlal2 v29.4S, v8.8H, v3.8H // ..........................................................*......... - ld2 {v16.8H, v17.8H}, [x5], #32 // ...........................................................*........ - smlal v11.4S, v8.4H, v3.4H // .............................................................*...... - ld2 {v7.8H, v8.8H}, [x1], #32 // ..............................................................e..... - ld1 {v3.8H}, [x3], #16 // ................................................................e... - st2 {v14.8H, v15.8H}, [x0], #32 // ..................................................................l. - - // ------------------------------------------------------------- cycle (expected) -------------------------------------------------------------> - // 0 25 50 75 100 125 - // |------------------------|------------------------|------------------------|------------------------|------------------------|--------------- - // ld2 {v3.8h, v4.8h}, [x1], #32 // e.....'.............................................................~.....'.............................................................~.... - // ld2 {v5.8h, v6.8h}, [x2], #32 // ......'.................................*.................................'.................................~................................ - // ld1 {v7.8h}, [x3], #16 // ..e...'...............................................................~...'...............................................................~.. - // smull v8.4s, v3.4h, v5.4h // ......'.......................................................*...........'.......................................................~.......... - // smull2 v10.4s, v3.8h, v5.8h // ......'...................................................*...............'...................................................~.............. - // smlal v8.4s, v4.4h, v7.4h // ......'............................................................*......'............................................................~..... - // smlal2 v10.4s, v4.8h, v7.8h // ......'.........................................................*.........'.........................................................~........ - // smull v9.4s, v3.4h, v6.4h // ......'............................................*......................'............................................~..................... - // smull2 v11.4s, v3.8h, v6.8h // ......'....................................................*..............'....................................................~............. - // smlal v9.4s, v4.4h, v5.4h // ......'.................................................*.................'.................................................~................ - // smlal2 v11.4s, v4.8h, v5.8h // ......'........................................................*..........'........................................................~......... - // ld2 {v3.8h, v4.8h}, [x4], #32 // ......~...................................................................l.................................................................. - // ld2 {v5.8h, v6.8h}, [x5], #32 // ......'..........................................................*........'..........................................................~....... - // ld1 {v7.8h}, [x6], #16 // ......'.~.................................................................'.l................................................................ - // smlal v8.4s, v3.4h, v5.4h // ......'......~............................................................'......l........................................................... - // smlal2 v10.4s, v3.8h, v5.8h // ......'...~...............................................................'...l.............................................................. - // smlal v8.4s, v4.4h, v7.4h // ......'..............~....................................................'..............l................................................... - // smlal2 v10.4s, v4.8h, v7.8h // ......'.......~...........................................................'.......l.......................................................... - // smlal v9.4s, v3.4h, v6.4h // ......'....~..............................................................'....l............................................................. - // smlal2 v11.4s, v3.8h, v6.8h // ......'.....~.............................................................'.....l............................................................ - // smlal v9.4s, v4.4h, v5.4h // ......'........~..........................................................'........l......................................................... - // smlal2 v11.4s, v4.8h, v5.8h // ......'.........~.........................................................'.........l........................................................ - // ld2 {v3.8h, v4.8h}, [x7], #32 // ......'.....................................*.............................'.....................................~............................ - // ld2 {v5.8h, v6.8h}, [x8], #32 // ......'...............................................*...................'...............................................~.................. - // ld1 {v7.8h}, [x9], #16 // ......'..........~........................................................'..........l....................................................... - // smlal v8.4s, v3.4h, v5.4h // ......'..................~................................................'..................l............................................... - // smlal2 v10.4s, v3.8h, v5.8h // ......'...............~...................................................'...............l.................................................. - // smlal v8.4s, v4.4h, v7.4h // ......'......................~............................................'......................l........................................... - // smlal2 v10.4s, v4.8h, v7.8h // ......'...................~...............................................'...................l.............................................. - // smlal v9.4s, v3.4h, v6.4h // ......'............~......................................................'............l..................................................... - // smlal2 v11.4s, v3.8h, v6.8h // ......'.............~.....................................................'.............l.................................................... - // smlal v9.4s, v4.4h, v5.4h // ......'................~..................................................'................l................................................. - // smlal2 v11.4s, v4.8h, v5.8h // ......'.................~.................................................'.................l................................................ - // ld2 {v3.8h, v4.8h}, [x10], #32 // ......'..........................................*........................'..........................................~....................... - // ld2 {v5.8h, v6.8h}, [x11], #32 // ......'.....................................................*.............'.....................................................~............ - // ld1 {v7.8h}, [x12], #16 // ......'..........................~........................................'..........................l....................................... - // smlal v8.4s, v3.4h, v5.4h // ......'............................~......................................'............................l..................................... - // smlal2 v10.4s, v3.8h, v5.8h // ......'.......................~...........................................'.......................l.......................................... - // smlal v8.4s, v4.4h, v7.4h // ......'................................~..................................'................................l................................. - // smlal2 v10.4s, v4.8h, v7.8h // ......'..............................~....................................'..............................l................................... - // smlal v9.4s, v3.4h, v6.4h // ......'....................~..............................................'....................l............................................. - // smlal2 v11.4s, v3.8h, v6.8h // ......'.....................~.............................................'.....................l............................................ - // smlal v9.4s, v4.4h, v5.4h // ......'........................~..........................................'........................l......................................... - // smlal2 v11.4s, v4.8h, v5.8h // ......'.........................~.........................................'.........................l........................................ - // uzp1 v28.8h, v8.8h, v10.8h // ......'.......................................~...........................'.......................................l.......................... - // mul v28.8h, v28.8h, v2.h[2] // ......'.........................................~.........................'.........................................l........................ - // smlal v8.4s, v28.4h, v0.4h // ......'.............................................~.....................'.............................................l.................... - // smlal2 v10.4s, v28.8h, v0.8h // ......'..............................................~....................'..............................................l................... - // uzp2 v26.8h, v8.8h, v10.8h // ......'..................................................~................'..................................................l............... - // uzp1 v28.8h, v9.8h, v11.8h // ......'.............................~.....................................'.............................l.................................... - // mul v28.8h, v28.8h, v2.h[2] // ......'...............................~...................................'...............................l.................................. - // smlal v9.4s, v28.4h, v0.4h // ......'...................................~...............................'...................................l.............................. - // smlal2 v11.4s, v28.8h, v0.8h // ......'....................................~..............................'....................................l............................. - // uzp2 v27.8h, v9.8h, v11.8h // ......'........................................~..........................'........................................l......................... - // st2 {v26.8h, v27.8h}, [x0], #32 // ....~.'.................................................................~.'.................................................................l +1: + // Instructions: 82 + // Expected cycles: 102 + // Expected IPC: 0.80 + // + // Cycle bound: 102.0 + // IPC bound: 0.80 + // + // Wall time: 15.93s + // User time: 15.93s + // + // ------------------------------- original position -------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................ + uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................ + smull2 v13.4S, v31.8H, v19.8H // ..........*....................................................................... + ldr q3, [x2], #32 // ....e............................................................................. + uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*....................... + smlal2 v13.4S, v16.8H, v23.8H // ............*..................................................................... + ldr q17, [x2, #-16] // .....e............................................................................ + smull v18.4S, v31.4H, v19.4H // .........*........................................................................ + smlal2 v13.4S, v20.8H, v28.8H // ...........................*...................................................... + smull v29.4S, v31.4H, v21.4H // .............*.................................................................... + ldr q21, [x5], #32 // .....................e............................................................ + smlal2 v13.4S, v26.8H, v8.8H // .............................*.................................................... + smlal v29.4S, v16.4H, v19.4H // ...............*.................................................................. + ldr q19, [x5, #-16] // ......................e........................................................... + smlal v18.4S, v16.4H, v23.4H // ...........*...................................................................... + smlal v29.4S, v20.4H, v27.4H // ..............................*................................................... + uzp1 v31.8H, v14.8H, v6.8H // ........................................*......................................... + uzp2 v27.8H, v21.8H, v19.8H // ........................e......................................................... + smlal v18.4S, v20.4H, v28.4H // ..........................*....................................................... + ldr q25, [x1, #16] // .e................................................................................ + smlal v29.4S, v26.4H, v28.4H // ................................*................................................. + smlal v18.4S, v26.4H, v8.4H // ............................*..................................................... + uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................ + smlal2 v13.4S, v9.8H, v31.8H // ............................................*..................................... + smlal2 v24.4S, v9.8H, v26.8H // ................................................*................................. + smlal v29.4S, v9.4H, v26.4H // ...............................................*.................................. + smlal v18.4S, v9.4H, v31.4H // ...........................................*...................................... + smlal2 v13.4S, v4.8H, v12.8H // ..............................................*................................... + smlal2 v24.4S, v4.8H, v31.8H // ..................................................*............................... + smlal v29.4S, v4.4H, v31.4H // .................................................*................................ + smlal v18.4S, v4.4H, v12.4H // .............................................*.................................... + smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................... + smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................ + smlal v29.4S, v30.4H, v1.4H // ................................................................*................. + smlal v18.4S, v30.4H, v10.4H // ............................................................*..................... + smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*.................. + smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*.............. + smlal v29.4S, v11.4H, v10.4H // ..................................................................*............... + smlal v18.4S, v11.4H, v22.4H // ..............................................................*................... + ldr q22, [x1], #32 // e................................................................................. + uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........ + uzp1 v28.8H, v21.8H, v19.8H // .......................e.......................................................... + mul v19.8H, v31.8H, v2.8H // ..........................................................................*....... + uzp1 v31.8H, v22.8H, v25.8H // ..e............................................................................... + uzp2 v16.8H, v22.8H, v25.8H // ...e.............................................................................. + uzp2 v21.8H, v3.8H, v17.8H // .......e.......................................................................... + smlal v29.4S, v19.4H, v0.4H // ...........................................................................*...... + smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*..... + uzp1 v19.8H, v3.8H, v17.8H // ......e........................................................................... + uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*............. + zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l.. + mul v23.8H, v26.8H, v2.8H // .....................................................................*............ + uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*.... + smull2 v24.4S, v31.8H, v21.8H // ..............e................................................................... + str q14, [x0, #16] // .................................................................................l + ldr q3, [x7, #16] // ...................................e.............................................. + ldr q6, [x8, #16] // .......................................e.......................................... + ldr q8, [x10], #32 // ...................................................e.............................. + ldr q26, [x10, #-16] // ....................................................e............................. + ld1 {v22.8H}, [x12], #16 // ...........................................................e...................... + uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................ + uzp2 v11.8H, v8.8H, v26.8H // ......................................................e........................... + ldr q8, [x4], #32 // .................e................................................................ + ldr q26, [x4, #-16] // ..................e............................................................... + ldr q4, [x7], #32 // ..................................e............................................... + uzp1 v20.8H, v8.8H, v26.8H // ...................e.............................................................. + uzp2 v26.8H, v8.8H, v26.8H // ....................e............................................................. + ld1 {v8.8H}, [x6], #16 // .........................e........................................................ + uzp1 v9.8H, v4.8H, v3.8H // ....................................e............................................. + ldr q25, [x11, #16] // ........................................................e......................... + ldr q29, [x11], #32 // .......................................................e.......................... + ld1 {v12.8H}, [x9], #16 // ..........................................e....................................... + ldr q14, [x8], #32 // ......................................e........................................... + smlal2 v24.4S, v16.8H, v19.8H // ................e................................................................. + smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*.......... + smlal v18.4S, v23.4H, v0.4H // ......................................................................*........... + ld1 {v23.8H}, [x3], #16 // ........e......................................................................... + smlal2 v24.4S, v20.8H, v27.8H // ...............................e.................................................. + uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................ + str q5, [x0], #32 // ................................................................................l. + zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*... + + // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~......................................... + // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~............................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~.................................... + // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~............................................................................. + // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~.......................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................ + // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~................................... + // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~.... + // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~......................................................................... + // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~.............................................................................. + // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~.................................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~........................................................................... + // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~....................................................................... + // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~........................... + // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~.................................................................... + // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~....... + // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~.................. + // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~................. + // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~............... + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~.............. + // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~...................................................................... + // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~....................................... + // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~............................................................... + // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~............. + // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~.............................................................. + // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................ + // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~........................................................... + // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~..................................................................... + // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~................................................................. + // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~... + // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................ + // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................ + // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................ + // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~......................... + // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............ + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~............................................................................... + // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........ + // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................ + // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................ + // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~.......................................................... + // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~......... + // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~...................................................... + // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~......................................................... + // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~.................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~..................................................... + // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~....................................................... + // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................ + // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~................................................... + // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~.................................................... + // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~....................... + // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~...................... + // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~.................... + // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~................... + // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~.......... + // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~........... + // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~. + // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................ + // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~..................... + // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~.............................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~................................................. + // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~.......................................... + // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~............................................. + // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~............................................... + // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................ + // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~........................................... + // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................ + // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~............................... + // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~............................. + // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~..... + // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~...... + // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~.. + // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................ + // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~...................................... + // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~.................................. + // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~................................. + // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................ + // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................ + // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l.............................. + // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l + // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l.......................... sub count, count, #1 - cbnz count, k4_loop_start - // Instructions: 92 - // Expected cycles: 107 - // Expected IPC: 0.86 - // - // Cycle bound: 107.0 - // IPC bound: 0.86 - // - // Wall time: 15.26s - // User time: 15.26s - // - // -------------------------------------------- cycle (expected) --------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------ - ld2 {v14.8H, v15.8H}, [x4], #32 // *.......................................................................................................... - ld1 {v1.8H}, [x6], #16 // ..*........................................................................................................ - smlal2 v29.4S, v14.8H, v16.8H // ....*...................................................................................................... - smlal v11.4S, v14.4H, v16.4H // .....*..................................................................................................... - ld2 {v12.8H, v13.8H}, [x5], #32 // ......*.................................................................................................... - smlal2 v29.4S, v15.8H, v1.8H // ........*.................................................................................................. - smlal v11.4S, v15.4H, v1.4H // .........*................................................................................................. - ld1 {v10.8H}, [x9], #16 // ..........*................................................................................................ - smlal2 v29.4S, v27.8H, v20.8H // ............*.............................................................................................. - smlal v11.4S, v27.4H, v20.4H // .............*............................................................................................. - ld2 {v22.8H, v23.8H}, [x2], #32 // ..............*............................................................................................ - smlal2 v29.4S, v28.8H, v10.8H // ................*.......................................................................................... - smlal v11.4S, v28.4H, v10.4H // .................*......................................................................................... - ld1 {v31.8H}, [x12], #16 // ..................*........................................................................................ - smlal2 v29.4S, v18.8H, v25.8H // ....................*...................................................................................... - smlal v11.4S, v18.4H, v25.4H // .....................*..................................................................................... - smull2 v1.4S, v7.8H, v23.8H // ......................*.................................................................................... - smlal v9.4S, v14.4H, v17.4H // .......................*................................................................................... - smlal2 v29.4S, v19.8H, v31.8H // ........................*.................................................................................. - smlal v11.4S, v19.4H, v31.4H // .........................*................................................................................. - smull v31.4S, v7.4H, v23.4H // ..........................*................................................................................ - smlal v9.4S, v15.4H, v16.4H // ...........................*............................................................................... - smull v10.4S, v7.4H, v22.4H // ............................*.............................................................................. - smlal2 v1.4S, v8.8H, v22.8H // .............................*............................................................................. - smlal v31.4S, v8.4H, v22.4H // ..............................*............................................................................ - ld2 {v4.8H, v5.8H}, [x4], #32 // ...............................*........................................................................... - ld2 {v23.8H, v24.8H}, [x11], #32 // .................................*......................................................................... - smlal2 v1.4S, v4.8H, v13.8H // ...................................*....................................................................... - smlal v31.4S, v4.4H, v13.4H // ....................................*...................................................................... - ld1 {v30.8H}, [x6], #16 // .....................................*..................................................................... - smlal2 v1.4S, v5.8H, v12.8H // .......................................*................................................................... - smlal v10.4S, v8.4H, v3.4H // ........................................*.................................................................. - smull2 v7.4S, v7.8H, v22.8H // .........................................*................................................................. - uzp1 v22.8H, v11.8H, v29.8H // ..........................................*................................................................ - smlal2 v6.4S, v14.8H, v17.8H // ...........................................*............................................................... - smlal v10.4S, v4.4H, v12.4H // ............................................*.............................................................. - smlal2 v7.4S, v8.8H, v3.8H // .............................................*............................................................. - ld2 {v13.8H, v14.8H}, [x7], #32 // ..............................................*............................................................ - mul v17.8H, v22.8H, v2.H[2] // ................................................*.......................................................... - smlal2 v7.4S, v4.8H, v12.8H // .................................................*......................................................... - smlal v10.4S, v5.4H, v30.4H // ..................................................*........................................................ - smlal2 v6.4S, v15.8H, v16.8H // ...................................................*....................................................... - smlal v11.4S, v17.4H, v0.4H // ....................................................*...................................................... - ld2 {v3.8H, v4.8H}, [x8], #32 // .....................................................*..................................................... - smlal2 v7.4S, v5.8H, v30.8H // .......................................................*................................................... - ld1 {v30.8H}, [x9], #16 // ........................................................*.................................................. - smlal v10.4S, v13.4H, v3.4H // ..........................................................*................................................ - smlal2 v7.4S, v13.8H, v3.8H // ...........................................................*............................................... - ld2 {v15.8H, v16.8H}, [x10], #32 // ............................................................*.............................................. - smlal v10.4S, v14.4H, v30.4H // ..............................................................*............................................ - smlal v9.4S, v27.4H, v21.4H // ...............................................................*........................................... - smlal2 v7.4S, v14.8H, v30.8H // ................................................................*.......................................... - smlal2 v6.4S, v27.8H, v21.8H // .................................................................*......................................... - smlal v10.4S, v15.4H, v23.4H // ..................................................................*........................................ - ld1 {v27.8H}, [x12], #16 // ...................................................................*....................................... - smlal2 v7.4S, v15.8H, v23.8H // .....................................................................*..................................... - smlal v31.4S, v5.4H, v12.4H // ......................................................................*.................................... - smlal2 v6.4S, v28.8H, v20.8H // .......................................................................*................................... - smlal v10.4S, v16.4H, v27.4H // ........................................................................*.................................. - smlal2 v7.4S, v16.8H, v27.8H // .........................................................................*................................. - smlal v31.4S, v13.4H, v4.4H // ..........................................................................*................................ - smlal2 v6.4S, v18.8H, v26.8H // ...........................................................................*............................... - smlal2 v1.4S, v13.8H, v4.8H // ............................................................................*.............................. - smlal v9.4S, v28.4H, v20.4H // .............................................................................*............................. - smlal v31.4S, v14.4H, v3.4H // ..............................................................................*............................ - uzp1 v13.8H, v10.8H, v7.8H // ...............................................................................*........................... - smlal2 v1.4S, v14.8H, v3.8H // ................................................................................*.......................... - smlal v9.4S, v18.4H, v26.4H // .................................................................................*......................... - smlal v31.4S, v15.4H, v24.4H // ..................................................................................*........................ - smlal2 v6.4S, v19.8H, v25.8H // ...................................................................................*....................... - smlal2 v1.4S, v15.8H, v24.8H // ....................................................................................*...................... - smlal v9.4S, v19.4H, v25.4H // .....................................................................................*..................... - smlal v31.4S, v16.4H, v23.4H // ......................................................................................*.................... - mul v13.8H, v13.8H, v2.H[2] // .......................................................................................*................... - smlal2 v1.4S, v16.8H, v23.8H // ........................................................................................*.................. - uzp1 v23.8H, v9.8H, v6.8H // .........................................................................................*................. - smlal2 v29.4S, v17.8H, v0.8H // ..........................................................................................*................ - mul v12.8H, v23.8H, v2.H[2] // ...........................................................................................*............... - uzp1 v4.8H, v31.8H, v1.8H // ............................................................................................*.............. - smlal v10.4S, v13.4H, v0.4H // .............................................................................................*............. - mul v23.8H, v4.8H, v2.H[2] // ..............................................................................................*............ - smlal v9.4S, v12.4H, v0.4H // ...............................................................................................*........... - smlal2 v7.4S, v13.8H, v0.8H // ................................................................................................*.......... - smlal2 v6.4S, v12.8H, v0.8H // .................................................................................................*......... - smlal v31.4S, v23.4H, v0.4H // ..................................................................................................*........ - smlal2 v1.4S, v23.8H, v0.8H // ...................................................................................................*....... - uzp2 v12.8H, v10.8H, v7.8H // ....................................................................................................*...... - uzp2 v25.8H, v9.8H, v6.8H // .....................................................................................................*..... - uzp2 v24.8H, v11.8H, v29.8H // ......................................................................................................*.... - uzp2 v13.8H, v31.8H, v1.8H // .......................................................................................................*... - st2 {v24.8H, v25.8H}, [x0], #32 // ........................................................................................................*.. - st2 {v12.8H, v13.8H}, [x0], #32 // ..........................................................................................................* - - // -------------------------------------------- cycle (expected) --------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------ - // ld2 {v13.8H, v14.8H}, [x4], #32 // *.......................................................................................................... - // ld1 {v15.8H}, [x6], #16 // ..*........................................................................................................ - // smlal2 v29.4S, v13.8H, v16.8H // ....*...................................................................................................... - // smlal v9.4S, v13.4H, v17.4H // .......................*................................................................................... - // smlal2 v6.4S, v13.8H, v17.8H // ...........................................*............................................................... - // smlal v11.4S, v13.4H, v16.4H // .....*..................................................................................................... - // smlal2 v29.4S, v14.8H, v15.8H // ........*.................................................................................................. - // smlal v9.4S, v14.4H, v16.4H // ...........................*............................................................................... - // smlal2 v6.4S, v14.8H, v16.8H // ...................................................*....................................................... - // ld1 {v30.8H}, [x9], #16 // ..........*................................................................................................ - // smlal v9.4S, v27.4H, v21.4H // ...............................................................*........................................... - // smlal2 v6.4S, v27.8H, v21.8H // .................................................................*......................................... - // smlal v11.4S, v14.4H, v15.4H // .........*................................................................................................. - // smlal2 v29.4S, v27.8H, v20.8H // ............*.............................................................................................. - // smlal v9.4S, v28.4H, v20.4H // .............................................................................*............................. - // smlal2 v6.4S, v28.8H, v20.8H // .......................................................................*................................... - // smlal v11.4S, v27.4H, v20.4H // .............*............................................................................................. - // smlal2 v29.4S, v28.8H, v30.8H // ................*.......................................................................................... - // smlal v9.4S, v18.4H, v26.4H // .................................................................................*......................... - // smlal2 v6.4S, v18.8H, v26.8H // ...........................................................................*............................... - // smlal v11.4S, v28.4H, v30.4H // .................*......................................................................................... - // smlal2 v29.4S, v18.8H, v25.8H // ....................*...................................................................................... - // smlal v9.4S, v19.4H, v25.4H // .....................................................................................*..................... - // smlal2 v6.4S, v19.8H, v25.8H // ...................................................................................*....................... - // ld1 {v23.8H}, [x12], #16 // ..................*........................................................................................ - // smlal v11.4S, v18.4H, v25.4H // .....................*..................................................................................... - // uzp1 v14.8H, v9.8H, v6.8H // .........................................................................................*................. - // smlal2 v29.4S, v19.8H, v23.8H // ........................*.................................................................................. - // mul v27.8H, v14.8H, v2.H[2] // ...........................................................................................*............... - // smlal v11.4S, v19.4H, v23.4H // .........................*................................................................................. - // ld2 {v4.8H, v5.8H}, [x2], #32 // ..............*............................................................................................ - // smlal v9.4S, v27.4H, v0.4H // ...............................................................................................*........... - // smlal2 v6.4S, v27.8H, v0.8H // .................................................................................................*......... - // ld2 {v27.8H, v28.8H}, [x7], #32 // ..............................................*............................................................ - // uzp1 v13.8H, v11.8H, v29.8H // ..........................................*................................................................ - // uzp2 v15.8H, v9.8H, v6.8H // .....................................................................................................*..... - // mul v13.8H, v13.8H, v2.H[2] // ................................................*.......................................................... - // ld2 {v18.8H, v19.8H}, [x10], #32 // ............................................................*.............................................. - // smull v9.4S, v7.4H, v5.4H // ..........................*................................................................................ - // smlal v11.4S, v13.4H, v0.4H // ....................................................*...................................................... - // smlal2 v29.4S, v13.8H, v0.8H // ..........................................................................................*................ - // ld2 {v20.8H, v21.8H}, [x8], #32 // .....................................................*..................................................... - // smlal v9.4S, v8.4H, v4.4H // ..............................*............................................................................ - // uzp2 v14.8H, v11.8H, v29.8H // ......................................................................................................*.... - // smull2 v29.4S, v7.8H, v4.8H // .........................................*................................................................. - // smull2 v6.4S, v7.8H, v5.8H // ......................*.................................................................................... - // ld2 {v25.8H, v26.8H}, [x11], #32 // .................................*......................................................................... - // smull v11.4S, v7.4H, v4.4H // ............................*.............................................................................. - // smlal2 v6.4S, v8.8H, v4.8H // .............................*............................................................................. - // smlal2 v29.4S, v8.8H, v3.8H // .............................................*............................................................. - // ld2 {v16.8H, v17.8H}, [x5], #32 // ......*.................................................................................................... - // smlal v11.4S, v8.4H, v3.4H // ........................................*.................................................................. - // st2 {v14.8H, v15.8H}, [x0], #32 // ........................................................................................................*.. - // ld2 {v13.8H, v14.8H}, [x4], #32 // ...............................*........................................................................... - // ld1 {v15.8H}, [x6], #16 // .....................................*..................................................................... - // smlal2 v29.4S, v13.8H, v16.8H // .................................................*......................................................... - // smlal v9.4S, v13.4H, v17.4H // ....................................*...................................................................... - // smlal2 v6.4S, v13.8H, v17.8H // ...................................*....................................................................... - // smlal v11.4S, v13.4H, v16.4H // ............................................*.............................................................. - // smlal2 v29.4S, v14.8H, v15.8H // .......................................................*................................................... - // smlal v9.4S, v14.4H, v16.4H // ......................................................................*.................................... - // smlal2 v6.4S, v14.8H, v16.8H // .......................................*................................................................... - // ld1 {v30.8H}, [x9], #16 // ........................................................*.................................................. - // smlal v9.4S, v27.4H, v21.4H // ..........................................................................*................................ - // smlal2 v6.4S, v27.8H, v21.8H // ............................................................................*.............................. - // smlal v11.4S, v14.4H, v15.4H // ..................................................*........................................................ - // smlal2 v29.4S, v27.8H, v20.8H // ...........................................................*............................................... - // smlal v9.4S, v28.4H, v20.4H // ..............................................................................*............................ - // smlal2 v6.4S, v28.8H, v20.8H // ................................................................................*.......................... - // smlal v11.4S, v27.4H, v20.4H // ..........................................................*................................................ - // smlal2 v29.4S, v28.8H, v30.8H // ................................................................*.......................................... - // smlal v9.4S, v18.4H, v26.4H // ..................................................................................*........................ - // smlal2 v6.4S, v18.8H, v26.8H // ....................................................................................*...................... - // smlal v11.4S, v28.4H, v30.4H // ..............................................................*............................................ - // smlal2 v29.4S, v18.8H, v25.8H // .....................................................................*..................................... - // smlal v9.4S, v19.4H, v25.4H // ......................................................................................*.................... - // smlal2 v6.4S, v19.8H, v25.8H // ........................................................................................*.................. - // ld1 {v23.8H}, [x12], #16 // ...................................................................*....................................... - // smlal v11.4S, v18.4H, v25.4H // ..................................................................*........................................ - // uzp1 v14.8H, v9.8H, v6.8H // ............................................................................................*.............. - // smlal2 v29.4S, v19.8H, v23.8H // .........................................................................*................................. - // mul v27.8H, v14.8H, v2.H[2] // ..............................................................................................*............ - // smlal v11.4S, v19.4H, v23.4H // ........................................................................*.................................. - // smlal v9.4S, v27.4H, v0.4H // ..................................................................................................*........ - // smlal2 v6.4S, v27.8H, v0.8H // ...................................................................................................*....... - // uzp1 v13.8H, v11.8H, v29.8H // ...............................................................................*........................... - // uzp2 v15.8H, v9.8H, v6.8H // .......................................................................................................*... - // mul v13.8H, v13.8H, v2.H[2] // .......................................................................................*................... - // smlal v11.4S, v13.4H, v0.4H // .............................................................................................*............. - // smlal2 v29.4S, v13.8H, v0.8H // ................................................................................................*.......... - // uzp2 v14.8H, v11.8H, v29.8H // ....................................................................................................*...... - // st2 {v14.8H, v15.8H}, [x0], #32 // ..........................................................................................................* + cbnz count, 1b + // Instructions: 50 + // Expected cycles: 56 + // Expected IPC: 0.89 + // + // Cycle bound: 56.0 + // IPC bound: 0.89 + // + // Wall time: 4.16s + // User time: 4.16s + // + // --------------- original position ---------------> + // 0 25 + // |------------------------| + smull2 v17.4S, v31.8H, v19.8H // ..*............................................... + uzp2 v1.8H, v14.8H, v6.8H // ................*................................. + smull v18.4S, v31.4H, v21.4H // .......*.......................................... + smlal2 v24.4S, v26.8H, v28.8H // *................................................. + smlal2 v17.4S, v16.8H, v23.8H // ....*............................................. + smull v21.4S, v31.4H, v19.4H // .....*............................................ + smlal v18.4S, v16.4H, v19.4H // .........*........................................ + uzp2 v31.8H, v4.8H, v3.8H // .*................................................ + uzp1 v3.8H, v14.8H, v6.8H // ............*..................................... + smlal v21.4S, v16.4H, v23.4H // ..........*....................................... + smlal v18.4S, v20.4H, v27.4H // ...........*...................................... + uzp2 v14.8H, v29.8H, v25.8H // ...*.............................................. + smlal2 v17.4S, v20.8H, v28.8H // ......*........................................... + smlal v21.4S, v20.4H, v28.4H // .............*.................................... + smlal v18.4S, v26.4H, v28.4H // ..............*................................... + smlal2 v24.4S, v9.8H, v1.8H // ..................*............................... + smlal2 v17.4S, v26.8H, v8.8H // ........*......................................... + smlal v21.4S, v26.4H, v8.4H // ...............*.................................. + smlal v18.4S, v9.4H, v1.4H // ...................*.............................. + smlal2 v24.4S, v31.8H, v3.8H // ......................*........................... + smlal2 v17.4S, v9.8H, v3.8H // .................*................................ + smlal v21.4S, v9.4H, v3.4H // ....................*............................. + smlal v18.4S, v31.4H, v3.4H // .......................*.......................... + smlal2 v24.4S, v30.8H, v14.8H // ..........................*....................... + smlal2 v17.4S, v31.8H, v12.8H // .....................*............................ + smlal v21.4S, v31.4H, v12.4H // ........................*......................... + smlal v18.4S, v30.4H, v14.4H // ...........................*...................... + smlal2 v24.4S, v11.8H, v10.8H // ..............................*................... + smlal2 v17.4S, v30.8H, v10.8H // .........................*........................ + smlal v21.4S, v30.4H, v10.4H // ............................*..................... + smlal v18.4S, v11.4H, v10.4H // ...............................*.................. + zip2 v19.8H, v7.8H, v15.8H // ......................................*........... + smlal2 v17.4S, v11.8H, v22.8H // .............................*.................... + smlal v21.4S, v11.4H, v22.4H // ................................*................. + uzp1 v23.8H, v18.8H, v24.8H // .................................*................ + str q19, [x0, #16] // .........................................*........ + mul v19.8H, v23.8H, v2.8H // ..................................*............... + uzp1 v23.8H, v21.8H, v17.8H // .....................................*............ + str q5, [x0], #32 // .............................................*.... + mul v26.8H, v23.8H, v2.8H // .......................................*.......... + smlal v18.4S, v19.4H, v0.4H // ...................................*.............. + smlal2 v24.4S, v19.8H, v0.8H // ....................................*............. + smlal v21.4S, v26.4H, v0.4H // ...........................................*...... + smlal2 v17.4S, v26.8H, v0.8H // ..........................................*....... + uzp2 v13.8H, v18.8H, v24.8H // ........................................*......... + uzp2 v19.8H, v21.8H, v17.8H // ............................................*..... + zip1 v23.8H, v19.8H, v13.8H // ..............................................*... + zip2 v19.8H, v19.8H, v13.8H // ...............................................*.. + str q23, [x0], #32 // .................................................* + str q19, [x0, #-16] // ................................................*. + + // ----------------- new position ------------------> + // 0 25 + // |------------------------|------------------------ + // smlal2 v24.4S, v26.8H, v28.8H // ...*.............................................. + // uzp2 v4.8H, v4.8H, v3.8H // .......*.......................................... + // smull2 v13.4S, v31.8H, v19.8H // *................................................. + // uzp2 v1.8H, v29.8H, v25.8H // ...........*...................................... + // smlal2 v13.4S, v16.8H, v23.8H // ....*............................................. + // smull v18.4S, v31.4H, v19.4H // .....*............................................ + // smlal2 v13.4S, v20.8H, v28.8H // ............*..................................... + // smull v29.4S, v31.4H, v21.4H // ..*............................................... + // smlal2 v13.4S, v26.8H, v8.8H // ................*................................. + // smlal v29.4S, v16.4H, v19.4H // ......*........................................... + // smlal v18.4S, v16.4H, v23.4H // .........*........................................ + // smlal v29.4S, v20.4H, v27.4H // ..........*....................................... + // uzp1 v31.8H, v14.8H, v6.8H // ........*......................................... + // smlal v18.4S, v20.4H, v28.4H // .............*.................................... + // smlal v29.4S, v26.4H, v28.4H // ..............*................................... + // smlal v18.4S, v26.4H, v8.4H // .................*................................ + // uzp2 v26.8H, v14.8H, v6.8H // .*................................................ + // smlal2 v13.4S, v9.8H, v31.8H // ....................*............................. + // smlal2 v24.4S, v9.8H, v26.8H // ...............*.................................. + // smlal v29.4S, v9.4H, v26.4H // ..................*............................... + // smlal v18.4S, v9.4H, v31.4H // .....................*............................ + // smlal2 v13.4S, v4.8H, v12.8H // ........................*......................... + // smlal2 v24.4S, v4.8H, v31.8H // ...................*.............................. + // smlal v29.4S, v4.4H, v31.4H // ......................*........................... + // smlal v18.4S, v4.4H, v12.4H // .........................*........................ + // smlal2 v13.4S, v30.8H, v10.8H // ............................*..................... + // smlal2 v24.4S, v30.8H, v1.8H // .......................*.......................... + // smlal v29.4S, v30.4H, v1.4H // ..........................*....................... + // smlal v18.4S, v30.4H, v10.4H // .............................*.................... + // smlal2 v13.4S, v11.8H, v22.8H // ................................*................. + // smlal2 v24.4S, v11.8H, v10.8H // ...........................*...................... + // smlal v29.4S, v11.4H, v10.4H // ..............................*................... + // smlal v18.4S, v11.4H, v22.4H // .................................*................ + // uzp1 v31.8H, v29.8H, v24.8H // ..................................*............... + // mul v19.8H, v31.8H, v2.8H // ....................................*............. + // smlal v29.4S, v19.4H, v0.4H // ........................................*......... + // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........ + // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............ + // zip2 v14.8H, v7.8H, v15.8H // ...............................*.................. + // mul v23.8H, v26.8H, v2.8H // .......................................*.......... + // uzp2 v15.8H, v29.8H, v24.8H // ............................................*..... + // str q14, [x0, #16] // ...................................*.............. + // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*...... + // smlal v18.4S, v23.4H, v0.4H // ..........................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // .............................................*.... + // str q5, [x0], #32 // ......................................*........... + // zip1 v5.8H, v7.8H, v15.8H // ..............................................*... + // zip2 v14.8H, v7.8H, v15.8H // ...............................................*.. + // str q14, [x0, #16] // .................................................* + // str q5, [x0], #32 // ................................................*. pop_stack diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S index c51e53188..722dc0f49 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/aarch64/src/rej_uniform_asm_clean.S @@ -22,272 +22,6 @@ #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - -.data -table_data: - .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 0 - .byte 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 1 - .byte 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 2 - .byte 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 3 - .byte 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 4 - .byte 0, 1, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 5 - .byte 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 6 - .byte 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 7 - .byte 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 8 - .byte 0, 1, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 9 - .byte 2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 10 - .byte 0, 1, 2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 11 - .byte 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 12 - .byte 0, 1, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 13 - .byte 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 14 - .byte 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1 // 15 - .byte 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 16 - .byte 0, 1, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 17 - .byte 2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 18 - .byte 0, 1, 2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 19 - .byte 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 20 - .byte 0, 1, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 21 - .byte 2, 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 22 - .byte 0, 1, 2, 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 23 - .byte 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 24 - .byte 0, 1, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 25 - .byte 2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 26 - .byte 0, 1, 2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 27 - .byte 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 28 - .byte 0, 1, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 29 - .byte 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 30 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1 // 31 - .byte 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 32 - .byte 0, 1, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 33 - .byte 2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 34 - .byte 0, 1, 2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 35 - .byte 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 36 - .byte 0, 1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 37 - .byte 2, 3, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 38 - .byte 0, 1, 2, 3, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 39 - .byte 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 40 - .byte 0, 1, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 41 - .byte 2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 42 - .byte 0, 1, 2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 43 - .byte 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 44 - .byte 0, 1, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 45 - .byte 2, 3, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 46 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1 // 47 - .byte 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 48 - .byte 0, 1, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 49 - .byte 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 50 - .byte 0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 51 - .byte 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 52 - .byte 0, 1, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 53 - .byte 2, 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 54 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 55 - .byte 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 56 - .byte 0, 1, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 57 - .byte 2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 58 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 59 - .byte 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 60 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 61 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 62 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1 // 63 - .byte 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 64 - .byte 0, 1, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 65 - .byte 2, 3, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 66 - .byte 0, 1, 2, 3, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 67 - .byte 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 68 - .byte 0, 1, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 69 - .byte 2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 70 - .byte 0, 1, 2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 71 - .byte 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 72 - .byte 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 73 - .byte 2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 74 - .byte 0, 1, 2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 75 - .byte 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 76 - .byte 0, 1, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 77 - .byte 2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 78 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1 // 79 - .byte 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 80 - .byte 0, 1, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 81 - .byte 2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 82 - .byte 0, 1, 2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 83 - .byte 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 84 - .byte 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 85 - .byte 2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 86 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 87 - .byte 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 88 - .byte 0, 1, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 89 - .byte 2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 90 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 91 - .byte 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 92 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 93 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 94 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1 // 95 - .byte 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 96 - .byte 0, 1, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 97 - .byte 2, 3, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 98 - .byte 0, 1, 2, 3, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 99 - .byte 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 100 - .byte 0, 1, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 101 - .byte 2, 3, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 102 - .byte 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 103 - .byte 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 104 - .byte 0, 1, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 105 - .byte 2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 106 - .byte 0, 1, 2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 107 - .byte 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 108 - .byte 0, 1, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 109 - .byte 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 110 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1 // 111 - .byte 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 112 - .byte 0, 1, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 113 - .byte 2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 114 - .byte 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 115 - .byte 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 116 - .byte 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 117 - .byte 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 118 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 119 - .byte 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 120 - .byte 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 121 - .byte 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 122 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 123 - .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 124 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 125 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 126 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 // 127 - .byte 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 128 - .byte 0, 1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 129 - .byte 2, 3, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 130 - .byte 0, 1, 2, 3, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 131 - .byte 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 132 - .byte 0, 1, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 133 - .byte 2, 3, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 134 - .byte 0, 1, 2, 3, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 135 - .byte 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 136 - .byte 0, 1, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 137 - .byte 2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 138 - .byte 0, 1, 2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 139 - .byte 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 140 - .byte 0, 1, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 141 - .byte 2, 3, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 142 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1 // 143 - .byte 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 144 - .byte 0, 1, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 145 - .byte 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 146 - .byte 0, 1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 147 - .byte 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 148 - .byte 0, 1, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 149 - .byte 2, 3, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 150 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 151 - .byte 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 152 - .byte 0, 1, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 153 - .byte 2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 154 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 155 - .byte 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 156 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 157 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 158 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1 // 159 - .byte 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 160 - .byte 0, 1, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 161 - .byte 2, 3, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 162 - .byte 0, 1, 2, 3, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 163 - .byte 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 164 - .byte 0, 1, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 165 - .byte 2, 3, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 166 - .byte 0, 1, 2, 3, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 167 - .byte 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 168 - .byte 0, 1, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 169 - .byte 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 170 - .byte 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 171 - .byte 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 172 - .byte 0, 1, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 173 - .byte 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 174 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1 // 175 - .byte 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 176 - .byte 0, 1, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 177 - .byte 2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 178 - .byte 0, 1, 2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 179 - .byte 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 180 - .byte 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 181 - .byte 2, 3, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 182 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 183 - .byte 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 184 - .byte 0, 1, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 185 - .byte 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 186 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 187 - .byte 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 188 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 189 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 190 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1 // 191 - .byte 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 192 - .byte 0, 1, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 193 - .byte 2, 3, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 194 - .byte 0, 1, 2, 3, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 195 - .byte 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 196 - .byte 0, 1, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 197 - .byte 2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 198 - .byte 0, 1, 2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 199 - .byte 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 200 - .byte 0, 1, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 201 - .byte 2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 202 - .byte 0, 1, 2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 203 - .byte 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 204 - .byte 0, 1, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 205 - .byte 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 206 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1 // 207 - .byte 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 208 - .byte 0, 1, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 209 - .byte 2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 210 - .byte 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 211 - .byte 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 212 - .byte 0, 1, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 213 - .byte 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 214 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 215 - .byte 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 216 - .byte 0, 1, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 217 - .byte 2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 218 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 219 - .byte 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 220 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 221 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 222 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1 // 223 - .byte 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 224 - .byte 0, 1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 225 - .byte 2, 3, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 226 - .byte 0, 1, 2, 3, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 227 - .byte 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 228 - .byte 0, 1, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 229 - .byte 2, 3, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 230 - .byte 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 231 - .byte 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 232 - .byte 0, 1, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 233 - .byte 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 234 - .byte 0, 1, 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 235 - .byte 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 236 - .byte 0, 1, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 237 - .byte 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 238 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1 // 239 - .byte 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 240 - .byte 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 241 - .byte 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 242 - .byte 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 243 - .byte 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 244 - .byte 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 245 - .byte 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 246 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 247 - .byte 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 248 - .byte 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 249 - .byte 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 250 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 251 - .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 252 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 253 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 254 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 // 255 - -bit_table_data: - .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - // We save the output on the stack first, and copy to the actual // output buffer only in the end. This is because the main loop can overwrite // by up to 62 bytes, which we account for here (we use 64 bytes for alignment). @@ -306,11 +40,9 @@ bit_table_data: output .req x0 buf .req x1 buflen .req w2 - len .req w3 + table_idx .req x3 - /* Pointers to static data */ - table_idx .req x5 - bit_table .req x6 + len .req w4 /* Temporary output on the stack */ output_tmp .req x7 @@ -381,17 +113,17 @@ bit_table_data: bits_q .req q31 .text +/* Literal pool */ +.p2align 4 +c_bit_table: + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .align 4 -.global MLKEM_NAMESPACE(rej_uniform_asm_clean) -.global _MLKEM_NAMESPACE(rej_uniform_asm_clean) -MLKEM_NAMESPACE(rej_uniform_asm_clean): -_MLKEM_NAMESPACE(rej_uniform_asm_clean): +.global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) +MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean): push_stack - ASM_LOAD(bit_table, bit_table_data) - ASM_LOAD(table_idx, table_data) - - ldr bits_q, [bit_table] + ldr bits_q, c_bit_table movz tmp, #MLKEM_Q dup mlkem_q.8h, tmp diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h index 3469d2739..8177b0b50 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/common.h @@ -26,4 +26,17 @@ * since the backend choice may be part of the namespace. */ #include "namespace.h" +/* On Apple platforms, we need to emit leading underscore + * in front of assembly symbols. We thus introducee a separate + * namespace wrapper for ASM symbols. */ +#if !defined(__APPLE__) +#define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym) +#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym) +#else +#define _PREFIX_UNDERSCORE(sym) _##sym +#define PREFIX_UNDERSCORE(sym) _PREFIX_UNDERSCORE(sym) +#define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym)) +#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym)) +#endif + #endif /* MLKEM_NATIVE_COMMON_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h index ac27408eb..31040a471 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/config.h @@ -39,23 +39,19 @@ /****************************************************************************** * Name: MLKEM_NAMESPACE - * _MLKEM_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/. *****************************************************************************/ #define MLKEM_NAMESPACE(sym) MLKEM_DEFAULT_NAMESPACE(sym) -#define _MLKEM_NAMESPACE(sym) _MLKEM_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: FIPS202_NAMESPACE - * _FIPS202_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/fips202/. *****************************************************************************/ #define FIPS202_NAMESPACE(sym) FIPS202_DEFAULT_NAMESPACE(sym) -#define _FIPS202_NAMESPACE(sym) _FIPS202_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: MLKEM_USE_NATIVE diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c index f52b2ff5a..8ab46ce0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/zetas.c @@ -14,7 +14,7 @@ * Table of zeta values used in the reference NTT and inverse NTT. * See autogenerate_files.py for details. */ -const int16_t zetas[128] = { +ALIGN const int16_t zetas[128] = { -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h index 3469d2739..8177b0b50 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/common.h @@ -26,4 +26,17 @@ * since the backend choice may be part of the namespace. */ #include "namespace.h" +/* On Apple platforms, we need to emit leading underscore + * in front of assembly symbols. We thus introducee a separate + * namespace wrapper for ASM symbols. */ +#if !defined(__APPLE__) +#define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym) +#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym) +#else +#define _PREFIX_UNDERSCORE(sym) _##sym +#define PREFIX_UNDERSCORE(sym) _PREFIX_UNDERSCORE(sym) +#define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym)) +#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym)) +#endif + #endif /* MLKEM_NATIVE_COMMON_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h index ac27408eb..31040a471 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/config.h @@ -39,23 +39,19 @@ /****************************************************************************** * Name: MLKEM_NAMESPACE - * _MLKEM_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/. *****************************************************************************/ #define MLKEM_NAMESPACE(sym) MLKEM_DEFAULT_NAMESPACE(sym) -#define _MLKEM_NAMESPACE(sym) _MLKEM_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: FIPS202_NAMESPACE - * _FIPS202_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/fips202/. *****************************************************************************/ #define FIPS202_NAMESPACE(sym) FIPS202_DEFAULT_NAMESPACE(sym) -#define _FIPS202_NAMESPACE(sym) _FIPS202_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: MLKEM_USE_NATIVE diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c index f52b2ff5a..8ab46ce0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/zetas.c @@ -14,7 +14,7 @@ * Table of zeta values used in the reference NTT and inverse NTT. * See autogenerate_files.py for details. */ -const int16_t zetas[128] = { +ALIGN const int16_t zetas[128] = { -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h index 3469d2739..8177b0b50 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/common.h @@ -26,4 +26,17 @@ * since the backend choice may be part of the namespace. */ #include "namespace.h" +/* On Apple platforms, we need to emit leading underscore + * in front of assembly symbols. We thus introducee a separate + * namespace wrapper for ASM symbols. */ +#if !defined(__APPLE__) +#define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym) +#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym) +#else +#define _PREFIX_UNDERSCORE(sym) _##sym +#define PREFIX_UNDERSCORE(sym) _PREFIX_UNDERSCORE(sym) +#define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym)) +#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym)) +#endif + #endif /* MLKEM_NATIVE_COMMON_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h index ac27408eb..31040a471 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/config.h @@ -39,23 +39,19 @@ /****************************************************************************** * Name: MLKEM_NAMESPACE - * _MLKEM_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/. *****************************************************************************/ #define MLKEM_NAMESPACE(sym) MLKEM_DEFAULT_NAMESPACE(sym) -#define _MLKEM_NAMESPACE(sym) _MLKEM_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: FIPS202_NAMESPACE - * _FIPS202_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/fips202/. *****************************************************************************/ #define FIPS202_NAMESPACE(sym) FIPS202_DEFAULT_NAMESPACE(sym) -#define _FIPS202_NAMESPACE(sym) _FIPS202_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: MLKEM_USE_NATIVE diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S index 503fbeb51..c51606b3f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/basemul.S @@ -113,8 +113,8 @@ vmovdqa %ymm11,(64*\off+48)*2(%rdi) .endm .text -.global MLKEM_NAMESPACE(basemul_avx2) -MLKEM_NAMESPACE(basemul_avx2): +.global MLKEM_ASM_NAMESPACE(basemul_avx2) +MLKEM_ASM_NAMESPACE(basemul_avx2): mov %rsp,%r8 and $-32,%rsp sub $32,%rsp diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S index 50ef190b7..61560fb46 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/fq.S @@ -60,8 +60,8 @@ vmovdqa %ymm9,224(%rdi) ret -.global MLKEM_NAMESPACE(reduce_avx2) -MLKEM_NAMESPACE(reduce_avx2): +.global MLKEM_ASM_NAMESPACE(reduce_avx2) +MLKEM_ASM_NAMESPACE(reduce_avx2): #consts vmovdqa _16XQ*2(%rsi),%ymm0 vmovdqa _16XV*2(%rsi),%ymm1 @@ -103,8 +103,8 @@ vmovdqa %ymm10,224(%rdi) ret -.global MLKEM_NAMESPACE(tomont_avx2) -MLKEM_NAMESPACE(tomont_avx2): +.global MLKEM_ASM_NAMESPACE(tomont_avx2) +MLKEM_ASM_NAMESPACE(tomont_avx2): #consts vmovdqa _16XQ*2(%rsi),%ymm0 vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S index 4860985ed..fda9ec0d3 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/intt.S @@ -242,8 +242,8 @@ vmovdqa %ymm11,(64*\off+176)*2(%rdi) .endm .text -.global MLKEM_NAMESPACE(invntt_avx2) -MLKEM_NAMESPACE(invntt_avx2): +.global MLKEM_ASM_NAMESPACE(invntt_avx2) +MLKEM_ASM_NAMESPACE(invntt_avx2): vmovdqa _16XQ*2(%rsi),%ymm0 intt_levels0t5 0 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S index a0b6f734c..dc3776b37 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/ntt.S @@ -206,8 +206,8 @@ vmovdqa %ymm11,(128*\off+112)*2(%rdi) .endm .text -.global MLKEM_NAMESPACE(ntt_avx2) -MLKEM_NAMESPACE(ntt_avx2): +.global MLKEM_ASM_NAMESPACE(ntt_avx2) +MLKEM_ASM_NAMESPACE(ntt_avx2): vmovdqa _16XQ*2(%rsi),%ymm0 level0 0 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S index 34f6b30b0..0d23c11b3 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/x86_64/src/shuffle.S @@ -15,8 +15,8 @@ #include "fq.inc" #include "shuffle.inc" -.global MLKEM_NAMESPACE(nttpack_avx2) -MLKEM_NAMESPACE(nttpack_avx2): +.global MLKEM_ASM_NAMESPACE(nttpack_avx2) +MLKEM_ASM_NAMESPACE(nttpack_avx2): #load vmovdqa (%rdi),%ymm4 vmovdqa 32(%rdi),%ymm5 @@ -102,8 +102,8 @@ vmovdqa %ymm11,224(%rdi) ret -.global MLKEM_NAMESPACE(nttunpack_avx2) -MLKEM_NAMESPACE(nttunpack_avx2): +.global MLKEM_ASM_NAMESPACE(nttunpack_avx2) +MLKEM_ASM_NAMESPACE(nttunpack_avx2): call nttunpack128_avx2 add $256,%rdi call nttunpack128_avx2 @@ -169,8 +169,8 @@ vmovdqu %ymm9,160(%rdi) ret -.global MLKEM_NAMESPACE(ntttobytes_avx2) -MLKEM_NAMESPACE(ntttobytes_avx2): +.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2) +MLKEM_ASM_NAMESPACE(ntttobytes_avx2): #consts vmovdqa _16XQ*2(%rdx),%ymm0 call ntttobytes128_avx @@ -245,8 +245,8 @@ vmovdqa %ymm1,224(%rdi) ret -.global MLKEM_NAMESPACE(nttfrombytes_avx2) -MLKEM_NAMESPACE(nttfrombytes_avx2): +.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2) +MLKEM_ASM_NAMESPACE(nttfrombytes_avx2): #consts vmovdqa _16XMASK*2(%rdx),%ymm0 call nttfrombytes128_avx diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c index f52b2ff5a..8ab46ce0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/zetas.c @@ -14,7 +14,7 @@ * Table of zeta values used in the reference NTT and inverse NTT. * See autogenerate_files.py for details. */ -const int16_t zetas[128] = { +ALIGN const int16_t zetas[128] = { -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c index ecf1b529a..b638e2081 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/aarch64_zetas.c @@ -20,7 +20,7 @@ * Table of zeta values used in the AArch64 forward NTT * See autogenerate_files.py for details. */ -const int16_t aarch64_ntt_zetas_layer01234[] = { +ALIGN const int16_t aarch64_ntt_zetas_layer01234[] = { -1600, -15749, -749, -7373, -40, -394, -687, -6762, 630, 6201, -1432, -14095, 848, 8347, 0, 0, 1062, 10453, 296, 2914, -882, -8682, 0, 0, -1410, -13879, 1339, 13180, 1476, 14529, @@ -31,7 +31,7 @@ const int16_t aarch64_ntt_zetas_layer01234[] = { 0, 0, -1583, -15582, -1355, -13338, 821, 8081, 0, 0, }; -const int16_t aarch64_ntt_zetas_layer56[] = { +ALIGN const int16_t aarch64_ntt_zetas_layer56[] = { 289, 289, 331, 331, -76, -76, -1573, -1573, 2845, 2845, 3258, 3258, -748, -748, -15483, -15483, 17, 17, 583, 583, 1637, 1637, -1041, -1041, 167, 167, 5739, @@ -77,7 +77,7 @@ const int16_t aarch64_ntt_zetas_layer56[] = { 10129, 10129, -3878, -3878, -11566, -11566, }; -const int16_t aarch64_invntt_zetas_layer01234[] = { +ALIGN const int16_t aarch64_invntt_zetas_layer01234[] = { 1583, 15582, -821, -8081, 1355, 13338, 0, 0, -569, -5601, 450, 4429, 936, 9213, 0, 0, 69, 679, 447, 4400, -535, -5266, 0, 0, 543, 5345, 1235, 12156, -1426, -14036, @@ -88,7 +88,7 @@ const int16_t aarch64_invntt_zetas_layer01234[] = { -848, -8347, 1432, 14095, -630, -6201, 687, 6762, 0, 0, }; -const int16_t aarch64_invntt_zetas_layer56[] = { +ALIGN const int16_t aarch64_invntt_zetas_layer56[] = { -910, -910, -1227, -1227, 219, 219, 855, 855, -8957, -8957, -12078, -12078, 2156, 2156, 8416, 8416, 1175, 1175, 394, 394, -1029, -1029, -1212, -1212, 11566, 11566, 3878, @@ -134,7 +134,7 @@ const int16_t aarch64_invntt_zetas_layer56[] = { -16113, -16113, -5739, -5739, -167, -167, }; -const int16_t aarch64_zetas_mulcache_native[] = { +ALIGN const int16_t aarch64_zetas_mulcache_native[] = { 17, -17, -568, 568, 583, -583, -680, 680, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 1409, -1409, -667, 667, -48, 48, 233, -233, 756, -756, -1173, 1173, -314, 314, -279, 279, -1626, @@ -149,7 +149,7 @@ const int16_t aarch64_zetas_mulcache_native[] = { 1219, -394, 394, 885, -885, -1175, 1175, }; -const int16_t aarch64_zetas_mulcache_twisted_native[] = { +ALIGN const int16_t aarch64_zetas_mulcache_twisted_native[] = { 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 7441, -7441, -11546, @@ -168,7 +168,6 @@ const int16_t aarch64_zetas_mulcache_twisted_native[] = { }; #else -#include "params.h" /* Dummy declaration for compilers disliking empty compilation units */ #define empty_cu_aarch64_zetas MLKEM_NAMESPACE(empty_cu_aarch64_zetas) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h index 2f3b0ef4f..6a5ee8a7d 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/arith_native_aarch64.h @@ -19,6 +19,7 @@ MLKEM_NAMESPACE(aarch64_zetas_mulcache_native) #define aarch64_zetas_mulcache_twisted_native \ MLKEM_NAMESPACE(aarch64_zetas_mulcache_twisted_native) +#define rej_uniform_table MLKEM_NAMESPACE(rej_uniform_table) extern const int16_t aarch64_ntt_zetas_layer01234[]; extern const int16_t aarch64_ntt_zetas_layer56[]; @@ -26,6 +27,7 @@ extern const int16_t aarch64_invntt_zetas_layer01234[]; extern const int16_t aarch64_invntt_zetas_layer56[]; extern const int16_t aarch64_zetas_mulcache_native[]; extern const int16_t aarch64_zetas_mulcache_twisted_native[]; +extern const uint8_t rej_uniform_table[]; #define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean) void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *); @@ -41,7 +43,7 @@ void intt_asm_opt(int16_t *, const int16_t *, const int16_t *); #define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) unsigned int rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, - unsigned int buflen); + unsigned int buflen, const uint8_t *table); #define poly_reduce_asm_clean MLKEM_NAMESPACE(poly_reduce_asm_clean) void poly_reduce_asm_clean(int16_t *); diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h index c91a7eedd..b0ff3d597 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/clean_impl.h @@ -74,7 +74,7 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len, { return -1; } - return (int)rej_uniform_asm_clean(r, buf, buflen); + return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table); } #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/common.i b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/common.i deleted file mode 100644 index 6b8880311..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/common.i +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: MIT - */ - -#if __APPLE__ -#define ASM_LOAD(dst, symbol) \ - adrp dst, symbol @PAGE %% add dst, dst, symbol @PAGEOFF -#else /* __APPLE__ */ -#define ASM_LOAD(dst, symbol) \ - adrp dst, symbol; \ - add dst, dst, : lo12 : symbol; -#endif /* __APPLE__ */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S index 2f05d8cca..623a82ae9 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_clean.S @@ -26,10 +26,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) // @@ -145,38 +141,7 @@ .text - .global MLKEM_NAMESPACE(intt_asm_clean) - .global _MLKEM_NAMESPACE(intt_asm_clean) - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ninv_addr: .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 -ninv_tw_addr: .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - -MLKEM_NAMESPACE(intt_asm_clean): -_MLKEM_NAMESPACE(intt_asm_clean): - push_stack + .global MLKEM_ASM_NAMESPACE(intt_asm_clean) in .req x0 r01234_ptr .req x1 @@ -227,16 +192,41 @@ _MLKEM_NAMESPACE(intt_asm_clean): t2 .req v27 t3 .req v28 - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ninv .req v29 + q_ninv .req q29 ninv_tw .req v30 + q_ninv_tw .req q30 + +/* Literal pool */ +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm + +.p2align 4 +c_consts: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +c_ninv: dup8h 512 +c_ninv_tw: dup8h 5040 + +MLKEM_ASM_NAMESPACE(intt_asm_clean): + push_stack - ASM_LOAD(xtmp, ninv_addr) - ld1r {ninv.8h}, [xtmp] - ASM_LOAD(xtmp, ninv_tw_addr) - ld1r {ninv_tw.8h}, [xtmp] + ldr q_consts, c_consts + ldr q_ninv, c_ninv + ldr q_ninv_tw, c_ninv_tw mov inp, in mov count, #8 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S index fc720e504..e332efef8 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/intt_opt.S @@ -26,10 +26,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) // @@ -145,38 +141,7 @@ .text - .global MLKEM_NAMESPACE(intt_asm_opt) - .global _MLKEM_NAMESPACE(intt_asm_opt) - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ninv_addr: .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 -ninv_tw_addr: .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - -MLKEM_NAMESPACE(intt_asm_opt): -_MLKEM_NAMESPACE(intt_asm_opt): - push_stack + .global MLKEM_ASM_NAMESPACE(intt_asm_opt) in .req x0 r01234_ptr .req x1 @@ -227,16 +192,41 @@ _MLKEM_NAMESPACE(intt_asm_opt): t2 .req v27 t3 .req v28 - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ninv .req v29 + q_ninv .req q29 ninv_tw .req v30 + q_ninv_tw .req q30 + +/* Literal pool */ +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm + +.p2align 4 +c_consts: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +c_ninv: dup8h 512 +c_ninv_tw: dup8h 5040 + +MLKEM_ASM_NAMESPACE(intt_asm_opt): + push_stack - ASM_LOAD(xtmp, ninv_addr) - ld1r {ninv.8h}, [xtmp] - ASM_LOAD(xtmp, ninv_tw_addr) - ld1r {ninv_tw.8h}, [xtmp] + ldr q_consts, c_consts + ldr q_ninv, c_ninv + ldr q_ninv_tw, c_ninv_tw mov inp, in mov count, #8 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S index b00d08810..877a5f689 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_clean.S @@ -27,10 +27,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) // @@ -64,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] -.endm - .macro load_roots_012 ldr q_root0, [r01234_ptr], #32 ldr q_root1, [r01234_ptr, #-16] @@ -100,13 +90,6 @@ trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro transpose_single data_out, data_in - trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s - trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s - trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s - trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s -.endm - .macro save_vregs sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -182,11 +165,11 @@ t3 .req v28 .text - .global MLKEM_NAMESPACE(ntt_asm_clean) - .global _MLKEM_NAMESPACE(ntt_asm_clean) + .global MLKEM_ASM_NAMESPACE(ntt_asm_clean) +/* Literal pool */ .p2align 4 -const_addr: +c_consts: .short 3329 .short 20159 .short 0 @@ -196,12 +179,9 @@ const_addr: .short 0 .short 0 -MLKEM_NAMESPACE(ntt_asm_clean): -_MLKEM_NAMESPACE(ntt_asm_clean): +MLKEM_ASM_NAMESPACE(ntt_asm_clean): push_stack - ASM_LOAD(xtmp, const_addr) - - ld1 {consts.8h}, [xtmp] + ldr q_consts, c_consts mov inp, in mov count, #4 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S index b8a22ecfc..15103a595 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/ntt_opt.S @@ -27,10 +27,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) @@ -64,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] -.endm - .macro load_roots_012 ldr q_root0, [r01234_ptr], #32 ldr q_root1, [r01234_ptr, #-16] @@ -100,13 +90,6 @@ trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro transpose_single data_out, data_in - trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s - trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s - trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s - trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s -.endm - .macro save_vregs sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -182,11 +165,11 @@ t3 .req v28 .text - .global MLKEM_NAMESPACE(ntt_asm_opt) - .global _MLKEM_NAMESPACE(ntt_asm_opt) + .global MLKEM_ASM_NAMESPACE(ntt_asm_opt) +/* Literal pool */ .p2align 4 -const_addr: +c_consts: .short 3329 .short 20159 .short 0 @@ -196,12 +179,9 @@ const_addr: .short 0 .short 0 -MLKEM_NAMESPACE(ntt_asm_opt): -_MLKEM_NAMESPACE(ntt_asm_opt): +MLKEM_ASM_NAMESPACE(ntt_asm_opt): push_stack - ASM_LOAD(xtmp, const_addr) - - ld1 {consts.8h}, [xtmp] + ldr q_consts, c_consts mov inp, in mov count, #4 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h index 5e4d00e1f..b22674026 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/opt_impl.h @@ -75,7 +75,7 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len, { return -1; } - return (int)rej_uniform_asm_clean(r, buf, buflen); + return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table); } #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S index 3e1bc5cf4..f70a40221 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_clean.S @@ -6,65 +6,85 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c .endm -.macro scalar_signed_to_unsigned a - sshr mask.8h, \a\().8h, #15 - and mask.16b, modulus.16b, mask.16b - add \a\().8h, \a\().8h, mask.8h +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 20159 // Barrett twist of 1 wrt 2^27 +c_mont_constant: dup8h -1044 // 2^16 % 3329 +c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) + +/* + * Some modular arithmetic macros + */ + +/* Barrett reduction */ +.macro barrett_reduce a + sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] + srshr tmp.8h, tmp.8h, #11 + mls \a\().8h, tmp.8h, modulus.h[0] .endm +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ .macro mulmod dst, src, const, const_twisted sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, tmp0.8h, consts.h[0] + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/* Turns signed-canonical to unsigned canonical representative + * through conditional addition of the modulus. + * + * Expected modulus in `modulus`. */ +.macro scalar_signed_to_unsigned a + sshr mask.8h, \a\().8h, #15 + and mask.16b, modulus.16b, mask.16b + add \a\().8h, \a\().8h, mask.8h .endm /********************************** * poly_reduce() * **********************************/ -.global MLKEM_NAMESPACE(poly_reduce_asm_clean) -.global _MLKEM_NAMESPACE(poly_reduce_asm_clean) -.p2align 4 -const_addr: - .short 3329 // ML-KEM modulus - .short 20159 // Barrett twist of 1 wrt 2^27 - .short -1044 // 2^16 % 3329 - .short -10276 // Barrett twist of -1044 (wrt 2^16) - .short 0 - .short 0 - .short 0 - .short 0 - - ptr .req x0 - count .req x1 - xtmp .req x2 - wmodulus .req w3 - - q_data .req q0 - data .req v0 - t0 .req v1 - consts .req v2 - mask .req v3 - modulus .req v4 - -MLKEM_NAMESPACE(poly_reduce_asm_clean): -_MLKEM_NAMESPACE(poly_reduce_asm_clean): - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov wmodulus, #3329 - dup modulus.8h, wmodulus +.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean) + + ptr .req x0 + count .req x1 + + data .req v0 + q_data .req q0 + + tmp .req v1 + mask .req v2 + modulus .req v3 + q_modulus .req q3 + modulus_twisted .req v4 + q_modulus_twisted .req q4 + +MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean): + + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted mov count, #8 loop_start: @@ -95,57 +115,64 @@ loop_start: .unreq ptr .unreq count - .unreq xtmp - .unreq q_data + .unreq data - .unreq t0 - .unreq consts + .unreq q_data + + .unreq tmp .unreq mask - .unreq wmodulus .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_mulcache_compute() * ********************************************/ -.global MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean) -.global _MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean) - - cache_ptr .req x0 - data_ptr .req x1 - zeta_ptr .req x2 - zeta_twisted_ptr .req x3 - xtmp .req x4 - - count .req x5 - - data_odd .req v1 - - zeta .req v2 - zeta_twisted .req v3 - q_zeta .req q2 - q_zeta_twisted .req q3 - - q_tmp0 .req q4 - q_tmp1 .req q5 - tmp0 .req v4 - tmp1 .req v5 - consts .req v6 - dst .req v7 - q_dst .req q7 - -MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean): -_MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov count, #(16) + +.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean) + + cache_ptr .req x0 + data_ptr .req x1 + zeta_ptr .req x2 + zeta_twisted_ptr .req x3 + count .req x4 + + data_odd .req v0 + zeta .req v1 + q_zeta .req q1 + zeta_twisted .req v2 + q_zeta_twisted .req q2 + + tmp0 .req v3 + q_tmp0 .req q3 + tmp1 .req v4 + q_tmp1 .req q4 + dst .req v5 + q_dst .req q5 + + modulus .req v6 + q_modulus .req q6 + modulus_twisted .req v7 + q_modulus_twisted .req q7 + +MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean): + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + + mov count, #16 mulcache_compute_loop_start: ldr q_tmp0, [data_ptr], #32 ldr q_tmp1, [data_ptr, #-16] ldr q_zeta, [zeta_ptr], #16 ldr q_zeta_twisted, [zeta_twisted_ptr], #16 + + // The mulcache of a polynomial a + b*X in Fq[X^2-zeta] is b*zeta; + // Since tmp0 || tmp1 represents multiple such polynomails as + // (a0,b0,a1,b1,...), extract only the odd elements. uzp2 data_odd.8h, tmp0.8h, tmp1.8h mulmod dst, data_odd, zeta, zeta_twisted + str q_dst, [cache_ptr], #16 subs count, count, #1 @@ -156,7 +183,7 @@ mulcache_compute_loop_start: .unreq cache_ptr .unreq data_ptr .unreq zeta_ptr - .unreq xtmp + .unreq zeta_twisted_ptr .unreq count .unreq data_odd @@ -165,34 +192,35 @@ mulcache_compute_loop_start: .unreq zeta_twisted .unreq q_zeta_twisted - .unreq q_tmp0 - .unreq q_tmp1 .unreq tmp0 + .unreq q_tmp0 .unreq tmp1 - .unreq consts + .unreq q_tmp1 .unreq dst + .unreq q_dst + + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_tobytes() * ********************************************/ -.global MLKEM_NAMESPACE(poly_tobytes_asm_clean) -.global _MLKEM_NAMESPACE(poly_tobytes_asm_clean) +.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_clean) data0 .req v0 data1 .req v1 - out0 .req v2 out1 .req v3 out2 .req v4 - - tmp .req v5 + tmp .req v5 dst .req x0 src .req x1 count .req x2 -MLKEM_NAMESPACE(poly_tobytes_asm_clean): -_MLKEM_NAMESPACE(poly_tobytes_asm_clean): +MLKEM_ASM_NAMESPACE(poly_tobytes_asm_clean): mov count, #16 poly_tobytes_asm_clean_asm_loop_start: @@ -229,37 +257,33 @@ poly_tobytes_asm_clean_asm_loop_start: /********************************** * poly_tomont() * **********************************/ -.global MLKEM_NAMESPACE(poly_tomont_asm_clean) -.global _MLKEM_NAMESPACE(poly_tomont_asm_clean) - - src .req x0 - count .req x1 - xtmp .req x2 - - wtmp2 .req w3 +.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean) - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 + src .req x0 + count .req x1 - consts .req v2 + data .req v0 + q_data .req q0 + res .req v1 + q_res .req q1 - factor .req v3 - factor_t .req v4 + factor .req v2 + q_factor .req q2 + factor_t .req v3 + q_factor_t .req q3 + modulus .req v4 + q_modulus .req q4 + modulus_twisted .req v5 + q_modulus_twisted .req q5 - tmp0 .req v5 + tmp0 .req v6 -MLKEM_NAMESPACE(poly_tomont_asm_clean): -_MLKEM_NAMESPACE(poly_tomont_asm_clean): +MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - ldrsh wtmp2, [xtmp, #(2*2)] - dup factor.8h, wtmp2 - ldrh wtmp2, [xtmp, #(3*2)] - dup factor_t.8h, wtmp2 + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + ldr q_factor, c_mont_constant + ldr q_factor_t, c_barrett_twist mov count, #8 poly_tomont_asm_loop: @@ -285,4 +309,23 @@ poly_tomont_asm_loop: ret + .unreq src + .unreq count + + .unreq data + .unreq q_data + .unreq res + .unreq q_res + + .unreq factor + .unreq q_factor + .unreq factor_t + .unreq q_factor_t + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted + + .unreq tmp0 + #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S index df3b21008..e58ee77c4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/poly_opt.S @@ -6,117 +6,137 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c .endm -.macro scalar_signed_to_unsigned a - sshr mask.8h, \a\().8h, #15 - and mask.16b, modulus.16b, mask.16b - add \a\().8h, \a\().8h, mask.8h +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 20159 // Barrett twist of 1 wrt 2^27 +c_mont_constant: dup8h -1044 // 2^16 % 3329 +c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) + +/* + * Some modular arithmetic macros + */ + +/* Barrett reduction */ +.macro barrett_reduce a + sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] + srshr tmp.8h, tmp.8h, #11 + mls \a\().8h, tmp.8h, modulus.h[0] .endm +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ .macro mulmod dst, src, const, const_twisted sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, tmp0.8h, consts.h[0] + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/* Turns signed-canonical to unsigned canonical representative + * through conditional addition of the modulus. + * + * Expected modulus in `modulus`. */ +.macro scalar_signed_to_unsigned a + sshr mask.8h, \a\().8h, #15 + and mask.16b, modulus.16b, mask.16b + add \a\().8h, \a\().8h, mask.8h .endm /********************************** * poly_reduce() * **********************************/ -.global MLKEM_NAMESPACE(poly_reduce_asm_opt) -.global _MLKEM_NAMESPACE(poly_reduce_asm_opt) -.p2align 4 -const_addr: - .short 3329 // ML-KEM modulus - .short 20159 // Barrett twist of 1 wrt 2^27 - .short -1044 // 2^16 % 3329 - .short -10276 // Barrett twist of -1044 (wrt 2^16) - .short 0 - .short 0 - .short 0 - .short 0 - - ptr .req x0 - count .req x1 - xtmp .req x2 - wmodulus .req w3 - - q_data .req q0 - data .req v0 - t0 .req v1 - consts .req v2 - mask .req v3 - modulus .req v4 - -MLKEM_NAMESPACE(poly_reduce_asm_opt): -_MLKEM_NAMESPACE(poly_reduce_asm_opt): - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov wmodulus, #3329 - dup modulus.8h, wmodulus +.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt) + + ptr .req x0 + count .req x1 + + data .req v0 + q_data .req q0 + + tmp .req v1 + mask .req v2 + modulus .req v3 + q_modulus .req q3 + modulus_twisted .req v4 + q_modulus_twisted .req q4 + +MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): + + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted mov count, #8 - // Instructions: 15 - // Expected cycles: 22 - // Expected IPC: 0.68 + // Instructions: 15 + // Expected cycles: 22 + // Expected IPC: 0.68 - // Cycle bound: 22.0 - // IPC bound: 0.68 + // Cycle bound: 22.0 + // IPC bound: 0.68 - // Wall time: 0.04s - // User time: 0.04s + // Wall time: 0.05s + // User time: 0.05s - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q21, [x0, #48] // *............................. - ldr q3, [x0, #32] // ..*........................... - sqdmulh v6.8H, v21.8H, v2.H[1] // ....*......................... - sqdmulh v30.8H, v3.8H, v2.H[1] // ......*....................... - srshr v6.8H, v6.8H, #11 // ........*..................... - srshr v30.8H, v30.8H, #11 // ..........*................... - mls v21.8H, v6.8H, v2.H[0] // ...........*.................. - mls v3.8H, v30.8H, v2.H[0] // .............*................ - ldr q22, [x0], #64 // ..............*............... - sshr v6.8H, v21.8H, #15 // ................*............. - sshr v30.8H, v3.8H, #15 // .................*............ - and v6.16B, v4.16B, v6.16B // ..................*........... - add v6.8H, v21.8H, v6.8H // ...................*.......... - and v21.16B, v4.16B, v30.16B // ....................*......... - add v21.8H, v3.8H, v21.8H // .....................*........ + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q21, [x0, #32] // *............................. + ldr q23, [x0, #48] // ..*........................... + sqdmulh v7.8H, v21.8H, v4.H[0] // ....*......................... + sqdmulh v30.8H, v23.8H, v4.H[0] // ......*....................... + srshr v7.8H, v7.8H, #11 // ........*..................... + srshr v30.8H, v30.8H, #11 // ..........*................... + mls v21.8H, v7.8H, v3.H[0] // ...........*.................. + mls v23.8H, v30.8H, v3.H[0] // .............*................ + ldr q5, [x0, #16] // ..............*............... + sshr v7.8H, v21.8H, #15 // ................*............. + sshr v30.8H, v23.8H, #15 // .................*............ + and v7.16B, v3.16B, v7.16B // ..................*........... + add v21.8H, v21.8H, v7.8H // ...................*.......... + and v7.16B, v3.16B, v30.16B // ....................*......... + add v16.8H, v23.8H, v7.8H // .....................*........ // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q31, [x0, #48] // *.............................. - // sqdmulh v24.8H, v31.8H, v2.H[1] // ....*.......................... - // srshr v6.8H, v24.8H, #11 // ........*...................... - // mls v31.8H, v6.8H, v2.H[0] // ...........*................... - // ldr q30, [x0, #32] // ..*............................ - // sshr v16.8H, v31.8H, #15 // ................*.............. - // and v28.16B, v4.16B, v16.16B // ..................*............ - // sqdmulh v23.8H, v30.8H, v2.H[1] // ......*........................ - // add v6.8H, v31.8H, v28.8H // ...................*........... - // srshr v18.8H, v23.8H, #11 // ..........*.................... - // mls v30.8H, v18.8H, v2.H[0] // .............*................. - // ldr q22, [x0], #64 // ..............*................ - // sshr v0.8H, v30.8H, #15 // .................*............. - // and v29.16B, v4.16B, v0.16B // ....................*.......... - // add v21.8H, v30.8H, v29.8H // .....................*......... + // ldr q30, [x0, #32] // *.............................. + // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*.......................... + // ldr q2, [x0, #48] // ..*............................ + // srshr v19.8H, v22.8H, #11 // ........*...................... + // mls v30.8H, v19.8H, v3.H[0] // ...........*................... + // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................ + // sshr v31.8H, v30.8H, #15 // ................*.............. + // srshr v25.8H, v25.8H, #11 // ..........*.................... + // and v18.16B, v3.16B, v31.16B // ..................*............ + // mls v2.8H, v25.8H, v3.H[0] // .............*................. + // add v21.8H, v30.8H, v18.8H // ...................*........... + // ldr q5, [x0, #16] // ..............*................ + // sshr v18.8H, v2.8H, #15 // .................*............. + // and v27.16B, v3.16B, v18.16B // ....................*.......... + // add v16.8H, v2.8H, v27.8H // .....................*......... sub count, count, #1 -loop_start: +1: // Instructions: 32 // Expected cycles: 36 // Expected IPC: 0.89 @@ -130,77 +150,77 @@ loop_start: // -------- cycle (expected) ---------> // 0 25 // |------------------------|---------- - sqdmulh v0.8H, v22.8H, v2.H[1] // *................................... - str q6, [x0, #-16] // .*.................................. - ldr q31, [x0, #48] // ..e................................. - srshr v5.8H, v0.8H, #11 // ....*............................... - str q21, [x0, #-32] // .....*.............................. - sqdmulh v24.8H, v31.8H, v2.H[1] // ......e............................. - mls v22.8H, v5.8H, v2.H[0] // .......*............................ - ldr q20, [x0, #-48] // ........*........................... - srshr v6.8H, v24.8H, #11 // ..........e......................... - sshr v0.8H, v22.8H, #15 // ...........*........................ - sqdmulh v21.8H, v20.8H, v2.H[1] // ............*....................... - mls v31.8H, v6.8H, v2.H[0] // .............e...................... - ldr q30, [x0, #32] // ..............e..................... - srshr v23.8H, v21.8H, #11 // ................*................... - and v19.16B, v4.16B, v0.16B // .................*.................. - sshr v16.8H, v31.8H, #15 // ..................e................. - mls v20.8H, v23.8H, v2.H[0] // ...................*................ - and v28.16B, v4.16B, v16.16B // ....................e............... - sqdmulh v23.8H, v30.8H, v2.H[1] // .....................e.............. - add v6.8H, v31.8H, v28.8H // ......................e............. - add v31.8H, v22.8H, v19.8H // .......................*............ - sshr v17.8H, v20.8H, #15 // ........................*........... - srshr v18.8H, v23.8H, #11 // .........................e.......... - and v26.16B, v4.16B, v17.16B // ..........................*......... - add v7.8H, v20.8H, v26.8H // ...........................*........ - mls v30.8H, v18.8H, v2.H[0] // ............................e....... - ldr q22, [x0], #64 // .............................e...... - str q7, [x0, #-112] // ...............................*.... - sshr v0.8H, v30.8H, #15 // ................................e... - str q31, [x0, #-128] // .................................*.. - and v29.16B, v4.16B, v0.16B // ..................................e. - add v21.8H, v30.8H, v29.8H // ...................................e - - // ------------------------ cycle (expected) -------------------------> - // 0 25 50 - // |------------------------|------------------------|----------------- - // ldr q0, [x0], #64 // ...........................e......'............................~.... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ..................................*................................. - // srshr v1.8h, v1.8h, #11 // ..~...............................'...*............................. - // mls v0.8h, v1.8h, v2.h[0] // .....~............................'......*.......................... - // sshr v3.8h, v0.8h, #15 // .........~........................'..........*...................... - // and v3.16b, v4.16b, v3.16b // ...............~..................'................*................ - // add v0.8h, v0.8h, v3.8h // .....................~............'......................*.......... - // str q0, [x0, #-64] // ...............................~..'................................* - // ldr q0, [x0, #-48] // ......~...........................'.......*......................... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ..........~.......................'...........*..................... - // srshr v1.8h, v1.8h, #11 // ..............~...................'...............*................. - // mls v0.8h, v1.8h, v2.h[0] // .................~................'..................*.............. - // sshr v3.8h, v0.8h, #15 // ......................~...........'.......................*......... - // and v3.16b, v4.16b, v3.16b // ........................~.........'.........................*....... - // add v0.8h, v0.8h, v3.8h // .........................~........'..........................*...... - // str q0, [x0, #-48] // .............................~....'..............................*.. - // ldr q0, [x0, #-32] // ............e.....................'.............~................... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ...................e..............'....................~............ - // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ - // mls v0.8h, v1.8h, v2.h[0] // ..........................e.......'...........................~..... - // sshr v3.8h, v0.8h, #15 // ..............................e...'...............................~. - // and v3.16b, v4.16b, v3.16b // ................................e.'................................. - // add v0.8h, v0.8h, v3.8h // .................................e'................................. - // str q0, [x0, #-32] // ...~..............................'....*............................ - // ldr q0, [x0, #-16] // e.................................'.~............................... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ....e.............................'.....~........................... - // srshr v1.8h, v1.8h, #11 // ........e.........................'.........~....................... - // mls v0.8h, v1.8h, v2.h[0] // ...........e......................'............~.................... - // sshr v3.8h, v0.8h, #15 // ................e.................'.................~............... - // and v3.16b, v4.16b, v3.16b // ..................e...............'...................~............. - // add v0.8h, v0.8h, v3.8h // ....................e.............'.....................~........... - // str q0, [x0, #-16] // ..................................'*................................ - - sub count, count, #1 - cbnz count, loop_start + ldr q6, [x0], #64 // *................................... + ldr q30, [x0, #32] // ..e................................. + sqdmulh v31.8H, v6.8H, v4.H[0] // ....*............................... + sqdmulh v29.8H, v5.8H, v4.H[0] // .....*.............................. + sqdmulh v22.8H, v30.8H, v4.H[0] // ......e............................. + str q16, [x0, #-16] // .......*............................ + srshr v20.8H, v31.8H, #11 // ........*........................... + srshr v28.8H, v29.8H, #11 // .........*.......................... + str q21, [x0, #-32] // ..........*......................... + mls v6.8H, v20.8H, v3.H[0] // ...........*........................ + mls v5.8H, v28.8H, v3.H[0] // ............*....................... + ldr q2, [x0, #48] // .............e...................... + sshr v31.8H, v6.8H, #15 // ...............*.................... + srshr v19.8H, v22.8H, #11 // ................e................... + and v22.16B, v3.16B, v31.16B // .................*.................. + add v0.8H, v6.8H, v22.8H // ..................*................. + mls v30.8H, v19.8H, v3.H[0] // ...................e................ + sshr v26.8H, v5.8H, #15 // ....................*............... + sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e.............. + and v17.16B, v3.16B, v26.16B // ......................*............. + add v1.8H, v5.8H, v17.8H // .......................*............ + sshr v31.8H, v30.8H, #15 // ........................e........... + srshr v25.8H, v25.8H, #11 // .........................e.......... + str q1, [x0, #-48] // ..........................*......... + and v18.16B, v3.16B, v31.16B // ...........................e........ + mls v2.8H, v25.8H, v3.H[0] // ............................e....... + add v21.8H, v30.8H, v18.8H // .............................e...... + ldr q5, [x0, #16] // ..............................e..... + sshr v18.8H, v2.8H, #15 // ................................e... + str q0, [x0, #-64] // .................................*.. + and v27.16B, v3.16B, v18.16B // ..................................e. + add v16.8H, v2.8H, v27.8H // ...................................e + + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + // ldr q0, [x0], #64 // ..................................*................................. + // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*............................. + // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*......................... + // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*...................... + // sshr v2.8h, v0.8h, #15 // .............~....................'..............*.................. + // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................ + // add v0.8h, v0.8h, v2.8h // ................~.................'.................*............... + // str q0, [x0, #-64] // ...............................~..'................................* + // ldr q0, [x0, #-48] // ............................e.....'.............................~... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................ + // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................ + // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*..................... + // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*............. + // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*........... + // add v0.8h, v0.8h, v2.8h // .....................~............'......................*.......... + // str q0, [x0, #-48] // ........................~.........'.........................*....... + // ldr q0, [x0, #-32] // e.................................'.~............................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~........................... + // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~................. + // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~.............. + // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~......... + // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~...... + // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~.... + // str q0, [x0, #-32] // ........~.........................'.........*....................... + // ldr q0, [x0, #-16] // ...........e......................'............~.................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............ + // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ + // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~..... + // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~. + // and v2.16b, v3.16b, v2.16b // ................................e.'................................. + // add v0.8h, v0.8h, v2.8h // .................................e'................................. + // str q0, [x0, #-16] // .....~............................'......*.......................... + + sub count, count, 1 + cbnz count, 1b // Instructions: 17 // Expected cycles: 23 // Expected IPC: 0.74 @@ -214,94 +234,96 @@ loop_start: // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q29, [x0, #-48] // *............................. - sqdmulh v1.8H, v22.8H, v2.H[1] // ..*........................... + sqdmulh v20.8H, v5.8H, v4.H[0] // *............................. + ldr q24, [x0], #64 // .*............................ str q21, [x0, #-32] // ...*.......................... - sqdmulh v25.8H, v29.8H, v2.H[1] // ....*......................... - str q6, [x0, #-16] // .....*........................ - srshr v30.8H, v1.8H, #11 // ......*....................... - srshr v20.8H, v25.8H, #11 // ........*..................... - mls v22.8H, v30.8H, v2.H[0] // .........*.................... - mls v29.8H, v20.8H, v2.H[0] // ...........*.................. - sshr v24.8H, v22.8H, #15 // .............*................ - and v31.16B, v4.16B, v24.16B // ...............*.............. - sshr v24.8H, v29.8H, #15 // ................*............. - add v3.8H, v22.8H, v31.8H // .................*............ - and v17.16B, v4.16B, v24.16B // ..................*........... - add v25.8H, v29.8H, v17.8H // ...................*.......... - str q3, [x0, #-64] // ....................*......... - str q25, [x0, #-48] // ......................*....... + srshr v20.8H, v20.8H, #11 // ....*......................... + sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................ + str q16, [x0, #-16] // ......*....................... + mls v5.8H, v20.8H, v3.H[0] // .......*...................... + srshr v20.8H, v25.8H, #11 // .........*.................... + sshr v2.8H, v5.8H, #15 // ...........*.................. + mls v24.8H, v20.8H, v3.H[0] // ............*................. + and v20.16B, v3.16B, v2.16B // .............*................ + add v31.8H, v5.8H, v20.8H // ..............*............... + sshr v20.8H, v24.8H, #15 // ................*............. + str q31, [x0, #-48] // .................*............ + and v31.16B, v3.16B, v20.16B // ..................*........... + add v24.8H, v24.8H, v31.8H // ...................*.......... + str q24, [x0, #-64] // ......................*....... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // sqdmulh v0.8H, v22.8H, v2.H[1] // ..*............................ - // str q6, [x0, #-16] // .....*......................... - // srshr v5.8H, v0.8H, #11 // ......*........................ - // str q21, [x0, #-32] // ...*........................... - // mls v22.8H, v5.8H, v2.H[0] // .........*..................... - // ldr q20, [x0, #-48] // *.............................. - // sshr v0.8H, v22.8H, #15 // .............*................. - // sqdmulh v21.8H, v20.8H, v2.H[1] // ....*.......................... - // srshr v23.8H, v21.8H, #11 // ........*...................... - // and v19.16B, v4.16B, v0.16B // ...............*............... - // mls v20.8H, v23.8H, v2.H[0] // ...........*................... - // add v31.8H, v22.8H, v19.8H // .................*............. - // sshr v17.8H, v20.8H, #15 // ................*.............. - // and v26.16B, v4.16B, v17.16B // ..................*............ - // add v7.8H, v20.8H, v26.8H // ...................*........... - // str q7, [x0, #-48] // ......................*........ - // str q31, [x0, #-64] // ....................*.......... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0], #64 // .*............................. + // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*......................... + // sqdmulh v29.8H, v5.8H, v4.H[0] // *.............................. + // str q16, [x0, #-16] // ......*........................ + // srshr v20.8H, v31.8H, #11 // .........*..................... + // srshr v28.8H, v29.8H, #11 // ....*.......................... + // str q21, [x0, #-32] // ...*........................... + // mls v6.8H, v20.8H, v3.H[0] // ............*.................. + // mls v5.8H, v28.8H, v3.H[0] // .......*....................... + // sshr v31.8H, v6.8H, #15 // ................*.............. + // and v22.16B, v3.16B, v31.16B // ..................*............ + // add v0.8H, v6.8H, v22.8H // ...................*........... + // sshr v26.8H, v5.8H, #15 // ...........*................... + // and v17.16B, v3.16B, v26.16B // .............*................. + // add v1.8H, v5.8H, v17.8H // ..............*................ + // str q1, [x0, #-48] // .................*............. + // str q0, [x0, #-64] // ......................*........ ret .unreq ptr .unreq count - .unreq xtmp - .unreq q_data + .unreq data - .unreq t0 - .unreq consts + .unreq q_data + + .unreq tmp .unreq mask - .unreq wmodulus .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_mulcache_compute() * ********************************************/ -.global MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt) -.global _MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt) - - cache_ptr .req x0 - data_ptr .req x1 - zeta_ptr .req x2 - zeta_twisted_ptr .req x3 - xtmp .req x4 - - count .req x5 - - data_odd .req v1 - - zeta .req v2 - zeta_twisted .req v3 - q_zeta .req q2 - q_zeta_twisted .req q3 - - q_tmp0 .req q4 - q_tmp1 .req q5 - tmp0 .req v4 - tmp1 .req v5 - consts .req v6 - dst .req v7 - q_dst .req q7 - -MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt): -_MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov count, #(16) + +.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt) + + cache_ptr .req x0 + data_ptr .req x1 + zeta_ptr .req x2 + zeta_twisted_ptr .req x3 + count .req x4 + + data_odd .req v0 + zeta .req v1 + q_zeta .req q1 + zeta_twisted .req v2 + q_zeta_twisted .req q2 + + tmp0 .req v3 + q_tmp0 .req q3 + tmp1 .req v4 + q_tmp1 .req q4 + dst .req v5 + q_dst .req q5 + + modulus .req v6 + q_modulus .req q6 + modulus_twisted .req v7 + q_modulus_twisted .req q7 + +MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + + mov count, #16 // Instructions: 7 // Expected cycles: 12 // Expected IPC: 0.58 @@ -315,65 +337,65 @@ _MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt): // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q3, [x1, #16] // *............................. - ldr q1, [x1], #32 // ..*........................... - ldr q26, [x3], #16 // ....*......................... - uzp2 v30.8H, v1.8H, v3.8H // ......*....................... - ldr q21, [x2], #16 // .......*...................... - sqrdmulh v3.8H, v30.8H, v26.8H // .........*.................... - mul v26.8H, v30.8H, v21.8H // ...........*.................. - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q30, [x1, #16] // *.............................. - // ldr q21, [x1], #32 // ..*............................ - // ldr q3, [x3], #16 // ....*.......................... - // uzp2 v24.8H, v21.8H, v30.8H // ......*........................ - // ldr q21, [x2], #16 // .......*....................... - // sqrdmulh v3.8H, v24.8H, v3.8H // .........*..................... - // mul v26.8H, v24.8H, v21.8H // ...........*................... + ldr q1, [x1, #16] // *............................. + ldr q27, [x1], #32 // ..*........................... + ldr q23, [x2], #16 // ....*......................... + uzp2 v27.8H, v27.8H, v1.8H // ......*....................... + ldr q1, [x3], #16 // .......*...................... + mul v2.8H, v27.8H, v23.8H // .........*.................... + sqrdmulh v27.8H, v27.8H, v1.8H // ...........*.................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q29, [x1, #16] // *.............................. + // ldr q21, [x2], #16 // ....*.......................... + // ldr q27, [x1], #32 // ..*............................ + // ldr q7, [x3], #16 // .......*....................... + // uzp2 v28.8H, v27.8H, v29.8H // ......*........................ + // mul v2.8H, v28.8H, v21.8H // .........*..................... + // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*................... sub count, count, #1 -mulcache_compute_loop_start: - // Instructions: 9 - // Expected cycles: 13 - // Expected IPC: 0.69 +1: + // Instructions: 9 + // Expected cycles: 13 + // Expected IPC: 0.69 - // Cycle bound: 13.0 - // IPC bound: 0.69 + // Cycle bound: 13.0 + // IPC bound: 0.69 - // Wall time: 0.08s - // User time: 0.08s + // Wall time: 0.09s + // User time: 0.09s - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q30, [x1, #16] // e............................. - ldr q21, [x1], #32 // ..e........................... - mls v26.8H, v3.8H, v6.H[0] // ....*......................... - ldr q3, [x3], #16 // .....e........................ - uzp2 v24.8H, v21.8H, v30.8H // .......e...................... - ldr q21, [x2], #16 // ........e..................... - str q26, [x0], #16 // ..........*................... - sqrdmulh v3.8H, v24.8H, v3.8H // ...........e.................. - mul v26.8H, v24.8H, v21.8H // ............e................. + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q29, [x1, #16] // e............................. + ldr q21, [x2], #16 // ..e........................... + mls v2.8H, v27.8H, v6.H[0] // ....*......................... + ldr q27, [x1], #32 // .....e........................ + ldr q7, [x3], #16 // .......e...................... + uzp2 v28.8H, v27.8H, v29.8H // .........e.................... + str q2, [x0], #16 // ..........*................... + mul v2.8H, v28.8H, v21.8H // ...........e.................. + sqrdmulh v27.8H, v28.8H, v7.8H // ............e................. // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q4, [x1], #32 // ..e..........'.~..........'.~.. - // ldr q5, [x1, #-16] // e............~............~.... - // ldr q2, [x2], #16 // ........e....'.......~....'.... - // ldr q3, [x3], #16 // .....e.......'....~.......'.... - // uzp2 v1.8h, v4.8h, v5.8h // .......e.....'......~.....'.... - // sqrdmulh v4.8h, v1.8h, v3.8h // ...........e.'..........~.'.... - // mul v7.8h, v1.8h, v2.8h // ............e'...........~'.... - // mls v7.8h, v4.8h, v6.h[0] // ....~........'...*........'.... - // str q7, [x0], #16 // ..........~..'.........*..'.... - - sub count, count, #1 - cbnz count, mulcache_compute_loop_start + // ldr q3, [x1], #32 // .....e.......'....~.......'.... + // ldr q4, [x1, #-16] // e............~............~.... + // ldr q1, [x2], #16 // ..e..........'.~..........'.~.. + // ldr q2, [x3], #16 // .......e.....'......~.....'.... + // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'.... + // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'.... + // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'.... + // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'.... + // str q5, [x0], #16 // ..........~..'.........*..'.... + + sub count, count, 1 + cbnz count, 1b // Instructions: 2 // Expected cycles: 5 // Expected IPC: 0.40 @@ -387,14 +409,14 @@ mulcache_compute_loop_start: // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - mls v26.8H, v3.8H, v6.H[0] // *............................. - str q26, [x0], #16 // ....*......................... + mls v2.8H, v27.8H, v6.H[0] // *............................. + str q2, [x0], #16 // ....*......................... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // mls v26.8H, v3.8H, v6.H[0] // *.............................. - // str q26, [x0], #16 // ....*.......................... + // mls v2.8H, v27.8H, v6.H[0] // *.............................. + // str q2, [x0], #16 // ....*.......................... ret @@ -402,7 +424,7 @@ mulcache_compute_loop_start: .unreq cache_ptr .unreq data_ptr .unreq zeta_ptr - .unreq xtmp + .unreq zeta_twisted_ptr .unreq count .unreq data_odd @@ -411,34 +433,35 @@ mulcache_compute_loop_start: .unreq zeta_twisted .unreq q_zeta_twisted - .unreq q_tmp0 - .unreq q_tmp1 .unreq tmp0 + .unreq q_tmp0 .unreq tmp1 - .unreq consts + .unreq q_tmp1 .unreq dst + .unreq q_dst + + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_tobytes() * ********************************************/ -.global MLKEM_NAMESPACE(poly_tobytes_asm_opt) -.global _MLKEM_NAMESPACE(poly_tobytes_asm_opt) +.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt) data0 .req v0 data1 .req v1 - out0 .req v2 out1 .req v3 out2 .req v4 - - tmp .req v5 + tmp .req v5 dst .req x0 src .req x1 count .req x2 -MLKEM_NAMESPACE(poly_tobytes_asm_opt): -_MLKEM_NAMESPACE(poly_tobytes_asm_opt): +MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt): mov count, #16 poly_tobytes_asm_opt_asm_loop_start: @@ -475,69 +498,65 @@ poly_tobytes_asm_opt_asm_loop_start: /********************************** * poly_tomont() * **********************************/ -.global MLKEM_NAMESPACE(poly_tomont_asm_opt) -.global _MLKEM_NAMESPACE(poly_tomont_asm_opt) - - src .req x0 - count .req x1 - xtmp .req x2 - - wtmp2 .req w3 +.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt) - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 + src .req x0 + count .req x1 - consts .req v2 + data .req v0 + q_data .req q0 + res .req v1 + q_res .req q1 - factor .req v3 - factor_t .req v4 + factor .req v2 + q_factor .req q2 + factor_t .req v3 + q_factor_t .req q3 + modulus .req v4 + q_modulus .req q4 + modulus_twisted .req v5 + q_modulus_twisted .req q5 - tmp0 .req v5 + tmp0 .req v6 -MLKEM_NAMESPACE(poly_tomont_asm_opt): -_MLKEM_NAMESPACE(poly_tomont_asm_opt): +MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - ldrsh wtmp2, [xtmp, #(2*2)] - dup factor.8h, wtmp2 - ldrh wtmp2, [xtmp, #(3*2)] - dup factor_t.8h, wtmp2 + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + ldr q_factor, c_mont_constant + ldr q_factor_t, c_barrett_twist mov count, #8 - // Instructions: 5 - // Expected cycles: 10 - // Expected IPC: 0.50 - // - // Cycle bound: 10.0 - // IPC bound: 0.50 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q6, [x0, #16] // *............................. - ldr q20, [x0, #32] // ..*........................... - mul v21.8H, v6.8H, v3.8H // ....*......................... - sqrdmulh v6.8H, v6.8H, v4.8H // .....*........................ - mls v21.8H, v6.8H, v2.H[0] // .........*.................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q16, [x0, #16] // *.............................. - // mul v21.8H, v16.8H, v3.8H // ....*.......................... - // sqrdmulh v0.8H, v16.8H, v4.8H // .....*......................... - // mls v21.8H, v0.8H, v2.H[0] // .........*..................... - // ldr q20, [x0, #32] // ..*............................ + // Instructions: 5 + // Expected cycles: 7 + // Expected IPC: 0.71 + // + // Cycle bound: 7.0 + // IPC bound: 0.71 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q26, [x0, #48] // *............................. + ldr q23, [x0, #16] // ..*........................... + mul v17.8H, v26.8H, v2.8H // ....*......................... + sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................ + ldr q27, [x0, #32] // ......*....................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q7, [x0, #48] // *.............................. + // ldr q23, [x0, #16] // ..*............................ + // mul v17.8H, v7.8H, v2.8H // ....*.......................... + // sqrdmulh v7.8H, v7.8H, v3.8H // .....*......................... + // ldr q27, [x0, #32] // ......*........................ sub count, count, #1 -poly_tomont_asm_loop: +1: // Instructions: 20 // Expected cycles: 24 // Expected IPC: 0.83 @@ -545,108 +564,127 @@ poly_tomont_asm_loop: // Cycle bound: 24.0 // IPC bound: 0.83 // - // Wall time: 0.58s - // User time: 0.58s + // Wall time: 0.73s + // User time: 0.73s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q18, [x0], #64 // *............................. - str q21, [x0, #-48] // ..*........................... - ldr q16, [x0, #16] // ...e.......................... - sqrdmulh v27.8H, v20.8H, v4.8H // .....*........................ - mul v1.8H, v20.8H, v3.8H // ......*....................... - mul v21.8H, v16.8H, v3.8H // .......e...................... - sqrdmulh v0.8H, v16.8H, v4.8H // ........e..................... - ldr q30, [x0, #-16] // .........*.................... - sqrdmulh v6.8H, v18.8H, v4.8H // ...........*.................. - mul v18.8H, v18.8H, v3.8H // ............*................. - mls v1.8H, v27.8H, v2.H[0] // .............*................ - mul v5.8H, v30.8H, v3.8H // ..............*............... - sqrdmulh v22.8H, v30.8H, v4.8H // ...............*.............. - mls v18.8H, v6.8H, v2.H[0] // ................*............. - str q1, [x0, #-32] // .................*............ - mls v21.8H, v0.8H, v2.H[0] // ..................e........... - mls v5.8H, v22.8H, v2.H[0] // ...................*.......... - str q18, [x0, #-64] // ....................*......... - ldr q20, [x0, #32] // .....................e........ - str q5, [x0, #-16] // .......................*...... - - // ------------- cycle (expected) -------------> + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v5.8H, v23.8H, v3.8H // .*............................ + ldr q7, [x0], #64 // ..*........................... + str q17, [x0, #-16] // ....*......................... + sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................ + sqrdmulh v19.8H, v7.8H, v3.8H // ......*....................... + mul v25.8H, v23.8H, v2.8H // .......*...................... + mul v0.8H, v7.8H, v2.8H // ........*..................... + mul v26.8H, v27.8H, v2.8H // .........*.................... + ldr q7, [x0, #48] // ..........e................... + mls v25.8H, v5.8H, v4.H[0] // ............*................. + ldr q23, [x0, #16] // .............e................ + mls v26.8H, v29.8H, v4.H[0] // ...............*.............. + mls v0.8H, v19.8H, v4.H[0] // ................*............. + str q25, [x0, #-48] // .................*............ + mul v17.8H, v7.8H, v2.8H // ..................e........... + sqrdmulh v7.8H, v7.8H, v3.8H // ...................e.......... + str q0, [x0, #-64] // ....................*......... + ldr q27, [x0, #32] // .....................e........ + str q26, [x0, #-32] // .......................*...... + + // --------- cycle (expected) ----------> // 0 25 - // |------------------------|------------------- - // ldr q0, [x0], #64 // .....................*....................... - // sqrdmulh v5.8h, v0.8h, v4.8h // ........~............'..........*............ - // mul v1.8h, v0.8h, v3.8h // .........~...........'...........*........... - // mls v1.8h, v5.8h, v2.h[0] // .............~.......'...............*....... - // str q1, [x0, #-64] // .................~...'...................*... - // ldr q0, [x0, #-48] // e....................'..~.................... - // sqrdmulh v5.8h, v0.8h, v4.8h // .....e...............'.......~............... - // mul v1.8h, v0.8h, v3.8h // ....e................'......~................ - // mls v1.8h, v5.8h, v2.h[0] // ...............e.....'.................~..... - // str q1, [x0, #-48] // .....................'.*..................... - // ldr q0, [x0, #-32] // ..................e..'....................~.. - // sqrdmulh v5.8h, v0.8h, v4.8h // ..~..................'....*.................. - // mul v1.8h, v0.8h, v3.8h // ...~.................'.....*................. - // mls v1.8h, v5.8h, v2.h[0] // ..........~..........'............*.......... - // str q1, [x0, #-32] // ..............~......'................*...... - // ldr q0, [x0, #-16] // ......~..............'........*.............. - // sqrdmulh v5.8h, v0.8h, v4.8h // ............~........'..............*........ - // mul v1.8h, v0.8h, v3.8h // ...........~.........'.............*......... - // mls v1.8h, v5.8h, v2.h[0] // ................~....'..................*.... - // str q1, [x0, #-16] // ....................~'......................* - - sub count, count, #1 - cbnz count, poly_tomont_asm_loop + // |------------------------|------------ + // ldr q0, [x0], #64 // ..............'.*..................... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*................. + // mul v1.8h, v0.8h, v2.8h // ..............'.......*............... + // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*....... + // str q1, [x0, #-64] // ..........~...'...................*... + // ldr q0, [x0, #-48] // ...e..........'............~.......... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*...................... + // mul v1.8h, v0.8h, v2.8h // ..............'......*................ + // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*........... + // str q1, [x0, #-48] // .......~......'................*...... + // ldr q0, [x0, #-32] // ...........e..'....................~.. + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*.................. + // mul v1.8h, v0.8h, v2.8h // ..............'........*.............. + // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........ + // str q1, [x0, #-32] // .............~'......................* + // ldr q0, [x0, #-16] // e.............'.........~............. + // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~.... + // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~..... + // mls v1.8h, v6.8h, v4.h[0] // ..............*....................... + // str q1, [x0, #-16] // ..............'...*................... + + sub count, count, 1 + cbnz count, 1b // Instructions: 15 - // Expected cycles: 19 - // Expected IPC: 0.79 + // Expected cycles: 18 + // Expected IPC: 0.83 // - // Cycle bound: 19.0 - // IPC bound: 0.79 + // Cycle bound: 18.0 + // IPC bound: 0.83 // - // Wall time: 0.04s - // User time: 0.04s + // Wall time: 0.07s + // User time: 0.07s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - mul v31.8H, v20.8H, v3.8H // *............................. - sqrdmulh v25.8H, v20.8H, v4.8H // .*............................ - ldr q27, [x0], #64 // ..*........................... - ldr q23, [x0, #-16] // ....*......................... - mls v31.8H, v25.8H, v2.H[0] // ......*....................... - sqrdmulh v30.8H, v27.8H, v4.8H // .......*...................... - mul v6.8H, v27.8H, v3.8H // ........*..................... - mul v20.8H, v23.8H, v3.8H // .........*.................... - sqrdmulh v26.8H, v23.8H, v4.8H // ..........*................... - str q31, [x0, #-32] // ...........*.................. - mls v6.8H, v30.8H, v2.H[0] // ............*................. - str q21, [x0, #-48] // .............*................ - mls v20.8H, v26.8H, v2.H[0] // ..............*............... - str q6, [x0, #-64] // ................*............. - str q20, [x0, #-16] // ..................*........... + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v7.8H, v23.8H, v3.8H // .*............................ + mul v26.8H, v23.8H, v2.8H // ..*........................... + sqrdmulh v25.8H, v27.8H, v3.8H // ...*.......................... + ldr q23, [x0], #64 // ....*......................... + mul v27.8H, v27.8H, v2.8H // ......*....................... + mls v26.8H, v7.8H, v4.H[0] // .......*...................... + sqrdmulh v7.8H, v23.8H, v3.8H // ........*..................... + mul v23.8H, v23.8H, v2.8H // .........*.................... + str q17, [x0, #-16] // ..........*................... + mls v27.8H, v25.8H, v4.H[0] // ...........*.................. + str q26, [x0, #-48] // ............*................. + mls v23.8H, v7.8H, v4.H[0] // .............*................ + str q27, [x0, #-32] // ...............*.............. + str q23, [x0, #-64] // .................*............ // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q18, [x0], #64 // ..*............................ - // str q21, [x0, #-48] // .............*................. - // sqrdmulh v27.8H, v20.8H, v4.8H // .*............................. - // mul v1.8H, v20.8H, v3.8H // *.............................. - // ldr q30, [x0, #-16] // ....*.......................... - // sqrdmulh v6.8H, v18.8H, v4.8H // .......*....................... - // mul v18.8H, v18.8H, v3.8H // ........*...................... - // mls v1.8H, v27.8H, v2.H[0] // ......*........................ - // mul v5.8H, v30.8H, v3.8H // .........*..................... - // sqrdmulh v22.8H, v30.8H, v4.8H // ..........*.................... - // mls v18.8H, v6.8H, v2.H[0] // ............*.................. - // str q1, [x0, #-32] // ...........*................... - // mls v5.8H, v22.8H, v2.H[0] // ..............*................ - // str q18, [x0, #-64] // ................*.............. - // str q5, [x0, #-16] // ..................*............ + // mls v17.8H, v7.8H, v4.H[0] // *.............................. + // sqrdmulh v5.8H, v23.8H, v3.8H // .*............................. + // ldr q7, [x0], #64 // ....*.......................... + // str q17, [x0, #-16] // ..........*.................... + // sqrdmulh v29.8H, v27.8H, v3.8H // ...*........................... + // sqrdmulh v19.8H, v7.8H, v3.8H // ........*...................... + // mul v25.8H, v23.8H, v2.8H // ..*............................ + // mul v0.8H, v7.8H, v2.8H // .........*..................... + // mul v26.8H, v27.8H, v2.8H // ......*........................ + // mls v25.8H, v5.8H, v4.H[0] // .......*....................... + // mls v26.8H, v29.8H, v4.H[0] // ...........*................... + // mls v0.8H, v19.8H, v4.H[0] // .............*................. + // str q25, [x0, #-48] // ............*.................. + // str q0, [x0, #-64] // .................*............. + // str q26, [x0, #-32] // ...............*............... ret + .unreq src + .unreq count + + .unreq data + .unreq q_data + .unreq res + .unreq q_res + + .unreq factor + .unreq q_factor + .unreq factor_t + .unreq q_factor_t + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted + + .unreq tmp0 + #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S index bfd1d2b8a..99fb05de5 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_clean.S @@ -12,10 +12,30 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -#include "params.h" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ + +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm -// Needed to provide ASM_LOAD directive -#include "common.i" +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 3327 // Input: // - Vectors al, ah of 32-bit entries @@ -23,9 +43,9 @@ // - Montgomery reductions of al || ah, stored in al .macro montgomery_reduce_long x, a uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, consts.h[2] - smlal \a\()l.4s, t0.4h, constN.4h - smlal2 \a\()h.4s, t0.8h, constN.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h uzp2 \x\().8h, \a\()l.8h, \a\()h.8h .endm @@ -58,10 +78,24 @@ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h .endm +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + .macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2 {\a\()0.8h, \a\()1.8h}, [\a_ptr], #32 - ld2 {\b\()0.8h, \b\()1.8h}, [\b_ptr], #32 - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 .endm .macro save_vregs @@ -101,14 +135,12 @@ a3_ptr .req x10 b3_ptr .req x11 b3_cache_ptr .req x12 - count .req x13 - xtmp .req x14 - modulus .req w15 - constN .req v0 - constX .req v1 - consts .req v2 + modulus .req v0 + q_modulus .req q0 + modulus_twisted .req v2 + q_modulus_twisted .req q2 aa0 .req v3 aa1 .req v4 @@ -131,29 +163,13 @@ t0 .req v28 -.p2align 4 -const_addr: - .short 3329 - .short 20159 - .short 3327 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - #if MLKEM_K == 2 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -172,7 +188,7 @@ k2_loop_start: montgomery_reduce_long out0, res0 montgomery_reduce_long out1, res1 - st2 {out0.8h, out1.8h}, [out], #32 + st2_wrap out, out subs count, count, #1 cbnz count, k2_loop_start @@ -182,17 +198,12 @@ k2_loop_start: #endif /* MLKEM_K == 2 */ #if MLKEM_K == 3 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -216,7 +227,7 @@ k3_loop_start: montgomery_reduce_long out0, res0 montgomery_reduce_long out1, res1 - st2 {out0.8h, out1.8h}, [out], #32 + st2_wrap out, out subs count, count, #1 cbnz count, k3_loop_start @@ -226,17 +237,12 @@ k3_loop_start: #endif /* MLKEM_K == 3 */ #if MLKEM_K == 4 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -270,7 +276,7 @@ k4_loop_start: montgomery_reduce_long out0, res0 montgomery_reduce_long out1, res1 - st2 {out0.8h, out1.8h}, [out], #32 + st2_wrap out, out subs count, count, #1 cbnz count, k4_loop_start diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S index 07dc98efd..16ed77c3f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/polyvec_opt.S @@ -11,10 +11,31 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -#include "params.h" -// Needed to provide ASM_LOAD directive -#include "common.i" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ + +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm + +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 3327 // Input: // - Vectors al, ah of 32-bit entries @@ -22,13 +43,17 @@ // - Montgomery reductions of al || ah, stored in al .macro montgomery_reduce_long x, a uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, consts.h[2] - smlal \a\()l.4s, t0.4h, constN.4h - smlal2 \a\()h.4s, t0.8h, constN.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h uzp2 \x\().8h, \a\()l.8h, \a\()h.8h .endm // Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 .macro pmull d, a, b smull \d\()0l.4s, \a\()0.4h, \b\()0.4h smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h @@ -53,10 +78,24 @@ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h .endm +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + .macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2 {\a\()0.8h, \a\()1.8h}, [\a_ptr], #32 - ld2 {\b\()0.8h, \b\()1.8h}, [\b_ptr], #32 - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 .endm .macro save_vregs @@ -96,14 +135,12 @@ a3_ptr .req x10 b3_ptr .req x11 b3_cache_ptr .req x12 - count .req x13 - xtmp .req x14 - modulus .req w15 - constN .req v0 - constX .req v1 - consts .req v2 + modulus .req v0 + q_modulus .req q0 + modulus_twisted .req v2 + q_modulus_twisted .req q2 aa0 .req v3 aa1 .req v4 @@ -126,29 +163,13 @@ t0 .req v28 -.p2align 4 -const_addr: - .short 3329 - .short 20159 - .short 3327 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - #if MLKEM_K == 2 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -157,195 +178,351 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) mov count, #(MLKEM_N / 16) - // Instructions: 6 - // Expected cycles: 11 - // Expected IPC: 0.55 - - // Cycle bound: 11.0 - // IPC bound: 0.55 - - // Wall time: 0.01s - // User time: 0.01s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ld2 {v6.8H, v7.8H}, [x1], #32 // *............................. - ld2 {v4.8H, v5.8H}, [x2], #32 // ..*........................... - ld1 {v12.8H}, [x3], #16 // ....*......................... - ld2 {v15.8H, v16.8H}, [x4], #32 // ......*....................... - ld2 {v29.8H, v30.8H}, [x5], #32 // ........*..................... - ld1 {v31.8H}, [x6], #16 // ..........*................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ld2 {v6.8H, v7.8H}, [x1], #32 // *.............................. - // ld2 {v4.8H, v5.8H}, [x2], #32 // ..*............................ - // ld1 {v12.8H}, [x3], #16 // ....*.......................... - // ld2 {v15.8H, v16.8H}, [x4], #32 // ......*........................ - // ld2 {v29.8H, v30.8H}, [x5], #32 // ........*...................... - // ld1 {v31.8H}, [x6], #16 // ..........*.................... + // Instructions: 75 + // Expected cycles: 94 + // Expected IPC: 0.80 + + // Cycle bound: 94.0 + // IPC bound: 0.80 + + // Wall time: 1.49s + // User time: 1.49s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q9, [x4], #32 // *.......................................................................... + ldr q5, [x4, #-16] // ......*.................................................................... + ldr q11, [x5], #32 // .*......................................................................... + uzp1 v23.8H, v9.8H, v5.8H // .........*................................................................. + uzp2 v9.8H, v9.8H, v5.8H // .....................*..................................................... + ldr q5, [x2], #32 // ..*........................................................................ + ldr q7, [x5, #-16] // ..............*............................................................ + ldr q21, [x2, #-16] // ...*....................................................................... + uzp2 v10.8H, v11.8H, v7.8H // .................*......................................................... + uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................ + uzp1 v7.8H, v5.8H, v21.8H // ....*...................................................................... + uzp2 v5.8H, v5.8H, v21.8H // .....*..................................................................... + ldr q21, [x1], #32 // .......*................................................................... + ldr q25, [x1, #-16] // ........*.................................................................. + ld1 {v6.8H}, [x3], #16 // ............................*.............................................. + uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................ + uzp2 v21.8H, v21.8H, v25.8H // ...........*............................................................... + smull v25.4S, v26.4H, v5.4H // ............*.............................................................. + smull2 v5.4S, v26.8H, v5.8H // .............*............................................................. + smull v19.4S, v26.4H, v7.4H // ..........................*................................................ + smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................ + smlal v25.4S, v21.4H, v7.4H // ...............*........................................................... + smlal2 v5.4S, v21.8H, v7.8H // ................*.......................................................... + smlal v19.4S, v21.4H, v6.4H // ...................................*....................................... + smlal2 v26.4S, v21.8H, v6.8H // .................................*......................................... + smlal v25.4S, v23.4H, v10.4H // ...................*....................................................... + smlal2 v5.4S, v23.8H, v10.8H // ....................*...................................................... + smlal v19.4S, v23.4H, v11.4H // ......................................*.................................... + smlal2 v26.4S, v23.8H, v11.8H // ....................................*...................................... + ld1 {v23.8H}, [x6], #16 // ........................*.................................................. + smlal v25.4S, v9.4H, v11.4H // ......................*.................................................... + smlal2 v5.4S, v9.8H, v11.8H // .......................*................................................... + smlal2 v26.4S, v9.8H, v23.8H // .......................................*................................... + smlal v19.4S, v9.4H, v23.4H // .........................................*................................. + ldr q9, [x4], #32 // ...............................*........................................... + uzp1 v11.8H, v25.8H, v5.8H // .........................*................................................. + uzp1 v23.8H, v19.8H, v26.8H // .............................................*............................. + mul v11.8H, v11.8H, v2.8H // ...........................*............................................... + mul v23.8H, v23.8H, v2.8H // ..............................................*............................ + ldr q7, [x5], #32 // ................................*.......................................... + smlal2 v5.4S, v11.8H, v0.8H // .............................*............................................. + smlal v25.4S, v11.4H, v0.4H // ..................................*........................................ + ldr q11, [x2], #32 // .....................................*..................................... + ldr q21, [x2, #-16] // ........................................*.................................. + ldr q6, [x4, #-16] // ...............................................*........................... + uzp1 v17.8H, v11.8H, v21.8H // ...........................................*............................... + ldr q10, [x1], #32 // ................................................*.......................... + ldr q29, [x1, #-16] // .................................................*......................... + uzp2 v11.8H, v11.8H, v21.8H // ............................................*.............................. + uzp1 v13.8H, v9.8H, v6.8H // ...................................................*....................... + uzp1 v3.8H, v10.8H, v29.8H // ....................................................*...................... + uzp2 v10.8H, v10.8H, v29.8H // .....................................................*..................... + smull v12.4S, v3.4H, v11.4H // ......................................................*.................... + smull2 v11.4S, v3.8H, v11.8H // .......................................................*................... + ldr q21, [x5, #-16] // ........................................................*.................. + smlal v12.4S, v10.4H, v17.4H // .........................................................*................. + smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................ + uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*............... + uzp1 v15.8H, v7.8H, v21.8H // ............................................................*.............. + smlal v12.4S, v13.4H, v29.4H // .............................................................*............. + smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............ + uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*........... + smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................ + smlal v12.4S, v28.4H, v15.4H // .................................................................*......... + smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........ + smlal v19.4S, v23.4H, v0.4H // ................................................................*.......... + uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................ + smull v23.4S, v3.4H, v17.4H // ......................................................................*.... + uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*..... + uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*...... + mul v14.8H, v9.8H, v2.8H // .......................................................................*... + ld1 {v22.8H}, [x6], #16 // ...................................................................*....... + zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................* + ld1 {v4.8H}, [x3], #16 // .........................................................................*. + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q18, [x4], #32 // *.......................................................................... + // ldr q30, [x5], #32 // ..*........................................................................ + // ldr q8, [x2], #32 // .....*..................................................................... + // ldr q9, [x2, #-16] // .......*................................................................... + // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................ + // uzp2 v4.8H, v8.8H, v9.8H // ...........*............................................................... + // ldr q19, [x4, #-16] // .*......................................................................... + // ldr q29, [x1], #32 // ............*.............................................................. + // ldr q12, [x1, #-16] // .............*............................................................. + // uzp1 v13.8H, v18.8H, v19.8H // ...*....................................................................... + // uzp1 v3.8H, v29.8H, v12.8H // ...............*........................................................... + // uzp2 v10.8H, v29.8H, v12.8H // ................*.......................................................... + // smull v12.4S, v3.4H, v4.4H // .................*......................................................... + // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................ + // ldr q5, [x5, #-16] // ......*.................................................................... + // smlal v12.4S, v10.4H, v17.4H // .....................*..................................................... + // smlal2 v11.4S, v10.8H, v17.8H // ......................*.................................................... + // uzp2 v14.8H, v30.8H, v5.8H // ........*.................................................................. + // uzp1 v15.8H, v30.8H, v5.8H // .........*................................................................. + // smlal v12.4S, v13.4H, v14.4H // .........................*................................................. + // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................ + // uzp2 v28.8H, v18.8H, v19.8H // ....*...................................................................... + // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................ + // smlal2 v11.4S, v28.8H, v15.8H // ...............................*........................................... + // ld1 {v22.8H}, [x6], #16 // .............................*............................................. + // uzp1 v1.8H, v12.8H, v11.8H // ...................................*....................................... + // smull v23.4S, v3.4H, v17.4H // ...................*....................................................... + // mul v14.8H, v1.8H, v2.8H // .....................................*..................................... + // ld1 {v4.8H}, [x3], #16 // ..............*............................................................ + // smlal2 v11.4S, v14.8H, v0.8H // ........................................*.................................. + // smull2 v20.4S, v3.8H, v17.8H // ....................*...................................................... + // ldr q18, [x4], #32 // ..................................*........................................ + // ldr q30, [x5], #32 // .......................................*................................... + // smlal2 v20.4S, v10.8H, v4.8H // ........................*.................................................. + // smlal v12.4S, v14.4H, v0.4H // .........................................*................................. + // smlal v23.4S, v10.4H, v4.4H // .......................*................................................... + // smlal2 v20.4S, v13.8H, v15.8H // ............................*.............................................. + // ldr q8, [x2], #32 // ..........................................*................................ + // smlal v23.4S, v13.4H, v15.4H // ...........................*............................................... + // smlal2 v20.4S, v28.8H, v22.8H // ................................*.......................................... + // ldr q9, [x2, #-16] // ...........................................*............................... + // smlal v23.4S, v28.4H, v22.4H // .................................*......................................... + // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........ + // uzp1 v17.8H, v8.8H, v9.8H // .............................................*............................. + // uzp2 v4.8H, v8.8H, v9.8H // ................................................*.......................... + // uzp1 v5.8H, v23.8H, v20.8H // ....................................*...................................... + // mul v31.8H, v5.8H, v2.8H // ......................................*.................................... + // ldr q19, [x4, #-16] // ............................................*.............................. + // ldr q29, [x1], #32 // ..............................................*............................ + // ldr q12, [x1, #-16] // ...............................................*........................... + // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............ + // uzp1 v13.8H, v18.8H, v19.8H // .................................................*......................... + // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................ + // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*....................... + // smull v12.4S, v3.4H, v4.4H // ....................................................*...................... + // smull2 v11.4S, v3.8H, v4.8H // .....................................................*..................... + // ldr q5, [x5, #-16] // ......................................................*.................... + // smlal v12.4S, v10.4H, v17.4H // .......................................................*................... + // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*.................. + // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*................. + // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................ + // smlal v12.4S, v13.4H, v14.4H // ...........................................................*............... + // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*.............. + // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*............. + // smlal v23.4S, v31.4H, v0.4H // .................................................................*......... + // smlal v12.4S, v28.4H, v15.4H // ...............................................................*........... + // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*.......... + // ld1 {v22.8H}, [x6], #16 // .......................................................................*... + // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*..... + // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*...... + // smull v23.4S, v3.4H, v17.4H // ...................................................................*....... + // mul v14.8H, v1.8H, v2.8H // ......................................................................*.... + // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + // ld1 {v4.8H}, [x3], #16 // ..........................................................................* + // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*. - sub count, count, #1 -k2_loop_start: - // Instructions: 33 - // Expected cycles: 40 - // Expected IPC: 0.82 - - // Cycle bound: 40.0 - // IPC bound: 0.82 - - // Wall time: 1.67s - // User time: 1.67s - - // ---------- cycle (expected) -----------> - // 0 25 - // |------------------------|-------------- - smull v13.4S, v6.4H, v4.4H // *....................................... - smull2 v10.4S, v6.8H, v4.8H // .*...................................... - smull v21.4S, v6.4H, v5.4H // ..*..................................... - smull2 v3.4S, v6.8H, v5.8H // ...*.................................... - smlal v13.4S, v7.4H, v12.4H // ....*................................... - smlal2 v10.4S, v7.8H, v12.8H // .....*.................................. - smlal v21.4S, v7.4H, v4.4H // ......*................................. - smlal2 v3.4S, v7.8H, v4.8H // .......*................................ - smlal v13.4S, v15.4H, v29.4H // ........*............................... - smlal2 v10.4S, v15.8H, v29.8H // .........*.............................. - smlal v21.4S, v15.4H, v30.4H // ..........*............................. - smlal2 v3.4S, v15.8H, v30.8H // ...........*............................ - smlal v13.4S, v16.4H, v31.4H // ............*........................... - smlal2 v10.4S, v16.8H, v31.8H // .............*.......................... - smlal v21.4S, v16.4H, v29.4H // ..............*......................... - smlal2 v3.4S, v16.8H, v29.8H // ...............*........................ - ld2 {v6.8H, v7.8H}, [x1], #32 // ................e....................... - uzp1 v12.8H, v13.8H, v10.8H // ..................*..................... - uzp1 v30.8H, v21.8H, v3.8H // ...................*.................... - mul v12.8H, v12.8H, v2.H[2] // ....................*................... - mul v30.8H, v30.8H, v2.H[2] // .....................*.................. - ld2 {v4.8H, v5.8H}, [x2], #32 // ......................e................. - smlal v13.4S, v12.4H, v0.4H // ........................*............... - smlal2 v10.4S, v12.8H, v0.8H // .........................*.............. - smlal v21.4S, v30.4H, v0.4H // ..........................*............. - smlal2 v3.4S, v30.8H, v0.8H // ...........................*............ - ld1 {v12.8H}, [x3], #16 // ............................e........... - uzp2 v13.8H, v13.8H, v10.8H // ..............................*......... - uzp2 v14.8H, v21.8H, v3.8H // ...............................*........ - ld2 {v15.8H, v16.8H}, [x4], #32 // ................................e....... - st2 {v13.8H, v14.8H}, [x0], #32 // ..................................*..... - ld2 {v29.8H, v30.8H}, [x5], #32 // ....................................e... - ld1 {v31.8H}, [x6], #16 // ......................................e. - - // -------------------- cycle (expected) --------------------> - // 0 25 50 - // |------------------------|------------------------|-------- - // ld2 {v3.8h, v4.8h}, [x1], #32 // e.......................'...............~.................. - // ld2 {v5.8h, v6.8h}, [x2], #32 // ......e.................'.....................~............ - // ld1 {v7.8h}, [x3], #16 // ............e...........'...........................~...... - // smull v8.4s, v3.4h, v5.4h // ........................*.................................. - // smull2 v10.4s, v3.8h, v5.8h // ........................'*................................. - // smlal v8.4s, v4.4h, v7.4h // ........................'...*.............................. - // smlal2 v10.4s, v4.8h, v7.8h // ........................'....*............................. - // smull v9.4s, v3.4h, v6.4h // ........................'.*................................ - // smull2 v11.4s, v3.8h, v6.8h // ........................'..*............................... - // smlal v9.4s, v4.4h, v5.4h // ........................'.....*............................ - // smlal2 v11.4s, v4.8h, v5.8h // ........................'......*........................... - // ld2 {v3.8h, v4.8h}, [x4], #32 // ................e.......'...............................~.. - // ld2 {v5.8h, v6.8h}, [x5], #32 // ....................e...'.................................. - // ld1 {v7.8h}, [x6], #16 // ......................e.'.................................. - // smlal v8.4s, v3.4h, v5.4h // ........................'.......*.......................... - // smlal2 v10.4s, v3.8h, v5.8h // ........................'........*......................... - // smlal v8.4s, v4.4h, v7.4h // ........................'...........*...................... - // smlal2 v10.4s, v4.8h, v7.8h // ........................'............*..................... - // smlal v9.4s, v3.4h, v6.4h // ........................'.........*........................ - // smlal2 v11.4s, v3.8h, v6.8h // ........................'..........*....................... - // smlal v9.4s, v4.4h, v5.4h // ........................'.............*.................... - // smlal2 v11.4s, v4.8h, v5.8h // ........................'..............*................... - // uzp1 v28.8h, v8.8h, v10.8h // ..~.....................'.................*................ - // mul v28.8h, v28.8h, v2.h[2] // ....~...................'...................*.............. - // smlal v8.4s, v28.4h, v0.4h // ........~...............'.......................*.......... - // smlal2 v10.4s, v28.8h, v0.8h // .........~..............'........................*......... - // uzp2 v26.8h, v8.8h, v10.8h // ..............~.........'.............................*.... - // uzp1 v28.8h, v9.8h, v11.8h // ...~....................'..................*............... - // mul v28.8h, v28.8h, v2.h[2] // .....~..................'....................*............. - // smlal v9.4s, v28.4h, v0.4h // ..........~.............'.........................*........ - // smlal2 v11.4s, v28.8h, v0.8h // ...........~............'..........................*....... - // uzp2 v27.8h, v9.8h, v11.8h // ...............~........'..............................*... - // st2 {v26.8h, v27.8h}, [x0], #32 // ..................~.....'.................................* + sub count, count, #2 +1: + // Instructions: 48 + // Expected cycles: 58 + // Expected IPC: 0.83 + + // Cycle bound: 58.0 + // IPC bound: 0.83 + + // Wall time: 6.39s + // User time: 6.39s + + // -------------- original position --------------> + // 0 25 + // |------------------------|---------------------- + smull2 v20.4S, v3.8H, v17.8H // ..........*..................................... + ldr q18, [x4], #32 // .................e.............................. + ldr q30, [x5], #32 // .....................e.......................... + smlal2 v20.4S, v10.8H, v4.8H // ............*................................... + smlal v12.4S, v14.4H, v0.4H // .........................................*...... + smlal v23.4S, v10.4H, v4.4H // ...........*.................................... + str q9, [x0, #16] // ...............................................l + smlal2 v20.4S, v13.8H, v15.8H // ...........................*.................... + ldr q8, [x2], #32 // ....e........................................... + smlal v23.4S, v13.4H, v15.4H // ..........................*..................... + smlal2 v20.4S, v28.8H, v22.8H // .............................*.................. + zip1 v26.8H, v19.8H, v27.8H // ............................................l... + ldr q9, [x2, #-16] // .....e.......................................... + smlal v23.4S, v28.4H, v22.4H // ............................*................... + uzp2 v27.8H, v12.8H, v11.8H // ...........................................*.... + uzp1 v17.8H, v8.8H, v9.8H // ......e......................................... + uzp2 v4.8H, v8.8H, v9.8H // .......e........................................ + uzp1 v5.8H, v23.8H, v20.8H // ..................................*............. + str q26, [x0], #32 // ..............................................l. + mul v31.8H, v5.8H, v2.8H // ...................................*............ + ldr q19, [x4, #-16] // ..................e............................. + ldr q29, [x1], #32 // e............................................... + ldr q12, [x1, #-16] // .e.............................................. + smlal2 v20.4S, v31.8H, v0.8H // .....................................*.......... + uzp1 v13.8H, v18.8H, v19.8H // ...................e............................ + uzp1 v3.8H, v29.8H, v12.8H // ..e............................................. + uzp2 v10.8H, v29.8H, v12.8H // ...e............................................ + smull v12.4S, v3.4H, v4.4H // .............e.................................. + smull2 v11.4S, v3.8H, v4.8H // ..............e................................. + ldr q5, [x5, #-16] // ......................e......................... + smlal v12.4S, v10.4H, v17.4H // ...............e................................ + smlal2 v11.4S, v10.8H, v17.8H // ................e............................... + uzp2 v14.8H, v30.8H, v5.8H // ........................e....................... + uzp1 v15.8H, v30.8H, v5.8H // .......................e........................ + smlal v12.4S, v13.4H, v14.4H // ..............................e................. + smlal2 v11.4S, v13.8H, v14.8H // ...............................e................ + uzp2 v28.8H, v18.8H, v19.8H // ....................e........................... + smlal v23.4S, v31.4H, v0.4H // ....................................*........... + smlal v12.4S, v28.4H, v15.4H // ................................e............... + smlal2 v11.4S, v28.8H, v15.8H // .................................e.............. + ld1 {v22.8H}, [x6], #16 // .........................e...................... + uzp2 v19.8H, v23.8H, v20.8H // ......................................*......... + uzp1 v1.8H, v12.8H, v11.8H // .......................................e........ + smull v23.4S, v3.4H, v17.4H // .........e...................................... + mul v14.8H, v1.8H, v2.8H // ........................................e....... + zip2 v9.8H, v19.8H, v27.8H // .............................................*.. + ld1 {v4.8H}, [x3], #16 // ........e....................................... + smlal2 v11.4S, v14.8H, v0.8H // ..........................................e..... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'.................. + // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'.................. + // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~.......... + // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~...... + // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~... + // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~.. + // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'.................. + // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'.................. + // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~.................. + // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~............. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~............... + // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'.................. + // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'.................. + // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'.................. + // ldr q12, [x4], #32 // e..............................................'~..............................................'~................. + // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'.................. + // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................ + // ldr q13, [x5, #-16] // ............................e..................'............................~..................'.................. + // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'.................. + // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'.................. + // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'.................. + // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~......... + // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~........... + // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~..... + // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........ + // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'.................. + // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'.................. + // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'.................. + // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~. + // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'.................. + // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'.................. + // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'.................. + // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'.................. + // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'.................. + // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'.................. + // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~.............. + // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'.................. + // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~.... + // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l....... + // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'.................. + // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l + // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............ sub count, count, #1 - cbnz count, k2_loop_start - // Instructions: 27 - // Expected cycles: 34 - // Expected IPC: 0.79 - - // Cycle bound: 34.0 - // IPC bound: 0.79 - - // Wall time: 0.25s - // User time: 0.25s - - // ------- cycle (expected) --------> - // 0 25 - // |------------------------|-------- - smull v13.4S, v6.4H, v5.4H // *................................. - smull2 v10.4S, v6.8H, v5.8H // .*................................ - smull v21.4S, v6.4H, v4.4H // ..*............................... - smull2 v3.4S, v6.8H, v4.8H // ...*.............................. - smlal v13.4S, v7.4H, v4.4H // ....*............................. - smlal2 v10.4S, v7.8H, v4.8H // .....*............................ - smlal v21.4S, v7.4H, v12.4H // ......*........................... - smlal2 v3.4S, v7.8H, v12.8H // .......*.......................... - smlal v13.4S, v15.4H, v30.4H // ........*......................... - smlal2 v10.4S, v15.8H, v30.8H // .........*........................ - smlal v21.4S, v15.4H, v29.4H // ..........*....................... - smlal2 v3.4S, v15.8H, v29.8H // ...........*...................... - smlal v13.4S, v16.4H, v29.4H // ............*..................... - smlal2 v10.4S, v16.8H, v29.8H // .............*.................... - smlal v21.4S, v16.4H, v31.4H // ..............*................... - smlal2 v3.4S, v16.8H, v31.8H // ...............*.................. - uzp1 v12.8H, v13.8H, v10.8H // .................*................ - uzp1 v6.8H, v21.8H, v3.8H // ...................*.............. - mul v12.8H, v12.8H, v2.H[2] // ....................*............. - mul v6.8H, v6.8H, v2.H[2] // .....................*............ - smlal v13.4S, v12.4H, v0.4H // ........................*......... - smlal v21.4S, v6.4H, v0.4H // .........................*........ - smlal2 v3.4S, v6.8H, v0.8H // ..........................*....... - smlal2 v10.4S, v12.8H, v0.8H // ...........................*...... - uzp2 v21.8H, v21.8H, v3.8H // ..............................*... - uzp2 v22.8H, v13.8H, v10.8H // ...............................*.. - st2 {v21.8H, v22.8H}, [x0], #32 // .................................* - - // ------- cycle (expected) --------> - // 0 25 - // |------------------------|-------- - // smull v13.4S, v6.4H, v4.4H // ..*............................... - // smull2 v10.4S, v6.8H, v4.8H // ...*.............................. - // smull v21.4S, v6.4H, v5.4H // *................................. - // smull2 v3.4S, v6.8H, v5.8H // .*................................ - // smlal v13.4S, v7.4H, v12.4H // ......*........................... - // smlal2 v10.4S, v7.8H, v12.8H // .......*.......................... - // smlal v21.4S, v7.4H, v4.4H // ....*............................. - // smlal2 v3.4S, v7.8H, v4.8H // .....*............................ - // smlal v13.4S, v15.4H, v29.4H // ..........*....................... - // smlal2 v10.4S, v15.8H, v29.8H // ...........*...................... - // smlal v21.4S, v15.4H, v30.4H // ........*......................... - // smlal2 v3.4S, v15.8H, v30.8H // .........*........................ - // smlal v13.4S, v16.4H, v31.4H // ..............*................... - // smlal2 v10.4S, v16.8H, v31.8H // ...............*.................. - // smlal v21.4S, v16.4H, v29.4H // ............*..................... - // smlal2 v3.4S, v16.8H, v29.8H // .............*.................... - // uzp1 v12.8H, v13.8H, v10.8H // ...................*.............. - // uzp1 v30.8H, v21.8H, v3.8H // .................*................ - // mul v12.8H, v12.8H, v2.H[2] // .....................*............ - // mul v30.8H, v30.8H, v2.H[2] // ....................*............. - // smlal v13.4S, v12.4H, v0.4H // .........................*........ - // smlal2 v10.4S, v12.8H, v0.8H // ..........................*....... - // smlal v21.4S, v30.4H, v0.4H // ........................*......... - // smlal2 v3.4S, v30.8H, v0.8H // ...........................*...... - // uzp2 v13.8H, v13.8H, v10.8H // ..............................*... - // uzp2 v14.8H, v21.8H, v3.8H // ...............................*.. - // st2 {v13.8H, v14.8H}, [x0], #32 // .................................* + cbnz count, 1b + // Instructions: 21 + // Expected cycles: 35 + // Expected IPC: 0.60 + + // Cycle bound: 35.0 + // IPC bound: 0.60 + + // Wall time: 0.08s + // User time: 0.08s + + // ----- original position -----> + // 0 25 + // |------------------------|---- + smull2 v5.4S, v3.8H, v17.8H // *............................. + smlal v12.4S, v14.4H, v0.4H // ..*........................... + smlal v23.4S, v10.4H, v4.4H // ...*.......................... + str q9, [x0, #16] // ....*......................... + smlal2 v5.4S, v10.8H, v4.8H // .*............................ + uzp2 v11.8H, v12.8H, v11.8H // ..........*................... + zip1 v9.8H, v19.8H, v27.8H // ........*..................... + smlal v23.4S, v13.4H, v15.4H // ......*....................... + smlal2 v5.4S, v13.8H, v15.8H // .....*........................ + str q9, [x0], #32 // ............*................. + smlal v23.4S, v28.4H, v22.4H // .........*.................... + smlal2 v5.4S, v28.8H, v22.8H // .......*...................... + uzp1 v9.8H, v23.8H, v5.8H // ...........*.................. + mul v9.8H, v9.8H, v2.8H // .............*................ + smlal2 v5.4S, v9.8H, v0.8H // ..............*............... + smlal v23.4S, v9.4H, v0.4H // ...............*.............. + uzp2 v9.8H, v23.8H, v5.8H // ................*............. + zip2 v5.8H, v9.8H, v11.8H // .................*............ + zip1 v9.8H, v9.8H, v11.8H // ...................*.......... + str q5, [x0, #16] // ..................*........... + str q9, [x0], #32 // ....................*......... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // smull2 v20.4S, v3.8H, v17.8H // *.............................. + // smlal2 v20.4S, v10.8H, v4.8H // ....*.......................... + // smlal v12.4S, v14.4H, v0.4H // .*............................. + // smlal v23.4S, v10.4H, v4.4H // ..*............................ + // str q9, [x0, #16] // ...*........................... + // smlal2 v20.4S, v13.8H, v15.8H // ........*...................... + // smlal v23.4S, v13.4H, v15.4H // .......*....................... + // smlal2 v20.4S, v28.8H, v22.8H // ...........*................... + // zip1 v26.8H, v19.8H, v27.8H // ......*........................ + // smlal v23.4S, v28.4H, v22.4H // ..........*.................... + // uzp2 v27.8H, v12.8H, v11.8H // .....*......................... + // uzp1 v5.8H, v23.8H, v20.8H // ............*.................. + // str q26, [x0], #32 // .........*..................... + // mul v31.8H, v5.8H, v2.8H // .............*................. + // smlal2 v20.4S, v31.8H, v0.8H // ..............*................ + // smlal v23.4S, v31.4H, v0.4H // ...............*............... + // uzp2 v19.8H, v23.8H, v20.8H // ................*.............. + // zip2 v9.8H, v19.8H, v27.8H // .................*............. + // str q9, [x0, #16] // ...................*........... + // zip1 v26.8H, v19.8H, v27.8H // ..................*............ + // str q26, [x0], #32 // ....................*.......... pop_stack @@ -353,17 +530,12 @@ k2_loop_start: #endif /* MLKEM_K == 2 */ #if MLKEM_K == 3 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -375,327 +547,453 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) mov count, #(MLKEM_N / 16) - // Instructions: 52 - // Expected cycles: 66 - // Expected IPC: 0.79 - - // Cycle bound: 66.0 - // IPC bound: 0.79 - - // Wall time: 8.09s - // User time: 8.09s - - // ----------------------- cycle (expected) ------------------------> - // 0 25 50 - // |------------------------|------------------------|--------------- - ld2 {v7.8H, v8.8H}, [x2], #32 // *................................................................. - ld2 {v28.8H, v29.8H}, [x1], #32 // ..*............................................................... - ld1 {v10.8H}, [x3], #16 // ....*............................................................. - smull2 v15.4S, v28.8H, v8.8H // ......*........................................................... - smull v9.4S, v28.4H, v8.4H // .......*.......................................................... - ld1 {v31.8H}, [x3], #16 // ........*......................................................... - smlal2 v15.4S, v29.8H, v7.8H // ..........*....................................................... - ld2 {v19.8H, v20.8H}, [x5], #32 // ...........*...................................................... - smlal v9.4S, v29.4H, v7.4H // .............*.................................................... - ld2 {v16.8H, v17.8H}, [x4], #32 // ..............*................................................... - ld2 {v3.8H, v4.8H}, [x7], #32 // ................*................................................. - smlal v9.4S, v16.4H, v20.4H // ..................*............................................... - smlal2 v15.4S, v16.8H, v20.8H // ...................*.............................................. - smull v1.4S, v28.4H, v7.4H // ....................*............................................. - smull2 v22.4S, v28.8H, v7.8H // .....................*............................................ - smlal v9.4S, v17.4H, v19.4H // ......................*........................................... - smlal2 v15.4S, v17.8H, v19.8H // .......................*.......................................... - ld2 {v6.8H, v7.8H}, [x8], #32 // ........................*......................................... - smlal v1.4S, v29.4H, v10.4H // ..........................*....................................... - smlal2 v22.4S, v29.8H, v10.8H // ...........................*...................................... - smlal2 v15.4S, v3.8H, v7.8H // ............................*..................................... - smlal v9.4S, v3.4H, v7.4H // .............................*.................................... - smlal v1.4S, v16.4H, v19.4H // ..............................*................................... - smlal2 v22.4S, v16.8H, v19.8H // ...............................*.................................. - ld1 {v5.8H}, [x6], #16 // ................................*................................. - smlal v9.4S, v4.4H, v6.4H // ..................................*............................... - smlal2 v15.4S, v4.8H, v6.8H // ...................................*.............................. - smlal v1.4S, v17.4H, v5.4H // ....................................*............................. - smlal2 v22.4S, v17.8H, v5.8H // .....................................*............................ - ld1 {v30.8H}, [x9], #16 // ......................................*........................... - smlal v1.4S, v3.4H, v6.4H // ........................................*......................... - smlal2 v22.4S, v3.8H, v6.8H // .........................................*........................ - ld2 {v11.8H, v12.8H}, [x1], #32 // ..........................................*....................... - smlal v1.4S, v4.4H, v30.4H // ............................................*..................... - smlal2 v22.4S, v4.8H, v30.8H // .............................................*.................... - uzp1 v21.8H, v9.8H, v15.8H // ..............................................*................... - ld1 {v14.8H}, [x6], #16 // ...............................................*.................. - uzp1 v13.8H, v1.8H, v22.8H // .................................................*................ - mul v10.8H, v21.8H, v2.H[2] // ..................................................*............... - mul v21.8H, v13.8H, v2.H[2] // ...................................................*.............. - ld2 {v23.8H, v24.8H}, [x2], #32 // ....................................................*............. - smlal2 v15.4S, v10.8H, v0.8H // ......................................................*........... - smlal2 v22.4S, v21.8H, v0.8H // .......................................................*.......... - smlal v1.4S, v21.4H, v0.4H // ........................................................*......... - smull2 v13.4S, v11.8H, v23.8H // .........................................................*........ - ld2 {v25.8H, v26.8H}, [x5], #32 // ..........................................................*....... - smlal v9.4S, v10.4H, v0.4H // ............................................................*..... - smlal2 v13.4S, v12.8H, v31.8H // .............................................................*.... - smull2 v27.4S, v11.8H, v24.8H // ..............................................................*... - uzp2 v16.8H, v1.8H, v22.8H // ...............................................................*.. - uzp2 v17.8H, v9.8H, v15.8H // ................................................................*. - ld2 {v6.8H, v7.8H}, [x4], #32 // .................................................................* - - // ----------------------- cycle (expected) ------------------------> - // 0 25 50 - // |------------------------|------------------------|--------------- - // ld2 {v23.8H, v24.8H}, [x2], #32 // *................................................................. - // ld2 {v11.8H, v12.8H}, [x1], #32 // ..*............................................................... - // ld2 {v6.8H, v7.8H}, [x4], #32 // ..............*................................................... - // ld1 {v31.8H}, [x3], #16 // ....*............................................................. - // ld2 {v25.8H, v26.8H}, [x5], #32 // ...........*...................................................... - // smull2 v13.4S, v11.8H, v23.8H // .....................*............................................ - // ld1 {v14.8H}, [x6], #16 // ................................*................................. - // smlal2 v13.4S, v12.8H, v31.8H // ...........................*...................................... - // smull2 v27.4S, v11.8H, v24.8H // ......*........................................................... - // smlal2 v13.4S, v6.8H, v25.8H // ...............................*.................................. - // smull v20.4S, v11.4H, v24.4H // .......*.......................................................... - // smull v17.4S, v11.4H, v23.4H // ....................*............................................. - // smlal2 v27.4S, v12.8H, v23.8H // ..........*....................................................... - // smlal2 v13.4S, v7.8H, v14.8H // .....................................*............................ - // smlal v20.4S, v12.4H, v23.4H // .............*.................................................... - // smlal v17.4S, v12.4H, v31.4H // ..........................*....................................... - // smlal2 v27.4S, v6.8H, v26.8H // ...................*.............................................. - // ld1 {v5.8H}, [x9], #16 // ......................................*........................... - // smlal v17.4S, v6.4H, v25.4H // ..............................*................................... - // smlal v20.4S, v6.4H, v26.4H // ..................*............................................... - // ld2 {v23.8H, v24.8H}, [x2], #32 // ....................................................*............. - // ld2 {v15.8H, v16.8H}, [x8], #32 // ........................*......................................... - // smlal v17.4S, v7.4H, v14.4H // ....................................*............................. - // ld2 {v8.8H, v9.8H}, [x7], #32 // ................*................................................. - // smlal v20.4S, v7.4H, v25.4H // ......................*........................................... - // smlal2 v27.4S, v7.8H, v25.8H // .......................*.......................................... - // smlal2 v13.4S, v8.8H, v15.8H // .........................................*........................ - // smlal v17.4S, v8.4H, v15.4H // ........................................*......................... - // smlal v20.4S, v8.4H, v16.4H // .............................*.................................... - // smlal2 v27.4S, v8.8H, v16.8H // ............................*..................................... - // smlal2 v13.4S, v9.8H, v5.8H // .............................................*.................... - // smlal v17.4S, v9.4H, v5.4H // ............................................*..................... - // smlal v20.4S, v9.4H, v15.4H // ..................................*............................... - // smlal2 v27.4S, v9.8H, v15.8H // ...................................*.............................. - // ld2 {v11.8H, v12.8H}, [x1], #32 // ..........................................*....................... - // uzp1 v31.8H, v17.8H, v13.8H // .................................................*................ - // uzp1 v8.8H, v20.8H, v27.8H // ..............................................*................... - // ld2 {v6.8H, v7.8H}, [x4], #32 // .................................................................* - // mul v18.8H, v31.8H, v2.H[2] // ...................................................*.............. - // mul v14.8H, v8.8H, v2.H[2] // ..................................................*............... - // ld1 {v31.8H}, [x3], #16 // ........*......................................................... - // smlal v17.4S, v18.4H, v0.4H // ........................................................*......... - // smlal2 v27.4S, v14.8H, v0.8H // ......................................................*........... - // smlal2 v13.4S, v18.8H, v0.8H // .......................................................*.......... - // smlal v20.4S, v14.4H, v0.4H // ............................................................*..... - // ld2 {v25.8H, v26.8H}, [x5], #32 // ..........................................................*....... - // uzp2 v16.8H, v17.8H, v13.8H // ...............................................................*.. - // smull2 v13.4S, v11.8H, v23.8H // .........................................................*........ - // uzp2 v17.8H, v20.8H, v27.8H // ................................................................*. - // ld1 {v14.8H}, [x6], #16 // ...............................................*.................. - // smlal2 v13.4S, v12.8H, v31.8H // .............................................................*.... - // smull2 v27.4S, v11.8H, v24.8H // ..............................................................*... + // Instructions: 75 + // Expected cycles: 103 + // Expected IPC: 0.73 + + // Cycle bound: 103.0 + // IPC bound: 0.73 + + // Wall time: 0.94s + // User time: 0.94s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q7, [x2, #16] // *.......................................................................... + ldr q20, [x2], #32 // ..*........................................................................ + ldr q15, [x1, #16] // .*......................................................................... + uzp1 v8.8H, v20.8H, v7.8H // ...............*........................................................... + uzp2 v7.8H, v20.8H, v7.8H // ................*.......................................................... + ld1 {v20.8H}, [x3], #16 // ...*....................................................................... + ldr q30, [x1], #32 // ..............*............................................................ + ldr q11, [x4], #32 // ....*...................................................................... + uzp1 v16.8H, v30.8H, v15.8H // .................*......................................................... + uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................ + smull v30.4S, v16.4H, v7.4H // ...................*....................................................... + smull2 v7.4S, v16.8H, v7.8H // ....................*...................................................... + smull v9.4S, v16.4H, v8.4H // .....................*..................................................... + smull2 v16.4S, v16.8H, v8.8H // ......................*.................................................... + smlal v30.4S, v15.4H, v8.4H // .......................*................................................... + smlal2 v7.4S, v15.8H, v8.8H // ........................*.................................................. + smlal v9.4S, v15.4H, v20.4H // .........................*................................................. + smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................ + ldr q20, [x4, #-16] // .....*..................................................................... + ldr q15, [x5], #32 // ......*.................................................................... + uzp1 v8.8H, v11.8H, v20.8H // ...........................*............................................... + uzp2 v20.8H, v11.8H, v20.8H // ............................*.............................................. + ldr q11, [x5, #-16] // .......*................................................................... + ld1 {v27.8H}, [x6], #16 // ........*.................................................................. + uzp1 v10.8H, v15.8H, v11.8H // .............................*............................................. + uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................ + smlal v9.4S, v8.4H, v10.4H // ...............................*........................................... + smlal2 v16.4S, v8.8H, v10.8H // ................................*.......................................... + smlal v30.4S, v8.4H, v15.4H // .................................*......................................... + smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................ + smlal v9.4S, v20.4H, v27.4H // ...................................*....................................... + smlal2 v16.4S, v20.8H, v27.8H // ....................................*...................................... + smlal v30.4S, v20.4H, v10.4H // .....................................*..................................... + smlal2 v7.4S, v20.8H, v10.8H // ......................................*.................................... + ldr q20, [x7], #32 // .........*................................................................. + ldr q15, [x7, #-16] // ..........*................................................................ + ldr q8, [x8], #32 // ...........*............................................................... + uzp1 v11.8H, v20.8H, v15.8H // .......................................*................................... + uzp2 v20.8H, v20.8H, v15.8H // ........................................*.................................. + ldr q15, [x8, #-16] // ............*.............................................................. + ld1 {v27.8H}, [x9], #16 // .............*............................................................. + uzp1 v10.8H, v8.8H, v15.8H // .........................................*................................. + uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................ + smlal v9.4S, v11.4H, v10.4H // ...........................................*............................... + smlal2 v16.4S, v11.8H, v10.8H // ............................................*.............................. + smlal v30.4S, v11.4H, v15.4H // .............................................*............................. + smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................ + smlal v9.4S, v20.4H, v27.4H // ...............................................*........................... + smlal2 v16.4S, v20.8H, v27.8H // ................................................*.......................... + smlal v30.4S, v20.4H, v10.4H // .................................................*......................... + smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................ + ldr q15, [x2], #32 // ...............................................................*........... + uzp1 v20.8H, v9.8H, v16.8H // ....................................................*...................... + uzp1 v8.8H, v30.8H, v7.8H // .....................................................*..................... + mul v20.8H, v20.8H, v2.8H // ......................................................*.................... + mul v8.8H, v8.8H, v2.8H // .......................................................*................... + ldr q21, [x4], #32 // .................................................................*......... + smlal v9.4S, v20.4H, v0.4H // ........................................................*.................. + smlal2 v16.4S, v20.8H, v0.8H // .........................................................*................. + smlal v30.4S, v8.4H, v0.4H // ..........................................................*................ + smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*............... + ldr q6, [x4, #-16] // ..................................................................*........ + uzp2 v27.8H, v9.8H, v16.8H // ............................................................*.............. + uzp2 v10.8H, v30.8H, v7.8H // .............................................................*............. + ldr q16, [x2, #-16] // ...................................................*....................... + ldr q30, [x1, #16] // ..............................................................*............ + ld1 {v9.8H}, [x3], #16 // ................................................................*.......... + ldr q1, [x5], #32 // ...................................................................*....... + ldr q12, [x5, #-16] // ....................................................................*...... + ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + ldr q19, [x7], #32 // ......................................................................*.... + ldr q31, [x7, #-16] // .......................................................................*... + ldr q17, [x8], #32 // ........................................................................*.. + ldr q18, [x8, #-16] // .........................................................................*. + ld1 {v25.8H}, [x9], #16 // ..........................................................................* + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q16, [x2, #16] // *.......................................................................... + // ldr q30, [x1, #16] // ..*........................................................................ + // ldr q15, [x2], #32 // .*......................................................................... + // ld1 {v9.8H}, [x3], #16 // .....*..................................................................... + // ldr q21, [x4], #32 // .......*................................................................... + // ldr q6, [x4, #-16] // ..................*........................................................ + // ldr q1, [x5], #32 // ...................*....................................................... + // ldr q12, [x5, #-16] // ......................*.................................................... + // ld1 {v24.8H}, [x6], #16 // .......................*................................................... + // ldr q19, [x7], #32 // ..................................*........................................ + // ldr q31, [x7, #-16] // ...................................*....................................... + // ldr q17, [x8], #32 // ....................................*...................................... + // ldr q18, [x8, #-16] // .......................................*................................... + // ld1 {v25.8H}, [x9], #16 // ........................................*.................................. + // ldr q20, [x1], #32 // ......*.................................................................... + // uzp1 v7.8H, v15.8H, v16.8H // ...*....................................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ....*...................................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ........*.................................................................. + // uzp2 v20.8H, v20.8H, v30.8H // .........*................................................................. + // smull v30.4S, v8.4H, v15.4H // ..........*................................................................ + // smull2 v15.4S, v8.8H, v15.8H // ...........*............................................................... + // smull v11.4S, v8.4H, v7.4H // ............*.............................................................. + // smull2 v8.4S, v8.8H, v7.8H // .............*............................................................. + // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................ + // smlal2 v15.4S, v20.8H, v7.8H // ...............*........................................................... + // smlal v11.4S, v20.4H, v9.4H // ................*.......................................................... + // smlal2 v8.4S, v20.8H, v9.8H // .................*......................................................... + // uzp1 v7.8H, v21.8H, v6.8H // ....................*...................................................... + // uzp2 v20.8H, v21.8H, v6.8H // .....................*..................................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........................*.................................................. + // uzp2 v9.8H, v1.8H, v12.8H // .........................*................................................. + // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................ + // smlal2 v8.4S, v7.8H, v16.8H // ...........................*............................................... + // smlal v30.4S, v7.4H, v9.4H // ............................*.............................................. + // smlal2 v15.4S, v7.8H, v9.8H // .............................*............................................. + // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................ + // smlal2 v8.4S, v20.8H, v24.8H // ...............................*........................................... + // smlal v30.4S, v20.4H, v16.4H // ................................*.......................................... + // smlal2 v15.4S, v20.8H, v16.8H // .................................*......................................... + // uzp1 v7.8H, v19.8H, v31.8H // .....................................*..................................... + // uzp2 v20.8H, v19.8H, v31.8H // ......................................*.................................... + // uzp1 v16.8H, v17.8H, v18.8H // .........................................*................................. + // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................ + // smlal v11.4S, v7.4H, v16.4H // ...........................................*............................... + // smlal2 v8.4S, v7.8H, v16.8H // ............................................*.............................. + // smlal v30.4S, v7.4H, v9.4H // .............................................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................ + // smlal v11.4S, v20.4H, v25.4H // ...............................................*........................... + // smlal2 v8.4S, v20.8H, v25.8H // ................................................*.......................... + // smlal v30.4S, v20.4H, v16.4H // .................................................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................ + // ldr q16, [x2, #16] // ................................................................*.......... + // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*...................... + // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*..................... + // mul v7.8H, v7.8H, v2.8H // ......................................................*.................... + // mul v20.8H, v20.8H, v2.8H // .......................................................*................... + // smlal v11.4S, v7.4H, v0.4H // .........................................................*................. + // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................ + // smlal v30.4S, v20.4H, v0.4H // ...........................................................*............... + // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*.............. + // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............ + // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*........... + // ldr q30, [x1, #16] // .................................................................*......... + // ldr q15, [x2], #32 // ...................................................*....................... + // ld1 {v9.8H}, [x3], #16 // ..................................................................*........ + // ldr q21, [x4], #32 // ........................................................*.................. + // ldr q6, [x4, #-16] // .............................................................*............. + // ldr q1, [x5], #32 // ...................................................................*....... + // ldr q12, [x5, #-16] // ....................................................................*...... + // ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + // ldr q19, [x7], #32 // ......................................................................*.... + // ldr q31, [x7, #-16] // .......................................................................*... + // ldr q17, [x8], #32 // ........................................................................*.. + // ldr q18, [x8, #-16] // .........................................................................*. + // ld1 {v25.8H}, [x9], #16 // ..........................................................................* sub count, count, #2 -k3_loop_start: - // Instructions: 44 - // Expected cycles: 54 - // Expected IPC: 0.81 - - // Cycle bound: 54.0 - // IPC bound: 0.81 - - // Wall time: 4.51s - // User time: 4.51s - - // ----------------- cycle (expected) ------------------> - // 0 25 50 - // |------------------------|------------------------|--- - st2 {v16.8H, v17.8H}, [x0], #32 // l..................................................... - smlal2 v13.4S, v6.8H, v25.8H // ..*................................................... - smull v20.4S, v11.4H, v24.4H // ...*.................................................. - smull v17.4S, v11.4H, v23.4H // ....*................................................. - smlal2 v27.4S, v12.8H, v23.8H // .....*................................................ - smlal2 v13.4S, v7.8H, v14.8H // ......*............................................... - smlal v20.4S, v12.4H, v23.4H // .......*.............................................. - smlal v17.4S, v12.4H, v31.4H // ........*............................................. - smlal2 v27.4S, v6.8H, v26.8H // .........*............................................ - ld1 {v5.8H}, [x9], #16 // ..........*........................................... - smlal v17.4S, v6.4H, v25.4H // ............*......................................... - smlal v20.4S, v6.4H, v26.4H // .............*........................................ - ld2 {v23.8H, v24.8H}, [x2], #32 // ..............e....................................... - ld2 {v15.8H, v16.8H}, [x8], #32 // ................*..................................... - smlal v17.4S, v7.4H, v14.4H // ..................*................................... - ld2 {v8.8H, v9.8H}, [x7], #32 // ...................*.................................. - smlal v20.4S, v7.4H, v25.4H // .....................*................................ - smlal2 v27.4S, v7.8H, v25.8H // ......................*............................... - smlal2 v13.4S, v8.8H, v15.8H // .......................*.............................. - smlal v17.4S, v8.4H, v15.4H // ........................*............................. - smlal v20.4S, v8.4H, v16.4H // .........................*............................ - smlal2 v27.4S, v8.8H, v16.8H // ..........................*........................... - smlal2 v13.4S, v9.8H, v5.8H // ...........................*.......................... - smlal v17.4S, v9.4H, v5.4H // ............................*......................... - smlal v20.4S, v9.4H, v15.4H // .............................*........................ - smlal2 v27.4S, v9.8H, v15.8H // ..............................*....................... - ld2 {v11.8H, v12.8H}, [x1], #32 // ...............................e...................... - uzp1 v31.8H, v17.8H, v13.8H // .................................*.................... - uzp1 v8.8H, v20.8H, v27.8H // ..................................*................... - ld2 {v6.8H, v7.8H}, [x4], #32 // ...................................e.................. - mul v18.8H, v31.8H, v2.H[2] // .....................................*................ - mul v14.8H, v8.8H, v2.H[2] // ......................................*............... - ld1 {v31.8H}, [x3], #16 // .......................................e.............. - smlal v17.4S, v18.4H, v0.4H // .........................................*............ - smlal2 v27.4S, v14.8H, v0.8H // ..........................................*........... - smlal2 v13.4S, v18.8H, v0.8H // ...........................................*.......... - smlal v20.4S, v14.4H, v0.4H // ............................................*......... - ld2 {v25.8H, v26.8H}, [x5], #32 // .............................................e........ - uzp2 v16.8H, v17.8H, v13.8H // ...............................................*...... - smull2 v13.4S, v11.8H, v23.8H // ................................................e..... - uzp2 v17.8H, v20.8H, v27.8H // .................................................*.... - ld1 {v14.8H}, [x6], #16 // ..................................................e... - smlal2 v13.4S, v12.8H, v31.8H // ....................................................e. - smull2 v27.4S, v11.8H, v24.8H // .....................................................e - - // -------------------------------------- cycle (expected) --------------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|------------------- - // ld2 {v3.8h, v4.8h}, [x1], #32 // .................e......................'..............................~....................... - // ld2 {v5.8h, v6.8h}, [x2], #32 // e.......................................'.............~........................................ - // ld1 {v7.8h}, [x3], #16 // .........................e..............'......................................~............... - // smull v8.4s, v3.4h, v5.4h // ........................................'...*.................................................. - // smull2 v10.4s, v3.8h, v5.8h // ..................................e.....'...............................................~...... - // smlal v8.4s, v4.4h, v7.4h // ........................................'.......*.............................................. - // smlal2 v10.4s, v4.8h, v7.8h // ......................................e.'...................................................~.. - // smull v9.4s, v3.4h, v6.4h // ........................................'..*................................................... - // smull2 v11.4s, v3.8h, v6.8h // .......................................e'....................................................~. - // smlal v9.4s, v4.4h, v5.4h // ........................................'......*............................................... - // smlal2 v11.4s, v4.8h, v5.8h // ........................................'....*................................................. - // ld2 {v3.8h, v4.8h}, [x4], #32 // .....................e..................'..................................~................... - // ld2 {v5.8h, v6.8h}, [x5], #32 // ...............................e........'............................................~......... - // ld1 {v7.8h}, [x6], #16 // ....................................e...'.................................................~.... - // smlal v8.4s, v3.4h, v5.4h // ........................................'...........*.......................................... - // smlal2 v10.4s, v3.8h, v5.8h // ........................................'.*.................................................... - // smlal v8.4s, v4.4h, v7.4h // ....~...................................'.................*.................................... - // smlal2 v10.4s, v4.8h, v7.8h // ........................................'.....*................................................ - // smlal v9.4s, v3.4h, v6.4h // ........................................'............*......................................... - // smlal2 v11.4s, v3.8h, v6.8h // ........................................'........*............................................. - // smlal v9.4s, v4.4h, v5.4h // .......~................................'....................*................................. - // smlal2 v11.4s, v4.8h, v5.8h // ........~...............................'.....................*................................ - // ld2 {v3.8h, v4.8h}, [x7], #32 // .....~..................................'..................*................................... - // ld2 {v5.8h, v6.8h}, [x8], #32 // ..~.....................................'...............*...................................... - // ld1 {v7.8h}, [x9], #16 // ........................................'.........*............................................ - // smlal v8.4s, v3.4h, v5.4h // ..........~.............................'.......................*.............................. - // smlal2 v10.4s, v3.8h, v5.8h // .........~..............................'......................*............................... - // smlal v8.4s, v4.4h, v7.4h // ..............~.........................'...........................*.......................... - // smlal2 v10.4s, v4.8h, v7.8h // .............~..........................'..........................*........................... - // smlal v9.4s, v3.4h, v6.4h // ...........~............................'........................*............................. - // smlal2 v11.4s, v3.8h, v6.8h // ............~...........................'.........................*............................ - // smlal v9.4s, v4.4h, v5.4h // ...............~........................'............................*......................... - // smlal2 v11.4s, v4.8h, v5.8h // ................~.......................'.............................*........................ - // uzp1 v28.8h, v8.8h, v10.8h // ...................~....................'................................*..................... - // mul v28.8h, v28.8h, v2.h[2] // .......................~................'....................................*................. - // smlal v8.4s, v28.4h, v0.4h // ...........................~............'........................................*............. - // smlal2 v10.4s, v28.8h, v0.8h // .............................~..........'..........................................*........... - // uzp2 v26.8h, v8.8h, v10.8h // .................................~......'..............................................*....... - // uzp1 v28.8h, v9.8h, v11.8h // ....................~...................'.................................*.................... - // mul v28.8h, v28.8h, v2.h[2] // ........................~...............'.....................................*................ - // smlal v9.4s, v28.4h, v0.4h // ..............................~.........'...........................................*.......... - // smlal2 v11.4s, v28.8h, v0.8h // ............................~...........'.........................................*............ - // uzp2 v27.8h, v9.8h, v11.8h // ...................................~....'................................................*..... - // st2 {v26.8h, v27.8h}, [x0], #32 // ........................................~.....................................................l +1: + // Instructions: 65 + // Expected cycles: 80 + // Expected IPC: 0.81 + + // Cycle bound: 80.0 + // IPC bound: 0.81 + + // Wall time: 11.64s + // User time: 11.64s + + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + ldr q20, [x1], #32 // *................................................................ + uzp1 v7.8H, v15.8H, v16.8H // ......*.......................................................... + uzp2 v15.8H, v15.8H, v16.8H // .......*......................................................... + uzp1 v8.8H, v20.8H, v30.8H // ..*.............................................................. + uzp2 v20.8H, v20.8H, v30.8H // ...*............................................................. + smull v30.4S, v8.4H, v15.4H // .............*................................................... + smull2 v15.4S, v8.8H, v15.8H // ..............*.................................................. + smull v11.4S, v8.4H, v7.4H // .........*....................................................... + smull2 v8.4S, v8.8H, v7.8H // ..........*...................................................... + smlal v30.4S, v20.4H, v7.4H // ...............*................................................. + smlal2 v15.4S, v20.8H, v7.8H // ................*................................................ + smlal v11.4S, v20.4H, v9.4H // ...........*..................................................... + smlal2 v8.4S, v20.8H, v9.8H // ............*.................................................... + uzp1 v7.8H, v21.8H, v6.8H // ...................*............................................. + uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................ + uzp1 v16.8H, v1.8H, v12.8H // .......................*......................................... + uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................ + smlal v11.4S, v7.4H, v16.4H // ..........................*...................................... + smlal2 v8.4S, v7.8H, v16.8H // ...........................*..................................... + smlal v30.4S, v7.4H, v9.4H // ..............................*.................................. + smlal2 v15.4S, v7.8H, v9.8H // ...............................*................................. + smlal v11.4S, v20.4H, v24.4H // ............................*.................................... + smlal2 v8.4S, v20.8H, v24.8H // .............................*................................... + smlal v30.4S, v20.4H, v16.4H // ................................*................................ + smlal2 v15.4S, v20.8H, v16.8H // .................................*............................... + uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................ + uzp2 v20.8H, v19.8H, v31.8H // .....................................*........................... + uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................ + uzp2 v9.8H, v17.8H, v18.8H // .........................................*....................... + smlal v11.4S, v7.4H, v16.4H // ...........................................*..................... + smlal2 v8.4S, v7.8H, v16.8H // ............................................*.................... + smlal v30.4S, v7.4H, v9.4H // ...............................................*................. + smlal2 v15.4S, v7.8H, v9.8H // ................................................*................ + smlal v11.4S, v20.4H, v25.4H // .............................................*................... + smlal2 v8.4S, v20.8H, v25.8H // ..............................................*.................. + smlal v30.4S, v20.4H, v16.4H // .................................................*............... + smlal2 v15.4S, v20.8H, v16.8H // ..................................................*.............. + ldr q16, [x2, #16] // .....e........................................................... + uzp1 v7.8H, v11.8H, v8.8H // ...................................................*............. + uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........ + mul v7.8H, v7.8H, v2.8H // ....................................................*............ + mul v20.8H, v20.8H, v2.8H // .........................................................*....... + zip2 v9.8H, v27.8H, v10.8H // ..............................................................l.. + zip1 v27.8H, v27.8H, v10.8H // .............................................................l... + smlal v11.4S, v7.4H, v0.4H // .....................................................*........... + smlal2 v8.4S, v7.8H, v0.8H // ......................................................*.......... + smlal v30.4S, v20.4H, v0.4H // ..........................................................*...... + smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*..... + str q27, [x0], #32 // ...............................................................l. + uzp2 v27.8H, v11.8H, v8.8H // .......................................................*......... + str q9, [x0, #-16] // ................................................................l + uzp2 v10.8H, v30.8H, v15.8H // ............................................................*.... + ldr q30, [x1, #16] // .e............................................................... + ldr q15, [x2], #32 // ....e............................................................ + ld1 {v9.8H}, [x3], #16 // ........e........................................................ + ldr q21, [x4], #32 // .................e............................................... + ldr q6, [x4, #-16] // ..................e.............................................. + ldr q1, [x5], #32 // .....................e........................................... + ldr q12, [x5, #-16] // ......................e.......................................... + ld1 {v24.8H}, [x6], #16 // .........................e....................................... + ldr q19, [x7], #32 // ..................................e.............................. + ldr q31, [x7, #-16] // ...................................e............................. + ldr q17, [x8], #32 // ......................................e.......................... + ldr q18, [x8, #-16] // .......................................e......................... + ld1 {v25.8H}, [x9], #16 // ..........................................e...................... + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q12, [x1], #32 // ............................*................................................................~.................................................. + // ldr q13, [x1, #-16] // ...............e............'...................................................~............'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~............................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~.............................................. + // ldr q12, [x2], #32 // ................e...........'....................................................~...........'.................................................. + // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~............. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~................................................. + // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................ + // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'.................................................. + // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~........................................... + // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~.......................................... + // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~....................................... + // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~...................................... + // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~............................................. + // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................ + // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~......................................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................ + // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'.................................................. + // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~.................................... + // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'.................................................. + // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~................................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~.................................. + // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................ + // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~............................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~............................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~.............................. + // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~........................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~.......................... + // ldr q12, [x7], #32 // .......................e....'...........................................................~....'.................................................. + // ldr q13, [x7, #-16] // ........................e...'............................................................~...'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~......................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................ + // ldr q12, [x8], #32 // .........................e..'.............................................................~..'.................................................. + // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~....................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~...................... + // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~..................... + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~.................... + // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~.................. + // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~............... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~.............. + // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............ + // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~.......... + // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~...... + // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~..... + // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~. + // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~........... + // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~......... + // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~.... + // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~... + // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'.................................................. + // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l....... + // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........ + // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l.. + // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l sub count, count, #1 - cbnz count, k3_loop_start - // Instructions: 36 - // Expected cycles: 45 - // Expected IPC: 0.80 - - // Cycle bound: 45.0 - // IPC bound: 0.80 - - // Wall time: 0.30s - // User time: 0.30s - - // ------------- cycle (expected) -------------> - // 0 25 - // |------------------------|------------------- - ld2 {v21.8H, v22.8H}, [x8], #32 // *............................................ - smull v10.4S, v11.4H, v24.4H // ..*.......................................... - smlal2 v13.4S, v6.8H, v25.8H // ...*......................................... - smlal2 v27.4S, v12.8H, v23.8H // ....*........................................ - smull v18.4S, v11.4H, v23.4H // .....*....................................... - smlal v10.4S, v12.4H, v23.4H // ......*...................................... - ld2 {v29.8H, v30.8H}, [x7], #32 // .......*..................................... - smlal2 v27.4S, v6.8H, v26.8H // .........*................................... - smlal v10.4S, v6.4H, v26.4H // ..........*.................................. - smlal v18.4S, v12.4H, v31.4H // ...........*................................. - smlal2 v13.4S, v7.8H, v14.8H // ............*................................ - smlal2 v27.4S, v7.8H, v25.8H // .............*............................... - smlal v10.4S, v7.4H, v25.4H // ..............*.............................. - smlal v18.4S, v6.4H, v25.4H // ...............*............................. - smlal2 v13.4S, v29.8H, v21.8H // ................*............................ - smlal2 v27.4S, v29.8H, v22.8H // .................*........................... - smlal v10.4S, v29.4H, v22.4H // ..................*.......................... - smlal v18.4S, v7.4H, v14.4H // ...................*......................... - ld1 {v3.8H}, [x9], #16 // ....................*........................ - smlal v10.4S, v30.4H, v21.4H // ......................*...................... - smlal v18.4S, v29.4H, v21.4H // .......................*..................... - smlal2 v27.4S, v30.8H, v21.8H // ........................*.................... - smlal2 v13.4S, v30.8H, v3.8H // .........................*................... - smlal v18.4S, v30.4H, v3.4H // ...........................*................. - uzp1 v8.8H, v10.8H, v27.8H // ............................*................ - mul v23.8H, v8.8H, v2.H[2] // ..............................*.............. - uzp1 v30.8H, v18.8H, v13.8H // ...............................*............. - mul v19.8H, v30.8H, v2.H[2] // .................................*........... - smlal2 v27.4S, v23.8H, v0.8H // ..................................*.......... - smlal v10.4S, v23.4H, v0.4H // ...................................*......... - smlal v18.4S, v19.4H, v0.4H // .....................................*....... - smlal2 v13.4S, v19.8H, v0.8H // ......................................*...... - st2 {v16.8H, v17.8H}, [x0], #32 // .......................................*..... - uzp2 v12.8H, v10.8H, v27.8H // .........................................*... - uzp2 v11.8H, v18.8H, v13.8H // ..........................................*.. - st2 {v11.8H, v12.8H}, [x0], #32 // ............................................* - - // ------------- cycle (expected) -------------> - // 0 25 - // |------------------------|------------------- - // st2 {v16.8H, v17.8H}, [x0], #32 // .......................................*..... - // smlal2 v13.4S, v6.8H, v25.8H // ...*......................................... - // smull v20.4S, v11.4H, v24.4H // ..*.......................................... - // smull v17.4S, v11.4H, v23.4H // .....*....................................... - // smlal2 v27.4S, v12.8H, v23.8H // ....*........................................ - // smlal2 v13.4S, v7.8H, v14.8H // ............*................................ - // smlal v20.4S, v12.4H, v23.4H // ......*...................................... - // smlal v17.4S, v12.4H, v31.4H // ...........*................................. - // smlal2 v27.4S, v6.8H, v26.8H // .........*................................... - // ld1 {v5.8H}, [x9], #16 // ....................*........................ - // smlal v17.4S, v6.4H, v25.4H // ...............*............................. - // smlal v20.4S, v6.4H, v26.4H // ..........*.................................. - // ld2 {v15.8H, v16.8H}, [x8], #32 // *............................................ - // smlal v17.4S, v7.4H, v14.4H // ...................*......................... - // ld2 {v8.8H, v9.8H}, [x7], #32 // .......*..................................... - // smlal v20.4S, v7.4H, v25.4H // ..............*.............................. - // smlal2 v27.4S, v7.8H, v25.8H // .............*............................... - // smlal2 v13.4S, v8.8H, v15.8H // ................*............................ - // smlal v17.4S, v8.4H, v15.4H // .......................*..................... - // smlal v20.4S, v8.4H, v16.4H // ..................*.......................... - // smlal2 v27.4S, v8.8H, v16.8H // .................*........................... - // smlal2 v13.4S, v9.8H, v5.8H // .........................*................... - // smlal v17.4S, v9.4H, v5.4H // ...........................*................. - // smlal v20.4S, v9.4H, v15.4H // ......................*...................... - // smlal2 v27.4S, v9.8H, v15.8H // ........................*.................... - // uzp1 v31.8H, v17.8H, v13.8H // ...............................*............. - // uzp1 v8.8H, v20.8H, v27.8H // ............................*................ - // mul v18.8H, v31.8H, v2.H[2] // .................................*........... - // mul v14.8H, v8.8H, v2.H[2] // ..............................*.............. - // smlal v17.4S, v18.4H, v0.4H // .....................................*....... - // smlal2 v27.4S, v14.8H, v0.8H // ..................................*.......... - // smlal2 v13.4S, v18.8H, v0.8H // ......................................*...... - // smlal v20.4S, v14.4H, v0.4H // ...................................*......... - // uzp2 v16.8H, v17.8H, v13.8H // ..........................................*.. - // uzp2 v17.8H, v20.8H, v27.8H // .........................................*... - // st2 {v16.8H, v17.8H}, [x0], #32 // ............................................* + cbnz count, 1b + // Instructions: 55 + // Expected cycles: 61 + // Expected IPC: 0.90 + + // Cycle bound: 61.0 + // IPC bound: 0.90 + + // Wall time: 8.41s + // User time: 8.41s + + // ----------------- original position ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + ldr q7, [x1], #32 // *...................................................... + uzp1 v20.8H, v15.8H, v16.8H // .*..................................................... + uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + uzp1 v23.8H, v7.8H, v30.8H // ...*................................................... + uzp2 v11.8H, v7.8H, v30.8H // ....*.................................................. + smull2 v8.4S, v23.8H, v20.8H // ........*.............................................. + smull v5.4S, v23.4H, v20.4H // .......*............................................... + smull2 v30.4S, v23.8H, v15.8H // ......*................................................ + uzp1 v28.8H, v1.8H, v12.8H // ...............*....................................... + smlal2 v8.4S, v11.8H, v9.8H // ............*.......................................... + smlal v5.4S, v11.4H, v9.4H // ...........*........................................... + uzp1 v3.8H, v21.8H, v6.8H // .............*......................................... + smull v16.4S, v23.4H, v15.4H // .....*................................................. + smlal2 v8.4S, v3.8H, v28.8H // ..................*.................................... + smlal v5.4S, v3.4H, v28.4H // .................*..................................... + uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................ + uzp1 v7.8H, v17.8H, v18.8H // ...........................*........................... + smlal2 v8.4S, v29.8H, v24.8H // ......................*................................ + uzp1 v14.8H, v19.8H, v31.8H // .........................*............................. + smlal v16.4S, v11.4H, v20.4H // .........*............................................. + smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................ + smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................ + uzp2 v20.8H, v1.8H, v12.8H // ................*...................................... + uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................ + smlal2 v30.4S, v3.8H, v20.8H // ....................*.................................. + smlal v16.4S, v3.4H, v20.4H // ...................*................................... + smlal v5.4S, v29.4H, v24.4H // .....................*................................. + uzp2 v9.8H, v17.8H, v18.8H // ............................*.......................... + smlal2 v30.4S, v29.8H, v28.8H // ........................*.............................. + smlal v16.4S, v29.4H, v28.4H // .......................*............................... + smlal v5.4S, v14.4H, v7.4H // .............................*......................... + smlal2 v8.4S, v21.8H, v25.8H // ..................................*.................... + smlal2 v30.4S, v14.8H, v9.8H // ................................*...................... + smlal v16.4S, v14.4H, v9.4H // ...............................*....................... + smlal v5.4S, v21.4H, v25.4H // .................................*..................... + zip1 v20.8H, v27.8H, v10.8H // ..........................................*............ + smlal2 v30.4S, v21.8H, v7.8H // ....................................*.................. + smlal v16.4S, v21.4H, v7.4H // ...................................*................... + uzp1 v7.8H, v5.8H, v8.8H // .....................................*................. + str q20, [x0], #32 // ...............................................*....... + mul v15.8H, v7.8H, v2.8H // .......................................*............... + uzp1 v7.8H, v16.8H, v30.8H // ......................................*................ + zip2 v31.8H, v27.8H, v10.8H // .........................................*............. + mul v20.8H, v7.8H, v2.8H // ........................................*.............. + smlal v5.4S, v15.4H, v0.4H // ...........................................*........... + smlal2 v8.4S, v15.8H, v0.8H // ............................................*.......... + str q31, [x0, #-16] // .................................................*..... + smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........ + smlal v16.4S, v20.4H, v0.4H // .............................................*......... + uzp2 v15.8H, v5.8H, v8.8H // ................................................*...... + uzp2 v20.8H, v16.8H, v30.8H // ..................................................*.... + zip1 v7.8H, v15.8H, v20.8H // ....................................................*.. + zip2 v20.8H, v15.8H, v20.8H // ...................................................*... + str q7, [x0], #32 // .....................................................*. + str q20, [x0, #-16] // ......................................................* + + // -------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // ldr q20, [x1], #32 // *...................................................... + // uzp1 v7.8H, v15.8H, v16.8H // .*..................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ...*................................................... + // uzp2 v20.8H, v20.8H, v30.8H // ....*.................................................. + // smull v30.4S, v8.4H, v15.4H // ............*.......................................... + // smull2 v15.4S, v8.8H, v15.8H // .......*............................................... + // smull v11.4S, v8.4H, v7.4H // ......*................................................ + // smull2 v8.4S, v8.8H, v7.8H // .....*................................................. + // smlal v30.4S, v20.4H, v7.4H // ...................*................................... + // smlal2 v15.4S, v20.8H, v7.8H // ....................*.................................. + // smlal v11.4S, v20.4H, v9.4H // ..........*............................................ + // smlal2 v8.4S, v20.8H, v9.8H // .........*............................................. + // uzp1 v7.8H, v21.8H, v6.8H // ...........*........................................... + // uzp2 v20.8H, v21.8H, v6.8H // ...............*....................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........*.............................................. + // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................ + // smlal v11.4S, v7.4H, v16.4H // ..............*........................................ + // smlal2 v8.4S, v7.8H, v16.8H // .............*......................................... + // smlal v30.4S, v7.4H, v9.4H // .........................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ........................*.............................. + // smlal v11.4S, v20.4H, v24.4H // ..........................*............................ + // smlal2 v8.4S, v20.8H, v24.8H // .................*..................................... + // smlal v30.4S, v20.4H, v16.4H // .............................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ............................*.......................... + // uzp1 v7.8H, v19.8H, v31.8H // ..................*.................................... + // uzp2 v20.8H, v19.8H, v31.8H // .......................*............................... + // uzp1 v16.8H, v17.8H, v18.8H // ................*...................................... + // uzp2 v9.8H, v17.8H, v18.8H // ...........................*........................... + // smlal v11.4S, v7.4H, v16.4H // ..............................*........................ + // smlal2 v8.4S, v7.8H, v16.8H // .....................*................................. + // smlal v30.4S, v7.4H, v9.4H // .................................*..................... + // smlal2 v15.4S, v7.8H, v9.8H // ................................*...................... + // smlal v11.4S, v20.4H, v25.4H // ..................................*.................... + // smlal2 v8.4S, v20.8H, v25.8H // ...............................*....................... + // smlal v30.4S, v20.4H, v16.4H // .....................................*................. + // smlal2 v15.4S, v20.8H, v16.8H // ....................................*.................. + // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................ + // uzp1 v20.8H, v30.8H, v15.8H // .........................................*............. + // mul v7.8H, v7.8H, v2.8H // ........................................*.............. + // mul v20.8H, v20.8H, v2.8H // ...........................................*........... + // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............ + // zip1 v27.8H, v27.8H, v10.8H // ...................................*................... + // smlal v11.4S, v7.4H, v0.4H // ............................................*.......... + // smlal2 v8.4S, v7.8H, v0.8H // .............................................*......... + // smlal v30.4S, v20.4H, v0.4H // ................................................*...... + // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*....... + // str q27, [x0], #32 // .......................................*............... + // uzp2 v27.8H, v11.8H, v8.8H // .................................................*..... + // str q9, [x0, #-16] // ..............................................*........ + // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*.... + // zip2 v9.8H, v27.8H, v10.8H // ....................................................*.. + // zip1 v27.8H, v27.8H, v10.8H // ...................................................*... + // str q27, [x0], #32 // .....................................................*. + // str q9, [x0, #-16] // ......................................................* pop_stack @@ -703,17 +1001,12 @@ k3_loop_start: #endif /* MLKEM_K == 3 */ #if MLKEM_K == 4 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -727,394 +1020,561 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): add b3_ptr, b0_ptr, #(3 * 512) add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) + // Bounds: + + // Each pmull is bound by 2*4096*2^15=2^28, so the final value + // before Montgomery reduction is bound by 2^30. + mov count, #(MLKEM_N / 16) - // Instructions: 18 - // Expected cycles: 27 - // Expected IPC: 0.67 - // - // Cycle bound: 27.0 - // IPC bound: 0.67 - // - // Wall time: 0.03s - // User time: 0.03s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ld2 {v3.8H, v4.8H}, [x1], #32 // *............................. - ld2 {v13.8H, v14.8H}, [x2], #32 // ..*........................... - ld1 {v31.8H}, [x3], #16 // ....*......................... - smull v9.4S, v3.4H, v14.4H // ......*....................... - smull2 v6.4S, v3.8H, v14.8H // .......*...................... - smull2 v29.4S, v3.8H, v13.8H // ........*..................... - smull v11.4S, v3.4H, v13.4H // .........*.................... - smlal v9.4S, v4.4H, v13.4H // ..........*................... - smlal2 v6.4S, v4.8H, v13.8H // ...........*.................. - smlal2 v29.4S, v4.8H, v31.8H // ............*................. - smlal v11.4S, v4.4H, v31.4H // .............*................ - ld2 {v27.8H, v28.8H}, [x7], #32 // ..............*............... - ld2 {v18.8H, v19.8H}, [x10], #32 // ................*............. - ld2 {v20.8H, v21.8H}, [x8], #32 // ..................*........... - ld2 {v25.8H, v26.8H}, [x11], #32 // ....................*......... - ld2 {v16.8H, v17.8H}, [x5], #32 // ......................*....... - ld2 {v7.8H, v8.8H}, [x1], #32 // ........................*..... - ld1 {v3.8H}, [x3], #16 // ..........................*... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ld2 {v7.8H, v8.8H}, [x1], #32 // *.............................. - // ld1 {v3.8H}, [x3], #16 // ....*.......................... - // ld2 {v4.8H, v5.8H}, [x2], #32 // ..*............................ - // ld2 {v27.8H, v28.8H}, [x7], #32 // ..............*................ - // ld2 {v18.8H, v19.8H}, [x10], #32 // ................*.............. - // smull v9.4S, v7.4H, v5.4H // ......*........................ - // ld2 {v20.8H, v21.8H}, [x8], #32 // ..................*............ - // smlal v9.4S, v8.4H, v4.4H // ..........*.................... - // smull2 v29.4S, v7.8H, v4.8H // ........*...................... - // smull2 v6.4S, v7.8H, v5.8H // .......*....................... - // ld2 {v25.8H, v26.8H}, [x11], #32 // ....................*.......... - // smull v11.4S, v7.4H, v4.4H // .........*..................... - // smlal2 v6.4S, v8.8H, v4.8H // ...........*................... - // smlal2 v29.4S, v8.8H, v3.8H // ............*.................. - // ld2 {v16.8H, v17.8H}, [x5], #32 // ......................*........ - // smlal v11.4S, v8.4H, v3.4H // .............*................. - // ld2 {v7.8H, v8.8H}, [x1], #32 // ........................*...... - // ld1 {v3.8H}, [x3], #16 // ..........................*.... + // Instructions: 114 + // Expected cycles: 153 + // Expected IPC: 0.75 + // + // Cycle bound: 153.0 + // IPC bound: 0.75 + // + // Wall time: 0.69s + // User time: 0.69s + // + // ----------------------------------------------- original position -----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + ldr q23, [x2, #16] // .*................................................................................................................ + ldr q19, [x2], #32 // *................................................................................................................. + ldr q17, [x5], #32 // ..*............................................................................................................... + uzp2 v13.8H, v19.8H, v23.8H // ..........*....................................................................................................... + uzp1 v19.8H, v19.8H, v23.8H // ...........*...................................................................................................... + ldr q23, [x5, #-16] // ...*.............................................................................................................. + ldr q30, [x1, #16] // .....*............................................................................................................ + uzp2 v9.8H, v17.8H, v23.8H // ....*............................................................................................................. + uzp1 v23.8H, v17.8H, v23.8H // .......*.......................................................................................................... + ldr q17, [x1], #32 // ......*........................................................................................................... + ldr q10, [x7, #16] // .............*.................................................................................................... + uzp1 v12.8H, v17.8H, v30.8H // ........*......................................................................................................... + uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................ + smull2 v30.4S, v12.8H, v13.8H // ............*..................................................................................................... + smull v13.4S, v12.4H, v13.4H // ............................................*..................................................................... + smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................ + smull v12.4S, v12.4H, v19.4H // ..........................................*....................................................................... + smlal2 v30.4S, v17.8H, v19.8H // ...............................*.................................................................................. + smlal v13.4S, v17.4H, v19.4H // ...............................................*.................................................................. + ldr q19, [x4], #32 // ....................*............................................................................................. + ldr q16, [x4, #-16] // .....................*............................................................................................ + ld1 {v8.8H}, [x3], #16 // ................................*................................................................................. + uzp1 v26.8H, v19.8H, v16.8H // .......................*.......................................................................................... + uzp2 v19.8H, v19.8H, v16.8H // ........................*......................................................................................... + smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................ + smlal v13.4S, v26.4H, v9.4H // ..................................................*............................................................... + smlal2 v22.4S, v17.8H, v8.8H // ........................................*......................................................................... + smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................ + smlal2 v30.4S, v19.8H, v23.8H // ...................................*.............................................................................. + smlal v13.4S, v19.4H, v23.4H // .......................................................*.......................................................... + smlal2 v22.4S, v26.8H, v23.8H // ...........................................*...................................................................... + smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................ + ldr q23, [x7], #32 // ......................*........................................................................................... + ldr q17, [x8, #16] // ..............*................................................................................................... + uzp1 v9.8H, v23.8H, v10.8H // ..........................*....................................................................................... + uzp2 v23.8H, v23.8H, v10.8H // ....................................*............................................................................. + ldr q10, [x10], #32 // ...............*.................................................................................................. + ldr q16, [x10, #-16] // ................*................................................................................................. + ld1 {v8.8H}, [x12], #16 // .................*................................................................................................ + uzp1 v26.8H, v10.8H, v16.8H // ..................*............................................................................................... + uzp2 v10.8H, v10.8H, v16.8H // ...................*.............................................................................................. + ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................ + ldr q3, [x11, #16] // ...........................*...................................................................................... + smlal2 v22.4S, v19.8H, v16.8H // ..............................................*................................................................... + smlal v12.4S, v19.4H, v16.4H // ........................................................*......................................................... + ldr q19, [x11], #32 // ............................*..................................................................................... + ld1 {v16.8H}, [x9], #16 // .............................*.................................................................................... + uzp1 v4.8H, v19.8H, v3.8H // ..................................*............................................................................... + uzp2 v19.8H, v19.8H, v3.8H // .......................................*.......................................................................... + ldr q3, [x8], #32 // ..............................*................................................................................... + ldr q31, [x2], #32 // ......................................*........................................................................... + uzp1 v6.8H, v3.8H, v17.8H // ...................................................*.............................................................. + uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................ + smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*....................................................... + smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*...................................................... + smlal v13.4S, v9.4H, v17.4H // ............................................................*..................................................... + smlal v12.4S, v9.4H, v6.4H // .............................................................*.................................................... + smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*................................................... + smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*.................................................. + smlal v13.4S, v23.4H, v6.4H // ................................................................*................................................. + smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................ + smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*............................................... + smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*.............................................. + smlal v13.4S, v26.4H, v19.4H // ....................................................................*............................................. + smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................ + smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*........................................... + smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*.......................................... + smlal v13.4S, v10.4H, v4.4H // ........................................................................*......................................... + smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................ + ldr q19, [x2, #-16] // .........................................*........................................................................ + uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*...................................... + uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*............................. + mul v23.8H, v23.8H, v2.8H // .............................................................................*.................................... + uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*................................. + uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*.............................. + mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................ + smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................ + smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*............................... + ldr q23, [x5], #32 // .............................................*.................................................................... + smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*...... + uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*........................... + smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*..... + ldr q17, [x5, #-16] // ................................................*................................................................. + ldr q13, [x1, #16] // ......................................................*........................................................... + uzp2 v27.8H, v23.8H, v17.8H // ....................................................*............................................................. + uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*..................................... + uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*.. + ldr q23, [x1], #32 // ..........................................................................*....................................... + zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................* + ldr q3, [x7, #16] // ........................................................................................*......................... + uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*................................... + uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*.................................. + smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*.......................... + ldr q6, [x8, #16] // .........................................................................................*........................ + ldr q23, [x10], #32 // ..........................................................................................*....................... + smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*....... + ldr q17, [x10, #-16] // ...........................................................................................*...................... + ld1 {v22.8H}, [x12], #16 // ............................................................................................*..................... + uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*.................... + uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*................... + ldr q23, [x4], #32 // ...............................................................................................*.................. + ldr q17, [x4, #-16] // ................................................................................................*................. + ldr q4, [x7], #32 // .................................................................................................*................ + uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*............... + uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*.............. + uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............ + smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*... + ld1 {v8.8H}, [x6], #16 // ....................................................................................................*............. + ldr q25, [x11, #16] // ......................................................................................................*........... + ldr q29, [x11], #32 // .......................................................................................................*.......... + ld1 {v12.8H}, [x9], #16 // ........................................................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*. + ldr q14, [x8], #32 // .........................................................................................................*........ + ld1 {v23.8H}, [x3], #16 // .............................................................................................................*.... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q3, [x2], #32 // .*................................................................................................................ + // ldr q17, [x2, #-16] // *................................................................................................................. + // ldr q21, [x5], #32 // ..*............................................................................................................... + // ldr q19, [x5, #-16] // .....*............................................................................................................ + // uzp2 v27.8H, v21.8H, v19.8H // .......*.......................................................................................................... + // ldr q25, [x1, #16] // ......*........................................................................................................... + // ldr q22, [x1], #32 // .........*........................................................................................................ + // uzp1 v28.8H, v21.8H, v19.8H // ........*......................................................................................................... + // uzp1 v31.8H, v22.8H, v25.8H // ...........*...................................................................................................... + // uzp2 v16.8H, v22.8H, v25.8H // ............*..................................................................................................... + // uzp2 v21.8H, v3.8H, v17.8H // ...*.............................................................................................................. + // uzp1 v19.8H, v3.8H, v17.8H // ....*............................................................................................................. + // smull2 v24.4S, v31.8H, v21.8H // .............*.................................................................................................... + // ldr q3, [x7, #16] // ..........*....................................................................................................... + // ldr q6, [x8, #16] // .................................*................................................................................ + // ldr q8, [x10], #32 // ....................................*............................................................................. + // ldr q26, [x10, #-16] // .....................................*............................................................................ + // ld1 {v22.8H}, [x12], #16 // ......................................*........................................................................... + // uzp1 v30.8H, v8.8H, v26.8H // .......................................*.......................................................................... + // uzp2 v11.8H, v8.8H, v26.8H // ........................................*......................................................................... + // ldr q8, [x4], #32 // ...................*.............................................................................................. + // ldr q26, [x4, #-16] // ....................*............................................................................................. + // ldr q4, [x7], #32 // ................................*................................................................................. + // uzp1 v20.8H, v8.8H, v26.8H // ......................*........................................................................................... + // uzp2 v26.8H, v8.8H, v26.8H // .......................*.......................................................................................... + // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................ + // uzp1 v9.8H, v4.8H, v3.8H // ..................................*............................................................................... + // ldr q25, [x11, #16] // ..........................................*....................................................................... + // ldr q29, [x11], #32 // .............................................*.................................................................... + // ld1 {v12.8H}, [x9], #16 // ..............................................*................................................................... + // ldr q14, [x8], #32 // .................................................*................................................................ + // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................ + // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................ + // smlal2 v24.4S, v20.8H, v27.8H // ........................*......................................................................................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*.................................................................. + // smlal2 v24.4S, v26.8H, v28.8H // ............................*..................................................................................... + // uzp2 v4.8H, v4.8H, v3.8H // ...................................*.............................................................................. + // smull2 v13.4S, v31.8H, v19.8H // ...............*.................................................................................................. + // ldr q3, [x2], #32 // ..................................................*............................................................... + // uzp2 v1.8H, v29.8H, v25.8H // ................................................*................................................................. + // smlal2 v13.4S, v16.8H, v23.8H // ..........................*....................................................................................... + // ldr q17, [x2, #-16] // .....................................................................*............................................ + // smull v18.4S, v31.4H, v19.4H // ................*................................................................................................. + // smlal2 v13.4S, v20.8H, v28.8H // ..............................*................................................................................... + // smull v29.4S, v31.4H, v21.4H // ..............*................................................................................................... + // ldr q21, [x5], #32 // ..............................................................................*................................... + // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*...................................................................... + // smlal v29.4S, v16.4H, v19.4H // ..................*............................................................................................... + // ldr q19, [x5, #-16] // ..................................................................................*............................... + // smlal v18.4S, v16.4H, v23.4H // ...........................*...................................................................................... + // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................ + // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*.............................................................. + // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*............................. + // smlal v18.4S, v20.4H, v28.4H // ...............................*.................................................................................. + // ldr q25, [x1, #16] // ...................................................................................*.............................. + // smlal v29.4S, v26.4H, v28.4H // .............................*.................................................................................... + // smlal v18.4S, v26.4H, v8.4H // ............................................*..................................................................... + // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*............................................................. + // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................ + // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*........................................................... + // smlal v29.4S, v9.4H, v26.4H // .......................................................*.......................................................... + // smlal v18.4S, v9.4H, v31.4H // ........................................................*......................................................... + // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................ + // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*....................................................... + // smlal v29.4S, v4.4H, v31.4H // ...........................................................*...................................................... + // smlal v18.4S, v4.4H, v12.4H // ............................................................*..................................................... + // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................................................... + // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*................................................... + // smlal v29.4S, v30.4H, v1.4H // ...............................................................*.................................................. + // smlal v18.4S, v30.4H, v10.4H // ................................................................*................................................. + // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................ + // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*............................................... + // smlal v29.4S, v11.4H, v10.4H // ...................................................................*.............................................. + // smlal v18.4S, v11.4H, v22.4H // ....................................................................*............................................. + // ldr q22, [x1], #32 // .......................................................................................*.......................... + // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*........................................... + // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................ + // mul v19.8H, v31.8H, v2.8H // ........................................................................*......................................... + // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*....................... + // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*...................... + // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................ + // smlal v29.4S, v19.4H, v0.4H // ............................................................................*..................................... + // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*.................................... + // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*....................................... + // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*.......................................... + // mul v23.8H, v26.8H, v2.8H // ...........................................................................*...................................... + // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*................................. + // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*..................... + // ldr q3, [x7, #16] // .........................................................................................*........................ + // ldr q6, [x8, #16] // .............................................................................................*.................... + // ldr q8, [x10], #32 // ..............................................................................................*................... + // ldr q26, [x10, #-16] // ................................................................................................*................. + // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................ + // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*............... + // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*.............. + // ldr q8, [x4], #32 // ....................................................................................................*............. + // ldr q26, [x4, #-16] // .....................................................................................................*............ + // ldr q4, [x7], #32 // ......................................................................................................*........... + // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*.......... + // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*......... + // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*...... + // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........ + // ldr q25, [x11, #16] // ............................................................................................................*..... + // ldr q29, [x11], #32 // .............................................................................................................*.... + // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*... + // ldr q14, [x8], #32 // ................................................................................................................*. + // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*.................. + // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*.................................. + // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................ + // ld1 {v23.8H}, [x3], #16 // .................................................................................................................* + // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*........................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*.. + // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*......................... sub count, count, #2 -k4_loop_start: - // Instructions: 55 - // Expected cycles: 68 - // Expected IPC: 0.81 - // - // Cycle bound: 68.0 - // IPC bound: 0.81 - // - // Wall time: 13.63s - // User time: 13.63s - // - // ------------------------ cycle (expected) -------------------------> - // 0 25 50 - // |------------------------|------------------------|----------------- - ld2 {v13.8H, v14.8H}, [x4], #32 // l................................................................... - ld1 {v15.8H}, [x6], #16 // ..l................................................................. - smlal2 v29.4S, v13.8H, v16.8H // ....l............................................................... - smlal v9.4S, v13.4H, v17.4H // .....l.............................................................. - smlal2 v6.4S, v13.8H, v17.8H // ......l............................................................. - smlal v11.4S, v13.4H, v16.4H // .......l............................................................ - smlal2 v29.4S, v14.8H, v15.8H // ........l........................................................... - smlal v9.4S, v14.4H, v16.4H // .........l.......................................................... - smlal2 v6.4S, v14.8H, v16.8H // ..........l......................................................... - ld1 {v30.8H}, [x9], #16 // ...........l........................................................ - smlal v9.4S, v27.4H, v21.4H // .............l...................................................... - smlal2 v6.4S, v27.8H, v21.8H // ..............l..................................................... - smlal v11.4S, v14.4H, v15.4H // ...............l.................................................... - smlal2 v29.4S, v27.8H, v20.8H // ................l................................................... - smlal v9.4S, v28.4H, v20.4H // .................l.................................................. - smlal2 v6.4S, v28.8H, v20.8H // ..................l................................................. - smlal v11.4S, v27.4H, v20.4H // ...................l................................................ - smlal2 v29.4S, v28.8H, v30.8H // ....................l............................................... - smlal v9.4S, v18.4H, v26.4H // .....................l.............................................. - smlal2 v6.4S, v18.8H, v26.8H // ......................l............................................. - smlal v11.4S, v28.4H, v30.4H // .......................l............................................ - smlal2 v29.4S, v18.8H, v25.8H // ........................l........................................... - smlal v9.4S, v19.4H, v25.4H // .........................l.......................................... - smlal2 v6.4S, v19.8H, v25.8H // ..........................l......................................... - ld1 {v23.8H}, [x12], #16 // ...........................l........................................ - smlal v11.4S, v18.4H, v25.4H // .............................l...................................... - uzp1 v14.8H, v9.8H, v6.8H // ..............................l..................................... - smlal2 v29.4S, v19.8H, v23.8H // ...............................l.................................... - mul v27.8H, v14.8H, v2.H[2] // ................................l................................... - smlal v11.4S, v19.4H, v23.4H // .................................l.................................. - ld2 {v4.8H, v5.8H}, [x2], #32 // ..................................*................................. - smlal v9.4S, v27.4H, v0.4H // ....................................l............................... - smlal2 v6.4S, v27.8H, v0.8H // .....................................l.............................. - ld2 {v27.8H, v28.8H}, [x7], #32 // ......................................*............................. - uzp1 v13.8H, v11.8H, v29.8H // ........................................l........................... - uzp2 v15.8H, v9.8H, v6.8H // .........................................l.......................... - mul v13.8H, v13.8H, v2.H[2] // ..........................................l......................... - ld2 {v18.8H, v19.8H}, [x10], #32 // ...........................................*........................ - smull v9.4S, v7.4H, v5.4H // .............................................*...................... - smlal v11.4S, v13.4H, v0.4H // ..............................................l..................... - smlal2 v29.4S, v13.8H, v0.8H // ...............................................l.................... - ld2 {v20.8H, v21.8H}, [x8], #32 // ................................................*................... - smlal v9.4S, v8.4H, v4.4H // ..................................................*................. - uzp2 v14.8H, v11.8H, v29.8H // ...................................................l................ - smull2 v29.4S, v7.8H, v4.8H // ....................................................*............... - smull2 v6.4S, v7.8H, v5.8H // .....................................................*.............. - ld2 {v25.8H, v26.8H}, [x11], #32 // ......................................................*............. - smull v11.4S, v7.4H, v4.4H // ........................................................*........... - smlal2 v6.4S, v8.8H, v4.8H // .........................................................*.......... - smlal2 v29.4S, v8.8H, v3.8H // ..........................................................*......... - ld2 {v16.8H, v17.8H}, [x5], #32 // ...........................................................*........ - smlal v11.4S, v8.4H, v3.4H // .............................................................*...... - ld2 {v7.8H, v8.8H}, [x1], #32 // ..............................................................e..... - ld1 {v3.8H}, [x3], #16 // ................................................................e... - st2 {v14.8H, v15.8H}, [x0], #32 // ..................................................................l. - - // ------------------------------------------------------------- cycle (expected) -------------------------------------------------------------> - // 0 25 50 75 100 125 - // |------------------------|------------------------|------------------------|------------------------|------------------------|--------------- - // ld2 {v3.8h, v4.8h}, [x1], #32 // e.....'.............................................................~.....'.............................................................~.... - // ld2 {v5.8h, v6.8h}, [x2], #32 // ......'.................................*.................................'.................................~................................ - // ld1 {v7.8h}, [x3], #16 // ..e...'...............................................................~...'...............................................................~.. - // smull v8.4s, v3.4h, v5.4h // ......'.......................................................*...........'.......................................................~.......... - // smull2 v10.4s, v3.8h, v5.8h // ......'...................................................*...............'...................................................~.............. - // smlal v8.4s, v4.4h, v7.4h // ......'............................................................*......'............................................................~..... - // smlal2 v10.4s, v4.8h, v7.8h // ......'.........................................................*.........'.........................................................~........ - // smull v9.4s, v3.4h, v6.4h // ......'............................................*......................'............................................~..................... - // smull2 v11.4s, v3.8h, v6.8h // ......'....................................................*..............'....................................................~............. - // smlal v9.4s, v4.4h, v5.4h // ......'.................................................*.................'.................................................~................ - // smlal2 v11.4s, v4.8h, v5.8h // ......'........................................................*..........'........................................................~......... - // ld2 {v3.8h, v4.8h}, [x4], #32 // ......~...................................................................l.................................................................. - // ld2 {v5.8h, v6.8h}, [x5], #32 // ......'..........................................................*........'..........................................................~....... - // ld1 {v7.8h}, [x6], #16 // ......'.~.................................................................'.l................................................................ - // smlal v8.4s, v3.4h, v5.4h // ......'......~............................................................'......l........................................................... - // smlal2 v10.4s, v3.8h, v5.8h // ......'...~...............................................................'...l.............................................................. - // smlal v8.4s, v4.4h, v7.4h // ......'..............~....................................................'..............l................................................... - // smlal2 v10.4s, v4.8h, v7.8h // ......'.......~...........................................................'.......l.......................................................... - // smlal v9.4s, v3.4h, v6.4h // ......'....~..............................................................'....l............................................................. - // smlal2 v11.4s, v3.8h, v6.8h // ......'.....~.............................................................'.....l............................................................ - // smlal v9.4s, v4.4h, v5.4h // ......'........~..........................................................'........l......................................................... - // smlal2 v11.4s, v4.8h, v5.8h // ......'.........~.........................................................'.........l........................................................ - // ld2 {v3.8h, v4.8h}, [x7], #32 // ......'.....................................*.............................'.....................................~............................ - // ld2 {v5.8h, v6.8h}, [x8], #32 // ......'...............................................*...................'...............................................~.................. - // ld1 {v7.8h}, [x9], #16 // ......'..........~........................................................'..........l....................................................... - // smlal v8.4s, v3.4h, v5.4h // ......'..................~................................................'..................l............................................... - // smlal2 v10.4s, v3.8h, v5.8h // ......'...............~...................................................'...............l.................................................. - // smlal v8.4s, v4.4h, v7.4h // ......'......................~............................................'......................l........................................... - // smlal2 v10.4s, v4.8h, v7.8h // ......'...................~...............................................'...................l.............................................. - // smlal v9.4s, v3.4h, v6.4h // ......'............~......................................................'............l..................................................... - // smlal2 v11.4s, v3.8h, v6.8h // ......'.............~.....................................................'.............l.................................................... - // smlal v9.4s, v4.4h, v5.4h // ......'................~..................................................'................l................................................. - // smlal2 v11.4s, v4.8h, v5.8h // ......'.................~.................................................'.................l................................................ - // ld2 {v3.8h, v4.8h}, [x10], #32 // ......'..........................................*........................'..........................................~....................... - // ld2 {v5.8h, v6.8h}, [x11], #32 // ......'.....................................................*.............'.....................................................~............ - // ld1 {v7.8h}, [x12], #16 // ......'..........................~........................................'..........................l....................................... - // smlal v8.4s, v3.4h, v5.4h // ......'............................~......................................'............................l..................................... - // smlal2 v10.4s, v3.8h, v5.8h // ......'.......................~...........................................'.......................l.......................................... - // smlal v8.4s, v4.4h, v7.4h // ......'................................~..................................'................................l................................. - // smlal2 v10.4s, v4.8h, v7.8h // ......'..............................~....................................'..............................l................................... - // smlal v9.4s, v3.4h, v6.4h // ......'....................~..............................................'....................l............................................. - // smlal2 v11.4s, v3.8h, v6.8h // ......'.....................~.............................................'.....................l............................................ - // smlal v9.4s, v4.4h, v5.4h // ......'........................~..........................................'........................l......................................... - // smlal2 v11.4s, v4.8h, v5.8h // ......'.........................~.........................................'.........................l........................................ - // uzp1 v28.8h, v8.8h, v10.8h // ......'.......................................~...........................'.......................................l.......................... - // mul v28.8h, v28.8h, v2.h[2] // ......'.........................................~.........................'.........................................l........................ - // smlal v8.4s, v28.4h, v0.4h // ......'.............................................~.....................'.............................................l.................... - // smlal2 v10.4s, v28.8h, v0.8h // ......'..............................................~....................'..............................................l................... - // uzp2 v26.8h, v8.8h, v10.8h // ......'..................................................~................'..................................................l............... - // uzp1 v28.8h, v9.8h, v11.8h // ......'.............................~.....................................'.............................l.................................... - // mul v28.8h, v28.8h, v2.h[2] // ......'...............................~...................................'...............................l.................................. - // smlal v9.4s, v28.4h, v0.4h // ......'...................................~...............................'...................................l.............................. - // smlal2 v11.4s, v28.8h, v0.8h // ......'....................................~..............................'....................................l............................. - // uzp2 v27.8h, v9.8h, v11.8h // ......'........................................~..........................'........................................l......................... - // st2 {v26.8h, v27.8h}, [x0], #32 // ....~.'.................................................................~.'.................................................................l +1: + // Instructions: 82 + // Expected cycles: 102 + // Expected IPC: 0.80 + // + // Cycle bound: 102.0 + // IPC bound: 0.80 + // + // Wall time: 15.93s + // User time: 15.93s + // + // ------------------------------- original position -------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................ + uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................ + smull2 v13.4S, v31.8H, v19.8H // ..........*....................................................................... + ldr q3, [x2], #32 // ....e............................................................................. + uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*....................... + smlal2 v13.4S, v16.8H, v23.8H // ............*..................................................................... + ldr q17, [x2, #-16] // .....e............................................................................ + smull v18.4S, v31.4H, v19.4H // .........*........................................................................ + smlal2 v13.4S, v20.8H, v28.8H // ...........................*...................................................... + smull v29.4S, v31.4H, v21.4H // .............*.................................................................... + ldr q21, [x5], #32 // .....................e............................................................ + smlal2 v13.4S, v26.8H, v8.8H // .............................*.................................................... + smlal v29.4S, v16.4H, v19.4H // ...............*.................................................................. + ldr q19, [x5, #-16] // ......................e........................................................... + smlal v18.4S, v16.4H, v23.4H // ...........*...................................................................... + smlal v29.4S, v20.4H, v27.4H // ..............................*................................................... + uzp1 v31.8H, v14.8H, v6.8H // ........................................*......................................... + uzp2 v27.8H, v21.8H, v19.8H // ........................e......................................................... + smlal v18.4S, v20.4H, v28.4H // ..........................*....................................................... + ldr q25, [x1, #16] // .e................................................................................ + smlal v29.4S, v26.4H, v28.4H // ................................*................................................. + smlal v18.4S, v26.4H, v8.4H // ............................*..................................................... + uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................ + smlal2 v13.4S, v9.8H, v31.8H // ............................................*..................................... + smlal2 v24.4S, v9.8H, v26.8H // ................................................*................................. + smlal v29.4S, v9.4H, v26.4H // ...............................................*.................................. + smlal v18.4S, v9.4H, v31.4H // ...........................................*...................................... + smlal2 v13.4S, v4.8H, v12.8H // ..............................................*................................... + smlal2 v24.4S, v4.8H, v31.8H // ..................................................*............................... + smlal v29.4S, v4.4H, v31.4H // .................................................*................................ + smlal v18.4S, v4.4H, v12.4H // .............................................*.................................... + smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................... + smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................ + smlal v29.4S, v30.4H, v1.4H // ................................................................*................. + smlal v18.4S, v30.4H, v10.4H // ............................................................*..................... + smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*.................. + smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*.............. + smlal v29.4S, v11.4H, v10.4H // ..................................................................*............... + smlal v18.4S, v11.4H, v22.4H // ..............................................................*................... + ldr q22, [x1], #32 // e................................................................................. + uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........ + uzp1 v28.8H, v21.8H, v19.8H // .......................e.......................................................... + mul v19.8H, v31.8H, v2.8H // ..........................................................................*....... + uzp1 v31.8H, v22.8H, v25.8H // ..e............................................................................... + uzp2 v16.8H, v22.8H, v25.8H // ...e.............................................................................. + uzp2 v21.8H, v3.8H, v17.8H // .......e.......................................................................... + smlal v29.4S, v19.4H, v0.4H // ...........................................................................*...... + smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*..... + uzp1 v19.8H, v3.8H, v17.8H // ......e........................................................................... + uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*............. + zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l.. + mul v23.8H, v26.8H, v2.8H // .....................................................................*............ + uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*.... + smull2 v24.4S, v31.8H, v21.8H // ..............e................................................................... + str q14, [x0, #16] // .................................................................................l + ldr q3, [x7, #16] // ...................................e.............................................. + ldr q6, [x8, #16] // .......................................e.......................................... + ldr q8, [x10], #32 // ...................................................e.............................. + ldr q26, [x10, #-16] // ....................................................e............................. + ld1 {v22.8H}, [x12], #16 // ...........................................................e...................... + uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................ + uzp2 v11.8H, v8.8H, v26.8H // ......................................................e........................... + ldr q8, [x4], #32 // .................e................................................................ + ldr q26, [x4, #-16] // ..................e............................................................... + ldr q4, [x7], #32 // ..................................e............................................... + uzp1 v20.8H, v8.8H, v26.8H // ...................e.............................................................. + uzp2 v26.8H, v8.8H, v26.8H // ....................e............................................................. + ld1 {v8.8H}, [x6], #16 // .........................e........................................................ + uzp1 v9.8H, v4.8H, v3.8H // ....................................e............................................. + ldr q25, [x11, #16] // ........................................................e......................... + ldr q29, [x11], #32 // .......................................................e.......................... + ld1 {v12.8H}, [x9], #16 // ..........................................e....................................... + ldr q14, [x8], #32 // ......................................e........................................... + smlal2 v24.4S, v16.8H, v19.8H // ................e................................................................. + smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*.......... + smlal v18.4S, v23.4H, v0.4H // ......................................................................*........... + ld1 {v23.8H}, [x3], #16 // ........e......................................................................... + smlal2 v24.4S, v20.8H, v27.8H // ...............................e.................................................. + uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................ + str q5, [x0], #32 // ................................................................................l. + zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*... + + // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~......................................... + // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~............................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~.................................... + // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~............................................................................. + // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~.......................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................ + // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~................................... + // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~.... + // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~......................................................................... + // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~.............................................................................. + // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~.................................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~........................................................................... + // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~....................................................................... + // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~........................... + // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~.................................................................... + // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~....... + // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~.................. + // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~................. + // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~............... + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~.............. + // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~...................................................................... + // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~....................................... + // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~............................................................... + // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~............. + // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~.............................................................. + // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................ + // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~........................................................... + // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~..................................................................... + // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~................................................................. + // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~... + // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................ + // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................ + // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................ + // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~......................... + // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............ + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~............................................................................... + // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........ + // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................ + // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................ + // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~.......................................................... + // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~......... + // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~...................................................... + // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~......................................................... + // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~.................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~..................................................... + // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~....................................................... + // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................ + // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~................................................... + // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~.................................................... + // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~....................... + // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~...................... + // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~.................... + // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~................... + // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~.......... + // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~........... + // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~. + // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................ + // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~..................... + // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~.............................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~................................................. + // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~.......................................... + // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~............................................. + // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~............................................... + // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................ + // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~........................................... + // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................ + // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~............................... + // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~............................. + // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~..... + // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~...... + // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~.. + // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................ + // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~...................................... + // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~.................................. + // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~................................. + // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................ + // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................ + // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l.............................. + // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l + // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l.......................... sub count, count, #1 - cbnz count, k4_loop_start - // Instructions: 92 - // Expected cycles: 107 - // Expected IPC: 0.86 - // - // Cycle bound: 107.0 - // IPC bound: 0.86 - // - // Wall time: 15.26s - // User time: 15.26s - // - // -------------------------------------------- cycle (expected) --------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------ - ld2 {v14.8H, v15.8H}, [x4], #32 // *.......................................................................................................... - ld1 {v1.8H}, [x6], #16 // ..*........................................................................................................ - smlal2 v29.4S, v14.8H, v16.8H // ....*...................................................................................................... - smlal v11.4S, v14.4H, v16.4H // .....*..................................................................................................... - ld2 {v12.8H, v13.8H}, [x5], #32 // ......*.................................................................................................... - smlal2 v29.4S, v15.8H, v1.8H // ........*.................................................................................................. - smlal v11.4S, v15.4H, v1.4H // .........*................................................................................................. - ld1 {v10.8H}, [x9], #16 // ..........*................................................................................................ - smlal2 v29.4S, v27.8H, v20.8H // ............*.............................................................................................. - smlal v11.4S, v27.4H, v20.4H // .............*............................................................................................. - ld2 {v22.8H, v23.8H}, [x2], #32 // ..............*............................................................................................ - smlal2 v29.4S, v28.8H, v10.8H // ................*.......................................................................................... - smlal v11.4S, v28.4H, v10.4H // .................*......................................................................................... - ld1 {v31.8H}, [x12], #16 // ..................*........................................................................................ - smlal2 v29.4S, v18.8H, v25.8H // ....................*...................................................................................... - smlal v11.4S, v18.4H, v25.4H // .....................*..................................................................................... - smull2 v1.4S, v7.8H, v23.8H // ......................*.................................................................................... - smlal v9.4S, v14.4H, v17.4H // .......................*................................................................................... - smlal2 v29.4S, v19.8H, v31.8H // ........................*.................................................................................. - smlal v11.4S, v19.4H, v31.4H // .........................*................................................................................. - smull v31.4S, v7.4H, v23.4H // ..........................*................................................................................ - smlal v9.4S, v15.4H, v16.4H // ...........................*............................................................................... - smull v10.4S, v7.4H, v22.4H // ............................*.............................................................................. - smlal2 v1.4S, v8.8H, v22.8H // .............................*............................................................................. - smlal v31.4S, v8.4H, v22.4H // ..............................*............................................................................ - ld2 {v4.8H, v5.8H}, [x4], #32 // ...............................*........................................................................... - ld2 {v23.8H, v24.8H}, [x11], #32 // .................................*......................................................................... - smlal2 v1.4S, v4.8H, v13.8H // ...................................*....................................................................... - smlal v31.4S, v4.4H, v13.4H // ....................................*...................................................................... - ld1 {v30.8H}, [x6], #16 // .....................................*..................................................................... - smlal2 v1.4S, v5.8H, v12.8H // .......................................*................................................................... - smlal v10.4S, v8.4H, v3.4H // ........................................*.................................................................. - smull2 v7.4S, v7.8H, v22.8H // .........................................*................................................................. - uzp1 v22.8H, v11.8H, v29.8H // ..........................................*................................................................ - smlal2 v6.4S, v14.8H, v17.8H // ...........................................*............................................................... - smlal v10.4S, v4.4H, v12.4H // ............................................*.............................................................. - smlal2 v7.4S, v8.8H, v3.8H // .............................................*............................................................. - ld2 {v13.8H, v14.8H}, [x7], #32 // ..............................................*............................................................ - mul v17.8H, v22.8H, v2.H[2] // ................................................*.......................................................... - smlal2 v7.4S, v4.8H, v12.8H // .................................................*......................................................... - smlal v10.4S, v5.4H, v30.4H // ..................................................*........................................................ - smlal2 v6.4S, v15.8H, v16.8H // ...................................................*....................................................... - smlal v11.4S, v17.4H, v0.4H // ....................................................*...................................................... - ld2 {v3.8H, v4.8H}, [x8], #32 // .....................................................*..................................................... - smlal2 v7.4S, v5.8H, v30.8H // .......................................................*................................................... - ld1 {v30.8H}, [x9], #16 // ........................................................*.................................................. - smlal v10.4S, v13.4H, v3.4H // ..........................................................*................................................ - smlal2 v7.4S, v13.8H, v3.8H // ...........................................................*............................................... - ld2 {v15.8H, v16.8H}, [x10], #32 // ............................................................*.............................................. - smlal v10.4S, v14.4H, v30.4H // ..............................................................*............................................ - smlal v9.4S, v27.4H, v21.4H // ...............................................................*........................................... - smlal2 v7.4S, v14.8H, v30.8H // ................................................................*.......................................... - smlal2 v6.4S, v27.8H, v21.8H // .................................................................*......................................... - smlal v10.4S, v15.4H, v23.4H // ..................................................................*........................................ - ld1 {v27.8H}, [x12], #16 // ...................................................................*....................................... - smlal2 v7.4S, v15.8H, v23.8H // .....................................................................*..................................... - smlal v31.4S, v5.4H, v12.4H // ......................................................................*.................................... - smlal2 v6.4S, v28.8H, v20.8H // .......................................................................*................................... - smlal v10.4S, v16.4H, v27.4H // ........................................................................*.................................. - smlal2 v7.4S, v16.8H, v27.8H // .........................................................................*................................. - smlal v31.4S, v13.4H, v4.4H // ..........................................................................*................................ - smlal2 v6.4S, v18.8H, v26.8H // ...........................................................................*............................... - smlal2 v1.4S, v13.8H, v4.8H // ............................................................................*.............................. - smlal v9.4S, v28.4H, v20.4H // .............................................................................*............................. - smlal v31.4S, v14.4H, v3.4H // ..............................................................................*............................ - uzp1 v13.8H, v10.8H, v7.8H // ...............................................................................*........................... - smlal2 v1.4S, v14.8H, v3.8H // ................................................................................*.......................... - smlal v9.4S, v18.4H, v26.4H // .................................................................................*......................... - smlal v31.4S, v15.4H, v24.4H // ..................................................................................*........................ - smlal2 v6.4S, v19.8H, v25.8H // ...................................................................................*....................... - smlal2 v1.4S, v15.8H, v24.8H // ....................................................................................*...................... - smlal v9.4S, v19.4H, v25.4H // .....................................................................................*..................... - smlal v31.4S, v16.4H, v23.4H // ......................................................................................*.................... - mul v13.8H, v13.8H, v2.H[2] // .......................................................................................*................... - smlal2 v1.4S, v16.8H, v23.8H // ........................................................................................*.................. - uzp1 v23.8H, v9.8H, v6.8H // .........................................................................................*................. - smlal2 v29.4S, v17.8H, v0.8H // ..........................................................................................*................ - mul v12.8H, v23.8H, v2.H[2] // ...........................................................................................*............... - uzp1 v4.8H, v31.8H, v1.8H // ............................................................................................*.............. - smlal v10.4S, v13.4H, v0.4H // .............................................................................................*............. - mul v23.8H, v4.8H, v2.H[2] // ..............................................................................................*............ - smlal v9.4S, v12.4H, v0.4H // ...............................................................................................*........... - smlal2 v7.4S, v13.8H, v0.8H // ................................................................................................*.......... - smlal2 v6.4S, v12.8H, v0.8H // .................................................................................................*......... - smlal v31.4S, v23.4H, v0.4H // ..................................................................................................*........ - smlal2 v1.4S, v23.8H, v0.8H // ...................................................................................................*....... - uzp2 v12.8H, v10.8H, v7.8H // ....................................................................................................*...... - uzp2 v25.8H, v9.8H, v6.8H // .....................................................................................................*..... - uzp2 v24.8H, v11.8H, v29.8H // ......................................................................................................*.... - uzp2 v13.8H, v31.8H, v1.8H // .......................................................................................................*... - st2 {v24.8H, v25.8H}, [x0], #32 // ........................................................................................................*.. - st2 {v12.8H, v13.8H}, [x0], #32 // ..........................................................................................................* - - // -------------------------------------------- cycle (expected) --------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------ - // ld2 {v13.8H, v14.8H}, [x4], #32 // *.......................................................................................................... - // ld1 {v15.8H}, [x6], #16 // ..*........................................................................................................ - // smlal2 v29.4S, v13.8H, v16.8H // ....*...................................................................................................... - // smlal v9.4S, v13.4H, v17.4H // .......................*................................................................................... - // smlal2 v6.4S, v13.8H, v17.8H // ...........................................*............................................................... - // smlal v11.4S, v13.4H, v16.4H // .....*..................................................................................................... - // smlal2 v29.4S, v14.8H, v15.8H // ........*.................................................................................................. - // smlal v9.4S, v14.4H, v16.4H // ...........................*............................................................................... - // smlal2 v6.4S, v14.8H, v16.8H // ...................................................*....................................................... - // ld1 {v30.8H}, [x9], #16 // ..........*................................................................................................ - // smlal v9.4S, v27.4H, v21.4H // ...............................................................*........................................... - // smlal2 v6.4S, v27.8H, v21.8H // .................................................................*......................................... - // smlal v11.4S, v14.4H, v15.4H // .........*................................................................................................. - // smlal2 v29.4S, v27.8H, v20.8H // ............*.............................................................................................. - // smlal v9.4S, v28.4H, v20.4H // .............................................................................*............................. - // smlal2 v6.4S, v28.8H, v20.8H // .......................................................................*................................... - // smlal v11.4S, v27.4H, v20.4H // .............*............................................................................................. - // smlal2 v29.4S, v28.8H, v30.8H // ................*.......................................................................................... - // smlal v9.4S, v18.4H, v26.4H // .................................................................................*......................... - // smlal2 v6.4S, v18.8H, v26.8H // ...........................................................................*............................... - // smlal v11.4S, v28.4H, v30.4H // .................*......................................................................................... - // smlal2 v29.4S, v18.8H, v25.8H // ....................*...................................................................................... - // smlal v9.4S, v19.4H, v25.4H // .....................................................................................*..................... - // smlal2 v6.4S, v19.8H, v25.8H // ...................................................................................*....................... - // ld1 {v23.8H}, [x12], #16 // ..................*........................................................................................ - // smlal v11.4S, v18.4H, v25.4H // .....................*..................................................................................... - // uzp1 v14.8H, v9.8H, v6.8H // .........................................................................................*................. - // smlal2 v29.4S, v19.8H, v23.8H // ........................*.................................................................................. - // mul v27.8H, v14.8H, v2.H[2] // ...........................................................................................*............... - // smlal v11.4S, v19.4H, v23.4H // .........................*................................................................................. - // ld2 {v4.8H, v5.8H}, [x2], #32 // ..............*............................................................................................ - // smlal v9.4S, v27.4H, v0.4H // ...............................................................................................*........... - // smlal2 v6.4S, v27.8H, v0.8H // .................................................................................................*......... - // ld2 {v27.8H, v28.8H}, [x7], #32 // ..............................................*............................................................ - // uzp1 v13.8H, v11.8H, v29.8H // ..........................................*................................................................ - // uzp2 v15.8H, v9.8H, v6.8H // .....................................................................................................*..... - // mul v13.8H, v13.8H, v2.H[2] // ................................................*.......................................................... - // ld2 {v18.8H, v19.8H}, [x10], #32 // ............................................................*.............................................. - // smull v9.4S, v7.4H, v5.4H // ..........................*................................................................................ - // smlal v11.4S, v13.4H, v0.4H // ....................................................*...................................................... - // smlal2 v29.4S, v13.8H, v0.8H // ..........................................................................................*................ - // ld2 {v20.8H, v21.8H}, [x8], #32 // .....................................................*..................................................... - // smlal v9.4S, v8.4H, v4.4H // ..............................*............................................................................ - // uzp2 v14.8H, v11.8H, v29.8H // ......................................................................................................*.... - // smull2 v29.4S, v7.8H, v4.8H // .........................................*................................................................. - // smull2 v6.4S, v7.8H, v5.8H // ......................*.................................................................................... - // ld2 {v25.8H, v26.8H}, [x11], #32 // .................................*......................................................................... - // smull v11.4S, v7.4H, v4.4H // ............................*.............................................................................. - // smlal2 v6.4S, v8.8H, v4.8H // .............................*............................................................................. - // smlal2 v29.4S, v8.8H, v3.8H // .............................................*............................................................. - // ld2 {v16.8H, v17.8H}, [x5], #32 // ......*.................................................................................................... - // smlal v11.4S, v8.4H, v3.4H // ........................................*.................................................................. - // st2 {v14.8H, v15.8H}, [x0], #32 // ........................................................................................................*.. - // ld2 {v13.8H, v14.8H}, [x4], #32 // ...............................*........................................................................... - // ld1 {v15.8H}, [x6], #16 // .....................................*..................................................................... - // smlal2 v29.4S, v13.8H, v16.8H // .................................................*......................................................... - // smlal v9.4S, v13.4H, v17.4H // ....................................*...................................................................... - // smlal2 v6.4S, v13.8H, v17.8H // ...................................*....................................................................... - // smlal v11.4S, v13.4H, v16.4H // ............................................*.............................................................. - // smlal2 v29.4S, v14.8H, v15.8H // .......................................................*................................................... - // smlal v9.4S, v14.4H, v16.4H // ......................................................................*.................................... - // smlal2 v6.4S, v14.8H, v16.8H // .......................................*................................................................... - // ld1 {v30.8H}, [x9], #16 // ........................................................*.................................................. - // smlal v9.4S, v27.4H, v21.4H // ..........................................................................*................................ - // smlal2 v6.4S, v27.8H, v21.8H // ............................................................................*.............................. - // smlal v11.4S, v14.4H, v15.4H // ..................................................*........................................................ - // smlal2 v29.4S, v27.8H, v20.8H // ...........................................................*............................................... - // smlal v9.4S, v28.4H, v20.4H // ..............................................................................*............................ - // smlal2 v6.4S, v28.8H, v20.8H // ................................................................................*.......................... - // smlal v11.4S, v27.4H, v20.4H // ..........................................................*................................................ - // smlal2 v29.4S, v28.8H, v30.8H // ................................................................*.......................................... - // smlal v9.4S, v18.4H, v26.4H // ..................................................................................*........................ - // smlal2 v6.4S, v18.8H, v26.8H // ....................................................................................*...................... - // smlal v11.4S, v28.4H, v30.4H // ..............................................................*............................................ - // smlal2 v29.4S, v18.8H, v25.8H // .....................................................................*..................................... - // smlal v9.4S, v19.4H, v25.4H // ......................................................................................*.................... - // smlal2 v6.4S, v19.8H, v25.8H // ........................................................................................*.................. - // ld1 {v23.8H}, [x12], #16 // ...................................................................*....................................... - // smlal v11.4S, v18.4H, v25.4H // ..................................................................*........................................ - // uzp1 v14.8H, v9.8H, v6.8H // ............................................................................................*.............. - // smlal2 v29.4S, v19.8H, v23.8H // .........................................................................*................................. - // mul v27.8H, v14.8H, v2.H[2] // ..............................................................................................*............ - // smlal v11.4S, v19.4H, v23.4H // ........................................................................*.................................. - // smlal v9.4S, v27.4H, v0.4H // ..................................................................................................*........ - // smlal2 v6.4S, v27.8H, v0.8H // ...................................................................................................*....... - // uzp1 v13.8H, v11.8H, v29.8H // ...............................................................................*........................... - // uzp2 v15.8H, v9.8H, v6.8H // .......................................................................................................*... - // mul v13.8H, v13.8H, v2.H[2] // .......................................................................................*................... - // smlal v11.4S, v13.4H, v0.4H // .............................................................................................*............. - // smlal2 v29.4S, v13.8H, v0.8H // ................................................................................................*.......... - // uzp2 v14.8H, v11.8H, v29.8H // ....................................................................................................*...... - // st2 {v14.8H, v15.8H}, [x0], #32 // ..........................................................................................................* + cbnz count, 1b + // Instructions: 50 + // Expected cycles: 56 + // Expected IPC: 0.89 + // + // Cycle bound: 56.0 + // IPC bound: 0.89 + // + // Wall time: 4.16s + // User time: 4.16s + // + // --------------- original position ---------------> + // 0 25 + // |------------------------| + smull2 v17.4S, v31.8H, v19.8H // ..*............................................... + uzp2 v1.8H, v14.8H, v6.8H // ................*................................. + smull v18.4S, v31.4H, v21.4H // .......*.......................................... + smlal2 v24.4S, v26.8H, v28.8H // *................................................. + smlal2 v17.4S, v16.8H, v23.8H // ....*............................................. + smull v21.4S, v31.4H, v19.4H // .....*............................................ + smlal v18.4S, v16.4H, v19.4H // .........*........................................ + uzp2 v31.8H, v4.8H, v3.8H // .*................................................ + uzp1 v3.8H, v14.8H, v6.8H // ............*..................................... + smlal v21.4S, v16.4H, v23.4H // ..........*....................................... + smlal v18.4S, v20.4H, v27.4H // ...........*...................................... + uzp2 v14.8H, v29.8H, v25.8H // ...*.............................................. + smlal2 v17.4S, v20.8H, v28.8H // ......*........................................... + smlal v21.4S, v20.4H, v28.4H // .............*.................................... + smlal v18.4S, v26.4H, v28.4H // ..............*................................... + smlal2 v24.4S, v9.8H, v1.8H // ..................*............................... + smlal2 v17.4S, v26.8H, v8.8H // ........*......................................... + smlal v21.4S, v26.4H, v8.4H // ...............*.................................. + smlal v18.4S, v9.4H, v1.4H // ...................*.............................. + smlal2 v24.4S, v31.8H, v3.8H // ......................*........................... + smlal2 v17.4S, v9.8H, v3.8H // .................*................................ + smlal v21.4S, v9.4H, v3.4H // ....................*............................. + smlal v18.4S, v31.4H, v3.4H // .......................*.......................... + smlal2 v24.4S, v30.8H, v14.8H // ..........................*....................... + smlal2 v17.4S, v31.8H, v12.8H // .....................*............................ + smlal v21.4S, v31.4H, v12.4H // ........................*......................... + smlal v18.4S, v30.4H, v14.4H // ...........................*...................... + smlal2 v24.4S, v11.8H, v10.8H // ..............................*................... + smlal2 v17.4S, v30.8H, v10.8H // .........................*........................ + smlal v21.4S, v30.4H, v10.4H // ............................*..................... + smlal v18.4S, v11.4H, v10.4H // ...............................*.................. + zip2 v19.8H, v7.8H, v15.8H // ......................................*........... + smlal2 v17.4S, v11.8H, v22.8H // .............................*.................... + smlal v21.4S, v11.4H, v22.4H // ................................*................. + uzp1 v23.8H, v18.8H, v24.8H // .................................*................ + str q19, [x0, #16] // .........................................*........ + mul v19.8H, v23.8H, v2.8H // ..................................*............... + uzp1 v23.8H, v21.8H, v17.8H // .....................................*............ + str q5, [x0], #32 // .............................................*.... + mul v26.8H, v23.8H, v2.8H // .......................................*.......... + smlal v18.4S, v19.4H, v0.4H // ...................................*.............. + smlal2 v24.4S, v19.8H, v0.8H // ....................................*............. + smlal v21.4S, v26.4H, v0.4H // ...........................................*...... + smlal2 v17.4S, v26.8H, v0.8H // ..........................................*....... + uzp2 v13.8H, v18.8H, v24.8H // ........................................*......... + uzp2 v19.8H, v21.8H, v17.8H // ............................................*..... + zip1 v23.8H, v19.8H, v13.8H // ..............................................*... + zip2 v19.8H, v19.8H, v13.8H // ...............................................*.. + str q23, [x0], #32 // .................................................* + str q19, [x0, #-16] // ................................................*. + + // ----------------- new position ------------------> + // 0 25 + // |------------------------|------------------------ + // smlal2 v24.4S, v26.8H, v28.8H // ...*.............................................. + // uzp2 v4.8H, v4.8H, v3.8H // .......*.......................................... + // smull2 v13.4S, v31.8H, v19.8H // *................................................. + // uzp2 v1.8H, v29.8H, v25.8H // ...........*...................................... + // smlal2 v13.4S, v16.8H, v23.8H // ....*............................................. + // smull v18.4S, v31.4H, v19.4H // .....*............................................ + // smlal2 v13.4S, v20.8H, v28.8H // ............*..................................... + // smull v29.4S, v31.4H, v21.4H // ..*............................................... + // smlal2 v13.4S, v26.8H, v8.8H // ................*................................. + // smlal v29.4S, v16.4H, v19.4H // ......*........................................... + // smlal v18.4S, v16.4H, v23.4H // .........*........................................ + // smlal v29.4S, v20.4H, v27.4H // ..........*....................................... + // uzp1 v31.8H, v14.8H, v6.8H // ........*......................................... + // smlal v18.4S, v20.4H, v28.4H // .............*.................................... + // smlal v29.4S, v26.4H, v28.4H // ..............*................................... + // smlal v18.4S, v26.4H, v8.4H // .................*................................ + // uzp2 v26.8H, v14.8H, v6.8H // .*................................................ + // smlal2 v13.4S, v9.8H, v31.8H // ....................*............................. + // smlal2 v24.4S, v9.8H, v26.8H // ...............*.................................. + // smlal v29.4S, v9.4H, v26.4H // ..................*............................... + // smlal v18.4S, v9.4H, v31.4H // .....................*............................ + // smlal2 v13.4S, v4.8H, v12.8H // ........................*......................... + // smlal2 v24.4S, v4.8H, v31.8H // ...................*.............................. + // smlal v29.4S, v4.4H, v31.4H // ......................*........................... + // smlal v18.4S, v4.4H, v12.4H // .........................*........................ + // smlal2 v13.4S, v30.8H, v10.8H // ............................*..................... + // smlal2 v24.4S, v30.8H, v1.8H // .......................*.......................... + // smlal v29.4S, v30.4H, v1.4H // ..........................*....................... + // smlal v18.4S, v30.4H, v10.4H // .............................*.................... + // smlal2 v13.4S, v11.8H, v22.8H // ................................*................. + // smlal2 v24.4S, v11.8H, v10.8H // ...........................*...................... + // smlal v29.4S, v11.4H, v10.4H // ..............................*................... + // smlal v18.4S, v11.4H, v22.4H // .................................*................ + // uzp1 v31.8H, v29.8H, v24.8H // ..................................*............... + // mul v19.8H, v31.8H, v2.8H // ....................................*............. + // smlal v29.4S, v19.4H, v0.4H // ........................................*......... + // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........ + // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............ + // zip2 v14.8H, v7.8H, v15.8H // ...............................*.................. + // mul v23.8H, v26.8H, v2.8H // .......................................*.......... + // uzp2 v15.8H, v29.8H, v24.8H // ............................................*..... + // str q14, [x0, #16] // ...................................*.............. + // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*...... + // smlal v18.4S, v23.4H, v0.4H // ..........................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // .............................................*.... + // str q5, [x0], #32 // ......................................*........... + // zip1 v5.8H, v7.8H, v15.8H // ..............................................*... + // zip2 v14.8H, v7.8H, v15.8H // ...............................................*.. + // str q14, [x0, #16] // .................................................* + // str q5, [x0], #32 // ................................................*. pop_stack diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S index c51e53188..722dc0f49 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/aarch64/src/rej_uniform_asm_clean.S @@ -22,272 +22,6 @@ #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - -.data -table_data: - .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 0 - .byte 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 1 - .byte 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 2 - .byte 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 3 - .byte 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 4 - .byte 0, 1, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 5 - .byte 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 6 - .byte 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 7 - .byte 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 8 - .byte 0, 1, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 9 - .byte 2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 10 - .byte 0, 1, 2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 11 - .byte 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 12 - .byte 0, 1, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 13 - .byte 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 14 - .byte 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1 // 15 - .byte 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 16 - .byte 0, 1, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 17 - .byte 2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 18 - .byte 0, 1, 2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 19 - .byte 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 20 - .byte 0, 1, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 21 - .byte 2, 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 22 - .byte 0, 1, 2, 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 23 - .byte 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 24 - .byte 0, 1, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 25 - .byte 2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 26 - .byte 0, 1, 2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 27 - .byte 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 28 - .byte 0, 1, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 29 - .byte 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 30 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1 // 31 - .byte 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 32 - .byte 0, 1, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 33 - .byte 2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 34 - .byte 0, 1, 2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 35 - .byte 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 36 - .byte 0, 1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 37 - .byte 2, 3, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 38 - .byte 0, 1, 2, 3, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 39 - .byte 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 40 - .byte 0, 1, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 41 - .byte 2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 42 - .byte 0, 1, 2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 43 - .byte 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 44 - .byte 0, 1, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 45 - .byte 2, 3, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 46 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1 // 47 - .byte 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 48 - .byte 0, 1, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 49 - .byte 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 50 - .byte 0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 51 - .byte 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 52 - .byte 0, 1, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 53 - .byte 2, 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 54 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 55 - .byte 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 56 - .byte 0, 1, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 57 - .byte 2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 58 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 59 - .byte 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 60 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 61 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 62 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1 // 63 - .byte 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 64 - .byte 0, 1, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 65 - .byte 2, 3, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 66 - .byte 0, 1, 2, 3, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 67 - .byte 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 68 - .byte 0, 1, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 69 - .byte 2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 70 - .byte 0, 1, 2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 71 - .byte 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 72 - .byte 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 73 - .byte 2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 74 - .byte 0, 1, 2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 75 - .byte 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 76 - .byte 0, 1, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 77 - .byte 2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 78 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1 // 79 - .byte 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 80 - .byte 0, 1, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 81 - .byte 2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 82 - .byte 0, 1, 2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 83 - .byte 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 84 - .byte 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 85 - .byte 2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 86 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 87 - .byte 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 88 - .byte 0, 1, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 89 - .byte 2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 90 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 91 - .byte 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 92 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 93 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 94 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1 // 95 - .byte 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 96 - .byte 0, 1, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 97 - .byte 2, 3, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 98 - .byte 0, 1, 2, 3, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 99 - .byte 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 100 - .byte 0, 1, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 101 - .byte 2, 3, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 102 - .byte 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 103 - .byte 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 104 - .byte 0, 1, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 105 - .byte 2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 106 - .byte 0, 1, 2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 107 - .byte 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 108 - .byte 0, 1, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 109 - .byte 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 110 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1 // 111 - .byte 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 112 - .byte 0, 1, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 113 - .byte 2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 114 - .byte 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 115 - .byte 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 116 - .byte 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 117 - .byte 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 118 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 119 - .byte 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 120 - .byte 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 121 - .byte 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 122 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 123 - .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 124 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 125 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 126 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 // 127 - .byte 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 128 - .byte 0, 1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 129 - .byte 2, 3, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 130 - .byte 0, 1, 2, 3, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 131 - .byte 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 132 - .byte 0, 1, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 133 - .byte 2, 3, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 134 - .byte 0, 1, 2, 3, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 135 - .byte 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 136 - .byte 0, 1, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 137 - .byte 2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 138 - .byte 0, 1, 2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 139 - .byte 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 140 - .byte 0, 1, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 141 - .byte 2, 3, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 142 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1 // 143 - .byte 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 144 - .byte 0, 1, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 145 - .byte 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 146 - .byte 0, 1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 147 - .byte 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 148 - .byte 0, 1, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 149 - .byte 2, 3, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 150 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 151 - .byte 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 152 - .byte 0, 1, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 153 - .byte 2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 154 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 155 - .byte 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 156 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 157 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 158 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1 // 159 - .byte 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 160 - .byte 0, 1, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 161 - .byte 2, 3, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 162 - .byte 0, 1, 2, 3, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 163 - .byte 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 164 - .byte 0, 1, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 165 - .byte 2, 3, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 166 - .byte 0, 1, 2, 3, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 167 - .byte 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 168 - .byte 0, 1, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 169 - .byte 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 170 - .byte 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 171 - .byte 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 172 - .byte 0, 1, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 173 - .byte 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 174 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1 // 175 - .byte 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 176 - .byte 0, 1, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 177 - .byte 2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 178 - .byte 0, 1, 2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 179 - .byte 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 180 - .byte 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 181 - .byte 2, 3, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 182 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 183 - .byte 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 184 - .byte 0, 1, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 185 - .byte 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 186 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 187 - .byte 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 188 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 189 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 190 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1 // 191 - .byte 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 192 - .byte 0, 1, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 193 - .byte 2, 3, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 194 - .byte 0, 1, 2, 3, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 195 - .byte 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 196 - .byte 0, 1, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 197 - .byte 2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 198 - .byte 0, 1, 2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 199 - .byte 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 200 - .byte 0, 1, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 201 - .byte 2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 202 - .byte 0, 1, 2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 203 - .byte 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 204 - .byte 0, 1, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 205 - .byte 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 206 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1 // 207 - .byte 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 208 - .byte 0, 1, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 209 - .byte 2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 210 - .byte 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 211 - .byte 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 212 - .byte 0, 1, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 213 - .byte 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 214 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 215 - .byte 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 216 - .byte 0, 1, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 217 - .byte 2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 218 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 219 - .byte 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 220 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 221 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 222 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1 // 223 - .byte 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 224 - .byte 0, 1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 225 - .byte 2, 3, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 226 - .byte 0, 1, 2, 3, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 227 - .byte 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 228 - .byte 0, 1, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 229 - .byte 2, 3, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 230 - .byte 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 231 - .byte 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 232 - .byte 0, 1, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 233 - .byte 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 234 - .byte 0, 1, 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 235 - .byte 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 236 - .byte 0, 1, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 237 - .byte 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 238 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1 // 239 - .byte 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 240 - .byte 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 241 - .byte 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 242 - .byte 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 243 - .byte 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 244 - .byte 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 245 - .byte 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 246 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 247 - .byte 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 248 - .byte 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 249 - .byte 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 250 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 251 - .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 252 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 253 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 254 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 // 255 - -bit_table_data: - .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - // We save the output on the stack first, and copy to the actual // output buffer only in the end. This is because the main loop can overwrite // by up to 62 bytes, which we account for here (we use 64 bytes for alignment). @@ -306,11 +40,9 @@ bit_table_data: output .req x0 buf .req x1 buflen .req w2 - len .req w3 + table_idx .req x3 - /* Pointers to static data */ - table_idx .req x5 - bit_table .req x6 + len .req w4 /* Temporary output on the stack */ output_tmp .req x7 @@ -381,17 +113,17 @@ bit_table_data: bits_q .req q31 .text +/* Literal pool */ +.p2align 4 +c_bit_table: + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .align 4 -.global MLKEM_NAMESPACE(rej_uniform_asm_clean) -.global _MLKEM_NAMESPACE(rej_uniform_asm_clean) -MLKEM_NAMESPACE(rej_uniform_asm_clean): -_MLKEM_NAMESPACE(rej_uniform_asm_clean): +.global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) +MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean): push_stack - ASM_LOAD(bit_table, bit_table_data) - ASM_LOAD(table_idx, table_data) - - ldr bits_q, [bit_table] + ldr bits_q, c_bit_table movz tmp, #MLKEM_Q dup mlkem_q.8h, tmp diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h index 3469d2739..8177b0b50 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/common.h @@ -26,4 +26,17 @@ * since the backend choice may be part of the namespace. */ #include "namespace.h" +/* On Apple platforms, we need to emit leading underscore + * in front of assembly symbols. We thus introducee a separate + * namespace wrapper for ASM symbols. */ +#if !defined(__APPLE__) +#define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym) +#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym) +#else +#define _PREFIX_UNDERSCORE(sym) _##sym +#define PREFIX_UNDERSCORE(sym) _PREFIX_UNDERSCORE(sym) +#define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym)) +#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym)) +#endif + #endif /* MLKEM_NATIVE_COMMON_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h index ac27408eb..31040a471 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/config.h @@ -39,23 +39,19 @@ /****************************************************************************** * Name: MLKEM_NAMESPACE - * _MLKEM_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/. *****************************************************************************/ #define MLKEM_NAMESPACE(sym) MLKEM_DEFAULT_NAMESPACE(sym) -#define _MLKEM_NAMESPACE(sym) _MLKEM_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: FIPS202_NAMESPACE - * _FIPS202_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/fips202/. *****************************************************************************/ #define FIPS202_NAMESPACE(sym) FIPS202_DEFAULT_NAMESPACE(sym) -#define _FIPS202_NAMESPACE(sym) _FIPS202_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: MLKEM_USE_NATIVE diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c index f52b2ff5a..8ab46ce0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/zetas.c @@ -14,7 +14,7 @@ * Table of zeta values used in the reference NTT and inverse NTT. * See autogenerate_files.py for details. */ -const int16_t zetas[128] = { +ALIGN const int16_t zetas[128] = { -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h index 3469d2739..8177b0b50 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/common.h @@ -26,4 +26,17 @@ * since the backend choice may be part of the namespace. */ #include "namespace.h" +/* On Apple platforms, we need to emit leading underscore + * in front of assembly symbols. We thus introducee a separate + * namespace wrapper for ASM symbols. */ +#if !defined(__APPLE__) +#define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym) +#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym) +#else +#define _PREFIX_UNDERSCORE(sym) _##sym +#define PREFIX_UNDERSCORE(sym) _PREFIX_UNDERSCORE(sym) +#define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym)) +#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym)) +#endif + #endif /* MLKEM_NATIVE_COMMON_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h index ac27408eb..31040a471 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/config.h @@ -39,23 +39,19 @@ /****************************************************************************** * Name: MLKEM_NAMESPACE - * _MLKEM_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/. *****************************************************************************/ #define MLKEM_NAMESPACE(sym) MLKEM_DEFAULT_NAMESPACE(sym) -#define _MLKEM_NAMESPACE(sym) _MLKEM_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: FIPS202_NAMESPACE - * _FIPS202_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/fips202/. *****************************************************************************/ #define FIPS202_NAMESPACE(sym) FIPS202_DEFAULT_NAMESPACE(sym) -#define _FIPS202_NAMESPACE(sym) _FIPS202_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: MLKEM_USE_NATIVE diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c index f52b2ff5a..8ab46ce0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/zetas.c @@ -14,7 +14,7 @@ * Table of zeta values used in the reference NTT and inverse NTT. * See autogenerate_files.py for details. */ -const int16_t zetas[128] = { +ALIGN const int16_t zetas[128] = { -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h index 3469d2739..8177b0b50 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/common.h @@ -26,4 +26,17 @@ * since the backend choice may be part of the namespace. */ #include "namespace.h" +/* On Apple platforms, we need to emit leading underscore + * in front of assembly symbols. We thus introducee a separate + * namespace wrapper for ASM symbols. */ +#if !defined(__APPLE__) +#define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym) +#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym) +#else +#define _PREFIX_UNDERSCORE(sym) _##sym +#define PREFIX_UNDERSCORE(sym) _PREFIX_UNDERSCORE(sym) +#define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym)) +#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym)) +#endif + #endif /* MLKEM_NATIVE_COMMON_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h index ac27408eb..31040a471 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/config.h @@ -39,23 +39,19 @@ /****************************************************************************** * Name: MLKEM_NAMESPACE - * _MLKEM_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/. *****************************************************************************/ #define MLKEM_NAMESPACE(sym) MLKEM_DEFAULT_NAMESPACE(sym) -#define _MLKEM_NAMESPACE(sym) _MLKEM_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: FIPS202_NAMESPACE - * _FIPS202_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/fips202/. *****************************************************************************/ #define FIPS202_NAMESPACE(sym) FIPS202_DEFAULT_NAMESPACE(sym) -#define _FIPS202_NAMESPACE(sym) _FIPS202_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: MLKEM_USE_NATIVE diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S index 503fbeb51..c51606b3f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/basemul.S @@ -113,8 +113,8 @@ vmovdqa %ymm11,(64*\off+48)*2(%rdi) .endm .text -.global MLKEM_NAMESPACE(basemul_avx2) -MLKEM_NAMESPACE(basemul_avx2): +.global MLKEM_ASM_NAMESPACE(basemul_avx2) +MLKEM_ASM_NAMESPACE(basemul_avx2): mov %rsp,%r8 and $-32,%rsp sub $32,%rsp diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S index 50ef190b7..61560fb46 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/fq.S @@ -60,8 +60,8 @@ vmovdqa %ymm9,224(%rdi) ret -.global MLKEM_NAMESPACE(reduce_avx2) -MLKEM_NAMESPACE(reduce_avx2): +.global MLKEM_ASM_NAMESPACE(reduce_avx2) +MLKEM_ASM_NAMESPACE(reduce_avx2): #consts vmovdqa _16XQ*2(%rsi),%ymm0 vmovdqa _16XV*2(%rsi),%ymm1 @@ -103,8 +103,8 @@ vmovdqa %ymm10,224(%rdi) ret -.global MLKEM_NAMESPACE(tomont_avx2) -MLKEM_NAMESPACE(tomont_avx2): +.global MLKEM_ASM_NAMESPACE(tomont_avx2) +MLKEM_ASM_NAMESPACE(tomont_avx2): #consts vmovdqa _16XQ*2(%rsi),%ymm0 vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S index 4860985ed..fda9ec0d3 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/intt.S @@ -242,8 +242,8 @@ vmovdqa %ymm11,(64*\off+176)*2(%rdi) .endm .text -.global MLKEM_NAMESPACE(invntt_avx2) -MLKEM_NAMESPACE(invntt_avx2): +.global MLKEM_ASM_NAMESPACE(invntt_avx2) +MLKEM_ASM_NAMESPACE(invntt_avx2): vmovdqa _16XQ*2(%rsi),%ymm0 intt_levels0t5 0 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S index a0b6f734c..dc3776b37 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/ntt.S @@ -206,8 +206,8 @@ vmovdqa %ymm11,(128*\off+112)*2(%rdi) .endm .text -.global MLKEM_NAMESPACE(ntt_avx2) -MLKEM_NAMESPACE(ntt_avx2): +.global MLKEM_ASM_NAMESPACE(ntt_avx2) +MLKEM_ASM_NAMESPACE(ntt_avx2): vmovdqa _16XQ*2(%rsi),%ymm0 level0 0 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S index 34f6b30b0..0d23c11b3 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/x86_64/src/shuffle.S @@ -15,8 +15,8 @@ #include "fq.inc" #include "shuffle.inc" -.global MLKEM_NAMESPACE(nttpack_avx2) -MLKEM_NAMESPACE(nttpack_avx2): +.global MLKEM_ASM_NAMESPACE(nttpack_avx2) +MLKEM_ASM_NAMESPACE(nttpack_avx2): #load vmovdqa (%rdi),%ymm4 vmovdqa 32(%rdi),%ymm5 @@ -102,8 +102,8 @@ vmovdqa %ymm11,224(%rdi) ret -.global MLKEM_NAMESPACE(nttunpack_avx2) -MLKEM_NAMESPACE(nttunpack_avx2): +.global MLKEM_ASM_NAMESPACE(nttunpack_avx2) +MLKEM_ASM_NAMESPACE(nttunpack_avx2): call nttunpack128_avx2 add $256,%rdi call nttunpack128_avx2 @@ -169,8 +169,8 @@ vmovdqu %ymm9,160(%rdi) ret -.global MLKEM_NAMESPACE(ntttobytes_avx2) -MLKEM_NAMESPACE(ntttobytes_avx2): +.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2) +MLKEM_ASM_NAMESPACE(ntttobytes_avx2): #consts vmovdqa _16XQ*2(%rdx),%ymm0 call ntttobytes128_avx @@ -245,8 +245,8 @@ vmovdqa %ymm1,224(%rdi) ret -.global MLKEM_NAMESPACE(nttfrombytes_avx2) -MLKEM_NAMESPACE(nttfrombytes_avx2): +.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2) +MLKEM_ASM_NAMESPACE(nttfrombytes_avx2): #consts vmovdqa _16XMASK*2(%rdx),%ymm0 call nttfrombytes128_avx diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c index f52b2ff5a..8ab46ce0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/zetas.c @@ -14,7 +14,7 @@ * Table of zeta values used in the reference NTT and inverse NTT. * See autogenerate_files.py for details. */ -const int16_t zetas[128] = { +ALIGN const int16_t zetas[128] = { -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c index ecf1b529a..b638e2081 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/aarch64_zetas.c @@ -20,7 +20,7 @@ * Table of zeta values used in the AArch64 forward NTT * See autogenerate_files.py for details. */ -const int16_t aarch64_ntt_zetas_layer01234[] = { +ALIGN const int16_t aarch64_ntt_zetas_layer01234[] = { -1600, -15749, -749, -7373, -40, -394, -687, -6762, 630, 6201, -1432, -14095, 848, 8347, 0, 0, 1062, 10453, 296, 2914, -882, -8682, 0, 0, -1410, -13879, 1339, 13180, 1476, 14529, @@ -31,7 +31,7 @@ const int16_t aarch64_ntt_zetas_layer01234[] = { 0, 0, -1583, -15582, -1355, -13338, 821, 8081, 0, 0, }; -const int16_t aarch64_ntt_zetas_layer56[] = { +ALIGN const int16_t aarch64_ntt_zetas_layer56[] = { 289, 289, 331, 331, -76, -76, -1573, -1573, 2845, 2845, 3258, 3258, -748, -748, -15483, -15483, 17, 17, 583, 583, 1637, 1637, -1041, -1041, 167, 167, 5739, @@ -77,7 +77,7 @@ const int16_t aarch64_ntt_zetas_layer56[] = { 10129, 10129, -3878, -3878, -11566, -11566, }; -const int16_t aarch64_invntt_zetas_layer01234[] = { +ALIGN const int16_t aarch64_invntt_zetas_layer01234[] = { 1583, 15582, -821, -8081, 1355, 13338, 0, 0, -569, -5601, 450, 4429, 936, 9213, 0, 0, 69, 679, 447, 4400, -535, -5266, 0, 0, 543, 5345, 1235, 12156, -1426, -14036, @@ -88,7 +88,7 @@ const int16_t aarch64_invntt_zetas_layer01234[] = { -848, -8347, 1432, 14095, -630, -6201, 687, 6762, 0, 0, }; -const int16_t aarch64_invntt_zetas_layer56[] = { +ALIGN const int16_t aarch64_invntt_zetas_layer56[] = { -910, -910, -1227, -1227, 219, 219, 855, 855, -8957, -8957, -12078, -12078, 2156, 2156, 8416, 8416, 1175, 1175, 394, 394, -1029, -1029, -1212, -1212, 11566, 11566, 3878, @@ -134,7 +134,7 @@ const int16_t aarch64_invntt_zetas_layer56[] = { -16113, -16113, -5739, -5739, -167, -167, }; -const int16_t aarch64_zetas_mulcache_native[] = { +ALIGN const int16_t aarch64_zetas_mulcache_native[] = { 17, -17, -568, 568, 583, -583, -680, 680, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 1409, -1409, -667, 667, -48, 48, 233, -233, 756, -756, -1173, 1173, -314, 314, -279, 279, -1626, @@ -149,7 +149,7 @@ const int16_t aarch64_zetas_mulcache_native[] = { 1219, -394, 394, 885, -885, -1175, 1175, }; -const int16_t aarch64_zetas_mulcache_twisted_native[] = { +ALIGN const int16_t aarch64_zetas_mulcache_twisted_native[] = { 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 7441, -7441, -11546, @@ -168,7 +168,6 @@ const int16_t aarch64_zetas_mulcache_twisted_native[] = { }; #else -#include "params.h" /* Dummy declaration for compilers disliking empty compilation units */ #define empty_cu_aarch64_zetas MLKEM_NAMESPACE(empty_cu_aarch64_zetas) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h index 2f3b0ef4f..6a5ee8a7d 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/arith_native_aarch64.h @@ -19,6 +19,7 @@ MLKEM_NAMESPACE(aarch64_zetas_mulcache_native) #define aarch64_zetas_mulcache_twisted_native \ MLKEM_NAMESPACE(aarch64_zetas_mulcache_twisted_native) +#define rej_uniform_table MLKEM_NAMESPACE(rej_uniform_table) extern const int16_t aarch64_ntt_zetas_layer01234[]; extern const int16_t aarch64_ntt_zetas_layer56[]; @@ -26,6 +27,7 @@ extern const int16_t aarch64_invntt_zetas_layer01234[]; extern const int16_t aarch64_invntt_zetas_layer56[]; extern const int16_t aarch64_zetas_mulcache_native[]; extern const int16_t aarch64_zetas_mulcache_twisted_native[]; +extern const uint8_t rej_uniform_table[]; #define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean) void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *); @@ -41,7 +43,7 @@ void intt_asm_opt(int16_t *, const int16_t *, const int16_t *); #define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) unsigned int rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, - unsigned int buflen); + unsigned int buflen, const uint8_t *table); #define poly_reduce_asm_clean MLKEM_NAMESPACE(poly_reduce_asm_clean) void poly_reduce_asm_clean(int16_t *); diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h index c91a7eedd..b0ff3d597 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/clean_impl.h @@ -74,7 +74,7 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len, { return -1; } - return (int)rej_uniform_asm_clean(r, buf, buflen); + return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table); } #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/common.i b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/common.i deleted file mode 100644 index 6b8880311..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/common.i +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: MIT - */ - -#if __APPLE__ -#define ASM_LOAD(dst, symbol) \ - adrp dst, symbol @PAGE %% add dst, dst, symbol @PAGEOFF -#else /* __APPLE__ */ -#define ASM_LOAD(dst, symbol) \ - adrp dst, symbol; \ - add dst, dst, : lo12 : symbol; -#endif /* __APPLE__ */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S index 2f05d8cca..623a82ae9 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_clean.S @@ -26,10 +26,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) // @@ -145,38 +141,7 @@ .text - .global MLKEM_NAMESPACE(intt_asm_clean) - .global _MLKEM_NAMESPACE(intt_asm_clean) - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ninv_addr: .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 -ninv_tw_addr: .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - -MLKEM_NAMESPACE(intt_asm_clean): -_MLKEM_NAMESPACE(intt_asm_clean): - push_stack + .global MLKEM_ASM_NAMESPACE(intt_asm_clean) in .req x0 r01234_ptr .req x1 @@ -227,16 +192,41 @@ _MLKEM_NAMESPACE(intt_asm_clean): t2 .req v27 t3 .req v28 - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ninv .req v29 + q_ninv .req q29 ninv_tw .req v30 + q_ninv_tw .req q30 + +/* Literal pool */ +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm + +.p2align 4 +c_consts: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +c_ninv: dup8h 512 +c_ninv_tw: dup8h 5040 + +MLKEM_ASM_NAMESPACE(intt_asm_clean): + push_stack - ASM_LOAD(xtmp, ninv_addr) - ld1r {ninv.8h}, [xtmp] - ASM_LOAD(xtmp, ninv_tw_addr) - ld1r {ninv_tw.8h}, [xtmp] + ldr q_consts, c_consts + ldr q_ninv, c_ninv + ldr q_ninv_tw, c_ninv_tw mov inp, in mov count, #8 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S index fc720e504..e332efef8 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/intt_opt.S @@ -26,10 +26,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) // @@ -145,38 +141,7 @@ .text - .global MLKEM_NAMESPACE(intt_asm_opt) - .global _MLKEM_NAMESPACE(intt_asm_opt) - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ninv_addr: .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 - .short 512 -ninv_tw_addr: .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - .short 5040 - -MLKEM_NAMESPACE(intt_asm_opt): -_MLKEM_NAMESPACE(intt_asm_opt): - push_stack + .global MLKEM_ASM_NAMESPACE(intt_asm_opt) in .req x0 r01234_ptr .req x1 @@ -227,16 +192,41 @@ _MLKEM_NAMESPACE(intt_asm_opt): t2 .req v27 t3 .req v28 - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ninv .req v29 + q_ninv .req q29 ninv_tw .req v30 + q_ninv_tw .req q30 + +/* Literal pool */ +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm + +.p2align 4 +c_consts: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +c_ninv: dup8h 512 +c_ninv_tw: dup8h 5040 + +MLKEM_ASM_NAMESPACE(intt_asm_opt): + push_stack - ASM_LOAD(xtmp, ninv_addr) - ld1r {ninv.8h}, [xtmp] - ASM_LOAD(xtmp, ninv_tw_addr) - ld1r {ninv_tw.8h}, [xtmp] + ldr q_consts, c_consts + ldr q_ninv, c_ninv + ldr q_ninv_tw, c_ninv_tw mov inp, in mov count, #8 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S index b00d08810..877a5f689 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_clean.S @@ -27,10 +27,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) // @@ -64,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] -.endm - .macro load_roots_012 ldr q_root0, [r01234_ptr], #32 ldr q_root1, [r01234_ptr, #-16] @@ -100,13 +90,6 @@ trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro transpose_single data_out, data_in - trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s - trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s - trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s - trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s -.endm - .macro save_vregs sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -182,11 +165,11 @@ t3 .req v28 .text - .global MLKEM_NAMESPACE(ntt_asm_clean) - .global _MLKEM_NAMESPACE(ntt_asm_clean) + .global MLKEM_ASM_NAMESPACE(ntt_asm_clean) +/* Literal pool */ .p2align 4 -const_addr: +c_consts: .short 3329 .short 20159 .short 0 @@ -196,12 +179,9 @@ const_addr: .short 0 .short 0 -MLKEM_NAMESPACE(ntt_asm_clean): -_MLKEM_NAMESPACE(ntt_asm_clean): +MLKEM_ASM_NAMESPACE(ntt_asm_clean): push_stack - ASM_LOAD(xtmp, const_addr) - - ld1 {consts.8h}, [xtmp] + ldr q_consts, c_consts mov inp, in mov count, #4 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S index b8a22ecfc..15103a595 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/ntt_opt.S @@ -27,10 +27,6 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) @@ -64,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] -.endm - .macro load_roots_012 ldr q_root0, [r01234_ptr], #32 ldr q_root1, [r01234_ptr, #-16] @@ -100,13 +90,6 @@ trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro transpose_single data_out, data_in - trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s - trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s - trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s - trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s -.endm - .macro save_vregs sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -182,11 +165,11 @@ t3 .req v28 .text - .global MLKEM_NAMESPACE(ntt_asm_opt) - .global _MLKEM_NAMESPACE(ntt_asm_opt) + .global MLKEM_ASM_NAMESPACE(ntt_asm_opt) +/* Literal pool */ .p2align 4 -const_addr: +c_consts: .short 3329 .short 20159 .short 0 @@ -196,12 +179,9 @@ const_addr: .short 0 .short 0 -MLKEM_NAMESPACE(ntt_asm_opt): -_MLKEM_NAMESPACE(ntt_asm_opt): +MLKEM_ASM_NAMESPACE(ntt_asm_opt): push_stack - ASM_LOAD(xtmp, const_addr) - - ld1 {consts.8h}, [xtmp] + ldr q_consts, c_consts mov inp, in mov count, #4 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h index 5e4d00e1f..b22674026 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/opt_impl.h @@ -75,7 +75,7 @@ static INLINE int rej_uniform_native(int16_t *r, unsigned int len, { return -1; } - return (int)rej_uniform_asm_clean(r, buf, buflen); + return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table); } #endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S index 3e1bc5cf4..f70a40221 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_clean.S @@ -6,65 +6,85 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c .endm -.macro scalar_signed_to_unsigned a - sshr mask.8h, \a\().8h, #15 - and mask.16b, modulus.16b, mask.16b - add \a\().8h, \a\().8h, mask.8h +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 20159 // Barrett twist of 1 wrt 2^27 +c_mont_constant: dup8h -1044 // 2^16 % 3329 +c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) + +/* + * Some modular arithmetic macros + */ + +/* Barrett reduction */ +.macro barrett_reduce a + sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] + srshr tmp.8h, tmp.8h, #11 + mls \a\().8h, tmp.8h, modulus.h[0] .endm +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ .macro mulmod dst, src, const, const_twisted sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, tmp0.8h, consts.h[0] + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/* Turns signed-canonical to unsigned canonical representative + * through conditional addition of the modulus. + * + * Expected modulus in `modulus`. */ +.macro scalar_signed_to_unsigned a + sshr mask.8h, \a\().8h, #15 + and mask.16b, modulus.16b, mask.16b + add \a\().8h, \a\().8h, mask.8h .endm /********************************** * poly_reduce() * **********************************/ -.global MLKEM_NAMESPACE(poly_reduce_asm_clean) -.global _MLKEM_NAMESPACE(poly_reduce_asm_clean) -.p2align 4 -const_addr: - .short 3329 // ML-KEM modulus - .short 20159 // Barrett twist of 1 wrt 2^27 - .short -1044 // 2^16 % 3329 - .short -10276 // Barrett twist of -1044 (wrt 2^16) - .short 0 - .short 0 - .short 0 - .short 0 - - ptr .req x0 - count .req x1 - xtmp .req x2 - wmodulus .req w3 - - q_data .req q0 - data .req v0 - t0 .req v1 - consts .req v2 - mask .req v3 - modulus .req v4 - -MLKEM_NAMESPACE(poly_reduce_asm_clean): -_MLKEM_NAMESPACE(poly_reduce_asm_clean): - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov wmodulus, #3329 - dup modulus.8h, wmodulus +.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean) + + ptr .req x0 + count .req x1 + + data .req v0 + q_data .req q0 + + tmp .req v1 + mask .req v2 + modulus .req v3 + q_modulus .req q3 + modulus_twisted .req v4 + q_modulus_twisted .req q4 + +MLKEM_ASM_NAMESPACE(poly_reduce_asm_clean): + + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted mov count, #8 loop_start: @@ -95,57 +115,64 @@ loop_start: .unreq ptr .unreq count - .unreq xtmp - .unreq q_data + .unreq data - .unreq t0 - .unreq consts + .unreq q_data + + .unreq tmp .unreq mask - .unreq wmodulus .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_mulcache_compute() * ********************************************/ -.global MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean) -.global _MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean) - - cache_ptr .req x0 - data_ptr .req x1 - zeta_ptr .req x2 - zeta_twisted_ptr .req x3 - xtmp .req x4 - - count .req x5 - - data_odd .req v1 - - zeta .req v2 - zeta_twisted .req v3 - q_zeta .req q2 - q_zeta_twisted .req q3 - - q_tmp0 .req q4 - q_tmp1 .req q5 - tmp0 .req v4 - tmp1 .req v5 - consts .req v6 - dst .req v7 - q_dst .req q7 - -MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean): -_MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov count, #(16) + +.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean) + + cache_ptr .req x0 + data_ptr .req x1 + zeta_ptr .req x2 + zeta_twisted_ptr .req x3 + count .req x4 + + data_odd .req v0 + zeta .req v1 + q_zeta .req q1 + zeta_twisted .req v2 + q_zeta_twisted .req q2 + + tmp0 .req v3 + q_tmp0 .req q3 + tmp1 .req v4 + q_tmp1 .req q4 + dst .req v5 + q_dst .req q5 + + modulus .req v6 + q_modulus .req q6 + modulus_twisted .req v7 + q_modulus_twisted .req q7 + +MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_clean): + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + + mov count, #16 mulcache_compute_loop_start: ldr q_tmp0, [data_ptr], #32 ldr q_tmp1, [data_ptr, #-16] ldr q_zeta, [zeta_ptr], #16 ldr q_zeta_twisted, [zeta_twisted_ptr], #16 + + // The mulcache of a polynomial a + b*X in Fq[X^2-zeta] is b*zeta; + // Since tmp0 || tmp1 represents multiple such polynomails as + // (a0,b0,a1,b1,...), extract only the odd elements. uzp2 data_odd.8h, tmp0.8h, tmp1.8h mulmod dst, data_odd, zeta, zeta_twisted + str q_dst, [cache_ptr], #16 subs count, count, #1 @@ -156,7 +183,7 @@ mulcache_compute_loop_start: .unreq cache_ptr .unreq data_ptr .unreq zeta_ptr - .unreq xtmp + .unreq zeta_twisted_ptr .unreq count .unreq data_odd @@ -165,34 +192,35 @@ mulcache_compute_loop_start: .unreq zeta_twisted .unreq q_zeta_twisted - .unreq q_tmp0 - .unreq q_tmp1 .unreq tmp0 + .unreq q_tmp0 .unreq tmp1 - .unreq consts + .unreq q_tmp1 .unreq dst + .unreq q_dst + + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_tobytes() * ********************************************/ -.global MLKEM_NAMESPACE(poly_tobytes_asm_clean) -.global _MLKEM_NAMESPACE(poly_tobytes_asm_clean) +.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_clean) data0 .req v0 data1 .req v1 - out0 .req v2 out1 .req v3 out2 .req v4 - - tmp .req v5 + tmp .req v5 dst .req x0 src .req x1 count .req x2 -MLKEM_NAMESPACE(poly_tobytes_asm_clean): -_MLKEM_NAMESPACE(poly_tobytes_asm_clean): +MLKEM_ASM_NAMESPACE(poly_tobytes_asm_clean): mov count, #16 poly_tobytes_asm_clean_asm_loop_start: @@ -229,37 +257,33 @@ poly_tobytes_asm_clean_asm_loop_start: /********************************** * poly_tomont() * **********************************/ -.global MLKEM_NAMESPACE(poly_tomont_asm_clean) -.global _MLKEM_NAMESPACE(poly_tomont_asm_clean) - - src .req x0 - count .req x1 - xtmp .req x2 - - wtmp2 .req w3 +.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean) - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 + src .req x0 + count .req x1 - consts .req v2 + data .req v0 + q_data .req q0 + res .req v1 + q_res .req q1 - factor .req v3 - factor_t .req v4 + factor .req v2 + q_factor .req q2 + factor_t .req v3 + q_factor_t .req q3 + modulus .req v4 + q_modulus .req q4 + modulus_twisted .req v5 + q_modulus_twisted .req q5 - tmp0 .req v5 + tmp0 .req v6 -MLKEM_NAMESPACE(poly_tomont_asm_clean): -_MLKEM_NAMESPACE(poly_tomont_asm_clean): +MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - ldrsh wtmp2, [xtmp, #(2*2)] - dup factor.8h, wtmp2 - ldrh wtmp2, [xtmp, #(3*2)] - dup factor_t.8h, wtmp2 + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + ldr q_factor, c_mont_constant + ldr q_factor_t, c_barrett_twist mov count, #8 poly_tomont_asm_loop: @@ -285,4 +309,23 @@ poly_tomont_asm_loop: ret + .unreq src + .unreq count + + .unreq data + .unreq q_data + .unreq res + .unreq q_res + + .unreq factor + .unreq q_factor + .unreq factor_t + .unreq q_factor_t + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted + + .unreq tmp0 + #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S index df3b21008..e58ee77c4 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/poly_opt.S @@ -6,117 +6,137 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c .endm -.macro scalar_signed_to_unsigned a - sshr mask.8h, \a\().8h, #15 - and mask.16b, modulus.16b, mask.16b - add \a\().8h, \a\().8h, mask.8h +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 20159 // Barrett twist of 1 wrt 2^27 +c_mont_constant: dup8h -1044 // 2^16 % 3329 +c_barrett_twist: dup8h -10276 // Barrett twist of -1044 (wrt 2^16) + +/* + * Some modular arithmetic macros + */ + +/* Barrett reduction */ +.macro barrett_reduce a + sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] + srshr tmp.8h, tmp.8h, #11 + mls \a\().8h, tmp.8h, modulus.h[0] .endm +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ .macro mulmod dst, src, const, const_twisted sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, tmp0.8h, consts.h[0] + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/* Turns signed-canonical to unsigned canonical representative + * through conditional addition of the modulus. + * + * Expected modulus in `modulus`. */ +.macro scalar_signed_to_unsigned a + sshr mask.8h, \a\().8h, #15 + and mask.16b, modulus.16b, mask.16b + add \a\().8h, \a\().8h, mask.8h .endm /********************************** * poly_reduce() * **********************************/ -.global MLKEM_NAMESPACE(poly_reduce_asm_opt) -.global _MLKEM_NAMESPACE(poly_reduce_asm_opt) -.p2align 4 -const_addr: - .short 3329 // ML-KEM modulus - .short 20159 // Barrett twist of 1 wrt 2^27 - .short -1044 // 2^16 % 3329 - .short -10276 // Barrett twist of -1044 (wrt 2^16) - .short 0 - .short 0 - .short 0 - .short 0 - - ptr .req x0 - count .req x1 - xtmp .req x2 - wmodulus .req w3 - - q_data .req q0 - data .req v0 - t0 .req v1 - consts .req v2 - mask .req v3 - modulus .req v4 - -MLKEM_NAMESPACE(poly_reduce_asm_opt): -_MLKEM_NAMESPACE(poly_reduce_asm_opt): - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov wmodulus, #3329 - dup modulus.8h, wmodulus +.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt) + + ptr .req x0 + count .req x1 + + data .req v0 + q_data .req q0 + + tmp .req v1 + mask .req v2 + modulus .req v3 + q_modulus .req q3 + modulus_twisted .req v4 + q_modulus_twisted .req q4 + +MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): + + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted mov count, #8 - // Instructions: 15 - // Expected cycles: 22 - // Expected IPC: 0.68 + // Instructions: 15 + // Expected cycles: 22 + // Expected IPC: 0.68 - // Cycle bound: 22.0 - // IPC bound: 0.68 + // Cycle bound: 22.0 + // IPC bound: 0.68 - // Wall time: 0.04s - // User time: 0.04s + // Wall time: 0.05s + // User time: 0.05s - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q21, [x0, #48] // *............................. - ldr q3, [x0, #32] // ..*........................... - sqdmulh v6.8H, v21.8H, v2.H[1] // ....*......................... - sqdmulh v30.8H, v3.8H, v2.H[1] // ......*....................... - srshr v6.8H, v6.8H, #11 // ........*..................... - srshr v30.8H, v30.8H, #11 // ..........*................... - mls v21.8H, v6.8H, v2.H[0] // ...........*.................. - mls v3.8H, v30.8H, v2.H[0] // .............*................ - ldr q22, [x0], #64 // ..............*............... - sshr v6.8H, v21.8H, #15 // ................*............. - sshr v30.8H, v3.8H, #15 // .................*............ - and v6.16B, v4.16B, v6.16B // ..................*........... - add v6.8H, v21.8H, v6.8H // ...................*.......... - and v21.16B, v4.16B, v30.16B // ....................*......... - add v21.8H, v3.8H, v21.8H // .....................*........ + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q21, [x0, #32] // *............................. + ldr q23, [x0, #48] // ..*........................... + sqdmulh v7.8H, v21.8H, v4.H[0] // ....*......................... + sqdmulh v30.8H, v23.8H, v4.H[0] // ......*....................... + srshr v7.8H, v7.8H, #11 // ........*..................... + srshr v30.8H, v30.8H, #11 // ..........*................... + mls v21.8H, v7.8H, v3.H[0] // ...........*.................. + mls v23.8H, v30.8H, v3.H[0] // .............*................ + ldr q5, [x0, #16] // ..............*............... + sshr v7.8H, v21.8H, #15 // ................*............. + sshr v30.8H, v23.8H, #15 // .................*............ + and v7.16B, v3.16B, v7.16B // ..................*........... + add v21.8H, v21.8H, v7.8H // ...................*.......... + and v7.16B, v3.16B, v30.16B // ....................*......... + add v16.8H, v23.8H, v7.8H // .....................*........ // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q31, [x0, #48] // *.............................. - // sqdmulh v24.8H, v31.8H, v2.H[1] // ....*.......................... - // srshr v6.8H, v24.8H, #11 // ........*...................... - // mls v31.8H, v6.8H, v2.H[0] // ...........*................... - // ldr q30, [x0, #32] // ..*............................ - // sshr v16.8H, v31.8H, #15 // ................*.............. - // and v28.16B, v4.16B, v16.16B // ..................*............ - // sqdmulh v23.8H, v30.8H, v2.H[1] // ......*........................ - // add v6.8H, v31.8H, v28.8H // ...................*........... - // srshr v18.8H, v23.8H, #11 // ..........*.................... - // mls v30.8H, v18.8H, v2.H[0] // .............*................. - // ldr q22, [x0], #64 // ..............*................ - // sshr v0.8H, v30.8H, #15 // .................*............. - // and v29.16B, v4.16B, v0.16B // ....................*.......... - // add v21.8H, v30.8H, v29.8H // .....................*......... + // ldr q30, [x0, #32] // *.............................. + // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*.......................... + // ldr q2, [x0, #48] // ..*............................ + // srshr v19.8H, v22.8H, #11 // ........*...................... + // mls v30.8H, v19.8H, v3.H[0] // ...........*................... + // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................ + // sshr v31.8H, v30.8H, #15 // ................*.............. + // srshr v25.8H, v25.8H, #11 // ..........*.................... + // and v18.16B, v3.16B, v31.16B // ..................*............ + // mls v2.8H, v25.8H, v3.H[0] // .............*................. + // add v21.8H, v30.8H, v18.8H // ...................*........... + // ldr q5, [x0, #16] // ..............*................ + // sshr v18.8H, v2.8H, #15 // .................*............. + // and v27.16B, v3.16B, v18.16B // ....................*.......... + // add v16.8H, v2.8H, v27.8H // .....................*......... sub count, count, #1 -loop_start: +1: // Instructions: 32 // Expected cycles: 36 // Expected IPC: 0.89 @@ -130,77 +150,77 @@ loop_start: // -------- cycle (expected) ---------> // 0 25 // |------------------------|---------- - sqdmulh v0.8H, v22.8H, v2.H[1] // *................................... - str q6, [x0, #-16] // .*.................................. - ldr q31, [x0, #48] // ..e................................. - srshr v5.8H, v0.8H, #11 // ....*............................... - str q21, [x0, #-32] // .....*.............................. - sqdmulh v24.8H, v31.8H, v2.H[1] // ......e............................. - mls v22.8H, v5.8H, v2.H[0] // .......*............................ - ldr q20, [x0, #-48] // ........*........................... - srshr v6.8H, v24.8H, #11 // ..........e......................... - sshr v0.8H, v22.8H, #15 // ...........*........................ - sqdmulh v21.8H, v20.8H, v2.H[1] // ............*....................... - mls v31.8H, v6.8H, v2.H[0] // .............e...................... - ldr q30, [x0, #32] // ..............e..................... - srshr v23.8H, v21.8H, #11 // ................*................... - and v19.16B, v4.16B, v0.16B // .................*.................. - sshr v16.8H, v31.8H, #15 // ..................e................. - mls v20.8H, v23.8H, v2.H[0] // ...................*................ - and v28.16B, v4.16B, v16.16B // ....................e............... - sqdmulh v23.8H, v30.8H, v2.H[1] // .....................e.............. - add v6.8H, v31.8H, v28.8H // ......................e............. - add v31.8H, v22.8H, v19.8H // .......................*............ - sshr v17.8H, v20.8H, #15 // ........................*........... - srshr v18.8H, v23.8H, #11 // .........................e.......... - and v26.16B, v4.16B, v17.16B // ..........................*......... - add v7.8H, v20.8H, v26.8H // ...........................*........ - mls v30.8H, v18.8H, v2.H[0] // ............................e....... - ldr q22, [x0], #64 // .............................e...... - str q7, [x0, #-112] // ...............................*.... - sshr v0.8H, v30.8H, #15 // ................................e... - str q31, [x0, #-128] // .................................*.. - and v29.16B, v4.16B, v0.16B // ..................................e. - add v21.8H, v30.8H, v29.8H // ...................................e - - // ------------------------ cycle (expected) -------------------------> - // 0 25 50 - // |------------------------|------------------------|----------------- - // ldr q0, [x0], #64 // ...........................e......'............................~.... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ..................................*................................. - // srshr v1.8h, v1.8h, #11 // ..~...............................'...*............................. - // mls v0.8h, v1.8h, v2.h[0] // .....~............................'......*.......................... - // sshr v3.8h, v0.8h, #15 // .........~........................'..........*...................... - // and v3.16b, v4.16b, v3.16b // ...............~..................'................*................ - // add v0.8h, v0.8h, v3.8h // .....................~............'......................*.......... - // str q0, [x0, #-64] // ...............................~..'................................* - // ldr q0, [x0, #-48] // ......~...........................'.......*......................... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ..........~.......................'...........*..................... - // srshr v1.8h, v1.8h, #11 // ..............~...................'...............*................. - // mls v0.8h, v1.8h, v2.h[0] // .................~................'..................*.............. - // sshr v3.8h, v0.8h, #15 // ......................~...........'.......................*......... - // and v3.16b, v4.16b, v3.16b // ........................~.........'.........................*....... - // add v0.8h, v0.8h, v3.8h // .........................~........'..........................*...... - // str q0, [x0, #-48] // .............................~....'..............................*.. - // ldr q0, [x0, #-32] // ............e.....................'.............~................... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ...................e..............'....................~............ - // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ - // mls v0.8h, v1.8h, v2.h[0] // ..........................e.......'...........................~..... - // sshr v3.8h, v0.8h, #15 // ..............................e...'...............................~. - // and v3.16b, v4.16b, v3.16b // ................................e.'................................. - // add v0.8h, v0.8h, v3.8h // .................................e'................................. - // str q0, [x0, #-32] // ...~..............................'....*............................ - // ldr q0, [x0, #-16] // e.................................'.~............................... - // sqdmulh v1.8h, v0.8h, v2.h[1] // ....e.............................'.....~........................... - // srshr v1.8h, v1.8h, #11 // ........e.........................'.........~....................... - // mls v0.8h, v1.8h, v2.h[0] // ...........e......................'............~.................... - // sshr v3.8h, v0.8h, #15 // ................e.................'.................~............... - // and v3.16b, v4.16b, v3.16b // ..................e...............'...................~............. - // add v0.8h, v0.8h, v3.8h // ....................e.............'.....................~........... - // str q0, [x0, #-16] // ..................................'*................................ - - sub count, count, #1 - cbnz count, loop_start + ldr q6, [x0], #64 // *................................... + ldr q30, [x0, #32] // ..e................................. + sqdmulh v31.8H, v6.8H, v4.H[0] // ....*............................... + sqdmulh v29.8H, v5.8H, v4.H[0] // .....*.............................. + sqdmulh v22.8H, v30.8H, v4.H[0] // ......e............................. + str q16, [x0, #-16] // .......*............................ + srshr v20.8H, v31.8H, #11 // ........*........................... + srshr v28.8H, v29.8H, #11 // .........*.......................... + str q21, [x0, #-32] // ..........*......................... + mls v6.8H, v20.8H, v3.H[0] // ...........*........................ + mls v5.8H, v28.8H, v3.H[0] // ............*....................... + ldr q2, [x0, #48] // .............e...................... + sshr v31.8H, v6.8H, #15 // ...............*.................... + srshr v19.8H, v22.8H, #11 // ................e................... + and v22.16B, v3.16B, v31.16B // .................*.................. + add v0.8H, v6.8H, v22.8H // ..................*................. + mls v30.8H, v19.8H, v3.H[0] // ...................e................ + sshr v26.8H, v5.8H, #15 // ....................*............... + sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e.............. + and v17.16B, v3.16B, v26.16B // ......................*............. + add v1.8H, v5.8H, v17.8H // .......................*............ + sshr v31.8H, v30.8H, #15 // ........................e........... + srshr v25.8H, v25.8H, #11 // .........................e.......... + str q1, [x0, #-48] // ..........................*......... + and v18.16B, v3.16B, v31.16B // ...........................e........ + mls v2.8H, v25.8H, v3.H[0] // ............................e....... + add v21.8H, v30.8H, v18.8H // .............................e...... + ldr q5, [x0, #16] // ..............................e..... + sshr v18.8H, v2.8H, #15 // ................................e... + str q0, [x0, #-64] // .................................*.. + and v27.16B, v3.16B, v18.16B // ..................................e. + add v16.8H, v2.8H, v27.8H // ...................................e + + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + // ldr q0, [x0], #64 // ..................................*................................. + // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*............................. + // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*......................... + // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*...................... + // sshr v2.8h, v0.8h, #15 // .............~....................'..............*.................. + // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................ + // add v0.8h, v0.8h, v2.8h // ................~.................'.................*............... + // str q0, [x0, #-64] // ...............................~..'................................* + // ldr q0, [x0, #-48] // ............................e.....'.............................~... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................ + // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................ + // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*..................... + // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*............. + // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*........... + // add v0.8h, v0.8h, v2.8h // .....................~............'......................*.......... + // str q0, [x0, #-48] // ........................~.........'.........................*....... + // ldr q0, [x0, #-32] // e.................................'.~............................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~........................... + // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~................. + // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~.............. + // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~......... + // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~...... + // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~.... + // str q0, [x0, #-32] // ........~.........................'.........*....................... + // ldr q0, [x0, #-16] // ...........e......................'............~.................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............ + // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ + // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~..... + // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~. + // and v2.16b, v3.16b, v2.16b // ................................e.'................................. + // add v0.8h, v0.8h, v2.8h // .................................e'................................. + // str q0, [x0, #-16] // .....~............................'......*.......................... + + sub count, count, 1 + cbnz count, 1b // Instructions: 17 // Expected cycles: 23 // Expected IPC: 0.74 @@ -214,94 +234,96 @@ loop_start: // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q29, [x0, #-48] // *............................. - sqdmulh v1.8H, v22.8H, v2.H[1] // ..*........................... + sqdmulh v20.8H, v5.8H, v4.H[0] // *............................. + ldr q24, [x0], #64 // .*............................ str q21, [x0, #-32] // ...*.......................... - sqdmulh v25.8H, v29.8H, v2.H[1] // ....*......................... - str q6, [x0, #-16] // .....*........................ - srshr v30.8H, v1.8H, #11 // ......*....................... - srshr v20.8H, v25.8H, #11 // ........*..................... - mls v22.8H, v30.8H, v2.H[0] // .........*.................... - mls v29.8H, v20.8H, v2.H[0] // ...........*.................. - sshr v24.8H, v22.8H, #15 // .............*................ - and v31.16B, v4.16B, v24.16B // ...............*.............. - sshr v24.8H, v29.8H, #15 // ................*............. - add v3.8H, v22.8H, v31.8H // .................*............ - and v17.16B, v4.16B, v24.16B // ..................*........... - add v25.8H, v29.8H, v17.8H // ...................*.......... - str q3, [x0, #-64] // ....................*......... - str q25, [x0, #-48] // ......................*....... + srshr v20.8H, v20.8H, #11 // ....*......................... + sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................ + str q16, [x0, #-16] // ......*....................... + mls v5.8H, v20.8H, v3.H[0] // .......*...................... + srshr v20.8H, v25.8H, #11 // .........*.................... + sshr v2.8H, v5.8H, #15 // ...........*.................. + mls v24.8H, v20.8H, v3.H[0] // ............*................. + and v20.16B, v3.16B, v2.16B // .............*................ + add v31.8H, v5.8H, v20.8H // ..............*............... + sshr v20.8H, v24.8H, #15 // ................*............. + str q31, [x0, #-48] // .................*............ + and v31.16B, v3.16B, v20.16B // ..................*........... + add v24.8H, v24.8H, v31.8H // ...................*.......... + str q24, [x0, #-64] // ......................*....... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // sqdmulh v0.8H, v22.8H, v2.H[1] // ..*............................ - // str q6, [x0, #-16] // .....*......................... - // srshr v5.8H, v0.8H, #11 // ......*........................ - // str q21, [x0, #-32] // ...*........................... - // mls v22.8H, v5.8H, v2.H[0] // .........*..................... - // ldr q20, [x0, #-48] // *.............................. - // sshr v0.8H, v22.8H, #15 // .............*................. - // sqdmulh v21.8H, v20.8H, v2.H[1] // ....*.......................... - // srshr v23.8H, v21.8H, #11 // ........*...................... - // and v19.16B, v4.16B, v0.16B // ...............*............... - // mls v20.8H, v23.8H, v2.H[0] // ...........*................... - // add v31.8H, v22.8H, v19.8H // .................*............. - // sshr v17.8H, v20.8H, #15 // ................*.............. - // and v26.16B, v4.16B, v17.16B // ..................*............ - // add v7.8H, v20.8H, v26.8H // ...................*........... - // str q7, [x0, #-48] // ......................*........ - // str q31, [x0, #-64] // ....................*.......... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0], #64 // .*............................. + // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*......................... + // sqdmulh v29.8H, v5.8H, v4.H[0] // *.............................. + // str q16, [x0, #-16] // ......*........................ + // srshr v20.8H, v31.8H, #11 // .........*..................... + // srshr v28.8H, v29.8H, #11 // ....*.......................... + // str q21, [x0, #-32] // ...*........................... + // mls v6.8H, v20.8H, v3.H[0] // ............*.................. + // mls v5.8H, v28.8H, v3.H[0] // .......*....................... + // sshr v31.8H, v6.8H, #15 // ................*.............. + // and v22.16B, v3.16B, v31.16B // ..................*............ + // add v0.8H, v6.8H, v22.8H // ...................*........... + // sshr v26.8H, v5.8H, #15 // ...........*................... + // and v17.16B, v3.16B, v26.16B // .............*................. + // add v1.8H, v5.8H, v17.8H // ..............*................ + // str q1, [x0, #-48] // .................*............. + // str q0, [x0, #-64] // ......................*........ ret .unreq ptr .unreq count - .unreq xtmp - .unreq q_data + .unreq data - .unreq t0 - .unreq consts + .unreq q_data + + .unreq tmp .unreq mask - .unreq wmodulus .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_mulcache_compute() * ********************************************/ -.global MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt) -.global _MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt) - - cache_ptr .req x0 - data_ptr .req x1 - zeta_ptr .req x2 - zeta_twisted_ptr .req x3 - xtmp .req x4 - - count .req x5 - - data_odd .req v1 - - zeta .req v2 - zeta_twisted .req v3 - q_zeta .req q2 - q_zeta_twisted .req q3 - - q_tmp0 .req q4 - q_tmp1 .req q5 - tmp0 .req v4 - tmp1 .req v5 - consts .req v6 - dst .req v7 - q_dst .req q7 - -MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt): -_MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - mov count, #(16) + +.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt) + + cache_ptr .req x0 + data_ptr .req x1 + zeta_ptr .req x2 + zeta_twisted_ptr .req x3 + count .req x4 + + data_odd .req v0 + zeta .req v1 + q_zeta .req q1 + zeta_twisted .req v2 + q_zeta_twisted .req q2 + + tmp0 .req v3 + q_tmp0 .req q3 + tmp1 .req v4 + q_tmp1 .req q4 + dst .req v5 + q_dst .req q5 + + modulus .req v6 + q_modulus .req q6 + modulus_twisted .req v7 + q_modulus_twisted .req q7 + +MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + + mov count, #16 // Instructions: 7 // Expected cycles: 12 // Expected IPC: 0.58 @@ -315,65 +337,65 @@ _MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt): // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q3, [x1, #16] // *............................. - ldr q1, [x1], #32 // ..*........................... - ldr q26, [x3], #16 // ....*......................... - uzp2 v30.8H, v1.8H, v3.8H // ......*....................... - ldr q21, [x2], #16 // .......*...................... - sqrdmulh v3.8H, v30.8H, v26.8H // .........*.................... - mul v26.8H, v30.8H, v21.8H // ...........*.................. - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q30, [x1, #16] // *.............................. - // ldr q21, [x1], #32 // ..*............................ - // ldr q3, [x3], #16 // ....*.......................... - // uzp2 v24.8H, v21.8H, v30.8H // ......*........................ - // ldr q21, [x2], #16 // .......*....................... - // sqrdmulh v3.8H, v24.8H, v3.8H // .........*..................... - // mul v26.8H, v24.8H, v21.8H // ...........*................... + ldr q1, [x1, #16] // *............................. + ldr q27, [x1], #32 // ..*........................... + ldr q23, [x2], #16 // ....*......................... + uzp2 v27.8H, v27.8H, v1.8H // ......*....................... + ldr q1, [x3], #16 // .......*...................... + mul v2.8H, v27.8H, v23.8H // .........*.................... + sqrdmulh v27.8H, v27.8H, v1.8H // ...........*.................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q29, [x1, #16] // *.............................. + // ldr q21, [x2], #16 // ....*.......................... + // ldr q27, [x1], #32 // ..*............................ + // ldr q7, [x3], #16 // .......*....................... + // uzp2 v28.8H, v27.8H, v29.8H // ......*........................ + // mul v2.8H, v28.8H, v21.8H // .........*..................... + // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*................... sub count, count, #1 -mulcache_compute_loop_start: - // Instructions: 9 - // Expected cycles: 13 - // Expected IPC: 0.69 +1: + // Instructions: 9 + // Expected cycles: 13 + // Expected IPC: 0.69 - // Cycle bound: 13.0 - // IPC bound: 0.69 + // Cycle bound: 13.0 + // IPC bound: 0.69 - // Wall time: 0.08s - // User time: 0.08s + // Wall time: 0.09s + // User time: 0.09s - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q30, [x1, #16] // e............................. - ldr q21, [x1], #32 // ..e........................... - mls v26.8H, v3.8H, v6.H[0] // ....*......................... - ldr q3, [x3], #16 // .....e........................ - uzp2 v24.8H, v21.8H, v30.8H // .......e...................... - ldr q21, [x2], #16 // ........e..................... - str q26, [x0], #16 // ..........*................... - sqrdmulh v3.8H, v24.8H, v3.8H // ...........e.................. - mul v26.8H, v24.8H, v21.8H // ............e................. + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q29, [x1, #16] // e............................. + ldr q21, [x2], #16 // ..e........................... + mls v2.8H, v27.8H, v6.H[0] // ....*......................... + ldr q27, [x1], #32 // .....e........................ + ldr q7, [x3], #16 // .......e...................... + uzp2 v28.8H, v27.8H, v29.8H // .........e.................... + str q2, [x0], #16 // ..........*................... + mul v2.8H, v28.8H, v21.8H // ...........e.................. + sqrdmulh v27.8H, v28.8H, v7.8H // ............e................. // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q4, [x1], #32 // ..e..........'.~..........'.~.. - // ldr q5, [x1, #-16] // e............~............~.... - // ldr q2, [x2], #16 // ........e....'.......~....'.... - // ldr q3, [x3], #16 // .....e.......'....~.......'.... - // uzp2 v1.8h, v4.8h, v5.8h // .......e.....'......~.....'.... - // sqrdmulh v4.8h, v1.8h, v3.8h // ...........e.'..........~.'.... - // mul v7.8h, v1.8h, v2.8h // ............e'...........~'.... - // mls v7.8h, v4.8h, v6.h[0] // ....~........'...*........'.... - // str q7, [x0], #16 // ..........~..'.........*..'.... - - sub count, count, #1 - cbnz count, mulcache_compute_loop_start + // ldr q3, [x1], #32 // .....e.......'....~.......'.... + // ldr q4, [x1, #-16] // e............~............~.... + // ldr q1, [x2], #16 // ..e..........'.~..........'.~.. + // ldr q2, [x3], #16 // .......e.....'......~.....'.... + // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'.... + // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'.... + // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'.... + // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'.... + // str q5, [x0], #16 // ..........~..'.........*..'.... + + sub count, count, 1 + cbnz count, 1b // Instructions: 2 // Expected cycles: 5 // Expected IPC: 0.40 @@ -387,14 +409,14 @@ mulcache_compute_loop_start: // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - mls v26.8H, v3.8H, v6.H[0] // *............................. - str q26, [x0], #16 // ....*......................... + mls v2.8H, v27.8H, v6.H[0] // *............................. + str q2, [x0], #16 // ....*......................... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // mls v26.8H, v3.8H, v6.H[0] // *.............................. - // str q26, [x0], #16 // ....*.......................... + // mls v2.8H, v27.8H, v6.H[0] // *.............................. + // str q2, [x0], #16 // ....*.......................... ret @@ -402,7 +424,7 @@ mulcache_compute_loop_start: .unreq cache_ptr .unreq data_ptr .unreq zeta_ptr - .unreq xtmp + .unreq zeta_twisted_ptr .unreq count .unreq data_odd @@ -411,34 +433,35 @@ mulcache_compute_loop_start: .unreq zeta_twisted .unreq q_zeta_twisted - .unreq q_tmp0 - .unreq q_tmp1 .unreq tmp0 + .unreq q_tmp0 .unreq tmp1 - .unreq consts + .unreq q_tmp1 .unreq dst + .unreq q_dst + + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted /******************************************** * poly_tobytes() * ********************************************/ -.global MLKEM_NAMESPACE(poly_tobytes_asm_opt) -.global _MLKEM_NAMESPACE(poly_tobytes_asm_opt) +.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt) data0 .req v0 data1 .req v1 - out0 .req v2 out1 .req v3 out2 .req v4 - - tmp .req v5 + tmp .req v5 dst .req x0 src .req x1 count .req x2 -MLKEM_NAMESPACE(poly_tobytes_asm_opt): -_MLKEM_NAMESPACE(poly_tobytes_asm_opt): +MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt): mov count, #16 poly_tobytes_asm_opt_asm_loop_start: @@ -475,69 +498,65 @@ poly_tobytes_asm_opt_asm_loop_start: /********************************** * poly_tomont() * **********************************/ -.global MLKEM_NAMESPACE(poly_tomont_asm_opt) -.global _MLKEM_NAMESPACE(poly_tomont_asm_opt) - - src .req x0 - count .req x1 - xtmp .req x2 - - wtmp2 .req w3 +.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt) - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 + src .req x0 + count .req x1 - consts .req v2 + data .req v0 + q_data .req q0 + res .req v1 + q_res .req q1 - factor .req v3 - factor_t .req v4 + factor .req v2 + q_factor .req q2 + factor_t .req v3 + q_factor_t .req q3 + modulus .req v4 + q_modulus .req q4 + modulus_twisted .req v5 + q_modulus_twisted .req q5 - tmp0 .req v5 + tmp0 .req v6 -MLKEM_NAMESPACE(poly_tomont_asm_opt): -_MLKEM_NAMESPACE(poly_tomont_asm_opt): +MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - - ldrsh wtmp2, [xtmp, #(2*2)] - dup factor.8h, wtmp2 - ldrh wtmp2, [xtmp, #(3*2)] - dup factor_t.8h, wtmp2 + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted + ldr q_factor, c_mont_constant + ldr q_factor_t, c_barrett_twist mov count, #8 - // Instructions: 5 - // Expected cycles: 10 - // Expected IPC: 0.50 - // - // Cycle bound: 10.0 - // IPC bound: 0.50 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q6, [x0, #16] // *............................. - ldr q20, [x0, #32] // ..*........................... - mul v21.8H, v6.8H, v3.8H // ....*......................... - sqrdmulh v6.8H, v6.8H, v4.8H // .....*........................ - mls v21.8H, v6.8H, v2.H[0] // .........*.................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q16, [x0, #16] // *.............................. - // mul v21.8H, v16.8H, v3.8H // ....*.......................... - // sqrdmulh v0.8H, v16.8H, v4.8H // .....*......................... - // mls v21.8H, v0.8H, v2.H[0] // .........*..................... - // ldr q20, [x0, #32] // ..*............................ + // Instructions: 5 + // Expected cycles: 7 + // Expected IPC: 0.71 + // + // Cycle bound: 7.0 + // IPC bound: 0.71 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q26, [x0, #48] // *............................. + ldr q23, [x0, #16] // ..*........................... + mul v17.8H, v26.8H, v2.8H // ....*......................... + sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................ + ldr q27, [x0, #32] // ......*....................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q7, [x0, #48] // *.............................. + // ldr q23, [x0, #16] // ..*............................ + // mul v17.8H, v7.8H, v2.8H // ....*.......................... + // sqrdmulh v7.8H, v7.8H, v3.8H // .....*......................... + // ldr q27, [x0, #32] // ......*........................ sub count, count, #1 -poly_tomont_asm_loop: +1: // Instructions: 20 // Expected cycles: 24 // Expected IPC: 0.83 @@ -545,108 +564,127 @@ poly_tomont_asm_loop: // Cycle bound: 24.0 // IPC bound: 0.83 // - // Wall time: 0.58s - // User time: 0.58s + // Wall time: 0.73s + // User time: 0.73s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q18, [x0], #64 // *............................. - str q21, [x0, #-48] // ..*........................... - ldr q16, [x0, #16] // ...e.......................... - sqrdmulh v27.8H, v20.8H, v4.8H // .....*........................ - mul v1.8H, v20.8H, v3.8H // ......*....................... - mul v21.8H, v16.8H, v3.8H // .......e...................... - sqrdmulh v0.8H, v16.8H, v4.8H // ........e..................... - ldr q30, [x0, #-16] // .........*.................... - sqrdmulh v6.8H, v18.8H, v4.8H // ...........*.................. - mul v18.8H, v18.8H, v3.8H // ............*................. - mls v1.8H, v27.8H, v2.H[0] // .............*................ - mul v5.8H, v30.8H, v3.8H // ..............*............... - sqrdmulh v22.8H, v30.8H, v4.8H // ...............*.............. - mls v18.8H, v6.8H, v2.H[0] // ................*............. - str q1, [x0, #-32] // .................*............ - mls v21.8H, v0.8H, v2.H[0] // ..................e........... - mls v5.8H, v22.8H, v2.H[0] // ...................*.......... - str q18, [x0, #-64] // ....................*......... - ldr q20, [x0, #32] // .....................e........ - str q5, [x0, #-16] // .......................*...... - - // ------------- cycle (expected) -------------> + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v5.8H, v23.8H, v3.8H // .*............................ + ldr q7, [x0], #64 // ..*........................... + str q17, [x0, #-16] // ....*......................... + sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................ + sqrdmulh v19.8H, v7.8H, v3.8H // ......*....................... + mul v25.8H, v23.8H, v2.8H // .......*...................... + mul v0.8H, v7.8H, v2.8H // ........*..................... + mul v26.8H, v27.8H, v2.8H // .........*.................... + ldr q7, [x0, #48] // ..........e................... + mls v25.8H, v5.8H, v4.H[0] // ............*................. + ldr q23, [x0, #16] // .............e................ + mls v26.8H, v29.8H, v4.H[0] // ...............*.............. + mls v0.8H, v19.8H, v4.H[0] // ................*............. + str q25, [x0, #-48] // .................*............ + mul v17.8H, v7.8H, v2.8H // ..................e........... + sqrdmulh v7.8H, v7.8H, v3.8H // ...................e.......... + str q0, [x0, #-64] // ....................*......... + ldr q27, [x0, #32] // .....................e........ + str q26, [x0, #-32] // .......................*...... + + // --------- cycle (expected) ----------> // 0 25 - // |------------------------|------------------- - // ldr q0, [x0], #64 // .....................*....................... - // sqrdmulh v5.8h, v0.8h, v4.8h // ........~............'..........*............ - // mul v1.8h, v0.8h, v3.8h // .........~...........'...........*........... - // mls v1.8h, v5.8h, v2.h[0] // .............~.......'...............*....... - // str q1, [x0, #-64] // .................~...'...................*... - // ldr q0, [x0, #-48] // e....................'..~.................... - // sqrdmulh v5.8h, v0.8h, v4.8h // .....e...............'.......~............... - // mul v1.8h, v0.8h, v3.8h // ....e................'......~................ - // mls v1.8h, v5.8h, v2.h[0] // ...............e.....'.................~..... - // str q1, [x0, #-48] // .....................'.*..................... - // ldr q0, [x0, #-32] // ..................e..'....................~.. - // sqrdmulh v5.8h, v0.8h, v4.8h // ..~..................'....*.................. - // mul v1.8h, v0.8h, v3.8h // ...~.................'.....*................. - // mls v1.8h, v5.8h, v2.h[0] // ..........~..........'............*.......... - // str q1, [x0, #-32] // ..............~......'................*...... - // ldr q0, [x0, #-16] // ......~..............'........*.............. - // sqrdmulh v5.8h, v0.8h, v4.8h // ............~........'..............*........ - // mul v1.8h, v0.8h, v3.8h // ...........~.........'.............*......... - // mls v1.8h, v5.8h, v2.h[0] // ................~....'..................*.... - // str q1, [x0, #-16] // ....................~'......................* - - sub count, count, #1 - cbnz count, poly_tomont_asm_loop + // |------------------------|------------ + // ldr q0, [x0], #64 // ..............'.*..................... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*................. + // mul v1.8h, v0.8h, v2.8h // ..............'.......*............... + // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*....... + // str q1, [x0, #-64] // ..........~...'...................*... + // ldr q0, [x0, #-48] // ...e..........'............~.......... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*...................... + // mul v1.8h, v0.8h, v2.8h // ..............'......*................ + // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*........... + // str q1, [x0, #-48] // .......~......'................*...... + // ldr q0, [x0, #-32] // ...........e..'....................~.. + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*.................. + // mul v1.8h, v0.8h, v2.8h // ..............'........*.............. + // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........ + // str q1, [x0, #-32] // .............~'......................* + // ldr q0, [x0, #-16] // e.............'.........~............. + // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~.... + // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~..... + // mls v1.8h, v6.8h, v4.h[0] // ..............*....................... + // str q1, [x0, #-16] // ..............'...*................... + + sub count, count, 1 + cbnz count, 1b // Instructions: 15 - // Expected cycles: 19 - // Expected IPC: 0.79 + // Expected cycles: 18 + // Expected IPC: 0.83 // - // Cycle bound: 19.0 - // IPC bound: 0.79 + // Cycle bound: 18.0 + // IPC bound: 0.83 // - // Wall time: 0.04s - // User time: 0.04s + // Wall time: 0.07s + // User time: 0.07s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - mul v31.8H, v20.8H, v3.8H // *............................. - sqrdmulh v25.8H, v20.8H, v4.8H // .*............................ - ldr q27, [x0], #64 // ..*........................... - ldr q23, [x0, #-16] // ....*......................... - mls v31.8H, v25.8H, v2.H[0] // ......*....................... - sqrdmulh v30.8H, v27.8H, v4.8H // .......*...................... - mul v6.8H, v27.8H, v3.8H // ........*..................... - mul v20.8H, v23.8H, v3.8H // .........*.................... - sqrdmulh v26.8H, v23.8H, v4.8H // ..........*................... - str q31, [x0, #-32] // ...........*.................. - mls v6.8H, v30.8H, v2.H[0] // ............*................. - str q21, [x0, #-48] // .............*................ - mls v20.8H, v26.8H, v2.H[0] // ..............*............... - str q6, [x0, #-64] // ................*............. - str q20, [x0, #-16] // ..................*........... + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v7.8H, v23.8H, v3.8H // .*............................ + mul v26.8H, v23.8H, v2.8H // ..*........................... + sqrdmulh v25.8H, v27.8H, v3.8H // ...*.......................... + ldr q23, [x0], #64 // ....*......................... + mul v27.8H, v27.8H, v2.8H // ......*....................... + mls v26.8H, v7.8H, v4.H[0] // .......*...................... + sqrdmulh v7.8H, v23.8H, v3.8H // ........*..................... + mul v23.8H, v23.8H, v2.8H // .........*.................... + str q17, [x0, #-16] // ..........*................... + mls v27.8H, v25.8H, v4.H[0] // ...........*.................. + str q26, [x0, #-48] // ............*................. + mls v23.8H, v7.8H, v4.H[0] // .............*................ + str q27, [x0, #-32] // ...............*.............. + str q23, [x0, #-64] // .................*............ // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q18, [x0], #64 // ..*............................ - // str q21, [x0, #-48] // .............*................. - // sqrdmulh v27.8H, v20.8H, v4.8H // .*............................. - // mul v1.8H, v20.8H, v3.8H // *.............................. - // ldr q30, [x0, #-16] // ....*.......................... - // sqrdmulh v6.8H, v18.8H, v4.8H // .......*....................... - // mul v18.8H, v18.8H, v3.8H // ........*...................... - // mls v1.8H, v27.8H, v2.H[0] // ......*........................ - // mul v5.8H, v30.8H, v3.8H // .........*..................... - // sqrdmulh v22.8H, v30.8H, v4.8H // ..........*.................... - // mls v18.8H, v6.8H, v2.H[0] // ............*.................. - // str q1, [x0, #-32] // ...........*................... - // mls v5.8H, v22.8H, v2.H[0] // ..............*................ - // str q18, [x0, #-64] // ................*.............. - // str q5, [x0, #-16] // ..................*............ + // mls v17.8H, v7.8H, v4.H[0] // *.............................. + // sqrdmulh v5.8H, v23.8H, v3.8H // .*............................. + // ldr q7, [x0], #64 // ....*.......................... + // str q17, [x0, #-16] // ..........*.................... + // sqrdmulh v29.8H, v27.8H, v3.8H // ...*........................... + // sqrdmulh v19.8H, v7.8H, v3.8H // ........*...................... + // mul v25.8H, v23.8H, v2.8H // ..*............................ + // mul v0.8H, v7.8H, v2.8H // .........*..................... + // mul v26.8H, v27.8H, v2.8H // ......*........................ + // mls v25.8H, v5.8H, v4.H[0] // .......*....................... + // mls v26.8H, v29.8H, v4.H[0] // ...........*................... + // mls v0.8H, v19.8H, v4.H[0] // .............*................. + // str q25, [x0, #-48] // ............*.................. + // str q0, [x0, #-64] // .................*............. + // str q26, [x0, #-32] // ...............*............... ret + .unreq src + .unreq count + + .unreq data + .unreq q_data + .unreq res + .unreq q_res + + .unreq factor + .unreq q_factor + .unreq factor_t + .unreq q_factor_t + .unreq modulus + .unreq q_modulus + .unreq modulus_twisted + .unreq q_modulus_twisted + + .unreq tmp0 + #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S index bfd1d2b8a..99fb05de5 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_clean.S @@ -12,10 +12,30 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) -#include "params.h" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ + +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm -// Needed to provide ASM_LOAD directive -#include "common.i" +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 3327 // Input: // - Vectors al, ah of 32-bit entries @@ -23,9 +43,9 @@ // - Montgomery reductions of al || ah, stored in al .macro montgomery_reduce_long x, a uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, consts.h[2] - smlal \a\()l.4s, t0.4h, constN.4h - smlal2 \a\()h.4s, t0.8h, constN.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h uzp2 \x\().8h, \a\()l.8h, \a\()h.8h .endm @@ -58,10 +78,24 @@ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h .endm +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + .macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2 {\a\()0.8h, \a\()1.8h}, [\a_ptr], #32 - ld2 {\b\()0.8h, \b\()1.8h}, [\b_ptr], #32 - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 .endm .macro save_vregs @@ -101,14 +135,12 @@ a3_ptr .req x10 b3_ptr .req x11 b3_cache_ptr .req x12 - count .req x13 - xtmp .req x14 - modulus .req w15 - constN .req v0 - constX .req v1 - consts .req v2 + modulus .req v0 + q_modulus .req q0 + modulus_twisted .req v2 + q_modulus_twisted .req q2 aa0 .req v3 aa1 .req v4 @@ -131,29 +163,13 @@ t0 .req v28 -.p2align 4 -const_addr: - .short 3329 - .short 20159 - .short 3327 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - #if MLKEM_K == 2 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -172,7 +188,7 @@ k2_loop_start: montgomery_reduce_long out0, res0 montgomery_reduce_long out1, res1 - st2 {out0.8h, out1.8h}, [out], #32 + st2_wrap out, out subs count, count, #1 cbnz count, k2_loop_start @@ -182,17 +198,12 @@ k2_loop_start: #endif /* MLKEM_K == 2 */ #if MLKEM_K == 3 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -216,7 +227,7 @@ k3_loop_start: montgomery_reduce_long out0, res0 montgomery_reduce_long out1, res1 - st2 {out0.8h, out1.8h}, [out], #32 + st2_wrap out, out subs count, count, #1 cbnz count, k3_loop_start @@ -226,17 +237,12 @@ k3_loop_start: #endif /* MLKEM_K == 3 */ #if MLKEM_K == 4 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -270,7 +276,7 @@ k4_loop_start: montgomery_reduce_long out0, res0 montgomery_reduce_long out1, res1 - st2 {out0.8h, out1.8h}, [out], #32 + st2_wrap out, out subs count, count, #1 cbnz count, k4_loop_start diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S index 07dc98efd..16ed77c3f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/polyvec_opt.S @@ -11,10 +11,31 @@ #include "common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -#include "params.h" -// Needed to provide ASM_LOAD directive -#include "common.i" +/* We use a single literal pool for all functions in this file. + * This is OK even when the file gets expanded through SLOTHY, + * since PC-relative offets are up to 1MB in AArch64. + * + * The use of dup8h to build constant vectors in memory + * is slightly wasteful and could be avoided with a GPR-load + * followed by Neon `dup`, but we're ultimately only talking + * about 64 bytes, so it seems OK. + */ + +.macro dup8h c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c + .short \c +.endm + +.p2align 4 +c_modulus: dup8h 3329 // ML-KEM modulus +c_modulus_twisted: dup8h 3327 // Input: // - Vectors al, ah of 32-bit entries @@ -22,13 +43,17 @@ // - Montgomery reductions of al || ah, stored in al .macro montgomery_reduce_long x, a uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, consts.h[2] - smlal \a\()l.4s, t0.4h, constN.4h - smlal2 \a\()h.4s, t0.8h, constN.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h uzp2 \x\().8h, \a\()l.8h, \a\()h.8h .endm // Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 .macro pmull d, a, b smull \d\()0l.4s, \a\()0.4h, \b\()0.4h smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h @@ -53,10 +78,24 @@ smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h .endm +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + .macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2 {\a\()0.8h, \a\()1.8h}, [\a_ptr], #32 - ld2 {\b\()0.8h, \b\()1.8h}, [\b_ptr], #32 - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 .endm .macro save_vregs @@ -96,14 +135,12 @@ a3_ptr .req x10 b3_ptr .req x11 b3_cache_ptr .req x12 - count .req x13 - xtmp .req x14 - modulus .req w15 - constN .req v0 - constX .req v1 - consts .req v2 + modulus .req v0 + q_modulus .req q0 + modulus_twisted .req v2 + q_modulus_twisted .req q2 aa0 .req v3 aa1 .req v4 @@ -126,29 +163,13 @@ t0 .req v28 -.p2align 4 -const_addr: - .short 3329 - .short 20159 - .short 3327 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - #if MLKEM_K == 2 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -157,195 +178,351 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) mov count, #(MLKEM_N / 16) - // Instructions: 6 - // Expected cycles: 11 - // Expected IPC: 0.55 - - // Cycle bound: 11.0 - // IPC bound: 0.55 - - // Wall time: 0.01s - // User time: 0.01s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ld2 {v6.8H, v7.8H}, [x1], #32 // *............................. - ld2 {v4.8H, v5.8H}, [x2], #32 // ..*........................... - ld1 {v12.8H}, [x3], #16 // ....*......................... - ld2 {v15.8H, v16.8H}, [x4], #32 // ......*....................... - ld2 {v29.8H, v30.8H}, [x5], #32 // ........*..................... - ld1 {v31.8H}, [x6], #16 // ..........*................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ld2 {v6.8H, v7.8H}, [x1], #32 // *.............................. - // ld2 {v4.8H, v5.8H}, [x2], #32 // ..*............................ - // ld1 {v12.8H}, [x3], #16 // ....*.......................... - // ld2 {v15.8H, v16.8H}, [x4], #32 // ......*........................ - // ld2 {v29.8H, v30.8H}, [x5], #32 // ........*...................... - // ld1 {v31.8H}, [x6], #16 // ..........*.................... + // Instructions: 75 + // Expected cycles: 94 + // Expected IPC: 0.80 + + // Cycle bound: 94.0 + // IPC bound: 0.80 + + // Wall time: 1.49s + // User time: 1.49s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q9, [x4], #32 // *.......................................................................... + ldr q5, [x4, #-16] // ......*.................................................................... + ldr q11, [x5], #32 // .*......................................................................... + uzp1 v23.8H, v9.8H, v5.8H // .........*................................................................. + uzp2 v9.8H, v9.8H, v5.8H // .....................*..................................................... + ldr q5, [x2], #32 // ..*........................................................................ + ldr q7, [x5, #-16] // ..............*............................................................ + ldr q21, [x2, #-16] // ...*....................................................................... + uzp2 v10.8H, v11.8H, v7.8H // .................*......................................................... + uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................ + uzp1 v7.8H, v5.8H, v21.8H // ....*...................................................................... + uzp2 v5.8H, v5.8H, v21.8H // .....*..................................................................... + ldr q21, [x1], #32 // .......*................................................................... + ldr q25, [x1, #-16] // ........*.................................................................. + ld1 {v6.8H}, [x3], #16 // ............................*.............................................. + uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................ + uzp2 v21.8H, v21.8H, v25.8H // ...........*............................................................... + smull v25.4S, v26.4H, v5.4H // ............*.............................................................. + smull2 v5.4S, v26.8H, v5.8H // .............*............................................................. + smull v19.4S, v26.4H, v7.4H // ..........................*................................................ + smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................ + smlal v25.4S, v21.4H, v7.4H // ...............*........................................................... + smlal2 v5.4S, v21.8H, v7.8H // ................*.......................................................... + smlal v19.4S, v21.4H, v6.4H // ...................................*....................................... + smlal2 v26.4S, v21.8H, v6.8H // .................................*......................................... + smlal v25.4S, v23.4H, v10.4H // ...................*....................................................... + smlal2 v5.4S, v23.8H, v10.8H // ....................*...................................................... + smlal v19.4S, v23.4H, v11.4H // ......................................*.................................... + smlal2 v26.4S, v23.8H, v11.8H // ....................................*...................................... + ld1 {v23.8H}, [x6], #16 // ........................*.................................................. + smlal v25.4S, v9.4H, v11.4H // ......................*.................................................... + smlal2 v5.4S, v9.8H, v11.8H // .......................*................................................... + smlal2 v26.4S, v9.8H, v23.8H // .......................................*................................... + smlal v19.4S, v9.4H, v23.4H // .........................................*................................. + ldr q9, [x4], #32 // ...............................*........................................... + uzp1 v11.8H, v25.8H, v5.8H // .........................*................................................. + uzp1 v23.8H, v19.8H, v26.8H // .............................................*............................. + mul v11.8H, v11.8H, v2.8H // ...........................*............................................... + mul v23.8H, v23.8H, v2.8H // ..............................................*............................ + ldr q7, [x5], #32 // ................................*.......................................... + smlal2 v5.4S, v11.8H, v0.8H // .............................*............................................. + smlal v25.4S, v11.4H, v0.4H // ..................................*........................................ + ldr q11, [x2], #32 // .....................................*..................................... + ldr q21, [x2, #-16] // ........................................*.................................. + ldr q6, [x4, #-16] // ...............................................*........................... + uzp1 v17.8H, v11.8H, v21.8H // ...........................................*............................... + ldr q10, [x1], #32 // ................................................*.......................... + ldr q29, [x1, #-16] // .................................................*......................... + uzp2 v11.8H, v11.8H, v21.8H // ............................................*.............................. + uzp1 v13.8H, v9.8H, v6.8H // ...................................................*....................... + uzp1 v3.8H, v10.8H, v29.8H // ....................................................*...................... + uzp2 v10.8H, v10.8H, v29.8H // .....................................................*..................... + smull v12.4S, v3.4H, v11.4H // ......................................................*.................... + smull2 v11.4S, v3.8H, v11.8H // .......................................................*................... + ldr q21, [x5, #-16] // ........................................................*.................. + smlal v12.4S, v10.4H, v17.4H // .........................................................*................. + smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................ + uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*............... + uzp1 v15.8H, v7.8H, v21.8H // ............................................................*.............. + smlal v12.4S, v13.4H, v29.4H // .............................................................*............. + smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............ + uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*........... + smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................ + smlal v12.4S, v28.4H, v15.4H // .................................................................*......... + smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........ + smlal v19.4S, v23.4H, v0.4H // ................................................................*.......... + uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................ + smull v23.4S, v3.4H, v17.4H // ......................................................................*.... + uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*..... + uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*...... + mul v14.8H, v9.8H, v2.8H // .......................................................................*... + ld1 {v22.8H}, [x6], #16 // ...................................................................*....... + zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................* + ld1 {v4.8H}, [x3], #16 // .........................................................................*. + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q18, [x4], #32 // *.......................................................................... + // ldr q30, [x5], #32 // ..*........................................................................ + // ldr q8, [x2], #32 // .....*..................................................................... + // ldr q9, [x2, #-16] // .......*................................................................... + // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................ + // uzp2 v4.8H, v8.8H, v9.8H // ...........*............................................................... + // ldr q19, [x4, #-16] // .*......................................................................... + // ldr q29, [x1], #32 // ............*.............................................................. + // ldr q12, [x1, #-16] // .............*............................................................. + // uzp1 v13.8H, v18.8H, v19.8H // ...*....................................................................... + // uzp1 v3.8H, v29.8H, v12.8H // ...............*........................................................... + // uzp2 v10.8H, v29.8H, v12.8H // ................*.......................................................... + // smull v12.4S, v3.4H, v4.4H // .................*......................................................... + // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................ + // ldr q5, [x5, #-16] // ......*.................................................................... + // smlal v12.4S, v10.4H, v17.4H // .....................*..................................................... + // smlal2 v11.4S, v10.8H, v17.8H // ......................*.................................................... + // uzp2 v14.8H, v30.8H, v5.8H // ........*.................................................................. + // uzp1 v15.8H, v30.8H, v5.8H // .........*................................................................. + // smlal v12.4S, v13.4H, v14.4H // .........................*................................................. + // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................ + // uzp2 v28.8H, v18.8H, v19.8H // ....*...................................................................... + // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................ + // smlal2 v11.4S, v28.8H, v15.8H // ...............................*........................................... + // ld1 {v22.8H}, [x6], #16 // .............................*............................................. + // uzp1 v1.8H, v12.8H, v11.8H // ...................................*....................................... + // smull v23.4S, v3.4H, v17.4H // ...................*....................................................... + // mul v14.8H, v1.8H, v2.8H // .....................................*..................................... + // ld1 {v4.8H}, [x3], #16 // ..............*............................................................ + // smlal2 v11.4S, v14.8H, v0.8H // ........................................*.................................. + // smull2 v20.4S, v3.8H, v17.8H // ....................*...................................................... + // ldr q18, [x4], #32 // ..................................*........................................ + // ldr q30, [x5], #32 // .......................................*................................... + // smlal2 v20.4S, v10.8H, v4.8H // ........................*.................................................. + // smlal v12.4S, v14.4H, v0.4H // .........................................*................................. + // smlal v23.4S, v10.4H, v4.4H // .......................*................................................... + // smlal2 v20.4S, v13.8H, v15.8H // ............................*.............................................. + // ldr q8, [x2], #32 // ..........................................*................................ + // smlal v23.4S, v13.4H, v15.4H // ...........................*............................................... + // smlal2 v20.4S, v28.8H, v22.8H // ................................*.......................................... + // ldr q9, [x2, #-16] // ...........................................*............................... + // smlal v23.4S, v28.4H, v22.4H // .................................*......................................... + // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........ + // uzp1 v17.8H, v8.8H, v9.8H // .............................................*............................. + // uzp2 v4.8H, v8.8H, v9.8H // ................................................*.......................... + // uzp1 v5.8H, v23.8H, v20.8H // ....................................*...................................... + // mul v31.8H, v5.8H, v2.8H // ......................................*.................................... + // ldr q19, [x4, #-16] // ............................................*.............................. + // ldr q29, [x1], #32 // ..............................................*............................ + // ldr q12, [x1, #-16] // ...............................................*........................... + // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............ + // uzp1 v13.8H, v18.8H, v19.8H // .................................................*......................... + // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................ + // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*....................... + // smull v12.4S, v3.4H, v4.4H // ....................................................*...................... + // smull2 v11.4S, v3.8H, v4.8H // .....................................................*..................... + // ldr q5, [x5, #-16] // ......................................................*.................... + // smlal v12.4S, v10.4H, v17.4H // .......................................................*................... + // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*.................. + // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*................. + // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................ + // smlal v12.4S, v13.4H, v14.4H // ...........................................................*............... + // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*.............. + // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*............. + // smlal v23.4S, v31.4H, v0.4H // .................................................................*......... + // smlal v12.4S, v28.4H, v15.4H // ...............................................................*........... + // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*.......... + // ld1 {v22.8H}, [x6], #16 // .......................................................................*... + // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*..... + // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*...... + // smull v23.4S, v3.4H, v17.4H // ...................................................................*....... + // mul v14.8H, v1.8H, v2.8H // ......................................................................*.... + // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + // ld1 {v4.8H}, [x3], #16 // ..........................................................................* + // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*. - sub count, count, #1 -k2_loop_start: - // Instructions: 33 - // Expected cycles: 40 - // Expected IPC: 0.82 - - // Cycle bound: 40.0 - // IPC bound: 0.82 - - // Wall time: 1.67s - // User time: 1.67s - - // ---------- cycle (expected) -----------> - // 0 25 - // |------------------------|-------------- - smull v13.4S, v6.4H, v4.4H // *....................................... - smull2 v10.4S, v6.8H, v4.8H // .*...................................... - smull v21.4S, v6.4H, v5.4H // ..*..................................... - smull2 v3.4S, v6.8H, v5.8H // ...*.................................... - smlal v13.4S, v7.4H, v12.4H // ....*................................... - smlal2 v10.4S, v7.8H, v12.8H // .....*.................................. - smlal v21.4S, v7.4H, v4.4H // ......*................................. - smlal2 v3.4S, v7.8H, v4.8H // .......*................................ - smlal v13.4S, v15.4H, v29.4H // ........*............................... - smlal2 v10.4S, v15.8H, v29.8H // .........*.............................. - smlal v21.4S, v15.4H, v30.4H // ..........*............................. - smlal2 v3.4S, v15.8H, v30.8H // ...........*............................ - smlal v13.4S, v16.4H, v31.4H // ............*........................... - smlal2 v10.4S, v16.8H, v31.8H // .............*.......................... - smlal v21.4S, v16.4H, v29.4H // ..............*......................... - smlal2 v3.4S, v16.8H, v29.8H // ...............*........................ - ld2 {v6.8H, v7.8H}, [x1], #32 // ................e....................... - uzp1 v12.8H, v13.8H, v10.8H // ..................*..................... - uzp1 v30.8H, v21.8H, v3.8H // ...................*.................... - mul v12.8H, v12.8H, v2.H[2] // ....................*................... - mul v30.8H, v30.8H, v2.H[2] // .....................*.................. - ld2 {v4.8H, v5.8H}, [x2], #32 // ......................e................. - smlal v13.4S, v12.4H, v0.4H // ........................*............... - smlal2 v10.4S, v12.8H, v0.8H // .........................*.............. - smlal v21.4S, v30.4H, v0.4H // ..........................*............. - smlal2 v3.4S, v30.8H, v0.8H // ...........................*............ - ld1 {v12.8H}, [x3], #16 // ............................e........... - uzp2 v13.8H, v13.8H, v10.8H // ..............................*......... - uzp2 v14.8H, v21.8H, v3.8H // ...............................*........ - ld2 {v15.8H, v16.8H}, [x4], #32 // ................................e....... - st2 {v13.8H, v14.8H}, [x0], #32 // ..................................*..... - ld2 {v29.8H, v30.8H}, [x5], #32 // ....................................e... - ld1 {v31.8H}, [x6], #16 // ......................................e. - - // -------------------- cycle (expected) --------------------> - // 0 25 50 - // |------------------------|------------------------|-------- - // ld2 {v3.8h, v4.8h}, [x1], #32 // e.......................'...............~.................. - // ld2 {v5.8h, v6.8h}, [x2], #32 // ......e.................'.....................~............ - // ld1 {v7.8h}, [x3], #16 // ............e...........'...........................~...... - // smull v8.4s, v3.4h, v5.4h // ........................*.................................. - // smull2 v10.4s, v3.8h, v5.8h // ........................'*................................. - // smlal v8.4s, v4.4h, v7.4h // ........................'...*.............................. - // smlal2 v10.4s, v4.8h, v7.8h // ........................'....*............................. - // smull v9.4s, v3.4h, v6.4h // ........................'.*................................ - // smull2 v11.4s, v3.8h, v6.8h // ........................'..*............................... - // smlal v9.4s, v4.4h, v5.4h // ........................'.....*............................ - // smlal2 v11.4s, v4.8h, v5.8h // ........................'......*........................... - // ld2 {v3.8h, v4.8h}, [x4], #32 // ................e.......'...............................~.. - // ld2 {v5.8h, v6.8h}, [x5], #32 // ....................e...'.................................. - // ld1 {v7.8h}, [x6], #16 // ......................e.'.................................. - // smlal v8.4s, v3.4h, v5.4h // ........................'.......*.......................... - // smlal2 v10.4s, v3.8h, v5.8h // ........................'........*......................... - // smlal v8.4s, v4.4h, v7.4h // ........................'...........*...................... - // smlal2 v10.4s, v4.8h, v7.8h // ........................'............*..................... - // smlal v9.4s, v3.4h, v6.4h // ........................'.........*........................ - // smlal2 v11.4s, v3.8h, v6.8h // ........................'..........*....................... - // smlal v9.4s, v4.4h, v5.4h // ........................'.............*.................... - // smlal2 v11.4s, v4.8h, v5.8h // ........................'..............*................... - // uzp1 v28.8h, v8.8h, v10.8h // ..~.....................'.................*................ - // mul v28.8h, v28.8h, v2.h[2] // ....~...................'...................*.............. - // smlal v8.4s, v28.4h, v0.4h // ........~...............'.......................*.......... - // smlal2 v10.4s, v28.8h, v0.8h // .........~..............'........................*......... - // uzp2 v26.8h, v8.8h, v10.8h // ..............~.........'.............................*.... - // uzp1 v28.8h, v9.8h, v11.8h // ...~....................'..................*............... - // mul v28.8h, v28.8h, v2.h[2] // .....~..................'....................*............. - // smlal v9.4s, v28.4h, v0.4h // ..........~.............'.........................*........ - // smlal2 v11.4s, v28.8h, v0.8h // ...........~............'..........................*....... - // uzp2 v27.8h, v9.8h, v11.8h // ...............~........'..............................*... - // st2 {v26.8h, v27.8h}, [x0], #32 // ..................~.....'.................................* + sub count, count, #2 +1: + // Instructions: 48 + // Expected cycles: 58 + // Expected IPC: 0.83 + + // Cycle bound: 58.0 + // IPC bound: 0.83 + + // Wall time: 6.39s + // User time: 6.39s + + // -------------- original position --------------> + // 0 25 + // |------------------------|---------------------- + smull2 v20.4S, v3.8H, v17.8H // ..........*..................................... + ldr q18, [x4], #32 // .................e.............................. + ldr q30, [x5], #32 // .....................e.......................... + smlal2 v20.4S, v10.8H, v4.8H // ............*................................... + smlal v12.4S, v14.4H, v0.4H // .........................................*...... + smlal v23.4S, v10.4H, v4.4H // ...........*.................................... + str q9, [x0, #16] // ...............................................l + smlal2 v20.4S, v13.8H, v15.8H // ...........................*.................... + ldr q8, [x2], #32 // ....e........................................... + smlal v23.4S, v13.4H, v15.4H // ..........................*..................... + smlal2 v20.4S, v28.8H, v22.8H // .............................*.................. + zip1 v26.8H, v19.8H, v27.8H // ............................................l... + ldr q9, [x2, #-16] // .....e.......................................... + smlal v23.4S, v28.4H, v22.4H // ............................*................... + uzp2 v27.8H, v12.8H, v11.8H // ...........................................*.... + uzp1 v17.8H, v8.8H, v9.8H // ......e......................................... + uzp2 v4.8H, v8.8H, v9.8H // .......e........................................ + uzp1 v5.8H, v23.8H, v20.8H // ..................................*............. + str q26, [x0], #32 // ..............................................l. + mul v31.8H, v5.8H, v2.8H // ...................................*............ + ldr q19, [x4, #-16] // ..................e............................. + ldr q29, [x1], #32 // e............................................... + ldr q12, [x1, #-16] // .e.............................................. + smlal2 v20.4S, v31.8H, v0.8H // .....................................*.......... + uzp1 v13.8H, v18.8H, v19.8H // ...................e............................ + uzp1 v3.8H, v29.8H, v12.8H // ..e............................................. + uzp2 v10.8H, v29.8H, v12.8H // ...e............................................ + smull v12.4S, v3.4H, v4.4H // .............e.................................. + smull2 v11.4S, v3.8H, v4.8H // ..............e................................. + ldr q5, [x5, #-16] // ......................e......................... + smlal v12.4S, v10.4H, v17.4H // ...............e................................ + smlal2 v11.4S, v10.8H, v17.8H // ................e............................... + uzp2 v14.8H, v30.8H, v5.8H // ........................e....................... + uzp1 v15.8H, v30.8H, v5.8H // .......................e........................ + smlal v12.4S, v13.4H, v14.4H // ..............................e................. + smlal2 v11.4S, v13.8H, v14.8H // ...............................e................ + uzp2 v28.8H, v18.8H, v19.8H // ....................e........................... + smlal v23.4S, v31.4H, v0.4H // ....................................*........... + smlal v12.4S, v28.4H, v15.4H // ................................e............... + smlal2 v11.4S, v28.8H, v15.8H // .................................e.............. + ld1 {v22.8H}, [x6], #16 // .........................e...................... + uzp2 v19.8H, v23.8H, v20.8H // ......................................*......... + uzp1 v1.8H, v12.8H, v11.8H // .......................................e........ + smull v23.4S, v3.4H, v17.4H // .........e...................................... + mul v14.8H, v1.8H, v2.8H // ........................................e....... + zip2 v9.8H, v19.8H, v27.8H // .............................................*.. + ld1 {v4.8H}, [x3], #16 // ........e....................................... + smlal2 v11.4S, v14.8H, v0.8H // ..........................................e..... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'.................. + // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'.................. + // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~.......... + // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~...... + // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~... + // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~.. + // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'.................. + // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'.................. + // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~.................. + // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~............. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~............... + // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'.................. + // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'.................. + // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'.................. + // ldr q12, [x4], #32 // e..............................................'~..............................................'~................. + // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'.................. + // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................ + // ldr q13, [x5, #-16] // ............................e..................'............................~..................'.................. + // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'.................. + // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'.................. + // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'.................. + // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~......... + // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~........... + // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~..... + // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........ + // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'.................. + // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'.................. + // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'.................. + // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~. + // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'.................. + // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'.................. + // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'.................. + // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'.................. + // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'.................. + // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'.................. + // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~.............. + // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'.................. + // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~.... + // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l....... + // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'.................. + // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l + // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............ sub count, count, #1 - cbnz count, k2_loop_start - // Instructions: 27 - // Expected cycles: 34 - // Expected IPC: 0.79 - - // Cycle bound: 34.0 - // IPC bound: 0.79 - - // Wall time: 0.25s - // User time: 0.25s - - // ------- cycle (expected) --------> - // 0 25 - // |------------------------|-------- - smull v13.4S, v6.4H, v5.4H // *................................. - smull2 v10.4S, v6.8H, v5.8H // .*................................ - smull v21.4S, v6.4H, v4.4H // ..*............................... - smull2 v3.4S, v6.8H, v4.8H // ...*.............................. - smlal v13.4S, v7.4H, v4.4H // ....*............................. - smlal2 v10.4S, v7.8H, v4.8H // .....*............................ - smlal v21.4S, v7.4H, v12.4H // ......*........................... - smlal2 v3.4S, v7.8H, v12.8H // .......*.......................... - smlal v13.4S, v15.4H, v30.4H // ........*......................... - smlal2 v10.4S, v15.8H, v30.8H // .........*........................ - smlal v21.4S, v15.4H, v29.4H // ..........*....................... - smlal2 v3.4S, v15.8H, v29.8H // ...........*...................... - smlal v13.4S, v16.4H, v29.4H // ............*..................... - smlal2 v10.4S, v16.8H, v29.8H // .............*.................... - smlal v21.4S, v16.4H, v31.4H // ..............*................... - smlal2 v3.4S, v16.8H, v31.8H // ...............*.................. - uzp1 v12.8H, v13.8H, v10.8H // .................*................ - uzp1 v6.8H, v21.8H, v3.8H // ...................*.............. - mul v12.8H, v12.8H, v2.H[2] // ....................*............. - mul v6.8H, v6.8H, v2.H[2] // .....................*............ - smlal v13.4S, v12.4H, v0.4H // ........................*......... - smlal v21.4S, v6.4H, v0.4H // .........................*........ - smlal2 v3.4S, v6.8H, v0.8H // ..........................*....... - smlal2 v10.4S, v12.8H, v0.8H // ...........................*...... - uzp2 v21.8H, v21.8H, v3.8H // ..............................*... - uzp2 v22.8H, v13.8H, v10.8H // ...............................*.. - st2 {v21.8H, v22.8H}, [x0], #32 // .................................* - - // ------- cycle (expected) --------> - // 0 25 - // |------------------------|-------- - // smull v13.4S, v6.4H, v4.4H // ..*............................... - // smull2 v10.4S, v6.8H, v4.8H // ...*.............................. - // smull v21.4S, v6.4H, v5.4H // *................................. - // smull2 v3.4S, v6.8H, v5.8H // .*................................ - // smlal v13.4S, v7.4H, v12.4H // ......*........................... - // smlal2 v10.4S, v7.8H, v12.8H // .......*.......................... - // smlal v21.4S, v7.4H, v4.4H // ....*............................. - // smlal2 v3.4S, v7.8H, v4.8H // .....*............................ - // smlal v13.4S, v15.4H, v29.4H // ..........*....................... - // smlal2 v10.4S, v15.8H, v29.8H // ...........*...................... - // smlal v21.4S, v15.4H, v30.4H // ........*......................... - // smlal2 v3.4S, v15.8H, v30.8H // .........*........................ - // smlal v13.4S, v16.4H, v31.4H // ..............*................... - // smlal2 v10.4S, v16.8H, v31.8H // ...............*.................. - // smlal v21.4S, v16.4H, v29.4H // ............*..................... - // smlal2 v3.4S, v16.8H, v29.8H // .............*.................... - // uzp1 v12.8H, v13.8H, v10.8H // ...................*.............. - // uzp1 v30.8H, v21.8H, v3.8H // .................*................ - // mul v12.8H, v12.8H, v2.H[2] // .....................*............ - // mul v30.8H, v30.8H, v2.H[2] // ....................*............. - // smlal v13.4S, v12.4H, v0.4H // .........................*........ - // smlal2 v10.4S, v12.8H, v0.8H // ..........................*....... - // smlal v21.4S, v30.4H, v0.4H // ........................*......... - // smlal2 v3.4S, v30.8H, v0.8H // ...........................*...... - // uzp2 v13.8H, v13.8H, v10.8H // ..............................*... - // uzp2 v14.8H, v21.8H, v3.8H // ...............................*.. - // st2 {v13.8H, v14.8H}, [x0], #32 // .................................* + cbnz count, 1b + // Instructions: 21 + // Expected cycles: 35 + // Expected IPC: 0.60 + + // Cycle bound: 35.0 + // IPC bound: 0.60 + + // Wall time: 0.08s + // User time: 0.08s + + // ----- original position -----> + // 0 25 + // |------------------------|---- + smull2 v5.4S, v3.8H, v17.8H // *............................. + smlal v12.4S, v14.4H, v0.4H // ..*........................... + smlal v23.4S, v10.4H, v4.4H // ...*.......................... + str q9, [x0, #16] // ....*......................... + smlal2 v5.4S, v10.8H, v4.8H // .*............................ + uzp2 v11.8H, v12.8H, v11.8H // ..........*................... + zip1 v9.8H, v19.8H, v27.8H // ........*..................... + smlal v23.4S, v13.4H, v15.4H // ......*....................... + smlal2 v5.4S, v13.8H, v15.8H // .....*........................ + str q9, [x0], #32 // ............*................. + smlal v23.4S, v28.4H, v22.4H // .........*.................... + smlal2 v5.4S, v28.8H, v22.8H // .......*...................... + uzp1 v9.8H, v23.8H, v5.8H // ...........*.................. + mul v9.8H, v9.8H, v2.8H // .............*................ + smlal2 v5.4S, v9.8H, v0.8H // ..............*............... + smlal v23.4S, v9.4H, v0.4H // ...............*.............. + uzp2 v9.8H, v23.8H, v5.8H // ................*............. + zip2 v5.8H, v9.8H, v11.8H // .................*............ + zip1 v9.8H, v9.8H, v11.8H // ...................*.......... + str q5, [x0, #16] // ..................*........... + str q9, [x0], #32 // ....................*......... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // smull2 v20.4S, v3.8H, v17.8H // *.............................. + // smlal2 v20.4S, v10.8H, v4.8H // ....*.......................... + // smlal v12.4S, v14.4H, v0.4H // .*............................. + // smlal v23.4S, v10.4H, v4.4H // ..*............................ + // str q9, [x0, #16] // ...*........................... + // smlal2 v20.4S, v13.8H, v15.8H // ........*...................... + // smlal v23.4S, v13.4H, v15.4H // .......*....................... + // smlal2 v20.4S, v28.8H, v22.8H // ...........*................... + // zip1 v26.8H, v19.8H, v27.8H // ......*........................ + // smlal v23.4S, v28.4H, v22.4H // ..........*.................... + // uzp2 v27.8H, v12.8H, v11.8H // .....*......................... + // uzp1 v5.8H, v23.8H, v20.8H // ............*.................. + // str q26, [x0], #32 // .........*..................... + // mul v31.8H, v5.8H, v2.8H // .............*................. + // smlal2 v20.4S, v31.8H, v0.8H // ..............*................ + // smlal v23.4S, v31.4H, v0.4H // ...............*............... + // uzp2 v19.8H, v23.8H, v20.8H // ................*.............. + // zip2 v9.8H, v19.8H, v27.8H // .................*............. + // str q9, [x0, #16] // ...................*........... + // zip1 v26.8H, v19.8H, v27.8H // ..................*............ + // str q26, [x0], #32 // ....................*.......... pop_stack @@ -353,17 +530,12 @@ k2_loop_start: #endif /* MLKEM_K == 2 */ #if MLKEM_K == 3 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -375,327 +547,453 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) mov count, #(MLKEM_N / 16) - // Instructions: 52 - // Expected cycles: 66 - // Expected IPC: 0.79 - - // Cycle bound: 66.0 - // IPC bound: 0.79 - - // Wall time: 8.09s - // User time: 8.09s - - // ----------------------- cycle (expected) ------------------------> - // 0 25 50 - // |------------------------|------------------------|--------------- - ld2 {v7.8H, v8.8H}, [x2], #32 // *................................................................. - ld2 {v28.8H, v29.8H}, [x1], #32 // ..*............................................................... - ld1 {v10.8H}, [x3], #16 // ....*............................................................. - smull2 v15.4S, v28.8H, v8.8H // ......*........................................................... - smull v9.4S, v28.4H, v8.4H // .......*.......................................................... - ld1 {v31.8H}, [x3], #16 // ........*......................................................... - smlal2 v15.4S, v29.8H, v7.8H // ..........*....................................................... - ld2 {v19.8H, v20.8H}, [x5], #32 // ...........*...................................................... - smlal v9.4S, v29.4H, v7.4H // .............*.................................................... - ld2 {v16.8H, v17.8H}, [x4], #32 // ..............*................................................... - ld2 {v3.8H, v4.8H}, [x7], #32 // ................*................................................. - smlal v9.4S, v16.4H, v20.4H // ..................*............................................... - smlal2 v15.4S, v16.8H, v20.8H // ...................*.............................................. - smull v1.4S, v28.4H, v7.4H // ....................*............................................. - smull2 v22.4S, v28.8H, v7.8H // .....................*............................................ - smlal v9.4S, v17.4H, v19.4H // ......................*........................................... - smlal2 v15.4S, v17.8H, v19.8H // .......................*.......................................... - ld2 {v6.8H, v7.8H}, [x8], #32 // ........................*......................................... - smlal v1.4S, v29.4H, v10.4H // ..........................*....................................... - smlal2 v22.4S, v29.8H, v10.8H // ...........................*...................................... - smlal2 v15.4S, v3.8H, v7.8H // ............................*..................................... - smlal v9.4S, v3.4H, v7.4H // .............................*.................................... - smlal v1.4S, v16.4H, v19.4H // ..............................*................................... - smlal2 v22.4S, v16.8H, v19.8H // ...............................*.................................. - ld1 {v5.8H}, [x6], #16 // ................................*................................. - smlal v9.4S, v4.4H, v6.4H // ..................................*............................... - smlal2 v15.4S, v4.8H, v6.8H // ...................................*.............................. - smlal v1.4S, v17.4H, v5.4H // ....................................*............................. - smlal2 v22.4S, v17.8H, v5.8H // .....................................*............................ - ld1 {v30.8H}, [x9], #16 // ......................................*........................... - smlal v1.4S, v3.4H, v6.4H // ........................................*......................... - smlal2 v22.4S, v3.8H, v6.8H // .........................................*........................ - ld2 {v11.8H, v12.8H}, [x1], #32 // ..........................................*....................... - smlal v1.4S, v4.4H, v30.4H // ............................................*..................... - smlal2 v22.4S, v4.8H, v30.8H // .............................................*.................... - uzp1 v21.8H, v9.8H, v15.8H // ..............................................*................... - ld1 {v14.8H}, [x6], #16 // ...............................................*.................. - uzp1 v13.8H, v1.8H, v22.8H // .................................................*................ - mul v10.8H, v21.8H, v2.H[2] // ..................................................*............... - mul v21.8H, v13.8H, v2.H[2] // ...................................................*.............. - ld2 {v23.8H, v24.8H}, [x2], #32 // ....................................................*............. - smlal2 v15.4S, v10.8H, v0.8H // ......................................................*........... - smlal2 v22.4S, v21.8H, v0.8H // .......................................................*.......... - smlal v1.4S, v21.4H, v0.4H // ........................................................*......... - smull2 v13.4S, v11.8H, v23.8H // .........................................................*........ - ld2 {v25.8H, v26.8H}, [x5], #32 // ..........................................................*....... - smlal v9.4S, v10.4H, v0.4H // ............................................................*..... - smlal2 v13.4S, v12.8H, v31.8H // .............................................................*.... - smull2 v27.4S, v11.8H, v24.8H // ..............................................................*... - uzp2 v16.8H, v1.8H, v22.8H // ...............................................................*.. - uzp2 v17.8H, v9.8H, v15.8H // ................................................................*. - ld2 {v6.8H, v7.8H}, [x4], #32 // .................................................................* - - // ----------------------- cycle (expected) ------------------------> - // 0 25 50 - // |------------------------|------------------------|--------------- - // ld2 {v23.8H, v24.8H}, [x2], #32 // *................................................................. - // ld2 {v11.8H, v12.8H}, [x1], #32 // ..*............................................................... - // ld2 {v6.8H, v7.8H}, [x4], #32 // ..............*................................................... - // ld1 {v31.8H}, [x3], #16 // ....*............................................................. - // ld2 {v25.8H, v26.8H}, [x5], #32 // ...........*...................................................... - // smull2 v13.4S, v11.8H, v23.8H // .....................*............................................ - // ld1 {v14.8H}, [x6], #16 // ................................*................................. - // smlal2 v13.4S, v12.8H, v31.8H // ...........................*...................................... - // smull2 v27.4S, v11.8H, v24.8H // ......*........................................................... - // smlal2 v13.4S, v6.8H, v25.8H // ...............................*.................................. - // smull v20.4S, v11.4H, v24.4H // .......*.......................................................... - // smull v17.4S, v11.4H, v23.4H // ....................*............................................. - // smlal2 v27.4S, v12.8H, v23.8H // ..........*....................................................... - // smlal2 v13.4S, v7.8H, v14.8H // .....................................*............................ - // smlal v20.4S, v12.4H, v23.4H // .............*.................................................... - // smlal v17.4S, v12.4H, v31.4H // ..........................*....................................... - // smlal2 v27.4S, v6.8H, v26.8H // ...................*.............................................. - // ld1 {v5.8H}, [x9], #16 // ......................................*........................... - // smlal v17.4S, v6.4H, v25.4H // ..............................*................................... - // smlal v20.4S, v6.4H, v26.4H // ..................*............................................... - // ld2 {v23.8H, v24.8H}, [x2], #32 // ....................................................*............. - // ld2 {v15.8H, v16.8H}, [x8], #32 // ........................*......................................... - // smlal v17.4S, v7.4H, v14.4H // ....................................*............................. - // ld2 {v8.8H, v9.8H}, [x7], #32 // ................*................................................. - // smlal v20.4S, v7.4H, v25.4H // ......................*........................................... - // smlal2 v27.4S, v7.8H, v25.8H // .......................*.......................................... - // smlal2 v13.4S, v8.8H, v15.8H // .........................................*........................ - // smlal v17.4S, v8.4H, v15.4H // ........................................*......................... - // smlal v20.4S, v8.4H, v16.4H // .............................*.................................... - // smlal2 v27.4S, v8.8H, v16.8H // ............................*..................................... - // smlal2 v13.4S, v9.8H, v5.8H // .............................................*.................... - // smlal v17.4S, v9.4H, v5.4H // ............................................*..................... - // smlal v20.4S, v9.4H, v15.4H // ..................................*............................... - // smlal2 v27.4S, v9.8H, v15.8H // ...................................*.............................. - // ld2 {v11.8H, v12.8H}, [x1], #32 // ..........................................*....................... - // uzp1 v31.8H, v17.8H, v13.8H // .................................................*................ - // uzp1 v8.8H, v20.8H, v27.8H // ..............................................*................... - // ld2 {v6.8H, v7.8H}, [x4], #32 // .................................................................* - // mul v18.8H, v31.8H, v2.H[2] // ...................................................*.............. - // mul v14.8H, v8.8H, v2.H[2] // ..................................................*............... - // ld1 {v31.8H}, [x3], #16 // ........*......................................................... - // smlal v17.4S, v18.4H, v0.4H // ........................................................*......... - // smlal2 v27.4S, v14.8H, v0.8H // ......................................................*........... - // smlal2 v13.4S, v18.8H, v0.8H // .......................................................*.......... - // smlal v20.4S, v14.4H, v0.4H // ............................................................*..... - // ld2 {v25.8H, v26.8H}, [x5], #32 // ..........................................................*....... - // uzp2 v16.8H, v17.8H, v13.8H // ...............................................................*.. - // smull2 v13.4S, v11.8H, v23.8H // .........................................................*........ - // uzp2 v17.8H, v20.8H, v27.8H // ................................................................*. - // ld1 {v14.8H}, [x6], #16 // ...............................................*.................. - // smlal2 v13.4S, v12.8H, v31.8H // .............................................................*.... - // smull2 v27.4S, v11.8H, v24.8H // ..............................................................*... + // Instructions: 75 + // Expected cycles: 103 + // Expected IPC: 0.73 + + // Cycle bound: 103.0 + // IPC bound: 0.73 + + // Wall time: 0.94s + // User time: 0.94s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q7, [x2, #16] // *.......................................................................... + ldr q20, [x2], #32 // ..*........................................................................ + ldr q15, [x1, #16] // .*......................................................................... + uzp1 v8.8H, v20.8H, v7.8H // ...............*........................................................... + uzp2 v7.8H, v20.8H, v7.8H // ................*.......................................................... + ld1 {v20.8H}, [x3], #16 // ...*....................................................................... + ldr q30, [x1], #32 // ..............*............................................................ + ldr q11, [x4], #32 // ....*...................................................................... + uzp1 v16.8H, v30.8H, v15.8H // .................*......................................................... + uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................ + smull v30.4S, v16.4H, v7.4H // ...................*....................................................... + smull2 v7.4S, v16.8H, v7.8H // ....................*...................................................... + smull v9.4S, v16.4H, v8.4H // .....................*..................................................... + smull2 v16.4S, v16.8H, v8.8H // ......................*.................................................... + smlal v30.4S, v15.4H, v8.4H // .......................*................................................... + smlal2 v7.4S, v15.8H, v8.8H // ........................*.................................................. + smlal v9.4S, v15.4H, v20.4H // .........................*................................................. + smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................ + ldr q20, [x4, #-16] // .....*..................................................................... + ldr q15, [x5], #32 // ......*.................................................................... + uzp1 v8.8H, v11.8H, v20.8H // ...........................*............................................... + uzp2 v20.8H, v11.8H, v20.8H // ............................*.............................................. + ldr q11, [x5, #-16] // .......*................................................................... + ld1 {v27.8H}, [x6], #16 // ........*.................................................................. + uzp1 v10.8H, v15.8H, v11.8H // .............................*............................................. + uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................ + smlal v9.4S, v8.4H, v10.4H // ...............................*........................................... + smlal2 v16.4S, v8.8H, v10.8H // ................................*.......................................... + smlal v30.4S, v8.4H, v15.4H // .................................*......................................... + smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................ + smlal v9.4S, v20.4H, v27.4H // ...................................*....................................... + smlal2 v16.4S, v20.8H, v27.8H // ....................................*...................................... + smlal v30.4S, v20.4H, v10.4H // .....................................*..................................... + smlal2 v7.4S, v20.8H, v10.8H // ......................................*.................................... + ldr q20, [x7], #32 // .........*................................................................. + ldr q15, [x7, #-16] // ..........*................................................................ + ldr q8, [x8], #32 // ...........*............................................................... + uzp1 v11.8H, v20.8H, v15.8H // .......................................*................................... + uzp2 v20.8H, v20.8H, v15.8H // ........................................*.................................. + ldr q15, [x8, #-16] // ............*.............................................................. + ld1 {v27.8H}, [x9], #16 // .............*............................................................. + uzp1 v10.8H, v8.8H, v15.8H // .........................................*................................. + uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................ + smlal v9.4S, v11.4H, v10.4H // ...........................................*............................... + smlal2 v16.4S, v11.8H, v10.8H // ............................................*.............................. + smlal v30.4S, v11.4H, v15.4H // .............................................*............................. + smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................ + smlal v9.4S, v20.4H, v27.4H // ...............................................*........................... + smlal2 v16.4S, v20.8H, v27.8H // ................................................*.......................... + smlal v30.4S, v20.4H, v10.4H // .................................................*......................... + smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................ + ldr q15, [x2], #32 // ...............................................................*........... + uzp1 v20.8H, v9.8H, v16.8H // ....................................................*...................... + uzp1 v8.8H, v30.8H, v7.8H // .....................................................*..................... + mul v20.8H, v20.8H, v2.8H // ......................................................*.................... + mul v8.8H, v8.8H, v2.8H // .......................................................*................... + ldr q21, [x4], #32 // .................................................................*......... + smlal v9.4S, v20.4H, v0.4H // ........................................................*.................. + smlal2 v16.4S, v20.8H, v0.8H // .........................................................*................. + smlal v30.4S, v8.4H, v0.4H // ..........................................................*................ + smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*............... + ldr q6, [x4, #-16] // ..................................................................*........ + uzp2 v27.8H, v9.8H, v16.8H // ............................................................*.............. + uzp2 v10.8H, v30.8H, v7.8H // .............................................................*............. + ldr q16, [x2, #-16] // ...................................................*....................... + ldr q30, [x1, #16] // ..............................................................*............ + ld1 {v9.8H}, [x3], #16 // ................................................................*.......... + ldr q1, [x5], #32 // ...................................................................*....... + ldr q12, [x5, #-16] // ....................................................................*...... + ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + ldr q19, [x7], #32 // ......................................................................*.... + ldr q31, [x7, #-16] // .......................................................................*... + ldr q17, [x8], #32 // ........................................................................*.. + ldr q18, [x8, #-16] // .........................................................................*. + ld1 {v25.8H}, [x9], #16 // ..........................................................................* + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q16, [x2, #16] // *.......................................................................... + // ldr q30, [x1, #16] // ..*........................................................................ + // ldr q15, [x2], #32 // .*......................................................................... + // ld1 {v9.8H}, [x3], #16 // .....*..................................................................... + // ldr q21, [x4], #32 // .......*................................................................... + // ldr q6, [x4, #-16] // ..................*........................................................ + // ldr q1, [x5], #32 // ...................*....................................................... + // ldr q12, [x5, #-16] // ......................*.................................................... + // ld1 {v24.8H}, [x6], #16 // .......................*................................................... + // ldr q19, [x7], #32 // ..................................*........................................ + // ldr q31, [x7, #-16] // ...................................*....................................... + // ldr q17, [x8], #32 // ....................................*...................................... + // ldr q18, [x8, #-16] // .......................................*................................... + // ld1 {v25.8H}, [x9], #16 // ........................................*.................................. + // ldr q20, [x1], #32 // ......*.................................................................... + // uzp1 v7.8H, v15.8H, v16.8H // ...*....................................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ....*...................................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ........*.................................................................. + // uzp2 v20.8H, v20.8H, v30.8H // .........*................................................................. + // smull v30.4S, v8.4H, v15.4H // ..........*................................................................ + // smull2 v15.4S, v8.8H, v15.8H // ...........*............................................................... + // smull v11.4S, v8.4H, v7.4H // ............*.............................................................. + // smull2 v8.4S, v8.8H, v7.8H // .............*............................................................. + // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................ + // smlal2 v15.4S, v20.8H, v7.8H // ...............*........................................................... + // smlal v11.4S, v20.4H, v9.4H // ................*.......................................................... + // smlal2 v8.4S, v20.8H, v9.8H // .................*......................................................... + // uzp1 v7.8H, v21.8H, v6.8H // ....................*...................................................... + // uzp2 v20.8H, v21.8H, v6.8H // .....................*..................................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........................*.................................................. + // uzp2 v9.8H, v1.8H, v12.8H // .........................*................................................. + // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................ + // smlal2 v8.4S, v7.8H, v16.8H // ...........................*............................................... + // smlal v30.4S, v7.4H, v9.4H // ............................*.............................................. + // smlal2 v15.4S, v7.8H, v9.8H // .............................*............................................. + // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................ + // smlal2 v8.4S, v20.8H, v24.8H // ...............................*........................................... + // smlal v30.4S, v20.4H, v16.4H // ................................*.......................................... + // smlal2 v15.4S, v20.8H, v16.8H // .................................*......................................... + // uzp1 v7.8H, v19.8H, v31.8H // .....................................*..................................... + // uzp2 v20.8H, v19.8H, v31.8H // ......................................*.................................... + // uzp1 v16.8H, v17.8H, v18.8H // .........................................*................................. + // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................ + // smlal v11.4S, v7.4H, v16.4H // ...........................................*............................... + // smlal2 v8.4S, v7.8H, v16.8H // ............................................*.............................. + // smlal v30.4S, v7.4H, v9.4H // .............................................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................ + // smlal v11.4S, v20.4H, v25.4H // ...............................................*........................... + // smlal2 v8.4S, v20.8H, v25.8H // ................................................*.......................... + // smlal v30.4S, v20.4H, v16.4H // .................................................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................ + // ldr q16, [x2, #16] // ................................................................*.......... + // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*...................... + // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*..................... + // mul v7.8H, v7.8H, v2.8H // ......................................................*.................... + // mul v20.8H, v20.8H, v2.8H // .......................................................*................... + // smlal v11.4S, v7.4H, v0.4H // .........................................................*................. + // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................ + // smlal v30.4S, v20.4H, v0.4H // ...........................................................*............... + // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*.............. + // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............ + // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*........... + // ldr q30, [x1, #16] // .................................................................*......... + // ldr q15, [x2], #32 // ...................................................*....................... + // ld1 {v9.8H}, [x3], #16 // ..................................................................*........ + // ldr q21, [x4], #32 // ........................................................*.................. + // ldr q6, [x4, #-16] // .............................................................*............. + // ldr q1, [x5], #32 // ...................................................................*....... + // ldr q12, [x5, #-16] // ....................................................................*...... + // ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + // ldr q19, [x7], #32 // ......................................................................*.... + // ldr q31, [x7, #-16] // .......................................................................*... + // ldr q17, [x8], #32 // ........................................................................*.. + // ldr q18, [x8, #-16] // .........................................................................*. + // ld1 {v25.8H}, [x9], #16 // ..........................................................................* sub count, count, #2 -k3_loop_start: - // Instructions: 44 - // Expected cycles: 54 - // Expected IPC: 0.81 - - // Cycle bound: 54.0 - // IPC bound: 0.81 - - // Wall time: 4.51s - // User time: 4.51s - - // ----------------- cycle (expected) ------------------> - // 0 25 50 - // |------------------------|------------------------|--- - st2 {v16.8H, v17.8H}, [x0], #32 // l..................................................... - smlal2 v13.4S, v6.8H, v25.8H // ..*................................................... - smull v20.4S, v11.4H, v24.4H // ...*.................................................. - smull v17.4S, v11.4H, v23.4H // ....*................................................. - smlal2 v27.4S, v12.8H, v23.8H // .....*................................................ - smlal2 v13.4S, v7.8H, v14.8H // ......*............................................... - smlal v20.4S, v12.4H, v23.4H // .......*.............................................. - smlal v17.4S, v12.4H, v31.4H // ........*............................................. - smlal2 v27.4S, v6.8H, v26.8H // .........*............................................ - ld1 {v5.8H}, [x9], #16 // ..........*........................................... - smlal v17.4S, v6.4H, v25.4H // ............*......................................... - smlal v20.4S, v6.4H, v26.4H // .............*........................................ - ld2 {v23.8H, v24.8H}, [x2], #32 // ..............e....................................... - ld2 {v15.8H, v16.8H}, [x8], #32 // ................*..................................... - smlal v17.4S, v7.4H, v14.4H // ..................*................................... - ld2 {v8.8H, v9.8H}, [x7], #32 // ...................*.................................. - smlal v20.4S, v7.4H, v25.4H // .....................*................................ - smlal2 v27.4S, v7.8H, v25.8H // ......................*............................... - smlal2 v13.4S, v8.8H, v15.8H // .......................*.............................. - smlal v17.4S, v8.4H, v15.4H // ........................*............................. - smlal v20.4S, v8.4H, v16.4H // .........................*............................ - smlal2 v27.4S, v8.8H, v16.8H // ..........................*........................... - smlal2 v13.4S, v9.8H, v5.8H // ...........................*.......................... - smlal v17.4S, v9.4H, v5.4H // ............................*......................... - smlal v20.4S, v9.4H, v15.4H // .............................*........................ - smlal2 v27.4S, v9.8H, v15.8H // ..............................*....................... - ld2 {v11.8H, v12.8H}, [x1], #32 // ...............................e...................... - uzp1 v31.8H, v17.8H, v13.8H // .................................*.................... - uzp1 v8.8H, v20.8H, v27.8H // ..................................*................... - ld2 {v6.8H, v7.8H}, [x4], #32 // ...................................e.................. - mul v18.8H, v31.8H, v2.H[2] // .....................................*................ - mul v14.8H, v8.8H, v2.H[2] // ......................................*............... - ld1 {v31.8H}, [x3], #16 // .......................................e.............. - smlal v17.4S, v18.4H, v0.4H // .........................................*............ - smlal2 v27.4S, v14.8H, v0.8H // ..........................................*........... - smlal2 v13.4S, v18.8H, v0.8H // ...........................................*.......... - smlal v20.4S, v14.4H, v0.4H // ............................................*......... - ld2 {v25.8H, v26.8H}, [x5], #32 // .............................................e........ - uzp2 v16.8H, v17.8H, v13.8H // ...............................................*...... - smull2 v13.4S, v11.8H, v23.8H // ................................................e..... - uzp2 v17.8H, v20.8H, v27.8H // .................................................*.... - ld1 {v14.8H}, [x6], #16 // ..................................................e... - smlal2 v13.4S, v12.8H, v31.8H // ....................................................e. - smull2 v27.4S, v11.8H, v24.8H // .....................................................e - - // -------------------------------------- cycle (expected) --------------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|------------------- - // ld2 {v3.8h, v4.8h}, [x1], #32 // .................e......................'..............................~....................... - // ld2 {v5.8h, v6.8h}, [x2], #32 // e.......................................'.............~........................................ - // ld1 {v7.8h}, [x3], #16 // .........................e..............'......................................~............... - // smull v8.4s, v3.4h, v5.4h // ........................................'...*.................................................. - // smull2 v10.4s, v3.8h, v5.8h // ..................................e.....'...............................................~...... - // smlal v8.4s, v4.4h, v7.4h // ........................................'.......*.............................................. - // smlal2 v10.4s, v4.8h, v7.8h // ......................................e.'...................................................~.. - // smull v9.4s, v3.4h, v6.4h // ........................................'..*................................................... - // smull2 v11.4s, v3.8h, v6.8h // .......................................e'....................................................~. - // smlal v9.4s, v4.4h, v5.4h // ........................................'......*............................................... - // smlal2 v11.4s, v4.8h, v5.8h // ........................................'....*................................................. - // ld2 {v3.8h, v4.8h}, [x4], #32 // .....................e..................'..................................~................... - // ld2 {v5.8h, v6.8h}, [x5], #32 // ...............................e........'............................................~......... - // ld1 {v7.8h}, [x6], #16 // ....................................e...'.................................................~.... - // smlal v8.4s, v3.4h, v5.4h // ........................................'...........*.......................................... - // smlal2 v10.4s, v3.8h, v5.8h // ........................................'.*.................................................... - // smlal v8.4s, v4.4h, v7.4h // ....~...................................'.................*.................................... - // smlal2 v10.4s, v4.8h, v7.8h // ........................................'.....*................................................ - // smlal v9.4s, v3.4h, v6.4h // ........................................'............*......................................... - // smlal2 v11.4s, v3.8h, v6.8h // ........................................'........*............................................. - // smlal v9.4s, v4.4h, v5.4h // .......~................................'....................*................................. - // smlal2 v11.4s, v4.8h, v5.8h // ........~...............................'.....................*................................ - // ld2 {v3.8h, v4.8h}, [x7], #32 // .....~..................................'..................*................................... - // ld2 {v5.8h, v6.8h}, [x8], #32 // ..~.....................................'...............*...................................... - // ld1 {v7.8h}, [x9], #16 // ........................................'.........*............................................ - // smlal v8.4s, v3.4h, v5.4h // ..........~.............................'.......................*.............................. - // smlal2 v10.4s, v3.8h, v5.8h // .........~..............................'......................*............................... - // smlal v8.4s, v4.4h, v7.4h // ..............~.........................'...........................*.......................... - // smlal2 v10.4s, v4.8h, v7.8h // .............~..........................'..........................*........................... - // smlal v9.4s, v3.4h, v6.4h // ...........~............................'........................*............................. - // smlal2 v11.4s, v3.8h, v6.8h // ............~...........................'.........................*............................ - // smlal v9.4s, v4.4h, v5.4h // ...............~........................'............................*......................... - // smlal2 v11.4s, v4.8h, v5.8h // ................~.......................'.............................*........................ - // uzp1 v28.8h, v8.8h, v10.8h // ...................~....................'................................*..................... - // mul v28.8h, v28.8h, v2.h[2] // .......................~................'....................................*................. - // smlal v8.4s, v28.4h, v0.4h // ...........................~............'........................................*............. - // smlal2 v10.4s, v28.8h, v0.8h // .............................~..........'..........................................*........... - // uzp2 v26.8h, v8.8h, v10.8h // .................................~......'..............................................*....... - // uzp1 v28.8h, v9.8h, v11.8h // ....................~...................'.................................*.................... - // mul v28.8h, v28.8h, v2.h[2] // ........................~...............'.....................................*................ - // smlal v9.4s, v28.4h, v0.4h // ..............................~.........'...........................................*.......... - // smlal2 v11.4s, v28.8h, v0.8h // ............................~...........'.........................................*............ - // uzp2 v27.8h, v9.8h, v11.8h // ...................................~....'................................................*..... - // st2 {v26.8h, v27.8h}, [x0], #32 // ........................................~.....................................................l +1: + // Instructions: 65 + // Expected cycles: 80 + // Expected IPC: 0.81 + + // Cycle bound: 80.0 + // IPC bound: 0.81 + + // Wall time: 11.64s + // User time: 11.64s + + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + ldr q20, [x1], #32 // *................................................................ + uzp1 v7.8H, v15.8H, v16.8H // ......*.......................................................... + uzp2 v15.8H, v15.8H, v16.8H // .......*......................................................... + uzp1 v8.8H, v20.8H, v30.8H // ..*.............................................................. + uzp2 v20.8H, v20.8H, v30.8H // ...*............................................................. + smull v30.4S, v8.4H, v15.4H // .............*................................................... + smull2 v15.4S, v8.8H, v15.8H // ..............*.................................................. + smull v11.4S, v8.4H, v7.4H // .........*....................................................... + smull2 v8.4S, v8.8H, v7.8H // ..........*...................................................... + smlal v30.4S, v20.4H, v7.4H // ...............*................................................. + smlal2 v15.4S, v20.8H, v7.8H // ................*................................................ + smlal v11.4S, v20.4H, v9.4H // ...........*..................................................... + smlal2 v8.4S, v20.8H, v9.8H // ............*.................................................... + uzp1 v7.8H, v21.8H, v6.8H // ...................*............................................. + uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................ + uzp1 v16.8H, v1.8H, v12.8H // .......................*......................................... + uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................ + smlal v11.4S, v7.4H, v16.4H // ..........................*...................................... + smlal2 v8.4S, v7.8H, v16.8H // ...........................*..................................... + smlal v30.4S, v7.4H, v9.4H // ..............................*.................................. + smlal2 v15.4S, v7.8H, v9.8H // ...............................*................................. + smlal v11.4S, v20.4H, v24.4H // ............................*.................................... + smlal2 v8.4S, v20.8H, v24.8H // .............................*................................... + smlal v30.4S, v20.4H, v16.4H // ................................*................................ + smlal2 v15.4S, v20.8H, v16.8H // .................................*............................... + uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................ + uzp2 v20.8H, v19.8H, v31.8H // .....................................*........................... + uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................ + uzp2 v9.8H, v17.8H, v18.8H // .........................................*....................... + smlal v11.4S, v7.4H, v16.4H // ...........................................*..................... + smlal2 v8.4S, v7.8H, v16.8H // ............................................*.................... + smlal v30.4S, v7.4H, v9.4H // ...............................................*................. + smlal2 v15.4S, v7.8H, v9.8H // ................................................*................ + smlal v11.4S, v20.4H, v25.4H // .............................................*................... + smlal2 v8.4S, v20.8H, v25.8H // ..............................................*.................. + smlal v30.4S, v20.4H, v16.4H // .................................................*............... + smlal2 v15.4S, v20.8H, v16.8H // ..................................................*.............. + ldr q16, [x2, #16] // .....e........................................................... + uzp1 v7.8H, v11.8H, v8.8H // ...................................................*............. + uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........ + mul v7.8H, v7.8H, v2.8H // ....................................................*............ + mul v20.8H, v20.8H, v2.8H // .........................................................*....... + zip2 v9.8H, v27.8H, v10.8H // ..............................................................l.. + zip1 v27.8H, v27.8H, v10.8H // .............................................................l... + smlal v11.4S, v7.4H, v0.4H // .....................................................*........... + smlal2 v8.4S, v7.8H, v0.8H // ......................................................*.......... + smlal v30.4S, v20.4H, v0.4H // ..........................................................*...... + smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*..... + str q27, [x0], #32 // ...............................................................l. + uzp2 v27.8H, v11.8H, v8.8H // .......................................................*......... + str q9, [x0, #-16] // ................................................................l + uzp2 v10.8H, v30.8H, v15.8H // ............................................................*.... + ldr q30, [x1, #16] // .e............................................................... + ldr q15, [x2], #32 // ....e............................................................ + ld1 {v9.8H}, [x3], #16 // ........e........................................................ + ldr q21, [x4], #32 // .................e............................................... + ldr q6, [x4, #-16] // ..................e.............................................. + ldr q1, [x5], #32 // .....................e........................................... + ldr q12, [x5, #-16] // ......................e.......................................... + ld1 {v24.8H}, [x6], #16 // .........................e....................................... + ldr q19, [x7], #32 // ..................................e.............................. + ldr q31, [x7, #-16] // ...................................e............................. + ldr q17, [x8], #32 // ......................................e.......................... + ldr q18, [x8, #-16] // .......................................e......................... + ld1 {v25.8H}, [x9], #16 // ..........................................e...................... + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q12, [x1], #32 // ............................*................................................................~.................................................. + // ldr q13, [x1, #-16] // ...............e............'...................................................~............'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~............................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~.............................................. + // ldr q12, [x2], #32 // ................e...........'....................................................~...........'.................................................. + // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~............. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~................................................. + // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................ + // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'.................................................. + // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~........................................... + // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~.......................................... + // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~....................................... + // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~...................................... + // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~............................................. + // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................ + // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~......................................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................ + // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'.................................................. + // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~.................................... + // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'.................................................. + // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~................................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~.................................. + // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................ + // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~............................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~............................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~.............................. + // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~........................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~.......................... + // ldr q12, [x7], #32 // .......................e....'...........................................................~....'.................................................. + // ldr q13, [x7, #-16] // ........................e...'............................................................~...'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~......................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................ + // ldr q12, [x8], #32 // .........................e..'.............................................................~..'.................................................. + // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~....................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~...................... + // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~..................... + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~.................... + // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~.................. + // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~............... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~.............. + // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............ + // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~.......... + // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~...... + // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~..... + // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~. + // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~........... + // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~......... + // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~.... + // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~... + // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'.................................................. + // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l....... + // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........ + // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l.. + // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l sub count, count, #1 - cbnz count, k3_loop_start - // Instructions: 36 - // Expected cycles: 45 - // Expected IPC: 0.80 - - // Cycle bound: 45.0 - // IPC bound: 0.80 - - // Wall time: 0.30s - // User time: 0.30s - - // ------------- cycle (expected) -------------> - // 0 25 - // |------------------------|------------------- - ld2 {v21.8H, v22.8H}, [x8], #32 // *............................................ - smull v10.4S, v11.4H, v24.4H // ..*.......................................... - smlal2 v13.4S, v6.8H, v25.8H // ...*......................................... - smlal2 v27.4S, v12.8H, v23.8H // ....*........................................ - smull v18.4S, v11.4H, v23.4H // .....*....................................... - smlal v10.4S, v12.4H, v23.4H // ......*...................................... - ld2 {v29.8H, v30.8H}, [x7], #32 // .......*..................................... - smlal2 v27.4S, v6.8H, v26.8H // .........*................................... - smlal v10.4S, v6.4H, v26.4H // ..........*.................................. - smlal v18.4S, v12.4H, v31.4H // ...........*................................. - smlal2 v13.4S, v7.8H, v14.8H // ............*................................ - smlal2 v27.4S, v7.8H, v25.8H // .............*............................... - smlal v10.4S, v7.4H, v25.4H // ..............*.............................. - smlal v18.4S, v6.4H, v25.4H // ...............*............................. - smlal2 v13.4S, v29.8H, v21.8H // ................*............................ - smlal2 v27.4S, v29.8H, v22.8H // .................*........................... - smlal v10.4S, v29.4H, v22.4H // ..................*.......................... - smlal v18.4S, v7.4H, v14.4H // ...................*......................... - ld1 {v3.8H}, [x9], #16 // ....................*........................ - smlal v10.4S, v30.4H, v21.4H // ......................*...................... - smlal v18.4S, v29.4H, v21.4H // .......................*..................... - smlal2 v27.4S, v30.8H, v21.8H // ........................*.................... - smlal2 v13.4S, v30.8H, v3.8H // .........................*................... - smlal v18.4S, v30.4H, v3.4H // ...........................*................. - uzp1 v8.8H, v10.8H, v27.8H // ............................*................ - mul v23.8H, v8.8H, v2.H[2] // ..............................*.............. - uzp1 v30.8H, v18.8H, v13.8H // ...............................*............. - mul v19.8H, v30.8H, v2.H[2] // .................................*........... - smlal2 v27.4S, v23.8H, v0.8H // ..................................*.......... - smlal v10.4S, v23.4H, v0.4H // ...................................*......... - smlal v18.4S, v19.4H, v0.4H // .....................................*....... - smlal2 v13.4S, v19.8H, v0.8H // ......................................*...... - st2 {v16.8H, v17.8H}, [x0], #32 // .......................................*..... - uzp2 v12.8H, v10.8H, v27.8H // .........................................*... - uzp2 v11.8H, v18.8H, v13.8H // ..........................................*.. - st2 {v11.8H, v12.8H}, [x0], #32 // ............................................* - - // ------------- cycle (expected) -------------> - // 0 25 - // |------------------------|------------------- - // st2 {v16.8H, v17.8H}, [x0], #32 // .......................................*..... - // smlal2 v13.4S, v6.8H, v25.8H // ...*......................................... - // smull v20.4S, v11.4H, v24.4H // ..*.......................................... - // smull v17.4S, v11.4H, v23.4H // .....*....................................... - // smlal2 v27.4S, v12.8H, v23.8H // ....*........................................ - // smlal2 v13.4S, v7.8H, v14.8H // ............*................................ - // smlal v20.4S, v12.4H, v23.4H // ......*...................................... - // smlal v17.4S, v12.4H, v31.4H // ...........*................................. - // smlal2 v27.4S, v6.8H, v26.8H // .........*................................... - // ld1 {v5.8H}, [x9], #16 // ....................*........................ - // smlal v17.4S, v6.4H, v25.4H // ...............*............................. - // smlal v20.4S, v6.4H, v26.4H // ..........*.................................. - // ld2 {v15.8H, v16.8H}, [x8], #32 // *............................................ - // smlal v17.4S, v7.4H, v14.4H // ...................*......................... - // ld2 {v8.8H, v9.8H}, [x7], #32 // .......*..................................... - // smlal v20.4S, v7.4H, v25.4H // ..............*.............................. - // smlal2 v27.4S, v7.8H, v25.8H // .............*............................... - // smlal2 v13.4S, v8.8H, v15.8H // ................*............................ - // smlal v17.4S, v8.4H, v15.4H // .......................*..................... - // smlal v20.4S, v8.4H, v16.4H // ..................*.......................... - // smlal2 v27.4S, v8.8H, v16.8H // .................*........................... - // smlal2 v13.4S, v9.8H, v5.8H // .........................*................... - // smlal v17.4S, v9.4H, v5.4H // ...........................*................. - // smlal v20.4S, v9.4H, v15.4H // ......................*...................... - // smlal2 v27.4S, v9.8H, v15.8H // ........................*.................... - // uzp1 v31.8H, v17.8H, v13.8H // ...............................*............. - // uzp1 v8.8H, v20.8H, v27.8H // ............................*................ - // mul v18.8H, v31.8H, v2.H[2] // .................................*........... - // mul v14.8H, v8.8H, v2.H[2] // ..............................*.............. - // smlal v17.4S, v18.4H, v0.4H // .....................................*....... - // smlal2 v27.4S, v14.8H, v0.8H // ..................................*.......... - // smlal2 v13.4S, v18.8H, v0.8H // ......................................*...... - // smlal v20.4S, v14.4H, v0.4H // ...................................*......... - // uzp2 v16.8H, v17.8H, v13.8H // ..........................................*.. - // uzp2 v17.8H, v20.8H, v27.8H // .........................................*... - // st2 {v16.8H, v17.8H}, [x0], #32 // ............................................* + cbnz count, 1b + // Instructions: 55 + // Expected cycles: 61 + // Expected IPC: 0.90 + + // Cycle bound: 61.0 + // IPC bound: 0.90 + + // Wall time: 8.41s + // User time: 8.41s + + // ----------------- original position ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + ldr q7, [x1], #32 // *...................................................... + uzp1 v20.8H, v15.8H, v16.8H // .*..................................................... + uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + uzp1 v23.8H, v7.8H, v30.8H // ...*................................................... + uzp2 v11.8H, v7.8H, v30.8H // ....*.................................................. + smull2 v8.4S, v23.8H, v20.8H // ........*.............................................. + smull v5.4S, v23.4H, v20.4H // .......*............................................... + smull2 v30.4S, v23.8H, v15.8H // ......*................................................ + uzp1 v28.8H, v1.8H, v12.8H // ...............*....................................... + smlal2 v8.4S, v11.8H, v9.8H // ............*.......................................... + smlal v5.4S, v11.4H, v9.4H // ...........*........................................... + uzp1 v3.8H, v21.8H, v6.8H // .............*......................................... + smull v16.4S, v23.4H, v15.4H // .....*................................................. + smlal2 v8.4S, v3.8H, v28.8H // ..................*.................................... + smlal v5.4S, v3.4H, v28.4H // .................*..................................... + uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................ + uzp1 v7.8H, v17.8H, v18.8H // ...........................*........................... + smlal2 v8.4S, v29.8H, v24.8H // ......................*................................ + uzp1 v14.8H, v19.8H, v31.8H // .........................*............................. + smlal v16.4S, v11.4H, v20.4H // .........*............................................. + smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................ + smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................ + uzp2 v20.8H, v1.8H, v12.8H // ................*...................................... + uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................ + smlal2 v30.4S, v3.8H, v20.8H // ....................*.................................. + smlal v16.4S, v3.4H, v20.4H // ...................*................................... + smlal v5.4S, v29.4H, v24.4H // .....................*................................. + uzp2 v9.8H, v17.8H, v18.8H // ............................*.......................... + smlal2 v30.4S, v29.8H, v28.8H // ........................*.............................. + smlal v16.4S, v29.4H, v28.4H // .......................*............................... + smlal v5.4S, v14.4H, v7.4H // .............................*......................... + smlal2 v8.4S, v21.8H, v25.8H // ..................................*.................... + smlal2 v30.4S, v14.8H, v9.8H // ................................*...................... + smlal v16.4S, v14.4H, v9.4H // ...............................*....................... + smlal v5.4S, v21.4H, v25.4H // .................................*..................... + zip1 v20.8H, v27.8H, v10.8H // ..........................................*............ + smlal2 v30.4S, v21.8H, v7.8H // ....................................*.................. + smlal v16.4S, v21.4H, v7.4H // ...................................*................... + uzp1 v7.8H, v5.8H, v8.8H // .....................................*................. + str q20, [x0], #32 // ...............................................*....... + mul v15.8H, v7.8H, v2.8H // .......................................*............... + uzp1 v7.8H, v16.8H, v30.8H // ......................................*................ + zip2 v31.8H, v27.8H, v10.8H // .........................................*............. + mul v20.8H, v7.8H, v2.8H // ........................................*.............. + smlal v5.4S, v15.4H, v0.4H // ...........................................*........... + smlal2 v8.4S, v15.8H, v0.8H // ............................................*.......... + str q31, [x0, #-16] // .................................................*..... + smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........ + smlal v16.4S, v20.4H, v0.4H // .............................................*......... + uzp2 v15.8H, v5.8H, v8.8H // ................................................*...... + uzp2 v20.8H, v16.8H, v30.8H // ..................................................*.... + zip1 v7.8H, v15.8H, v20.8H // ....................................................*.. + zip2 v20.8H, v15.8H, v20.8H // ...................................................*... + str q7, [x0], #32 // .....................................................*. + str q20, [x0, #-16] // ......................................................* + + // -------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // ldr q20, [x1], #32 // *...................................................... + // uzp1 v7.8H, v15.8H, v16.8H // .*..................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ...*................................................... + // uzp2 v20.8H, v20.8H, v30.8H // ....*.................................................. + // smull v30.4S, v8.4H, v15.4H // ............*.......................................... + // smull2 v15.4S, v8.8H, v15.8H // .......*............................................... + // smull v11.4S, v8.4H, v7.4H // ......*................................................ + // smull2 v8.4S, v8.8H, v7.8H // .....*................................................. + // smlal v30.4S, v20.4H, v7.4H // ...................*................................... + // smlal2 v15.4S, v20.8H, v7.8H // ....................*.................................. + // smlal v11.4S, v20.4H, v9.4H // ..........*............................................ + // smlal2 v8.4S, v20.8H, v9.8H // .........*............................................. + // uzp1 v7.8H, v21.8H, v6.8H // ...........*........................................... + // uzp2 v20.8H, v21.8H, v6.8H // ...............*....................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........*.............................................. + // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................ + // smlal v11.4S, v7.4H, v16.4H // ..............*........................................ + // smlal2 v8.4S, v7.8H, v16.8H // .............*......................................... + // smlal v30.4S, v7.4H, v9.4H // .........................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ........................*.............................. + // smlal v11.4S, v20.4H, v24.4H // ..........................*............................ + // smlal2 v8.4S, v20.8H, v24.8H // .................*..................................... + // smlal v30.4S, v20.4H, v16.4H // .............................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ............................*.......................... + // uzp1 v7.8H, v19.8H, v31.8H // ..................*.................................... + // uzp2 v20.8H, v19.8H, v31.8H // .......................*............................... + // uzp1 v16.8H, v17.8H, v18.8H // ................*...................................... + // uzp2 v9.8H, v17.8H, v18.8H // ...........................*........................... + // smlal v11.4S, v7.4H, v16.4H // ..............................*........................ + // smlal2 v8.4S, v7.8H, v16.8H // .....................*................................. + // smlal v30.4S, v7.4H, v9.4H // .................................*..................... + // smlal2 v15.4S, v7.8H, v9.8H // ................................*...................... + // smlal v11.4S, v20.4H, v25.4H // ..................................*.................... + // smlal2 v8.4S, v20.8H, v25.8H // ...............................*....................... + // smlal v30.4S, v20.4H, v16.4H // .....................................*................. + // smlal2 v15.4S, v20.8H, v16.8H // ....................................*.................. + // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................ + // uzp1 v20.8H, v30.8H, v15.8H // .........................................*............. + // mul v7.8H, v7.8H, v2.8H // ........................................*.............. + // mul v20.8H, v20.8H, v2.8H // ...........................................*........... + // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............ + // zip1 v27.8H, v27.8H, v10.8H // ...................................*................... + // smlal v11.4S, v7.4H, v0.4H // ............................................*.......... + // smlal2 v8.4S, v7.8H, v0.8H // .............................................*......... + // smlal v30.4S, v20.4H, v0.4H // ................................................*...... + // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*....... + // str q27, [x0], #32 // .......................................*............... + // uzp2 v27.8H, v11.8H, v8.8H // .................................................*..... + // str q9, [x0, #-16] // ..............................................*........ + // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*.... + // zip2 v9.8H, v27.8H, v10.8H // ....................................................*.. + // zip1 v27.8H, v27.8H, v10.8H // ...................................................*... + // str q27, [x0], #32 // .....................................................*. + // str q9, [x0, #-16] // ......................................................* pop_stack @@ -703,17 +1001,12 @@ k3_loop_start: #endif /* MLKEM_K == 3 */ #if MLKEM_K == 4 -.global MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -.global _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) +.global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt) -MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): -_MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): push_stack - - ASM_LOAD(xtmp, const_addr) - ld1 {consts.8h}, [xtmp] - ldrsh modulus, [xtmp] - dup constN.8h, modulus + ldr q_modulus, c_modulus + ldr q_modulus_twisted, c_modulus_twisted // Computed bases of vector entries @@ -727,394 +1020,561 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_opt): add b3_ptr, b0_ptr, #(3 * 512) add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) + // Bounds: + + // Each pmull is bound by 2*4096*2^15=2^28, so the final value + // before Montgomery reduction is bound by 2^30. + mov count, #(MLKEM_N / 16) - // Instructions: 18 - // Expected cycles: 27 - // Expected IPC: 0.67 - // - // Cycle bound: 27.0 - // IPC bound: 0.67 - // - // Wall time: 0.03s - // User time: 0.03s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ld2 {v3.8H, v4.8H}, [x1], #32 // *............................. - ld2 {v13.8H, v14.8H}, [x2], #32 // ..*........................... - ld1 {v31.8H}, [x3], #16 // ....*......................... - smull v9.4S, v3.4H, v14.4H // ......*....................... - smull2 v6.4S, v3.8H, v14.8H // .......*...................... - smull2 v29.4S, v3.8H, v13.8H // ........*..................... - smull v11.4S, v3.4H, v13.4H // .........*.................... - smlal v9.4S, v4.4H, v13.4H // ..........*................... - smlal2 v6.4S, v4.8H, v13.8H // ...........*.................. - smlal2 v29.4S, v4.8H, v31.8H // ............*................. - smlal v11.4S, v4.4H, v31.4H // .............*................ - ld2 {v27.8H, v28.8H}, [x7], #32 // ..............*............... - ld2 {v18.8H, v19.8H}, [x10], #32 // ................*............. - ld2 {v20.8H, v21.8H}, [x8], #32 // ..................*........... - ld2 {v25.8H, v26.8H}, [x11], #32 // ....................*......... - ld2 {v16.8H, v17.8H}, [x5], #32 // ......................*....... - ld2 {v7.8H, v8.8H}, [x1], #32 // ........................*..... - ld1 {v3.8H}, [x3], #16 // ..........................*... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ld2 {v7.8H, v8.8H}, [x1], #32 // *.............................. - // ld1 {v3.8H}, [x3], #16 // ....*.......................... - // ld2 {v4.8H, v5.8H}, [x2], #32 // ..*............................ - // ld2 {v27.8H, v28.8H}, [x7], #32 // ..............*................ - // ld2 {v18.8H, v19.8H}, [x10], #32 // ................*.............. - // smull v9.4S, v7.4H, v5.4H // ......*........................ - // ld2 {v20.8H, v21.8H}, [x8], #32 // ..................*............ - // smlal v9.4S, v8.4H, v4.4H // ..........*.................... - // smull2 v29.4S, v7.8H, v4.8H // ........*...................... - // smull2 v6.4S, v7.8H, v5.8H // .......*....................... - // ld2 {v25.8H, v26.8H}, [x11], #32 // ....................*.......... - // smull v11.4S, v7.4H, v4.4H // .........*..................... - // smlal2 v6.4S, v8.8H, v4.8H // ...........*................... - // smlal2 v29.4S, v8.8H, v3.8H // ............*.................. - // ld2 {v16.8H, v17.8H}, [x5], #32 // ......................*........ - // smlal v11.4S, v8.4H, v3.4H // .............*................. - // ld2 {v7.8H, v8.8H}, [x1], #32 // ........................*...... - // ld1 {v3.8H}, [x3], #16 // ..........................*.... + // Instructions: 114 + // Expected cycles: 153 + // Expected IPC: 0.75 + // + // Cycle bound: 153.0 + // IPC bound: 0.75 + // + // Wall time: 0.69s + // User time: 0.69s + // + // ----------------------------------------------- original position -----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + ldr q23, [x2, #16] // .*................................................................................................................ + ldr q19, [x2], #32 // *................................................................................................................. + ldr q17, [x5], #32 // ..*............................................................................................................... + uzp2 v13.8H, v19.8H, v23.8H // ..........*....................................................................................................... + uzp1 v19.8H, v19.8H, v23.8H // ...........*...................................................................................................... + ldr q23, [x5, #-16] // ...*.............................................................................................................. + ldr q30, [x1, #16] // .....*............................................................................................................ + uzp2 v9.8H, v17.8H, v23.8H // ....*............................................................................................................. + uzp1 v23.8H, v17.8H, v23.8H // .......*.......................................................................................................... + ldr q17, [x1], #32 // ......*........................................................................................................... + ldr q10, [x7, #16] // .............*.................................................................................................... + uzp1 v12.8H, v17.8H, v30.8H // ........*......................................................................................................... + uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................ + smull2 v30.4S, v12.8H, v13.8H // ............*..................................................................................................... + smull v13.4S, v12.4H, v13.4H // ............................................*..................................................................... + smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................ + smull v12.4S, v12.4H, v19.4H // ..........................................*....................................................................... + smlal2 v30.4S, v17.8H, v19.8H // ...............................*.................................................................................. + smlal v13.4S, v17.4H, v19.4H // ...............................................*.................................................................. + ldr q19, [x4], #32 // ....................*............................................................................................. + ldr q16, [x4, #-16] // .....................*............................................................................................ + ld1 {v8.8H}, [x3], #16 // ................................*................................................................................. + uzp1 v26.8H, v19.8H, v16.8H // .......................*.......................................................................................... + uzp2 v19.8H, v19.8H, v16.8H // ........................*......................................................................................... + smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................ + smlal v13.4S, v26.4H, v9.4H // ..................................................*............................................................... + smlal2 v22.4S, v17.8H, v8.8H // ........................................*......................................................................... + smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................ + smlal2 v30.4S, v19.8H, v23.8H // ...................................*.............................................................................. + smlal v13.4S, v19.4H, v23.4H // .......................................................*.......................................................... + smlal2 v22.4S, v26.8H, v23.8H // ...........................................*...................................................................... + smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................ + ldr q23, [x7], #32 // ......................*........................................................................................... + ldr q17, [x8, #16] // ..............*................................................................................................... + uzp1 v9.8H, v23.8H, v10.8H // ..........................*....................................................................................... + uzp2 v23.8H, v23.8H, v10.8H // ....................................*............................................................................. + ldr q10, [x10], #32 // ...............*.................................................................................................. + ldr q16, [x10, #-16] // ................*................................................................................................. + ld1 {v8.8H}, [x12], #16 // .................*................................................................................................ + uzp1 v26.8H, v10.8H, v16.8H // ..................*............................................................................................... + uzp2 v10.8H, v10.8H, v16.8H // ...................*.............................................................................................. + ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................ + ldr q3, [x11, #16] // ...........................*...................................................................................... + smlal2 v22.4S, v19.8H, v16.8H // ..............................................*................................................................... + smlal v12.4S, v19.4H, v16.4H // ........................................................*......................................................... + ldr q19, [x11], #32 // ............................*..................................................................................... + ld1 {v16.8H}, [x9], #16 // .............................*.................................................................................... + uzp1 v4.8H, v19.8H, v3.8H // ..................................*............................................................................... + uzp2 v19.8H, v19.8H, v3.8H // .......................................*.......................................................................... + ldr q3, [x8], #32 // ..............................*................................................................................... + ldr q31, [x2], #32 // ......................................*........................................................................... + uzp1 v6.8H, v3.8H, v17.8H // ...................................................*.............................................................. + uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................ + smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*....................................................... + smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*...................................................... + smlal v13.4S, v9.4H, v17.4H // ............................................................*..................................................... + smlal v12.4S, v9.4H, v6.4H // .............................................................*.................................................... + smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*................................................... + smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*.................................................. + smlal v13.4S, v23.4H, v6.4H // ................................................................*................................................. + smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................ + smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*............................................... + smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*.............................................. + smlal v13.4S, v26.4H, v19.4H // ....................................................................*............................................. + smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................ + smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*........................................... + smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*.......................................... + smlal v13.4S, v10.4H, v4.4H // ........................................................................*......................................... + smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................ + ldr q19, [x2, #-16] // .........................................*........................................................................ + uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*...................................... + uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*............................. + mul v23.8H, v23.8H, v2.8H // .............................................................................*.................................... + uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*................................. + uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*.............................. + mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................ + smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................ + smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*............................... + ldr q23, [x5], #32 // .............................................*.................................................................... + smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*...... + uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*........................... + smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*..... + ldr q17, [x5, #-16] // ................................................*................................................................. + ldr q13, [x1, #16] // ......................................................*........................................................... + uzp2 v27.8H, v23.8H, v17.8H // ....................................................*............................................................. + uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*..................................... + uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*.. + ldr q23, [x1], #32 // ..........................................................................*....................................... + zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................* + ldr q3, [x7, #16] // ........................................................................................*......................... + uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*................................... + uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*.................................. + smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*.......................... + ldr q6, [x8, #16] // .........................................................................................*........................ + ldr q23, [x10], #32 // ..........................................................................................*....................... + smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*....... + ldr q17, [x10, #-16] // ...........................................................................................*...................... + ld1 {v22.8H}, [x12], #16 // ............................................................................................*..................... + uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*.................... + uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*................... + ldr q23, [x4], #32 // ...............................................................................................*.................. + ldr q17, [x4, #-16] // ................................................................................................*................. + ldr q4, [x7], #32 // .................................................................................................*................ + uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*............... + uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*.............. + uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............ + smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*... + ld1 {v8.8H}, [x6], #16 // ....................................................................................................*............. + ldr q25, [x11, #16] // ......................................................................................................*........... + ldr q29, [x11], #32 // .......................................................................................................*.......... + ld1 {v12.8H}, [x9], #16 // ........................................................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*. + ldr q14, [x8], #32 // .........................................................................................................*........ + ld1 {v23.8H}, [x3], #16 // .............................................................................................................*.... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q3, [x2], #32 // .*................................................................................................................ + // ldr q17, [x2, #-16] // *................................................................................................................. + // ldr q21, [x5], #32 // ..*............................................................................................................... + // ldr q19, [x5, #-16] // .....*............................................................................................................ + // uzp2 v27.8H, v21.8H, v19.8H // .......*.......................................................................................................... + // ldr q25, [x1, #16] // ......*........................................................................................................... + // ldr q22, [x1], #32 // .........*........................................................................................................ + // uzp1 v28.8H, v21.8H, v19.8H // ........*......................................................................................................... + // uzp1 v31.8H, v22.8H, v25.8H // ...........*...................................................................................................... + // uzp2 v16.8H, v22.8H, v25.8H // ............*..................................................................................................... + // uzp2 v21.8H, v3.8H, v17.8H // ...*.............................................................................................................. + // uzp1 v19.8H, v3.8H, v17.8H // ....*............................................................................................................. + // smull2 v24.4S, v31.8H, v21.8H // .............*.................................................................................................... + // ldr q3, [x7, #16] // ..........*....................................................................................................... + // ldr q6, [x8, #16] // .................................*................................................................................ + // ldr q8, [x10], #32 // ....................................*............................................................................. + // ldr q26, [x10, #-16] // .....................................*............................................................................ + // ld1 {v22.8H}, [x12], #16 // ......................................*........................................................................... + // uzp1 v30.8H, v8.8H, v26.8H // .......................................*.......................................................................... + // uzp2 v11.8H, v8.8H, v26.8H // ........................................*......................................................................... + // ldr q8, [x4], #32 // ...................*.............................................................................................. + // ldr q26, [x4, #-16] // ....................*............................................................................................. + // ldr q4, [x7], #32 // ................................*................................................................................. + // uzp1 v20.8H, v8.8H, v26.8H // ......................*........................................................................................... + // uzp2 v26.8H, v8.8H, v26.8H // .......................*.......................................................................................... + // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................ + // uzp1 v9.8H, v4.8H, v3.8H // ..................................*............................................................................... + // ldr q25, [x11, #16] // ..........................................*....................................................................... + // ldr q29, [x11], #32 // .............................................*.................................................................... + // ld1 {v12.8H}, [x9], #16 // ..............................................*................................................................... + // ldr q14, [x8], #32 // .................................................*................................................................ + // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................ + // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................ + // smlal2 v24.4S, v20.8H, v27.8H // ........................*......................................................................................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*.................................................................. + // smlal2 v24.4S, v26.8H, v28.8H // ............................*..................................................................................... + // uzp2 v4.8H, v4.8H, v3.8H // ...................................*.............................................................................. + // smull2 v13.4S, v31.8H, v19.8H // ...............*.................................................................................................. + // ldr q3, [x2], #32 // ..................................................*............................................................... + // uzp2 v1.8H, v29.8H, v25.8H // ................................................*................................................................. + // smlal2 v13.4S, v16.8H, v23.8H // ..........................*....................................................................................... + // ldr q17, [x2, #-16] // .....................................................................*............................................ + // smull v18.4S, v31.4H, v19.4H // ................*................................................................................................. + // smlal2 v13.4S, v20.8H, v28.8H // ..............................*................................................................................... + // smull v29.4S, v31.4H, v21.4H // ..............*................................................................................................... + // ldr q21, [x5], #32 // ..............................................................................*................................... + // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*...................................................................... + // smlal v29.4S, v16.4H, v19.4H // ..................*............................................................................................... + // ldr q19, [x5, #-16] // ..................................................................................*............................... + // smlal v18.4S, v16.4H, v23.4H // ...........................*...................................................................................... + // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................ + // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*.............................................................. + // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*............................. + // smlal v18.4S, v20.4H, v28.4H // ...............................*.................................................................................. + // ldr q25, [x1, #16] // ...................................................................................*.............................. + // smlal v29.4S, v26.4H, v28.4H // .............................*.................................................................................... + // smlal v18.4S, v26.4H, v8.4H // ............................................*..................................................................... + // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*............................................................. + // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................ + // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*........................................................... + // smlal v29.4S, v9.4H, v26.4H // .......................................................*.......................................................... + // smlal v18.4S, v9.4H, v31.4H // ........................................................*......................................................... + // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................ + // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*....................................................... + // smlal v29.4S, v4.4H, v31.4H // ...........................................................*...................................................... + // smlal v18.4S, v4.4H, v12.4H // ............................................................*..................................................... + // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................................................... + // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*................................................... + // smlal v29.4S, v30.4H, v1.4H // ...............................................................*.................................................. + // smlal v18.4S, v30.4H, v10.4H // ................................................................*................................................. + // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................ + // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*............................................... + // smlal v29.4S, v11.4H, v10.4H // ...................................................................*.............................................. + // smlal v18.4S, v11.4H, v22.4H // ....................................................................*............................................. + // ldr q22, [x1], #32 // .......................................................................................*.......................... + // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*........................................... + // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................ + // mul v19.8H, v31.8H, v2.8H // ........................................................................*......................................... + // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*....................... + // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*...................... + // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................ + // smlal v29.4S, v19.4H, v0.4H // ............................................................................*..................................... + // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*.................................... + // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*....................................... + // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*.......................................... + // mul v23.8H, v26.8H, v2.8H // ...........................................................................*...................................... + // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*................................. + // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*..................... + // ldr q3, [x7, #16] // .........................................................................................*........................ + // ldr q6, [x8, #16] // .............................................................................................*.................... + // ldr q8, [x10], #32 // ..............................................................................................*................... + // ldr q26, [x10, #-16] // ................................................................................................*................. + // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................ + // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*............... + // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*.............. + // ldr q8, [x4], #32 // ....................................................................................................*............. + // ldr q26, [x4, #-16] // .....................................................................................................*............ + // ldr q4, [x7], #32 // ......................................................................................................*........... + // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*.......... + // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*......... + // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*...... + // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........ + // ldr q25, [x11, #16] // ............................................................................................................*..... + // ldr q29, [x11], #32 // .............................................................................................................*.... + // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*... + // ldr q14, [x8], #32 // ................................................................................................................*. + // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*.................. + // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*.................................. + // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................ + // ld1 {v23.8H}, [x3], #16 // .................................................................................................................* + // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*........................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*.. + // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*......................... sub count, count, #2 -k4_loop_start: - // Instructions: 55 - // Expected cycles: 68 - // Expected IPC: 0.81 - // - // Cycle bound: 68.0 - // IPC bound: 0.81 - // - // Wall time: 13.63s - // User time: 13.63s - // - // ------------------------ cycle (expected) -------------------------> - // 0 25 50 - // |------------------------|------------------------|----------------- - ld2 {v13.8H, v14.8H}, [x4], #32 // l................................................................... - ld1 {v15.8H}, [x6], #16 // ..l................................................................. - smlal2 v29.4S, v13.8H, v16.8H // ....l............................................................... - smlal v9.4S, v13.4H, v17.4H // .....l.............................................................. - smlal2 v6.4S, v13.8H, v17.8H // ......l............................................................. - smlal v11.4S, v13.4H, v16.4H // .......l............................................................ - smlal2 v29.4S, v14.8H, v15.8H // ........l........................................................... - smlal v9.4S, v14.4H, v16.4H // .........l.......................................................... - smlal2 v6.4S, v14.8H, v16.8H // ..........l......................................................... - ld1 {v30.8H}, [x9], #16 // ...........l........................................................ - smlal v9.4S, v27.4H, v21.4H // .............l...................................................... - smlal2 v6.4S, v27.8H, v21.8H // ..............l..................................................... - smlal v11.4S, v14.4H, v15.4H // ...............l.................................................... - smlal2 v29.4S, v27.8H, v20.8H // ................l................................................... - smlal v9.4S, v28.4H, v20.4H // .................l.................................................. - smlal2 v6.4S, v28.8H, v20.8H // ..................l................................................. - smlal v11.4S, v27.4H, v20.4H // ...................l................................................ - smlal2 v29.4S, v28.8H, v30.8H // ....................l............................................... - smlal v9.4S, v18.4H, v26.4H // .....................l.............................................. - smlal2 v6.4S, v18.8H, v26.8H // ......................l............................................. - smlal v11.4S, v28.4H, v30.4H // .......................l............................................ - smlal2 v29.4S, v18.8H, v25.8H // ........................l........................................... - smlal v9.4S, v19.4H, v25.4H // .........................l.......................................... - smlal2 v6.4S, v19.8H, v25.8H // ..........................l......................................... - ld1 {v23.8H}, [x12], #16 // ...........................l........................................ - smlal v11.4S, v18.4H, v25.4H // .............................l...................................... - uzp1 v14.8H, v9.8H, v6.8H // ..............................l..................................... - smlal2 v29.4S, v19.8H, v23.8H // ...............................l.................................... - mul v27.8H, v14.8H, v2.H[2] // ................................l................................... - smlal v11.4S, v19.4H, v23.4H // .................................l.................................. - ld2 {v4.8H, v5.8H}, [x2], #32 // ..................................*................................. - smlal v9.4S, v27.4H, v0.4H // ....................................l............................... - smlal2 v6.4S, v27.8H, v0.8H // .....................................l.............................. - ld2 {v27.8H, v28.8H}, [x7], #32 // ......................................*............................. - uzp1 v13.8H, v11.8H, v29.8H // ........................................l........................... - uzp2 v15.8H, v9.8H, v6.8H // .........................................l.......................... - mul v13.8H, v13.8H, v2.H[2] // ..........................................l......................... - ld2 {v18.8H, v19.8H}, [x10], #32 // ...........................................*........................ - smull v9.4S, v7.4H, v5.4H // .............................................*...................... - smlal v11.4S, v13.4H, v0.4H // ..............................................l..................... - smlal2 v29.4S, v13.8H, v0.8H // ...............................................l.................... - ld2 {v20.8H, v21.8H}, [x8], #32 // ................................................*................... - smlal v9.4S, v8.4H, v4.4H // ..................................................*................. - uzp2 v14.8H, v11.8H, v29.8H // ...................................................l................ - smull2 v29.4S, v7.8H, v4.8H // ....................................................*............... - smull2 v6.4S, v7.8H, v5.8H // .....................................................*.............. - ld2 {v25.8H, v26.8H}, [x11], #32 // ......................................................*............. - smull v11.4S, v7.4H, v4.4H // ........................................................*........... - smlal2 v6.4S, v8.8H, v4.8H // .........................................................*.......... - smlal2 v29.4S, v8.8H, v3.8H // ..........................................................*......... - ld2 {v16.8H, v17.8H}, [x5], #32 // ...........................................................*........ - smlal v11.4S, v8.4H, v3.4H // .............................................................*...... - ld2 {v7.8H, v8.8H}, [x1], #32 // ..............................................................e..... - ld1 {v3.8H}, [x3], #16 // ................................................................e... - st2 {v14.8H, v15.8H}, [x0], #32 // ..................................................................l. - - // ------------------------------------------------------------- cycle (expected) -------------------------------------------------------------> - // 0 25 50 75 100 125 - // |------------------------|------------------------|------------------------|------------------------|------------------------|--------------- - // ld2 {v3.8h, v4.8h}, [x1], #32 // e.....'.............................................................~.....'.............................................................~.... - // ld2 {v5.8h, v6.8h}, [x2], #32 // ......'.................................*.................................'.................................~................................ - // ld1 {v7.8h}, [x3], #16 // ..e...'...............................................................~...'...............................................................~.. - // smull v8.4s, v3.4h, v5.4h // ......'.......................................................*...........'.......................................................~.......... - // smull2 v10.4s, v3.8h, v5.8h // ......'...................................................*...............'...................................................~.............. - // smlal v8.4s, v4.4h, v7.4h // ......'............................................................*......'............................................................~..... - // smlal2 v10.4s, v4.8h, v7.8h // ......'.........................................................*.........'.........................................................~........ - // smull v9.4s, v3.4h, v6.4h // ......'............................................*......................'............................................~..................... - // smull2 v11.4s, v3.8h, v6.8h // ......'....................................................*..............'....................................................~............. - // smlal v9.4s, v4.4h, v5.4h // ......'.................................................*.................'.................................................~................ - // smlal2 v11.4s, v4.8h, v5.8h // ......'........................................................*..........'........................................................~......... - // ld2 {v3.8h, v4.8h}, [x4], #32 // ......~...................................................................l.................................................................. - // ld2 {v5.8h, v6.8h}, [x5], #32 // ......'..........................................................*........'..........................................................~....... - // ld1 {v7.8h}, [x6], #16 // ......'.~.................................................................'.l................................................................ - // smlal v8.4s, v3.4h, v5.4h // ......'......~............................................................'......l........................................................... - // smlal2 v10.4s, v3.8h, v5.8h // ......'...~...............................................................'...l.............................................................. - // smlal v8.4s, v4.4h, v7.4h // ......'..............~....................................................'..............l................................................... - // smlal2 v10.4s, v4.8h, v7.8h // ......'.......~...........................................................'.......l.......................................................... - // smlal v9.4s, v3.4h, v6.4h // ......'....~..............................................................'....l............................................................. - // smlal2 v11.4s, v3.8h, v6.8h // ......'.....~.............................................................'.....l............................................................ - // smlal v9.4s, v4.4h, v5.4h // ......'........~..........................................................'........l......................................................... - // smlal2 v11.4s, v4.8h, v5.8h // ......'.........~.........................................................'.........l........................................................ - // ld2 {v3.8h, v4.8h}, [x7], #32 // ......'.....................................*.............................'.....................................~............................ - // ld2 {v5.8h, v6.8h}, [x8], #32 // ......'...............................................*...................'...............................................~.................. - // ld1 {v7.8h}, [x9], #16 // ......'..........~........................................................'..........l....................................................... - // smlal v8.4s, v3.4h, v5.4h // ......'..................~................................................'..................l............................................... - // smlal2 v10.4s, v3.8h, v5.8h // ......'...............~...................................................'...............l.................................................. - // smlal v8.4s, v4.4h, v7.4h // ......'......................~............................................'......................l........................................... - // smlal2 v10.4s, v4.8h, v7.8h // ......'...................~...............................................'...................l.............................................. - // smlal v9.4s, v3.4h, v6.4h // ......'............~......................................................'............l..................................................... - // smlal2 v11.4s, v3.8h, v6.8h // ......'.............~.....................................................'.............l.................................................... - // smlal v9.4s, v4.4h, v5.4h // ......'................~..................................................'................l................................................. - // smlal2 v11.4s, v4.8h, v5.8h // ......'.................~.................................................'.................l................................................ - // ld2 {v3.8h, v4.8h}, [x10], #32 // ......'..........................................*........................'..........................................~....................... - // ld2 {v5.8h, v6.8h}, [x11], #32 // ......'.....................................................*.............'.....................................................~............ - // ld1 {v7.8h}, [x12], #16 // ......'..........................~........................................'..........................l....................................... - // smlal v8.4s, v3.4h, v5.4h // ......'............................~......................................'............................l..................................... - // smlal2 v10.4s, v3.8h, v5.8h // ......'.......................~...........................................'.......................l.......................................... - // smlal v8.4s, v4.4h, v7.4h // ......'................................~..................................'................................l................................. - // smlal2 v10.4s, v4.8h, v7.8h // ......'..............................~....................................'..............................l................................... - // smlal v9.4s, v3.4h, v6.4h // ......'....................~..............................................'....................l............................................. - // smlal2 v11.4s, v3.8h, v6.8h // ......'.....................~.............................................'.....................l............................................ - // smlal v9.4s, v4.4h, v5.4h // ......'........................~..........................................'........................l......................................... - // smlal2 v11.4s, v4.8h, v5.8h // ......'.........................~.........................................'.........................l........................................ - // uzp1 v28.8h, v8.8h, v10.8h // ......'.......................................~...........................'.......................................l.......................... - // mul v28.8h, v28.8h, v2.h[2] // ......'.........................................~.........................'.........................................l........................ - // smlal v8.4s, v28.4h, v0.4h // ......'.............................................~.....................'.............................................l.................... - // smlal2 v10.4s, v28.8h, v0.8h // ......'..............................................~....................'..............................................l................... - // uzp2 v26.8h, v8.8h, v10.8h // ......'..................................................~................'..................................................l............... - // uzp1 v28.8h, v9.8h, v11.8h // ......'.............................~.....................................'.............................l.................................... - // mul v28.8h, v28.8h, v2.h[2] // ......'...............................~...................................'...............................l.................................. - // smlal v9.4s, v28.4h, v0.4h // ......'...................................~...............................'...................................l.............................. - // smlal2 v11.4s, v28.8h, v0.8h // ......'....................................~..............................'....................................l............................. - // uzp2 v27.8h, v9.8h, v11.8h // ......'........................................~..........................'........................................l......................... - // st2 {v26.8h, v27.8h}, [x0], #32 // ....~.'.................................................................~.'.................................................................l +1: + // Instructions: 82 + // Expected cycles: 102 + // Expected IPC: 0.80 + // + // Cycle bound: 102.0 + // IPC bound: 0.80 + // + // Wall time: 15.93s + // User time: 15.93s + // + // ------------------------------- original position -------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................ + uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................ + smull2 v13.4S, v31.8H, v19.8H // ..........*....................................................................... + ldr q3, [x2], #32 // ....e............................................................................. + uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*....................... + smlal2 v13.4S, v16.8H, v23.8H // ............*..................................................................... + ldr q17, [x2, #-16] // .....e............................................................................ + smull v18.4S, v31.4H, v19.4H // .........*........................................................................ + smlal2 v13.4S, v20.8H, v28.8H // ...........................*...................................................... + smull v29.4S, v31.4H, v21.4H // .............*.................................................................... + ldr q21, [x5], #32 // .....................e............................................................ + smlal2 v13.4S, v26.8H, v8.8H // .............................*.................................................... + smlal v29.4S, v16.4H, v19.4H // ...............*.................................................................. + ldr q19, [x5, #-16] // ......................e........................................................... + smlal v18.4S, v16.4H, v23.4H // ...........*...................................................................... + smlal v29.4S, v20.4H, v27.4H // ..............................*................................................... + uzp1 v31.8H, v14.8H, v6.8H // ........................................*......................................... + uzp2 v27.8H, v21.8H, v19.8H // ........................e......................................................... + smlal v18.4S, v20.4H, v28.4H // ..........................*....................................................... + ldr q25, [x1, #16] // .e................................................................................ + smlal v29.4S, v26.4H, v28.4H // ................................*................................................. + smlal v18.4S, v26.4H, v8.4H // ............................*..................................................... + uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................ + smlal2 v13.4S, v9.8H, v31.8H // ............................................*..................................... + smlal2 v24.4S, v9.8H, v26.8H // ................................................*................................. + smlal v29.4S, v9.4H, v26.4H // ...............................................*.................................. + smlal v18.4S, v9.4H, v31.4H // ...........................................*...................................... + smlal2 v13.4S, v4.8H, v12.8H // ..............................................*................................... + smlal2 v24.4S, v4.8H, v31.8H // ..................................................*............................... + smlal v29.4S, v4.4H, v31.4H // .................................................*................................ + smlal v18.4S, v4.4H, v12.4H // .............................................*.................................... + smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................... + smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................ + smlal v29.4S, v30.4H, v1.4H // ................................................................*................. + smlal v18.4S, v30.4H, v10.4H // ............................................................*..................... + smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*.................. + smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*.............. + smlal v29.4S, v11.4H, v10.4H // ..................................................................*............... + smlal v18.4S, v11.4H, v22.4H // ..............................................................*................... + ldr q22, [x1], #32 // e................................................................................. + uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........ + uzp1 v28.8H, v21.8H, v19.8H // .......................e.......................................................... + mul v19.8H, v31.8H, v2.8H // ..........................................................................*....... + uzp1 v31.8H, v22.8H, v25.8H // ..e............................................................................... + uzp2 v16.8H, v22.8H, v25.8H // ...e.............................................................................. + uzp2 v21.8H, v3.8H, v17.8H // .......e.......................................................................... + smlal v29.4S, v19.4H, v0.4H // ...........................................................................*...... + smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*..... + uzp1 v19.8H, v3.8H, v17.8H // ......e........................................................................... + uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*............. + zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l.. + mul v23.8H, v26.8H, v2.8H // .....................................................................*............ + uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*.... + smull2 v24.4S, v31.8H, v21.8H // ..............e................................................................... + str q14, [x0, #16] // .................................................................................l + ldr q3, [x7, #16] // ...................................e.............................................. + ldr q6, [x8, #16] // .......................................e.......................................... + ldr q8, [x10], #32 // ...................................................e.............................. + ldr q26, [x10, #-16] // ....................................................e............................. + ld1 {v22.8H}, [x12], #16 // ...........................................................e...................... + uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................ + uzp2 v11.8H, v8.8H, v26.8H // ......................................................e........................... + ldr q8, [x4], #32 // .................e................................................................ + ldr q26, [x4, #-16] // ..................e............................................................... + ldr q4, [x7], #32 // ..................................e............................................... + uzp1 v20.8H, v8.8H, v26.8H // ...................e.............................................................. + uzp2 v26.8H, v8.8H, v26.8H // ....................e............................................................. + ld1 {v8.8H}, [x6], #16 // .........................e........................................................ + uzp1 v9.8H, v4.8H, v3.8H // ....................................e............................................. + ldr q25, [x11, #16] // ........................................................e......................... + ldr q29, [x11], #32 // .......................................................e.......................... + ld1 {v12.8H}, [x9], #16 // ..........................................e....................................... + ldr q14, [x8], #32 // ......................................e........................................... + smlal2 v24.4S, v16.8H, v19.8H // ................e................................................................. + smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*.......... + smlal v18.4S, v23.4H, v0.4H // ......................................................................*........... + ld1 {v23.8H}, [x3], #16 // ........e......................................................................... + smlal2 v24.4S, v20.8H, v27.8H // ...............................e.................................................. + uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................ + str q5, [x0], #32 // ................................................................................l. + zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*... + + // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~......................................... + // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~............................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~.................................... + // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~............................................................................. + // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~.......................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................ + // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~................................... + // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~.... + // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~......................................................................... + // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~.............................................................................. + // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~.................................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~........................................................................... + // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~....................................................................... + // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~........................... + // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~.................................................................... + // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~....... + // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~.................. + // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~................. + // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~............... + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~.............. + // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~...................................................................... + // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~....................................... + // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~............................................................... + // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~............. + // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~.............................................................. + // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................ + // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~........................................................... + // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~..................................................................... + // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~................................................................. + // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~... + // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................ + // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................ + // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................ + // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~......................... + // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............ + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~............................................................................... + // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........ + // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................ + // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................ + // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~.......................................................... + // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~......... + // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~...................................................... + // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~......................................................... + // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~.................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~..................................................... + // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~....................................................... + // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................ + // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~................................................... + // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~.................................................... + // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~....................... + // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~...................... + // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~.................... + // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~................... + // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~.......... + // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~........... + // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~. + // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................ + // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~..................... + // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~.............................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~................................................. + // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~.......................................... + // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~............................................. + // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~............................................... + // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................ + // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~........................................... + // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................ + // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~............................... + // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~............................. + // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~..... + // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~...... + // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~.. + // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................ + // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~...................................... + // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~.................................. + // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~................................. + // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................ + // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................ + // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l.............................. + // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l + // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l.......................... sub count, count, #1 - cbnz count, k4_loop_start - // Instructions: 92 - // Expected cycles: 107 - // Expected IPC: 0.86 - // - // Cycle bound: 107.0 - // IPC bound: 0.86 - // - // Wall time: 15.26s - // User time: 15.26s - // - // -------------------------------------------- cycle (expected) --------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------ - ld2 {v14.8H, v15.8H}, [x4], #32 // *.......................................................................................................... - ld1 {v1.8H}, [x6], #16 // ..*........................................................................................................ - smlal2 v29.4S, v14.8H, v16.8H // ....*...................................................................................................... - smlal v11.4S, v14.4H, v16.4H // .....*..................................................................................................... - ld2 {v12.8H, v13.8H}, [x5], #32 // ......*.................................................................................................... - smlal2 v29.4S, v15.8H, v1.8H // ........*.................................................................................................. - smlal v11.4S, v15.4H, v1.4H // .........*................................................................................................. - ld1 {v10.8H}, [x9], #16 // ..........*................................................................................................ - smlal2 v29.4S, v27.8H, v20.8H // ............*.............................................................................................. - smlal v11.4S, v27.4H, v20.4H // .............*............................................................................................. - ld2 {v22.8H, v23.8H}, [x2], #32 // ..............*............................................................................................ - smlal2 v29.4S, v28.8H, v10.8H // ................*.......................................................................................... - smlal v11.4S, v28.4H, v10.4H // .................*......................................................................................... - ld1 {v31.8H}, [x12], #16 // ..................*........................................................................................ - smlal2 v29.4S, v18.8H, v25.8H // ....................*...................................................................................... - smlal v11.4S, v18.4H, v25.4H // .....................*..................................................................................... - smull2 v1.4S, v7.8H, v23.8H // ......................*.................................................................................... - smlal v9.4S, v14.4H, v17.4H // .......................*................................................................................... - smlal2 v29.4S, v19.8H, v31.8H // ........................*.................................................................................. - smlal v11.4S, v19.4H, v31.4H // .........................*................................................................................. - smull v31.4S, v7.4H, v23.4H // ..........................*................................................................................ - smlal v9.4S, v15.4H, v16.4H // ...........................*............................................................................... - smull v10.4S, v7.4H, v22.4H // ............................*.............................................................................. - smlal2 v1.4S, v8.8H, v22.8H // .............................*............................................................................. - smlal v31.4S, v8.4H, v22.4H // ..............................*............................................................................ - ld2 {v4.8H, v5.8H}, [x4], #32 // ...............................*........................................................................... - ld2 {v23.8H, v24.8H}, [x11], #32 // .................................*......................................................................... - smlal2 v1.4S, v4.8H, v13.8H // ...................................*....................................................................... - smlal v31.4S, v4.4H, v13.4H // ....................................*...................................................................... - ld1 {v30.8H}, [x6], #16 // .....................................*..................................................................... - smlal2 v1.4S, v5.8H, v12.8H // .......................................*................................................................... - smlal v10.4S, v8.4H, v3.4H // ........................................*.................................................................. - smull2 v7.4S, v7.8H, v22.8H // .........................................*................................................................. - uzp1 v22.8H, v11.8H, v29.8H // ..........................................*................................................................ - smlal2 v6.4S, v14.8H, v17.8H // ...........................................*............................................................... - smlal v10.4S, v4.4H, v12.4H // ............................................*.............................................................. - smlal2 v7.4S, v8.8H, v3.8H // .............................................*............................................................. - ld2 {v13.8H, v14.8H}, [x7], #32 // ..............................................*............................................................ - mul v17.8H, v22.8H, v2.H[2] // ................................................*.......................................................... - smlal2 v7.4S, v4.8H, v12.8H // .................................................*......................................................... - smlal v10.4S, v5.4H, v30.4H // ..................................................*........................................................ - smlal2 v6.4S, v15.8H, v16.8H // ...................................................*....................................................... - smlal v11.4S, v17.4H, v0.4H // ....................................................*...................................................... - ld2 {v3.8H, v4.8H}, [x8], #32 // .....................................................*..................................................... - smlal2 v7.4S, v5.8H, v30.8H // .......................................................*................................................... - ld1 {v30.8H}, [x9], #16 // ........................................................*.................................................. - smlal v10.4S, v13.4H, v3.4H // ..........................................................*................................................ - smlal2 v7.4S, v13.8H, v3.8H // ...........................................................*............................................... - ld2 {v15.8H, v16.8H}, [x10], #32 // ............................................................*.............................................. - smlal v10.4S, v14.4H, v30.4H // ..............................................................*............................................ - smlal v9.4S, v27.4H, v21.4H // ...............................................................*........................................... - smlal2 v7.4S, v14.8H, v30.8H // ................................................................*.......................................... - smlal2 v6.4S, v27.8H, v21.8H // .................................................................*......................................... - smlal v10.4S, v15.4H, v23.4H // ..................................................................*........................................ - ld1 {v27.8H}, [x12], #16 // ...................................................................*....................................... - smlal2 v7.4S, v15.8H, v23.8H // .....................................................................*..................................... - smlal v31.4S, v5.4H, v12.4H // ......................................................................*.................................... - smlal2 v6.4S, v28.8H, v20.8H // .......................................................................*................................... - smlal v10.4S, v16.4H, v27.4H // ........................................................................*.................................. - smlal2 v7.4S, v16.8H, v27.8H // .........................................................................*................................. - smlal v31.4S, v13.4H, v4.4H // ..........................................................................*................................ - smlal2 v6.4S, v18.8H, v26.8H // ...........................................................................*............................... - smlal2 v1.4S, v13.8H, v4.8H // ............................................................................*.............................. - smlal v9.4S, v28.4H, v20.4H // .............................................................................*............................. - smlal v31.4S, v14.4H, v3.4H // ..............................................................................*............................ - uzp1 v13.8H, v10.8H, v7.8H // ...............................................................................*........................... - smlal2 v1.4S, v14.8H, v3.8H // ................................................................................*.......................... - smlal v9.4S, v18.4H, v26.4H // .................................................................................*......................... - smlal v31.4S, v15.4H, v24.4H // ..................................................................................*........................ - smlal2 v6.4S, v19.8H, v25.8H // ...................................................................................*....................... - smlal2 v1.4S, v15.8H, v24.8H // ....................................................................................*...................... - smlal v9.4S, v19.4H, v25.4H // .....................................................................................*..................... - smlal v31.4S, v16.4H, v23.4H // ......................................................................................*.................... - mul v13.8H, v13.8H, v2.H[2] // .......................................................................................*................... - smlal2 v1.4S, v16.8H, v23.8H // ........................................................................................*.................. - uzp1 v23.8H, v9.8H, v6.8H // .........................................................................................*................. - smlal2 v29.4S, v17.8H, v0.8H // ..........................................................................................*................ - mul v12.8H, v23.8H, v2.H[2] // ...........................................................................................*............... - uzp1 v4.8H, v31.8H, v1.8H // ............................................................................................*.............. - smlal v10.4S, v13.4H, v0.4H // .............................................................................................*............. - mul v23.8H, v4.8H, v2.H[2] // ..............................................................................................*............ - smlal v9.4S, v12.4H, v0.4H // ...............................................................................................*........... - smlal2 v7.4S, v13.8H, v0.8H // ................................................................................................*.......... - smlal2 v6.4S, v12.8H, v0.8H // .................................................................................................*......... - smlal v31.4S, v23.4H, v0.4H // ..................................................................................................*........ - smlal2 v1.4S, v23.8H, v0.8H // ...................................................................................................*....... - uzp2 v12.8H, v10.8H, v7.8H // ....................................................................................................*...... - uzp2 v25.8H, v9.8H, v6.8H // .....................................................................................................*..... - uzp2 v24.8H, v11.8H, v29.8H // ......................................................................................................*.... - uzp2 v13.8H, v31.8H, v1.8H // .......................................................................................................*... - st2 {v24.8H, v25.8H}, [x0], #32 // ........................................................................................................*.. - st2 {v12.8H, v13.8H}, [x0], #32 // ..........................................................................................................* - - // -------------------------------------------- cycle (expected) --------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------ - // ld2 {v13.8H, v14.8H}, [x4], #32 // *.......................................................................................................... - // ld1 {v15.8H}, [x6], #16 // ..*........................................................................................................ - // smlal2 v29.4S, v13.8H, v16.8H // ....*...................................................................................................... - // smlal v9.4S, v13.4H, v17.4H // .......................*................................................................................... - // smlal2 v6.4S, v13.8H, v17.8H // ...........................................*............................................................... - // smlal v11.4S, v13.4H, v16.4H // .....*..................................................................................................... - // smlal2 v29.4S, v14.8H, v15.8H // ........*.................................................................................................. - // smlal v9.4S, v14.4H, v16.4H // ...........................*............................................................................... - // smlal2 v6.4S, v14.8H, v16.8H // ...................................................*....................................................... - // ld1 {v30.8H}, [x9], #16 // ..........*................................................................................................ - // smlal v9.4S, v27.4H, v21.4H // ...............................................................*........................................... - // smlal2 v6.4S, v27.8H, v21.8H // .................................................................*......................................... - // smlal v11.4S, v14.4H, v15.4H // .........*................................................................................................. - // smlal2 v29.4S, v27.8H, v20.8H // ............*.............................................................................................. - // smlal v9.4S, v28.4H, v20.4H // .............................................................................*............................. - // smlal2 v6.4S, v28.8H, v20.8H // .......................................................................*................................... - // smlal v11.4S, v27.4H, v20.4H // .............*............................................................................................. - // smlal2 v29.4S, v28.8H, v30.8H // ................*.......................................................................................... - // smlal v9.4S, v18.4H, v26.4H // .................................................................................*......................... - // smlal2 v6.4S, v18.8H, v26.8H // ...........................................................................*............................... - // smlal v11.4S, v28.4H, v30.4H // .................*......................................................................................... - // smlal2 v29.4S, v18.8H, v25.8H // ....................*...................................................................................... - // smlal v9.4S, v19.4H, v25.4H // .....................................................................................*..................... - // smlal2 v6.4S, v19.8H, v25.8H // ...................................................................................*....................... - // ld1 {v23.8H}, [x12], #16 // ..................*........................................................................................ - // smlal v11.4S, v18.4H, v25.4H // .....................*..................................................................................... - // uzp1 v14.8H, v9.8H, v6.8H // .........................................................................................*................. - // smlal2 v29.4S, v19.8H, v23.8H // ........................*.................................................................................. - // mul v27.8H, v14.8H, v2.H[2] // ...........................................................................................*............... - // smlal v11.4S, v19.4H, v23.4H // .........................*................................................................................. - // ld2 {v4.8H, v5.8H}, [x2], #32 // ..............*............................................................................................ - // smlal v9.4S, v27.4H, v0.4H // ...............................................................................................*........... - // smlal2 v6.4S, v27.8H, v0.8H // .................................................................................................*......... - // ld2 {v27.8H, v28.8H}, [x7], #32 // ..............................................*............................................................ - // uzp1 v13.8H, v11.8H, v29.8H // ..........................................*................................................................ - // uzp2 v15.8H, v9.8H, v6.8H // .....................................................................................................*..... - // mul v13.8H, v13.8H, v2.H[2] // ................................................*.......................................................... - // ld2 {v18.8H, v19.8H}, [x10], #32 // ............................................................*.............................................. - // smull v9.4S, v7.4H, v5.4H // ..........................*................................................................................ - // smlal v11.4S, v13.4H, v0.4H // ....................................................*...................................................... - // smlal2 v29.4S, v13.8H, v0.8H // ..........................................................................................*................ - // ld2 {v20.8H, v21.8H}, [x8], #32 // .....................................................*..................................................... - // smlal v9.4S, v8.4H, v4.4H // ..............................*............................................................................ - // uzp2 v14.8H, v11.8H, v29.8H // ......................................................................................................*.... - // smull2 v29.4S, v7.8H, v4.8H // .........................................*................................................................. - // smull2 v6.4S, v7.8H, v5.8H // ......................*.................................................................................... - // ld2 {v25.8H, v26.8H}, [x11], #32 // .................................*......................................................................... - // smull v11.4S, v7.4H, v4.4H // ............................*.............................................................................. - // smlal2 v6.4S, v8.8H, v4.8H // .............................*............................................................................. - // smlal2 v29.4S, v8.8H, v3.8H // .............................................*............................................................. - // ld2 {v16.8H, v17.8H}, [x5], #32 // ......*.................................................................................................... - // smlal v11.4S, v8.4H, v3.4H // ........................................*.................................................................. - // st2 {v14.8H, v15.8H}, [x0], #32 // ........................................................................................................*.. - // ld2 {v13.8H, v14.8H}, [x4], #32 // ...............................*........................................................................... - // ld1 {v15.8H}, [x6], #16 // .....................................*..................................................................... - // smlal2 v29.4S, v13.8H, v16.8H // .................................................*......................................................... - // smlal v9.4S, v13.4H, v17.4H // ....................................*...................................................................... - // smlal2 v6.4S, v13.8H, v17.8H // ...................................*....................................................................... - // smlal v11.4S, v13.4H, v16.4H // ............................................*.............................................................. - // smlal2 v29.4S, v14.8H, v15.8H // .......................................................*................................................... - // smlal v9.4S, v14.4H, v16.4H // ......................................................................*.................................... - // smlal2 v6.4S, v14.8H, v16.8H // .......................................*................................................................... - // ld1 {v30.8H}, [x9], #16 // ........................................................*.................................................. - // smlal v9.4S, v27.4H, v21.4H // ..........................................................................*................................ - // smlal2 v6.4S, v27.8H, v21.8H // ............................................................................*.............................. - // smlal v11.4S, v14.4H, v15.4H // ..................................................*........................................................ - // smlal2 v29.4S, v27.8H, v20.8H // ...........................................................*............................................... - // smlal v9.4S, v28.4H, v20.4H // ..............................................................................*............................ - // smlal2 v6.4S, v28.8H, v20.8H // ................................................................................*.......................... - // smlal v11.4S, v27.4H, v20.4H // ..........................................................*................................................ - // smlal2 v29.4S, v28.8H, v30.8H // ................................................................*.......................................... - // smlal v9.4S, v18.4H, v26.4H // ..................................................................................*........................ - // smlal2 v6.4S, v18.8H, v26.8H // ....................................................................................*...................... - // smlal v11.4S, v28.4H, v30.4H // ..............................................................*............................................ - // smlal2 v29.4S, v18.8H, v25.8H // .....................................................................*..................................... - // smlal v9.4S, v19.4H, v25.4H // ......................................................................................*.................... - // smlal2 v6.4S, v19.8H, v25.8H // ........................................................................................*.................. - // ld1 {v23.8H}, [x12], #16 // ...................................................................*....................................... - // smlal v11.4S, v18.4H, v25.4H // ..................................................................*........................................ - // uzp1 v14.8H, v9.8H, v6.8H // ............................................................................................*.............. - // smlal2 v29.4S, v19.8H, v23.8H // .........................................................................*................................. - // mul v27.8H, v14.8H, v2.H[2] // ..............................................................................................*............ - // smlal v11.4S, v19.4H, v23.4H // ........................................................................*.................................. - // smlal v9.4S, v27.4H, v0.4H // ..................................................................................................*........ - // smlal2 v6.4S, v27.8H, v0.8H // ...................................................................................................*....... - // uzp1 v13.8H, v11.8H, v29.8H // ...............................................................................*........................... - // uzp2 v15.8H, v9.8H, v6.8H // .......................................................................................................*... - // mul v13.8H, v13.8H, v2.H[2] // .......................................................................................*................... - // smlal v11.4S, v13.4H, v0.4H // .............................................................................................*............. - // smlal2 v29.4S, v13.8H, v0.8H // ................................................................................................*.......... - // uzp2 v14.8H, v11.8H, v29.8H // ....................................................................................................*...... - // st2 {v14.8H, v15.8H}, [x0], #32 // ..........................................................................................................* + cbnz count, 1b + // Instructions: 50 + // Expected cycles: 56 + // Expected IPC: 0.89 + // + // Cycle bound: 56.0 + // IPC bound: 0.89 + // + // Wall time: 4.16s + // User time: 4.16s + // + // --------------- original position ---------------> + // 0 25 + // |------------------------| + smull2 v17.4S, v31.8H, v19.8H // ..*............................................... + uzp2 v1.8H, v14.8H, v6.8H // ................*................................. + smull v18.4S, v31.4H, v21.4H // .......*.......................................... + smlal2 v24.4S, v26.8H, v28.8H // *................................................. + smlal2 v17.4S, v16.8H, v23.8H // ....*............................................. + smull v21.4S, v31.4H, v19.4H // .....*............................................ + smlal v18.4S, v16.4H, v19.4H // .........*........................................ + uzp2 v31.8H, v4.8H, v3.8H // .*................................................ + uzp1 v3.8H, v14.8H, v6.8H // ............*..................................... + smlal v21.4S, v16.4H, v23.4H // ..........*....................................... + smlal v18.4S, v20.4H, v27.4H // ...........*...................................... + uzp2 v14.8H, v29.8H, v25.8H // ...*.............................................. + smlal2 v17.4S, v20.8H, v28.8H // ......*........................................... + smlal v21.4S, v20.4H, v28.4H // .............*.................................... + smlal v18.4S, v26.4H, v28.4H // ..............*................................... + smlal2 v24.4S, v9.8H, v1.8H // ..................*............................... + smlal2 v17.4S, v26.8H, v8.8H // ........*......................................... + smlal v21.4S, v26.4H, v8.4H // ...............*.................................. + smlal v18.4S, v9.4H, v1.4H // ...................*.............................. + smlal2 v24.4S, v31.8H, v3.8H // ......................*........................... + smlal2 v17.4S, v9.8H, v3.8H // .................*................................ + smlal v21.4S, v9.4H, v3.4H // ....................*............................. + smlal v18.4S, v31.4H, v3.4H // .......................*.......................... + smlal2 v24.4S, v30.8H, v14.8H // ..........................*....................... + smlal2 v17.4S, v31.8H, v12.8H // .....................*............................ + smlal v21.4S, v31.4H, v12.4H // ........................*......................... + smlal v18.4S, v30.4H, v14.4H // ...........................*...................... + smlal2 v24.4S, v11.8H, v10.8H // ..............................*................... + smlal2 v17.4S, v30.8H, v10.8H // .........................*........................ + smlal v21.4S, v30.4H, v10.4H // ............................*..................... + smlal v18.4S, v11.4H, v10.4H // ...............................*.................. + zip2 v19.8H, v7.8H, v15.8H // ......................................*........... + smlal2 v17.4S, v11.8H, v22.8H // .............................*.................... + smlal v21.4S, v11.4H, v22.4H // ................................*................. + uzp1 v23.8H, v18.8H, v24.8H // .................................*................ + str q19, [x0, #16] // .........................................*........ + mul v19.8H, v23.8H, v2.8H // ..................................*............... + uzp1 v23.8H, v21.8H, v17.8H // .....................................*............ + str q5, [x0], #32 // .............................................*.... + mul v26.8H, v23.8H, v2.8H // .......................................*.......... + smlal v18.4S, v19.4H, v0.4H // ...................................*.............. + smlal2 v24.4S, v19.8H, v0.8H // ....................................*............. + smlal v21.4S, v26.4H, v0.4H // ...........................................*...... + smlal2 v17.4S, v26.8H, v0.8H // ..........................................*....... + uzp2 v13.8H, v18.8H, v24.8H // ........................................*......... + uzp2 v19.8H, v21.8H, v17.8H // ............................................*..... + zip1 v23.8H, v19.8H, v13.8H // ..............................................*... + zip2 v19.8H, v19.8H, v13.8H // ...............................................*.. + str q23, [x0], #32 // .................................................* + str q19, [x0, #-16] // ................................................*. + + // ----------------- new position ------------------> + // 0 25 + // |------------------------|------------------------ + // smlal2 v24.4S, v26.8H, v28.8H // ...*.............................................. + // uzp2 v4.8H, v4.8H, v3.8H // .......*.......................................... + // smull2 v13.4S, v31.8H, v19.8H // *................................................. + // uzp2 v1.8H, v29.8H, v25.8H // ...........*...................................... + // smlal2 v13.4S, v16.8H, v23.8H // ....*............................................. + // smull v18.4S, v31.4H, v19.4H // .....*............................................ + // smlal2 v13.4S, v20.8H, v28.8H // ............*..................................... + // smull v29.4S, v31.4H, v21.4H // ..*............................................... + // smlal2 v13.4S, v26.8H, v8.8H // ................*................................. + // smlal v29.4S, v16.4H, v19.4H // ......*........................................... + // smlal v18.4S, v16.4H, v23.4H // .........*........................................ + // smlal v29.4S, v20.4H, v27.4H // ..........*....................................... + // uzp1 v31.8H, v14.8H, v6.8H // ........*......................................... + // smlal v18.4S, v20.4H, v28.4H // .............*.................................... + // smlal v29.4S, v26.4H, v28.4H // ..............*................................... + // smlal v18.4S, v26.4H, v8.4H // .................*................................ + // uzp2 v26.8H, v14.8H, v6.8H // .*................................................ + // smlal2 v13.4S, v9.8H, v31.8H // ....................*............................. + // smlal2 v24.4S, v9.8H, v26.8H // ...............*.................................. + // smlal v29.4S, v9.4H, v26.4H // ..................*............................... + // smlal v18.4S, v9.4H, v31.4H // .....................*............................ + // smlal2 v13.4S, v4.8H, v12.8H // ........................*......................... + // smlal2 v24.4S, v4.8H, v31.8H // ...................*.............................. + // smlal v29.4S, v4.4H, v31.4H // ......................*........................... + // smlal v18.4S, v4.4H, v12.4H // .........................*........................ + // smlal2 v13.4S, v30.8H, v10.8H // ............................*..................... + // smlal2 v24.4S, v30.8H, v1.8H // .......................*.......................... + // smlal v29.4S, v30.4H, v1.4H // ..........................*....................... + // smlal v18.4S, v30.4H, v10.4H // .............................*.................... + // smlal2 v13.4S, v11.8H, v22.8H // ................................*................. + // smlal2 v24.4S, v11.8H, v10.8H // ...........................*...................... + // smlal v29.4S, v11.4H, v10.4H // ..............................*................... + // smlal v18.4S, v11.4H, v22.4H // .................................*................ + // uzp1 v31.8H, v29.8H, v24.8H // ..................................*............... + // mul v19.8H, v31.8H, v2.8H // ....................................*............. + // smlal v29.4S, v19.4H, v0.4H // ........................................*......... + // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........ + // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............ + // zip2 v14.8H, v7.8H, v15.8H // ...............................*.................. + // mul v23.8H, v26.8H, v2.8H // .......................................*.......... + // uzp2 v15.8H, v29.8H, v24.8H // ............................................*..... + // str q14, [x0, #16] // ...................................*.............. + // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*...... + // smlal v18.4S, v23.4H, v0.4H // ..........................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // .............................................*.... + // str q5, [x0], #32 // ......................................*........... + // zip1 v5.8H, v7.8H, v15.8H // ..............................................*... + // zip2 v14.8H, v7.8H, v15.8H // ...............................................*.. + // str q14, [x0, #16] // .................................................* + // str q5, [x0], #32 // ................................................*. pop_stack diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S index c51e53188..722dc0f49 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/aarch64/src/rej_uniform_asm_clean.S @@ -22,272 +22,6 @@ #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) -// Needed to provide ASM_LOAD directive -#include "common.i" -#include "params.h" - -.data -table_data: - .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 0 - .byte 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 1 - .byte 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 2 - .byte 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 3 - .byte 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 4 - .byte 0, 1, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 5 - .byte 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 6 - .byte 0, 1, 2, 3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 7 - .byte 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 8 - .byte 0, 1, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 9 - .byte 2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 10 - .byte 0, 1, 2, 3, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 11 - .byte 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 12 - .byte 0, 1, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 13 - .byte 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 14 - .byte 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1 // 15 - .byte 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 16 - .byte 0, 1, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 17 - .byte 2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 18 - .byte 0, 1, 2, 3, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 19 - .byte 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 20 - .byte 0, 1, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 21 - .byte 2, 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 22 - .byte 0, 1, 2, 3, 4, 5, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 23 - .byte 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 24 - .byte 0, 1, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 25 - .byte 2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 26 - .byte 0, 1, 2, 3, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 27 - .byte 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 28 - .byte 0, 1, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 29 - .byte 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1 // 30 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1 // 31 - .byte 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 32 - .byte 0, 1, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 33 - .byte 2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 34 - .byte 0, 1, 2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 35 - .byte 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 36 - .byte 0, 1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 37 - .byte 2, 3, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 38 - .byte 0, 1, 2, 3, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 39 - .byte 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 40 - .byte 0, 1, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 41 - .byte 2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 42 - .byte 0, 1, 2, 3, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 43 - .byte 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 44 - .byte 0, 1, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 45 - .byte 2, 3, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 46 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, -1, -1, -1, -1, -1, -1 // 47 - .byte 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 48 - .byte 0, 1, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 49 - .byte 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 50 - .byte 0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 51 - .byte 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 52 - .byte 0, 1, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 53 - .byte 2, 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 54 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 55 - .byte 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 56 - .byte 0, 1, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 57 - .byte 2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 58 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 59 - .byte 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1 // 60 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 61 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1 // 62 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1 // 63 - .byte 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 64 - .byte 0, 1, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 65 - .byte 2, 3, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 66 - .byte 0, 1, 2, 3, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 67 - .byte 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 68 - .byte 0, 1, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 69 - .byte 2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 70 - .byte 0, 1, 2, 3, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 71 - .byte 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 72 - .byte 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 73 - .byte 2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 74 - .byte 0, 1, 2, 3, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 75 - .byte 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 76 - .byte 0, 1, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 77 - .byte 2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 78 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1 // 79 - .byte 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 80 - .byte 0, 1, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 81 - .byte 2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 82 - .byte 0, 1, 2, 3, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 83 - .byte 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 84 - .byte 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 85 - .byte 2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 86 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 87 - .byte 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 88 - .byte 0, 1, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 89 - .byte 2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 90 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 91 - .byte 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 92 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 93 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1 // 94 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, -1, -1, -1, -1 // 95 - .byte 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 96 - .byte 0, 1, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 97 - .byte 2, 3, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 98 - .byte 0, 1, 2, 3, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 99 - .byte 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 100 - .byte 0, 1, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 101 - .byte 2, 3, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 102 - .byte 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 103 - .byte 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 104 - .byte 0, 1, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 105 - .byte 2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 106 - .byte 0, 1, 2, 3, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 107 - .byte 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 108 - .byte 0, 1, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 109 - .byte 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 110 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, -1, -1, -1, -1 // 111 - .byte 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 112 - .byte 0, 1, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 113 - .byte 2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 114 - .byte 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 115 - .byte 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 116 - .byte 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 117 - .byte 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 118 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 119 - .byte 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1 // 120 - .byte 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 121 - .byte 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 122 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 123 - .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1 // 124 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 125 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1 // 126 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 // 127 - .byte 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 128 - .byte 0, 1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 129 - .byte 2, 3, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 130 - .byte 0, 1, 2, 3, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 131 - .byte 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 132 - .byte 0, 1, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 133 - .byte 2, 3, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 134 - .byte 0, 1, 2, 3, 4, 5, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 135 - .byte 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 136 - .byte 0, 1, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 137 - .byte 2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 138 - .byte 0, 1, 2, 3, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 139 - .byte 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 140 - .byte 0, 1, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 141 - .byte 2, 3, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 142 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1 // 143 - .byte 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 144 - .byte 0, 1, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 145 - .byte 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 146 - .byte 0, 1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 147 - .byte 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 148 - .byte 0, 1, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 149 - .byte 2, 3, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 150 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 151 - .byte 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 152 - .byte 0, 1, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 153 - .byte 2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 154 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 155 - .byte 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 156 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 157 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1 // 158 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, -1, -1, -1, -1 // 159 - .byte 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 160 - .byte 0, 1, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 161 - .byte 2, 3, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 162 - .byte 0, 1, 2, 3, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 163 - .byte 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 164 - .byte 0, 1, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 165 - .byte 2, 3, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 166 - .byte 0, 1, 2, 3, 4, 5, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 167 - .byte 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 168 - .byte 0, 1, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 169 - .byte 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 170 - .byte 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 171 - .byte 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 172 - .byte 0, 1, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 173 - .byte 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 174 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1 // 175 - .byte 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 176 - .byte 0, 1, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 177 - .byte 2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 178 - .byte 0, 1, 2, 3, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 179 - .byte 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 180 - .byte 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 181 - .byte 2, 3, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 182 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 183 - .byte 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 184 - .byte 0, 1, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 185 - .byte 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 186 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 187 - .byte 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1 // 188 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 189 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1, -1, -1 // 190 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, -1, -1 // 191 - .byte 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 192 - .byte 0, 1, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 193 - .byte 2, 3, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 194 - .byte 0, 1, 2, 3, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 195 - .byte 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 196 - .byte 0, 1, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 197 - .byte 2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 198 - .byte 0, 1, 2, 3, 4, 5, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 199 - .byte 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 200 - .byte 0, 1, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 201 - .byte 2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 202 - .byte 0, 1, 2, 3, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 203 - .byte 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 204 - .byte 0, 1, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 205 - .byte 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 206 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, -1, -1, -1, -1 // 207 - .byte 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 208 - .byte 0, 1, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 209 - .byte 2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 210 - .byte 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 211 - .byte 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 212 - .byte 0, 1, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 213 - .byte 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 214 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 215 - .byte 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 216 - .byte 0, 1, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 217 - .byte 2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 218 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 219 - .byte 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 220 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 221 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1, -1, -1 // 222 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, -1, -1 // 223 - .byte 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 224 - .byte 0, 1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 225 - .byte 2, 3, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 226 - .byte 0, 1, 2, 3, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 227 - .byte 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 228 - .byte 0, 1, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 229 - .byte 2, 3, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 230 - .byte 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 231 - .byte 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 232 - .byte 0, 1, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 233 - .byte 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 234 - .byte 0, 1, 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 235 - .byte 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 236 - .byte 0, 1, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 237 - .byte 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 238 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, -1, -1 // 239 - .byte 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 // 240 - .byte 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 241 - .byte 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 242 - .byte 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 243 - .byte 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 244 - .byte 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 245 - .byte 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 246 - .byte 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 247 - .byte 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1 // 248 - .byte 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 249 - .byte 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 250 - .byte 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 251 - .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1 // 252 - .byte 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 253 - .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1 // 254 - .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 // 255 - -bit_table_data: - .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 - // We save the output on the stack first, and copy to the actual // output buffer only in the end. This is because the main loop can overwrite // by up to 62 bytes, which we account for here (we use 64 bytes for alignment). @@ -306,11 +40,9 @@ bit_table_data: output .req x0 buf .req x1 buflen .req w2 - len .req w3 + table_idx .req x3 - /* Pointers to static data */ - table_idx .req x5 - bit_table .req x6 + len .req w4 /* Temporary output on the stack */ output_tmp .req x7 @@ -381,17 +113,17 @@ bit_table_data: bits_q .req q31 .text +/* Literal pool */ +.p2align 4 +c_bit_table: + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .align 4 -.global MLKEM_NAMESPACE(rej_uniform_asm_clean) -.global _MLKEM_NAMESPACE(rej_uniform_asm_clean) -MLKEM_NAMESPACE(rej_uniform_asm_clean): -_MLKEM_NAMESPACE(rej_uniform_asm_clean): +.global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) +MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean): push_stack - ASM_LOAD(bit_table, bit_table_data) - ASM_LOAD(table_idx, table_data) - - ldr bits_q, [bit_table] + ldr bits_q, c_bit_table movz tmp, #MLKEM_Q dup mlkem_q.8h, tmp diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h index 3469d2739..8177b0b50 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/common.h @@ -26,4 +26,17 @@ * since the backend choice may be part of the namespace. */ #include "namespace.h" +/* On Apple platforms, we need to emit leading underscore + * in front of assembly symbols. We thus introducee a separate + * namespace wrapper for ASM symbols. */ +#if !defined(__APPLE__) +#define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym) +#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym) +#else +#define _PREFIX_UNDERSCORE(sym) _##sym +#define PREFIX_UNDERSCORE(sym) _PREFIX_UNDERSCORE(sym) +#define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym)) +#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym)) +#endif + #endif /* MLKEM_NATIVE_COMMON_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h index ac27408eb..31040a471 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/config.h @@ -39,23 +39,19 @@ /****************************************************************************** * Name: MLKEM_NAMESPACE - * _MLKEM_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/. *****************************************************************************/ #define MLKEM_NAMESPACE(sym) MLKEM_DEFAULT_NAMESPACE(sym) -#define _MLKEM_NAMESPACE(sym) _MLKEM_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: FIPS202_NAMESPACE - * _FIPS202_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/fips202/. *****************************************************************************/ #define FIPS202_NAMESPACE(sym) FIPS202_DEFAULT_NAMESPACE(sym) -#define _FIPS202_NAMESPACE(sym) _FIPS202_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: MLKEM_USE_NATIVE diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c index f52b2ff5a..8ab46ce0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/zetas.c @@ -14,7 +14,7 @@ * Table of zeta values used in the reference NTT and inverse NTT. * See autogenerate_files.py for details. */ -const int16_t zetas[128] = { +ALIGN const int16_t zetas[128] = { -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h index 3469d2739..8177b0b50 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/common.h @@ -26,4 +26,17 @@ * since the backend choice may be part of the namespace. */ #include "namespace.h" +/* On Apple platforms, we need to emit leading underscore + * in front of assembly symbols. We thus introducee a separate + * namespace wrapper for ASM symbols. */ +#if !defined(__APPLE__) +#define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym) +#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym) +#else +#define _PREFIX_UNDERSCORE(sym) _##sym +#define PREFIX_UNDERSCORE(sym) _PREFIX_UNDERSCORE(sym) +#define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym)) +#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym)) +#endif + #endif /* MLKEM_NATIVE_COMMON_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h index ac27408eb..31040a471 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/config.h @@ -39,23 +39,19 @@ /****************************************************************************** * Name: MLKEM_NAMESPACE - * _MLKEM_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/. *****************************************************************************/ #define MLKEM_NAMESPACE(sym) MLKEM_DEFAULT_NAMESPACE(sym) -#define _MLKEM_NAMESPACE(sym) _MLKEM_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: FIPS202_NAMESPACE - * _FIPS202_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/fips202/. *****************************************************************************/ #define FIPS202_NAMESPACE(sym) FIPS202_DEFAULT_NAMESPACE(sym) -#define _FIPS202_NAMESPACE(sym) _FIPS202_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: MLKEM_USE_NATIVE diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c index f52b2ff5a..8ab46ce0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/zetas.c @@ -14,7 +14,7 @@ * Table of zeta values used in the reference NTT and inverse NTT. * See autogenerate_files.py for details. */ -const int16_t zetas[128] = { +ALIGN const int16_t zetas[128] = { -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223, diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h index 3469d2739..8177b0b50 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/common.h @@ -26,4 +26,17 @@ * since the backend choice may be part of the namespace. */ #include "namespace.h" +/* On Apple platforms, we need to emit leading underscore + * in front of assembly symbols. We thus introducee a separate + * namespace wrapper for ASM symbols. */ +#if !defined(__APPLE__) +#define MLKEM_ASM_NAMESPACE(sym) MLKEM_NAMESPACE(sym) +#define FIPS202_ASM_NAMESPACE(sym) FIPS202_NAMESPACE(sym) +#else +#define _PREFIX_UNDERSCORE(sym) _##sym +#define PREFIX_UNDERSCORE(sym) _PREFIX_UNDERSCORE(sym) +#define MLKEM_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(MLKEM_NAMESPACE(sym)) +#define FIPS202_ASM_NAMESPACE(sym) PREFIX_UNDERSCORE(FIPS202_NAMESPACE(sym)) +#endif + #endif /* MLKEM_NATIVE_COMMON_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h index ac27408eb..31040a471 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/config.h @@ -39,23 +39,19 @@ /****************************************************************************** * Name: MLKEM_NAMESPACE - * _MLKEM_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/. *****************************************************************************/ #define MLKEM_NAMESPACE(sym) MLKEM_DEFAULT_NAMESPACE(sym) -#define _MLKEM_NAMESPACE(sym) _MLKEM_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: FIPS202_NAMESPACE - * _FIPS202_NAMESPACE * * Description: The macros to use to namespace global symbols * from mlkem/fips202/. *****************************************************************************/ #define FIPS202_NAMESPACE(sym) FIPS202_DEFAULT_NAMESPACE(sym) -#define _FIPS202_NAMESPACE(sym) _FIPS202_DEFAULT_NAMESPACE(sym) /****************************************************************************** * Name: MLKEM_USE_NATIVE diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S index 503fbeb51..c51606b3f 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/basemul.S @@ -113,8 +113,8 @@ vmovdqa %ymm11,(64*\off+48)*2(%rdi) .endm .text -.global MLKEM_NAMESPACE(basemul_avx2) -MLKEM_NAMESPACE(basemul_avx2): +.global MLKEM_ASM_NAMESPACE(basemul_avx2) +MLKEM_ASM_NAMESPACE(basemul_avx2): mov %rsp,%r8 and $-32,%rsp sub $32,%rsp diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S index 50ef190b7..61560fb46 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/fq.S @@ -60,8 +60,8 @@ vmovdqa %ymm9,224(%rdi) ret -.global MLKEM_NAMESPACE(reduce_avx2) -MLKEM_NAMESPACE(reduce_avx2): +.global MLKEM_ASM_NAMESPACE(reduce_avx2) +MLKEM_ASM_NAMESPACE(reduce_avx2): #consts vmovdqa _16XQ*2(%rsi),%ymm0 vmovdqa _16XV*2(%rsi),%ymm1 @@ -103,8 +103,8 @@ vmovdqa %ymm10,224(%rdi) ret -.global MLKEM_NAMESPACE(tomont_avx2) -MLKEM_NAMESPACE(tomont_avx2): +.global MLKEM_ASM_NAMESPACE(tomont_avx2) +MLKEM_ASM_NAMESPACE(tomont_avx2): #consts vmovdqa _16XQ*2(%rsi),%ymm0 vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S index 4860985ed..fda9ec0d3 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/intt.S @@ -242,8 +242,8 @@ vmovdqa %ymm11,(64*\off+176)*2(%rdi) .endm .text -.global MLKEM_NAMESPACE(invntt_avx2) -MLKEM_NAMESPACE(invntt_avx2): +.global MLKEM_ASM_NAMESPACE(invntt_avx2) +MLKEM_ASM_NAMESPACE(invntt_avx2): vmovdqa _16XQ*2(%rsi),%ymm0 intt_levels0t5 0 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S index a0b6f734c..dc3776b37 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/ntt.S @@ -206,8 +206,8 @@ vmovdqa %ymm11,(128*\off+112)*2(%rdi) .endm .text -.global MLKEM_NAMESPACE(ntt_avx2) -MLKEM_NAMESPACE(ntt_avx2): +.global MLKEM_ASM_NAMESPACE(ntt_avx2) +MLKEM_ASM_NAMESPACE(ntt_avx2): vmovdqa _16XQ*2(%rsi),%ymm0 level0 0 diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S index 34f6b30b0..0d23c11b3 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/x86_64/src/shuffle.S @@ -15,8 +15,8 @@ #include "fq.inc" #include "shuffle.inc" -.global MLKEM_NAMESPACE(nttpack_avx2) -MLKEM_NAMESPACE(nttpack_avx2): +.global MLKEM_ASM_NAMESPACE(nttpack_avx2) +MLKEM_ASM_NAMESPACE(nttpack_avx2): #load vmovdqa (%rdi),%ymm4 vmovdqa 32(%rdi),%ymm5 @@ -102,8 +102,8 @@ vmovdqa %ymm11,224(%rdi) ret -.global MLKEM_NAMESPACE(nttunpack_avx2) -MLKEM_NAMESPACE(nttunpack_avx2): +.global MLKEM_ASM_NAMESPACE(nttunpack_avx2) +MLKEM_ASM_NAMESPACE(nttunpack_avx2): call nttunpack128_avx2 add $256,%rdi call nttunpack128_avx2 @@ -169,8 +169,8 @@ vmovdqu %ymm9,160(%rdi) ret -.global MLKEM_NAMESPACE(ntttobytes_avx2) -MLKEM_NAMESPACE(ntttobytes_avx2): +.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2) +MLKEM_ASM_NAMESPACE(ntttobytes_avx2): #consts vmovdqa _16XQ*2(%rdx),%ymm0 call ntttobytes128_avx @@ -245,8 +245,8 @@ vmovdqa %ymm1,224(%rdi) ret -.global MLKEM_NAMESPACE(nttfrombytes_avx2) -MLKEM_NAMESPACE(nttfrombytes_avx2): +.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2) +MLKEM_ASM_NAMESPACE(nttfrombytes_avx2): #consts vmovdqa _16XMASK*2(%rdx),%ymm0 call nttfrombytes128_avx diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c index f52b2ff5a..8ab46ce0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/zetas.c @@ -14,7 +14,7 @@ * Table of zeta values used in the reference NTT and inverse NTT. * See autogenerate_files.py for details. */ -const int16_t zetas[128] = { +ALIGN const int16_t zetas[128] = { -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223,