From 961d25e9c7e4a1758adb1dbeaa15187de69dd052 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 May 2018 22:54:39 +0200 Subject: [PATCH 01/86] Use the new zrot.c on POWER8 for crot as well fixes #1571 (the old zrot.S assembly does not handle incx=0 correctly) --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 00ff8682a5..1aa0610785 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = ../arm/zrot.c +CROTKERNEL = zrot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c From 43e592ceb38a56716279a6514ceca1ec9bdb0865 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 24 May 2018 20:56:24 +0800 Subject: [PATCH 02/86] Add -lm for Android. Conflicts: exports/Makefile --- exports/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 53d4f75bbc..127b05057b 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -128,6 +128,8 @@ so : ../$(LIBSONAME) ifeq ($(OSNAME), Android) INTERNALNAME = $(LIBPREFIX).so +FEXTRALIB += -lm +EXTRALIB += -lm else INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) endif From 908d40be715bfb252972a0a4abf27726a729945f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 29 May 2018 14:27:46 +0200 Subject: [PATCH 03/86] Adapt lapack-test and blas-test to changes in netlib directory layout partial fix for #1574 - the problem with lapack_testing.py looks like an upstream bug --- Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index c0e5fbcf82..380ba1ce8f 100644 --- a/Makefile +++ b/Makefile @@ -294,9 +294,10 @@ endif lapack-test : (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc ifneq ($(CROSS), 1) - ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ + ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \ ./testsecond; ./testdsecnd; ./testieee; ./testversion ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) endif @@ -308,9 +309,9 @@ lapack-runtest: blas-test: - (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) + (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing - (cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) + (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) dummy : From a7dbd4c57d22b580b32f3a97b0b327bf2fedf551 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 11:19:33 +0200 Subject: [PATCH 04/86] Fix paths to LIN and EIG tests should fix 1574 --- lapack-netlib/lapack_testing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 3c917482d6..5d07e1e876 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -257,16 +257,16 @@ def run_summary_test( f, cmdline, short_summary): else: if dtest==16: # LIN TESTS - cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==17: # PROTO LIN TESTS - cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==18: # PROTO LIN TESTS - cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" else: # EIG TESTS - cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" if (not just_errors and not short_summary): print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ') # Run the process: either to read the file or run the LAPACK testing From 5fae96fb70cbc1205e50220f77722ac5ff92f0d8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 12:43:45 +0200 Subject: [PATCH 05/86] Update version to 0.3.1.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b5789119a4..f49f205137 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 0.dev) +set(OpenBLAS_PATCH_VERSION 1.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From b491b10057196c5735a261608ec110b1bbd134d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 12:44:36 +0200 Subject: [PATCH 06/86] Update version to 0.3.1.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 12734464bb..1b4b8eb637 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.0.dev +VERSION = 0.3.1.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From d1b7be14aa9b57ca4df9c00cdb4611974729b3be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 12:52:04 +0200 Subject: [PATCH 07/86] Handle INCX=0,INCY=0 case Fixes #1575 (sswap/dswap failing the swap utest on x86) as suggested by atsampson. --- kernel/x86/swap.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S index 54b00b33ec..d3cf04942c 100644 --- a/kernel/x86/swap.S +++ b/kernel/x86/swap.S @@ -138,6 +138,14 @@ /* INCX != 1 or INCY != 1 */ .L14: + cmpl $0, %ebx + jne .L141 + cmpl $0, %ecx + jne .L141 +/* INCX == 0 and INCY == 0 */ + jmp .L27 + +.L141 movl %edx, %eax sarl $2, %eax jle .L28 From a91f1587b9be6c9bbc403a79970d3e2a03bf866c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 13:26:00 +0200 Subject: [PATCH 08/86] Work around name clash with Windows10's winnt.h fixes #1503 --- driver/level3/Makefile | 48 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 3522252065..e320092e37 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) @@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F) xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) From 2fc748bf7200ca53d66d43107dc2c732685519d0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 13:41:12 +0200 Subject: [PATCH 09/86] Restore optimized swap kernel now that we have a proper fix --- kernel/x86/KERNEL.NEHALEM | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86/KERNEL.NEHALEM b/kernel/x86/KERNEL.NEHALEM index 835520efb8..65b03ae50e 100644 --- a/kernel/x86/KERNEL.NEHALEM +++ b/kernel/x86/KERNEL.NEHALEM @@ -1,3 +1 @@ include $(KERNELDIR)/KERNEL.PENRYN -SSWAPKERNEL = ../arm/swap.c -DSWAPKERNEL = ../arm/swap.c From 7df8c4f76fa7aadd8d1bce1d99fe826a4826d775 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 May 2018 17:23:08 +0200 Subject: [PATCH 10/86] typo fix --- kernel/x86/swap.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S index d3cf04942c..e30c278985 100644 --- a/kernel/x86/swap.S +++ b/kernel/x86/swap.S @@ -145,7 +145,7 @@ /* INCX == 0 and INCY == 0 */ jmp .L27 -.L141 +.L141: movl %edx, %eax sarl $2, %eax jle .L28 From e2a8c35e5a6897e5aebf5e2fb8ba18f94735c89a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Jun 2018 15:08:14 +0200 Subject: [PATCH 11/86] Fixes from netlib PR253 LAPACKE interfaces for Aasen's functions now call ?sytrf_aa and ?hetrf_aa instead of ?sytrf and ?hetrf --- lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c | 6 +++--- lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c index b4a7595d8d..e4d5387796 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chetrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_chetrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_chetrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_chetrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_chetrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_chetrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_chetrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_chetrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c index d4f24142ba..f6661c85c6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_csytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_csytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_csytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_csytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_csytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_csy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_csytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_csytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c index cbf97b6326..e72bfa6de6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrf_aa_work.c @@ -40,7 +40,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_dsytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_dsytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -55,7 +55,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_dsytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_dsytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -67,7 +67,7 @@ lapack_int LAPACKE_dsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_dsytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_dsytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c index d68cb17c18..182946a452 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrf_aa_work.c @@ -40,7 +40,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_ssytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_ssytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -55,7 +55,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_ssytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_ssytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -67,7 +67,7 @@ lapack_int LAPACKE_ssytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_ssytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_ssytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c index 5214217fb8..dbad2d81e1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_zhetrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_zhetrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_zhetrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zhetrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_zhetrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_zhetrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zhetrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c index 29d75319e1..03726c63e0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrf_aa_work.c @@ -41,7 +41,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_zsytrf( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); + LAPACK_zsytrf_aa( &uplo, &n, a, &lda, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } @@ -56,7 +56,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_zsytrf( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zsytrf_aa( &uplo, &n, a, &lda_t, ipiv, work, &lwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -69,7 +69,7 @@ lapack_int LAPACKE_zsytrf_aa_work( int matrix_layout, char uplo, lapack_int n, /* Transpose input matrices */ LAPACKE_zsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t ); /* Call LAPACK function and adjust info */ - LAPACK_zsytrf( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); + LAPACK_zsytrf_aa( &uplo, &n, a_t, &lda_t, ipiv, work, &lwork, &info ); if( info < 0 ) { info = info - 1; } From 677e42d7b0c6b6c40af94268fbb9d9be60f7af0a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Jun 2018 15:12:59 +0200 Subject: [PATCH 12/86] Fixes from netlib PR 253 When minimal workspace is given in ?hesv_aa, ?sysv_aa, ?hesv_aa_2stage, ?sysv_aa_2stage, now no error is given Quick return for ?laqr1 --- lapack-netlib/SRC/cgejsv.f | 4 ++-- lapack-netlib/SRC/chesv_aa.f | 5 ++--- lapack-netlib/SRC/chesv_aa_2stage.f | 15 +++++++++------ lapack-netlib/SRC/chetrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/chetrs_aa_2stage.f | 1 + lapack-netlib/SRC/cla_syamv.f | 2 +- lapack-netlib/SRC/claqr1.f | 7 +++++++ lapack-netlib/SRC/csysv_aa.f | 3 --- lapack-netlib/SRC/csysv_aa_2stage.f | 15 +++++++++------ lapack-netlib/SRC/csytrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/csytri2.f | 6 +++--- lapack-netlib/SRC/csytrs_aa_2stage.f | 1 + lapack-netlib/SRC/ctrevc3.f | 18 +++++++++--------- lapack-netlib/SRC/dgelqt.f | 2 +- lapack-netlib/SRC/dla_syamv.f | 2 +- lapack-netlib/SRC/dlaqr1.f | 7 +++++++ lapack-netlib/SRC/dsysv_aa.f | 3 --- lapack-netlib/SRC/dsysv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/dsytrf_aa_2stage.f | 8 ++++++-- lapack-netlib/SRC/dsytri2.f | 6 +++--- lapack-netlib/SRC/dsytrs_aa_2stage.f | 1 + lapack-netlib/SRC/dtrevc3.f | 4 ++-- lapack-netlib/SRC/iparmq.f | 4 ++-- lapack-netlib/SRC/sla_syamv.f | 2 +- lapack-netlib/SRC/slaqr1.f | 7 +++++++ lapack-netlib/SRC/ssysv_aa.f | 3 --- lapack-netlib/SRC/ssysv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/ssytrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/ssytri2.f | 4 ++-- lapack-netlib/SRC/ssytrs_aa_2stage.f | 1 + lapack-netlib/SRC/strevc3.f | 12 ++++++------ lapack-netlib/SRC/zgejsv.f | 4 ++-- lapack-netlib/SRC/zhesv_aa.f | 5 ++--- lapack-netlib/SRC/zhesv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/zhetrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/zhetrs_aa_2stage.f | 7 ++++--- lapack-netlib/SRC/zla_syamv.f | 2 +- lapack-netlib/SRC/zlaqr1.f | 7 +++++++ lapack-netlib/SRC/zsysv_aa.f | 3 --- lapack-netlib/SRC/zsysv_aa_2stage.f | 13 +++++++------ lapack-netlib/SRC/zsytrf_aa_2stage.f | 6 +++++- lapack-netlib/SRC/zsytri2.f | 2 +- lapack-netlib/SRC/zsytrs_aa_2stage.f | 1 + 43 files changed, 155 insertions(+), 101 deletions(-) diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 8eb43cf507..a7b1c451c2 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -701,7 +701,7 @@ SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, LWSVDJ = MAX( 2 * N, 1 ) LWSVDJV = MAX( 2 * N, 1 ) * .. minimal REAL workspace length for CGEQP3, CPOCON, CGESVJ - LRWQP3 = N + LRWQP3 = 2 * N LRWCON = N LRWSVDJ = N IF ( LQUERY ) THEN @@ -939,7 +939,7 @@ SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, END IF END IF MINWRK = MAX( 2, MINWRK ) - OPTWRK = MAX( 2, OPTWRK ) + OPTWRK = MAX( OPTWRK, MINWRK ) IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 END IF diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index 0bf636b486..470f910bc1 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -209,6 +209,8 @@ SUBROUTINE CHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 + ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + INFO = -10 END IF * IF( INFO.EQ.0 ) THEN @@ -219,9 +221,6 @@ SUBROUTINE CHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT_HETRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index 057d9c57a9..05f6b7bb78 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -105,6 +105,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -124,7 +125,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -150,6 +151,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -233,19 +235,18 @@ SUBROUTINE CHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN @@ -270,6 +271,8 @@ SUBROUTINE CHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, END IF * WORK( 1 ) = LWKOPT +* + RETURN * * End of CHESV_AA_2STAGE * diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index 0fa2ae3a06..ce34d73cce 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -658,6 +660,8 @@ SUBROUTINE CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, * * Factor the band matrix CALL CGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of CHETRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/chetrs_aa_2stage.f b/lapack-netlib/SRC/chetrs_aa_2stage.f index 3f85766730..05d09275b3 100644 --- a/lapack-netlib/SRC/chetrs_aa_2stage.f +++ b/lapack-netlib/SRC/chetrs_aa_2stage.f @@ -87,6 +87,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/cla_syamv.f b/lapack-netlib/SRC/cla_syamv.f index e1d3df9601..695b5e4786 100644 --- a/lapack-netlib/SRC/cla_syamv.f +++ b/lapack-netlib/SRC/cla_syamv.f @@ -241,7 +241,7 @@ SUBROUTINE CLA_SYAMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'SSYMV ', INFO ) + CALL XERBLA( 'CLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/claqr1.f b/lapack-netlib/SRC/claqr1.f index b76bedf60a..9779471968 100644 --- a/lapack-netlib/SRC/claqr1.f +++ b/lapack-netlib/SRC/claqr1.f @@ -142,6 +142,13 @@ SUBROUTINE CLAQR1( N, H, LDH, S1, S2, V ) CABS1( CDUM ) = ABS( REAL( CDUM ) ) + ABS( AIMAG( CDUM ) ) * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = CABS1( H( 1, 1 )-S2 ) + CABS1( H( 2, 1 ) ) IF( S.EQ.RZERO ) THEN diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f index 9cd669d334..87be734ccb 100644 --- a/lapack-netlib/SRC/csysv_aa.f +++ b/lapack-netlib/SRC/csysv_aa.f @@ -221,9 +221,6 @@ SUBROUTINE CSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f index cba57fc3e3..a13349824f 100644 --- a/lapack-netlib/SRC/csysv_aa_2stage.f +++ b/lapack-netlib/SRC/csysv_aa_2stage.f @@ -105,6 +105,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -124,7 +125,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -150,6 +151,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -233,19 +235,18 @@ SUBROUTINE CSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN @@ -270,6 +271,8 @@ SUBROUTINE CSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, END IF * WORK( 1 ) = LWKOPT +* + RETURN * * End of CSYSV_AA_2STAGE * diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f index 0a6bfbe31f..0d0bd156cc 100644 --- a/lapack-netlib/SRC/csytrf_aa_2stage.f +++ b/lapack-netlib/SRC/csytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -662,6 +664,8 @@ SUBROUTINE CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, * * Factor the band matrix CALL CGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of CSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/csytri2.f b/lapack-netlib/SRC/csytri2.f index 4c6baaa3e6..4bd8e4f994 100644 --- a/lapack-netlib/SRC/csytri2.f +++ b/lapack-netlib/SRC/csytri2.f @@ -96,11 +96,11 @@ *> LWORK is INTEGER *> The dimension of the array WORK. *> WORK is size >= (N+NB+1)*(NB+3) -*> If LDWORK = -1, then a workspace query is assumed; the routine +*> If LWORK = -1, then a workspace query is assumed; the routine *> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, -*> - and no error message related to LDWORK is issued by XERBLA. +*> - and no error message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -163,7 +163,7 @@ SUBROUTINE CSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) * Get blocksize - NBMAX = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 ) + NBMAX = ILAENV( 1, 'CSYTRI2', UPLO, N, -1, -1, -1 ) IF ( NBMAX .GE. N ) THEN MINSIZE = N ELSE diff --git a/lapack-netlib/SRC/csytrs_aa_2stage.f b/lapack-netlib/SRC/csytrs_aa_2stage.f index 03bccda823..d025c08fe8 100644 --- a/lapack-netlib/SRC/csytrs_aa_2stage.f +++ b/lapack-netlib/SRC/csytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f index c06b404770..a134c1a50c 100644 --- a/lapack-netlib/SRC/ctrevc3.f +++ b/lapack-netlib/SRC/ctrevc3.f @@ -27,8 +27,8 @@ * .. * .. Array Arguments .. * LOGICAL SELECT( * ) -* REAL RWORK( * ) -* COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), +* REAL RWORK( * ) +* COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), * $ WORK( * ) * .. * @@ -258,17 +258,17 @@ SUBROUTINE CTREVC3( SIDE, HOWMNY, SELECT, N, T, LDT, VL, LDVL, VR, * .. * .. Array Arguments .. LOGICAL SELECT( * ) - REAL RWORK( * ) - COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), + REAL RWORK( * ) + COMPLEX T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), $ WORK( * ) * .. * * ===================================================================== * * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) - COMPLEX CZERO, CONE + COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), $ CONE = ( 1.0E+0, 0.0E+0 ) ) INTEGER NBMIN, NBMAX @@ -277,13 +277,13 @@ SUBROUTINE CTREVC3( SIDE, HOWMNY, SELECT, N, T, LDT, VL, LDVL, VR, * .. Local Scalars .. LOGICAL ALLV, BOTHV, LEFTV, LQUERY, OVER, RIGHTV, SOMEV INTEGER I, II, IS, J, K, KI, IV, MAXWRK, NB - REAL OVFL, REMAX, SCALE, SMIN, SMLNUM, ULP, UNFL - COMPLEX CDUM + REAL OVFL, REMAX, SCALE, SMIN, SMLNUM, ULP, UNFL + COMPLEX CDUM * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ICAMAX - REAL SLAMCH, SCASUM + REAL SLAMCH, SCASUM EXTERNAL LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM * .. * .. External Subroutines .. diff --git a/lapack-netlib/SRC/dgelqt.f b/lapack-netlib/SRC/dgelqt.f index 2124f3dc38..5b4ee65b51 100644 --- a/lapack-netlib/SRC/dgelqt.f +++ b/lapack-netlib/SRC/dgelqt.f @@ -158,7 +158,7 @@ SUBROUTINE DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) INTEGER I, IB, IINFO, K * .. * .. External Subroutines .. - EXTERNAL DGEQRT2, DGELQT3, DGEQRT3, DLARFB, XERBLA + EXTERNAL DGELQT3, DLARFB, XERBLA * .. * .. Executable Statements .. * diff --git a/lapack-netlib/SRC/dla_syamv.f b/lapack-netlib/SRC/dla_syamv.f index 29566a6e9b..bb6dbe2889 100644 --- a/lapack-netlib/SRC/dla_syamv.f +++ b/lapack-netlib/SRC/dla_syamv.f @@ -230,7 +230,7 @@ SUBROUTINE DLA_SYAMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'DSYMV ', INFO ) + CALL XERBLA( 'DLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/dlaqr1.f b/lapack-netlib/SRC/dlaqr1.f index 81a462fb39..795b072ab3 100644 --- a/lapack-netlib/SRC/dlaqr1.f +++ b/lapack-netlib/SRC/dlaqr1.f @@ -147,6 +147,13 @@ SUBROUTINE DLAQR1( N, H, LDH, SR1, SI1, SR2, SI2, V ) INTRINSIC ABS * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = ABS( H( 1, 1 )-SR2 ) + ABS( SI2 ) + ABS( H( 2, 1 ) ) IF( S.EQ.ZERO ) THEN diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f index cbccd5e650..7192928c6d 100644 --- a/lapack-netlib/SRC/dsysv_aa.f +++ b/lapack-netlib/SRC/dsysv_aa.f @@ -221,9 +221,6 @@ SUBROUTINE DSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index ac3c77d76b..05e538f0b9 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -107,6 +107,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -126,7 +127,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -152,6 +153,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -235,19 +237,18 @@ SUBROUTINE DSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index f5f06cc1d2..25fc1a2eb0 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -109,6 +110,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -128,10 +130,10 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the -*> row and column IPIV(k). +*> row and column IPIV2(k). *> \endverbatim *> *> \param[out] INFO @@ -641,6 +643,8 @@ SUBROUTINE DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, * * Factor the band matrix CALL DGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of DSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f index 9aa21a854f..23f8b9fa26 100644 --- a/lapack-netlib/SRC/dsytri2.f +++ b/lapack-netlib/SRC/dsytri2.f @@ -96,11 +96,11 @@ *> LWORK is INTEGER *> The dimension of the array WORK. *> WORK is size >= (N+NB+1)*(NB+3) -*> If LDWORK = -1, then a workspace query is assumed; the routine +*> If LWORK = -1, then a workspace query is assumed; the routine *> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, -*> - and no error message related to LDWORK is issued by XERBLA. +*> - and no error message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -163,7 +163,7 @@ SUBROUTINE DSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) * Get blocksize - NBMAX = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) + NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 ) IF ( NBMAX .GE. N ) THEN MINSIZE = N ELSE diff --git a/lapack-netlib/SRC/dsytrs_aa_2stage.f b/lapack-netlib/SRC/dsytrs_aa_2stage.f index caff5d4adc..bb283cb953 100644 --- a/lapack-netlib/SRC/dsytrs_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f index 745f636d0b..957baf4f09 100644 --- a/lapack-netlib/SRC/dtrevc3.f +++ b/lapack-netlib/SRC/dtrevc3.f @@ -45,9 +45,9 @@ *> The right eigenvector x and the left eigenvector y of T corresponding *> to an eigenvalue w are defined by: *> -*> T*x = w*x, (y**H)*T = w*(y**H) +*> T*x = w*x, (y**T)*T = w*(y**T) *> -*> where y**H denotes the conjugate transpose of y. +*> where y**T denotes the transpose of the vector y. *> The eigenvalues are not input to this routine, but are read directly *> from the diagonal blocks of T. *> diff --git a/lapack-netlib/SRC/iparmq.f b/lapack-netlib/SRC/iparmq.f index e576e0db01..a9212b3e03 100644 --- a/lapack-netlib/SRC/iparmq.f +++ b/lapack-netlib/SRC/iparmq.f @@ -104,13 +104,13 @@ *> *> \param[in] NAME *> \verbatim -*> NAME is character string +*> NAME is CHARACTER string *> Name of the calling subroutine *> \endverbatim *> *> \param[in] OPTS *> \verbatim -*> OPTS is character string +*> OPTS is CHARACTER string *> This is a concatenation of the string arguments to *> TTQRE. *> \endverbatim diff --git a/lapack-netlib/SRC/sla_syamv.f b/lapack-netlib/SRC/sla_syamv.f index d40e7bd95f..4459f4d8bd 100644 --- a/lapack-netlib/SRC/sla_syamv.f +++ b/lapack-netlib/SRC/sla_syamv.f @@ -230,7 +230,7 @@ SUBROUTINE SLA_SYAMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'SSYMV ', INFO ) + CALL XERBLA( 'SLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/slaqr1.f b/lapack-netlib/SRC/slaqr1.f index 7d7d851eec..2de33849dd 100644 --- a/lapack-netlib/SRC/slaqr1.f +++ b/lapack-netlib/SRC/slaqr1.f @@ -147,6 +147,13 @@ SUBROUTINE SLAQR1( N, H, LDH, SR1, SI1, SR2, SI2, V ) INTRINSIC ABS * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = ABS( H( 1, 1 )-SR2 ) + ABS( SI2 ) + ABS( H( 2, 1 ) ) IF( S.EQ.ZERO ) THEN diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index abf52b1437..e470f58830 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -220,9 +220,6 @@ SUBROUTINE SSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index a738c7415f..43d9371417 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -106,6 +106,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -125,7 +126,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -151,6 +152,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -234,19 +236,18 @@ SUBROUTINE SSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index a929749308..0e0f6edb79 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -641,6 +643,8 @@ SUBROUTINE SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, * * Factor the band matrix CALL SGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of SSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f index 97b5390058..4b9ea4e7b2 100644 --- a/lapack-netlib/SRC/ssytri2.f +++ b/lapack-netlib/SRC/ssytri2.f @@ -96,11 +96,11 @@ *> LWORK is INTEGER *> The dimension of the array WORK. *> WORK is size >= (N+NB+1)*(NB+3) -*> If LDWORK = -1, then a workspace query is assumed; the routine +*> If LWORK = -1, then a workspace query is assumed; the routine *> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, -*> - and no error message related to LDWORK is issued by XERBLA. +*> - and no error message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO diff --git a/lapack-netlib/SRC/ssytrs_aa_2stage.f b/lapack-netlib/SRC/ssytrs_aa_2stage.f index c9c7181f2e..d271b94810 100644 --- a/lapack-netlib/SRC/ssytrs_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f index 0df1189f0f..525978071b 100644 --- a/lapack-netlib/SRC/strevc3.f +++ b/lapack-netlib/SRC/strevc3.f @@ -27,7 +27,7 @@ * .. * .. Array Arguments .. * LOGICAL SELECT( * ) -* REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), +* REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), * $ WORK( * ) * .. * @@ -45,9 +45,9 @@ *> The right eigenvector x and the left eigenvector y of T corresponding *> to an eigenvalue w are defined by: *> -*> T*x = w*x, (y**H)*T = w*(y**H) +*> T*x = w*x, (y**T)*T = w*(y**T) *> -*> where y**H denotes the conjugate transpose of y. +*> where y**T denotes the transpose of the vector y. *> The eigenvalues are not input to this routine, but are read directly *> from the diagonal blocks of T. *> @@ -251,14 +251,14 @@ SUBROUTINE STREVC3( SIDE, HOWMNY, SELECT, N, T, LDT, VL, LDVL, * .. * .. Array Arguments .. LOGICAL SELECT( * ) - REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), + REAL T( LDT, * ), VL( LDVL, * ), VR( LDVR, * ), $ WORK( * ) * .. * * ===================================================================== * * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) INTEGER NBMIN, NBMAX PARAMETER ( NBMIN = 8, NBMAX = 128 ) @@ -268,7 +268,7 @@ SUBROUTINE STREVC3( SIDE, HOWMNY, SELECT, N, T, LDT, VL, LDVL, $ RIGHTV, SOMEV INTEGER I, IERR, II, IP, IS, J, J1, J2, JNXT, K, KI, $ IV, MAXWRK, NB, KI2 - REAL BETA, BIGNUM, EMAX, OVFL, REC, REMAX, SCALE, + REAL BETA, BIGNUM, EMAX, OVFL, REC, REMAX, SCALE, $ SMIN, SMLNUM, ULP, UNFL, VCRIT, VMAX, WI, WR, $ XNORM * .. diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index e8418c680f..d553da90b8 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -704,7 +704,7 @@ SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, LWSVDJ = MAX( 2 * N, 1 ) LWSVDJV = MAX( 2 * N, 1 ) * .. minimal REAL workspace length for ZGEQP3, ZPOCON, ZGESVJ - LRWQP3 = N + LRWQP3 = 2 * N LRWCON = N LRWSVDJ = N IF ( LQUERY ) THEN @@ -942,7 +942,7 @@ SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, END IF END IF MINWRK = MAX( 2, MINWRK ) - OPTWRK = MAX( 2, OPTWRK ) + OPTWRK = MAX( MINWRK, OPTWRK ) IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 END IF diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f index bbd0fdff42..8511f0e7d6 100644 --- a/lapack-netlib/SRC/zhesv_aa.f +++ b/lapack-netlib/SRC/zhesv_aa.f @@ -209,6 +209,8 @@ SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 + ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + INFO = -10 END IF * IF( INFO.EQ.0 ) THEN @@ -219,9 +221,6 @@ SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT_HETRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index a34440029c..ed221dc69f 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -106,6 +106,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -125,7 +126,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -151,6 +152,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -240,19 +242,18 @@ SUBROUTINE ZHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 4d62198d6e..73c0ebe9a6 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -657,6 +659,8 @@ SUBROUTINE ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, * * Factor the band matrix CALL ZGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of ZHETRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/zhetrs_aa_2stage.f b/lapack-netlib/SRC/zhetrs_aa_2stage.f index 02e17476f3..7fcee1118f 100644 --- a/lapack-netlib/SRC/zhetrs_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrs_aa_2stage.f @@ -69,7 +69,7 @@ *> *> \param[in] A *> \verbatim -*> A is COMPLEX*16array, dimension (LDA,N) +*> A is COMPLEX*16 array, dimension (LDA,N) *> Details of factors computed by ZHETRF_AA_2STAGE. *> \endverbatim *> @@ -81,12 +81,13 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (LTB) *> Details of factors computed by ZHETRF_AA_2STAGE. *> \endverbatim *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> @@ -106,7 +107,7 @@ *> *> \param[in,out] B *> \verbatim -*> B is COMPLEX*16array, dimension (LDB,NRHS) +*> B is COMPLEX*16 array, dimension (LDB,NRHS) *> On entry, the right hand side matrix B. *> On exit, the solution matrix X. *> \endverbatim diff --git a/lapack-netlib/SRC/zla_syamv.f b/lapack-netlib/SRC/zla_syamv.f index 02958bef3f..cfdb3cdc87 100644 --- a/lapack-netlib/SRC/zla_syamv.f +++ b/lapack-netlib/SRC/zla_syamv.f @@ -241,7 +241,7 @@ SUBROUTINE ZLA_SYAMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INFO = 10 END IF IF( INFO.NE.0 )THEN - CALL XERBLA( 'DSYMV ', INFO ) + CALL XERBLA( 'ZLA_SYAMV', INFO ) RETURN END IF * diff --git a/lapack-netlib/SRC/zlaqr1.f b/lapack-netlib/SRC/zlaqr1.f index 03afb87aaa..34341cb10a 100644 --- a/lapack-netlib/SRC/zlaqr1.f +++ b/lapack-netlib/SRC/zlaqr1.f @@ -142,6 +142,13 @@ SUBROUTINE ZLAQR1( N, H, LDH, S1, S2, V ) CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. +* +* Quick return if possible +* + IF( N.NE.2 .AND. N.NE.3 ) THEN + RETURN + END IF +* IF( N.EQ.2 ) THEN S = CABS1( H( 1, 1 )-S2 ) + CABS1( H( 2, 1 ) ) IF( S.EQ.RZERO ) THEN diff --git a/lapack-netlib/SRC/zsysv_aa.f b/lapack-netlib/SRC/zsysv_aa.f index 10693c7312..325d07c54c 100644 --- a/lapack-netlib/SRC/zsysv_aa.f +++ b/lapack-netlib/SRC/zsysv_aa.f @@ -221,9 +221,6 @@ SUBROUTINE ZSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT - IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN - INFO = -10 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f index fcf9bc8702..029ed587d0 100644 --- a/lapack-netlib/SRC/zsysv_aa_2stage.f +++ b/lapack-netlib/SRC/zsysv_aa_2stage.f @@ -105,6 +105,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -124,7 +125,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -150,6 +151,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -233,19 +235,18 @@ SUBROUTINE ZSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 + ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 + ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) LWKOPT = INT( WORK(1) ) - IF( LTB.LT.INT( TB(1) ) .AND. .NOT.TQUERY ) THEN - INFO = -7 - ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.WQUERY ) THEN - INFO = -13 - END IF END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f index 1f916726e6..d3486c1a72 100644 --- a/lapack-netlib/SRC/zsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f @@ -93,6 +93,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N, internally *> used to select NB such that LTB >= (3*NB+1)*N. *> @@ -112,7 +113,7 @@ *> *> \param[out] IPIV2 *> \verbatim -*> IPIV is INTEGER array, dimension (N) +*> IPIV2 is INTEGER array, dimension (N) *> On exit, it contains the details of the interchanges, i.e., *> the row and column k of T were interchanged with the *> row and column IPIV(k). @@ -125,6 +126,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The size of WORK. LWORK >= N, internally used to select NB *> such that LWORK >= N*NB. *> @@ -662,6 +664,8 @@ SUBROUTINE ZSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, * * Factor the band matrix CALL ZGBTRF( N, N, NB, NB, TB, LDTB, IPIV2, INFO ) +* + RETURN * * End of ZSYTRF_AA_2STAGE * diff --git a/lapack-netlib/SRC/zsytri2.f b/lapack-netlib/SRC/zsytri2.f index d5aabd43af..e7303c90b2 100644 --- a/lapack-netlib/SRC/zsytri2.f +++ b/lapack-netlib/SRC/zsytri2.f @@ -163,7 +163,7 @@ SUBROUTINE ZSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) * Get blocksize - NBMAX = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 ) + NBMAX = ILAENV( 1, 'ZSYTRI2', UPLO, N, -1, -1, -1 ) IF ( NBMAX .GE. N ) THEN MINSIZE = N ELSE diff --git a/lapack-netlib/SRC/zsytrs_aa_2stage.f b/lapack-netlib/SRC/zsytrs_aa_2stage.f index c5d8947536..fa15eee907 100644 --- a/lapack-netlib/SRC/zsytrs_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrs_aa_2stage.f @@ -85,6 +85,7 @@ *> *> \param[in] LTB *> \verbatim +*> LTB is INTEGER *> The size of the array TB. LTB >= 4*N. *> \endverbatim *> From c5b13d4e10d38eb1bad56aac21bc9ffcf0b577df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Jun 2018 15:14:45 +0200 Subject: [PATCH 13/86] Fixes from netlib PR 253 --- lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f index 5698bcf94e..f6d990d1c6 100644 --- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f @@ -218,7 +218,7 @@ SUBROUTINE DCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, DERRSY, DLACPY, DLARHS, - $ DLATB4, DLATMS, DPOT02, DSYTRF_AA_2STAGE + $ DLATB4, DLATMS, DPOT02, DSYTRF_AA_2STAGE, $ DSYTRS_AA_2STAGE, XLAENV * .. * .. Intrinsic Functions .. diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f index 0be321eb09..8984226542 100644 --- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f @@ -204,7 +204,7 @@ SUBROUTINE DDRVSY_AA_2STAGE( * .. External Subroutines .. EXTERNAL ALADHD, ALAERH, ALASVM, XLAENV, DERRVX, $ DGET04, DLACPY, DLARHS, DLATB4, DLATMS, - $ DSYSV_AA_2STAGE, CHET01_AA, DPOT02, + $ DSYSV_AA_2STAGE, DPOT02, $ DSYTRF_AA_2STAGE * .. * .. Scalars in Common .. diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f index d8d9dc0a93..70e8ff6b80 100644 --- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f @@ -203,7 +203,7 @@ SUBROUTINE SDRVSY_AA_2STAGE( * .. * .. External Subroutines .. EXTERNAL ALADHD, ALAERH, ALASVM, XLAENV, SERRVX, - $ CGET04, SLACPY, SLARHS, SLATB4, SLATMS, + $ SLACPY, SLARHS, SLATB4, SLATMS, $ SSYSV_AA_2STAGE, SSYT01_AA, SPOT02, $ SSYTRF_AA_2STAGE * .. diff --git a/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f index d4d8c29399..87fc47f71e 100644 --- a/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zchksy_aa_2stage.f @@ -217,8 +217,8 @@ SUBROUTINE ZCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, CERRSY, ZLACPY, ZLARHS, - $ CLATB4, ZLATMS, ZSYT02, ZSYT01, + EXTERNAL ALAERH, ALAHD, ALASUM, ZERRSY, ZLACPY, ZLARHS, + $ ZLATB4, ZLATMS, ZSYT02, ZSYT01, $ ZSYTRF_AA_2STAGE, ZSYTRS_AA_2STAGE, $ XLAENV * .. From a8002e283a5874946bb464a45045d4651081e675 Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Fri, 1 Jun 2018 23:20:00 +0100 Subject: [PATCH 14/86] Revert "take out unused variables" This reverts commit e5752ff9b322c665a7393d6109c2da7ad6ee2523. The variables i and n are used in the `#if !__GLIBC_PREREQ(2, 7)` branch. Closes gh-1586. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ef328b9455..d69e52e97a 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -180,7 +180,7 @@ int get_num_procs(void) { cpu_set_t *cpusetp; size_t size; int ret; -// int i,n; +int i,n; if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) From 99c7bba8e404fcf697f00bc986e106892eff47ad Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 3 Jun 2018 07:24:29 +0000 Subject: [PATCH 15/86] Initial support for SkylakeX / AVX512 This patch adds the basic infrastructure for adding the SkylakeX (Intel Skylake server) target. The SkylakeX target will use the AVX512 (AVX512VL level) instruction set, which brings 2 basic things: 1) 512 bit wide SIMD (2x width of AVX2) 2) 32 SIMD registers (2x the number on AVX2) This initial patch only contains a trivial transofrmation of the Haswell SGEMM kernel to AVX512VL; more will follow later but this patch aims to get the infrastructure in place for this "later". Full performance tuning has not been done yet; with more registers and wider SIMD it's in theory possible to retune the kernels but even without that there's an interesting enough performance increase (30-40% range) with just this change. --- Makefile.system | 8 +- TargetList.txt | 1 + cmake/arch.cmake | 3 + cmake/system.cmake | 2 +- cpuid.h | 3 + cpuid_x86.c | 2 + driver/others/dynamic.c | 2 + driver/others/parameter.c | 4 +- getarch.c | 15 + kernel/CMakeLists.txt | 2 +- kernel/Makefile.L3 | 4 + kernel/setparam-ref.c | 16 + kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.SKYLAKEX | 4 + kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/cdot.c | 2 +- kernel/x86_64/cgemv_n_4.c | 2 +- kernel/x86_64/cgemv_t_4.c | 2 +- kernel/x86_64/cscal.c | 2 +- kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/dgemv_n_4.c | 2 +- kernel/x86_64/dgemv_t_4.c | 2 +- kernel/x86_64/dscal.c | 2 +- kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_U.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemm_kernel_16x4_skylakex.S | 6812 ++++++++++++++++++++ kernel/x86_64/sgemv_n_4.c | 2 +- kernel/x86_64/sgemv_t_4.c | 2 +- kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_U.c | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zaxpy.c | 2 +- kernel/x86_64/zdot.c | 2 +- kernel/x86_64/zgemv_n_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- kernel/x86_64/zscal.c | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- param.h | 119 + 57 files changed, 7034 insertions(+), 47 deletions(-) create mode 100644 kernel/x86_64/KERNEL.SKYLAKEX create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex.S diff --git a/Makefile.system b/Makefile.system index 7bfac1fa80..b005b80c9f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -62,6 +62,9 @@ ifeq ($(BINARY), 32) ifeq ($(TARGET), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), SKYLAKEX) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -95,6 +98,9 @@ ifeq ($(BINARY), 32) ifeq ($(TARGET_CORE), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), SKYLAKEX) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -467,7 +473,7 @@ ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += HASWELL ZEN +DYNAMIC_CORE += HASWELL ZEN SKYLAKEX endif endif diff --git a/TargetList.txt b/TargetList.txt index aeeaa9ede3..31e4881c4e 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -20,6 +20,7 @@ DUNNINGTON NEHALEM SANDYBRIDGE HASWELL +SKYLAKEX ATOM b)AMD CPU: diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 798a9ef824..527d2bec6e 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -56,6 +56,9 @@ if (DYNAMIC_ARCH) if (NOT NO_AVX2) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () + if (NOT NO_AVX512) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) + endif () endif () if (NOT DYNAMIC_CORE) diff --git a/cmake/system.cmake b/cmake/system.cmake index 6458956710..c21fe7c142 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,7 +33,7 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") diff --git a/cpuid.h b/cpuid.h index 1dacc49bae..a6bc211f3e 100644 --- a/cpuid.h +++ b/cpuid.h @@ -115,6 +115,7 @@ #define CORE_STEAMROLLER 25 #define CORE_EXCAVATOR 26 #define CORE_ZEN 27 +#define CORE_SKYLAKEX 28 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -137,6 +138,7 @@ #define HAVE_AVX (1 << 18) #define HAVE_FMA4 (1 << 19) #define HAVE_FMA3 (1 << 20) +#define HAVE_AVX512VL (1 << 21) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -211,5 +213,6 @@ typedef struct { #define CPUTYPE_STEAMROLLER 49 #define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_ZEN 51 +#define CPUTYPE_SKYLAKEX 52 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 342c565252..5f49e77157 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -50,6 +50,8 @@ #ifdef NO_AVX #define CPUTYPE_HASWELL CPUTYPE_NEHALEM #define CORE_HASWELL CORE_NEHALEM +#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM +#define CORE_SKYLAKEX CORE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index fbf7cd40e4..a0c9794b1c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -74,6 +74,7 @@ extern gotoblas_t gotoblas_STEAMROLLER; extern gotoblas_t gotoblas_EXCAVATOR; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; @@ -83,6 +84,7 @@ extern gotoblas_t gotoblas_ZEN; //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM +#define gotoblas_SKYLAKEX gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 31a48644ff..e7332c0c42 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -167,7 +167,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 992fc2b953..fcffe63e22 100644 --- a/getarch.c +++ b/getarch.c @@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "HASWELL" #endif +#ifdef FORCE_SKYLAKEX +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SKYLAKEX" +#define ARCHCONFIG "-DSKYLAKEX " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" +#define LIBNAME "skylakex" +#define CORENAME "SKYLAKEX" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index c06d1eae88..947114ebef 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen") + if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex") set(USE_TRMM true) endif () diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 0664263967..b37e536efa 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -32,6 +32,10 @@ ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif +ifeq ($(CORE), SKYLAKEX) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index b6c5b54deb..9030d7c6dd 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -871,6 +871,22 @@ static void init_parameter(void) { #endif #endif +#ifdef SKYLAKEX + +#ifdef DEBUG + fprintf(stderr, "SkylakeX\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef OPTERON diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 0b475afa21..34653d400a 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index e98854f34b..492f343447 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 086852cfce..6840c54adf 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 2dd8ad08b2..361ccf6030 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX)) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 154276f6ac..11825429ef 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index acdcd6e22b..4c054f3992 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index da561b5833..e674967365 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index a11b0286ac..498057697b 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 787ab59822..f3072983d0 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 9a3b0cbd7d..879ae9c383 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index bd7a78b5ae..6c308197b7 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX new file mode 100644 index 0000000000..744831d678 --- /dev/null +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -0,0 +1,4 @@ +include $(KERNELDIR)/KERNEL.HASWELL + +SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S + diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index b1ec19bd3d..586d05ac2d 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 5f01f7eebf..93fca0a0d9 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 770c955b2a..d81766cd40 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index d75e58fdd9..6bdea67871 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 9b9179da04..72af998092 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 4bde62824f..b4acdccd21 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "daxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 8162a5d833..0595490288 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ddot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 1b9ca7a605..309fbe7677 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) +#elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) #include "dgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 6b99d6fdd0..a7478e3a8b 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 428558617a..2c7b3b17c2 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dscal_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 3e8db3fa3f..73099462c1 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dsymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 61cb77a64c..431e4bb3fc 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index d89fe408a6..d89c4070d7 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "saxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index b6f3c21afe..c3ab2ffe6b 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S new file mode 100644 index 0000000000..1fab892ca7 --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S @@ -0,0 +1,6812 @@ +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define BO2 %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#if defined(OS_WINDOWS) +#define L_BUFFER_SIZE 8192 +#else +#define L_BUFFER_SIZE 12288 +#endif + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 + +#else + +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 6 lines of N +*******************************************************************************************/ + +.macro KERNEL16x6_SUB + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm2 + vbroadcastss -3 * SIZE(BO), %zmm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm2 + vbroadcastss -1 * SIZE(BO), %zmm3 + VFMADD231PS_( %zmm8,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm3,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm2 + vbroadcastss 1 * SIZE(BO), %zmm3 + VFMADD231PS_( %zmm12,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm3,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro KERNEL16x6_SUB4 + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm2 + vbroadcastss -3 * SIZE(BO), %zmm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm7 + vbroadcastss -1 * SIZE(BO), %zmm9 + VFMADD231PS_( %zmm8,%zmm7,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm9,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm11 + vbroadcastss 1 * SIZE(BO), %zmm13 + VFMADD231PS_( %zmm12,%zmm11,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm13,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm16 + vbroadcastss -3 * SIZE(BO), %zmm17 + + VFMADD231PS_( %zmm4,%zmm16,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm17,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm18 + vbroadcastss -1 * SIZE(BO), %zmm19 + VFMADD231PS_( %zmm8,%zmm18,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm19,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm20 + vbroadcastss 1 * SIZE(BO), %zmm21 + VFMADD231PS_( %zmm12,%zmm20,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm21,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm22 + vbroadcastss -3 * SIZE(BO), %zmm23 + + VFMADD231PS_( %zmm4,%zmm22,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm23,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm24 + vbroadcastss -1 * SIZE(BO), %zmm25 + VFMADD231PS_( %zmm8,%zmm24,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm25,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm26 + vbroadcastss 1 * SIZE(BO), %zmm27 + VFMADD231PS_( %zmm12,%zmm26,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm27,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax + vmovups -16 * SIZE(AO), %zmm0 + vbroadcastss -4 * SIZE(BO), %zmm28 + vbroadcastss -3 * SIZE(BO), %zmm29 + + VFMADD231PS_( %zmm4,%zmm28,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm29,%zmm0 ) + + vbroadcastss -2 * SIZE(BO), %zmm30 + vbroadcastss -1 * SIZE(BO), %zmm31 + VFMADD231PS_( %zmm8,%zmm30,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm31,%zmm0 ) + + vbroadcastss 0 * SIZE(BO), %zmm1 + vbroadcastss 1 * SIZE(BO), %zmm5 + VFMADD231PS_( %zmm12,%zmm1,%zmm0 ) + VFMADD231PS_( %zmm14,%zmm5,%zmm0 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro SAVE16x6 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + vmulps %zmm0 , %zmm8 , %zmm8 + vmulps %zmm0 , %zmm10, %zmm10 + vmulps %zmm0 , %zmm12, %zmm12 + vmulps %zmm0 , %zmm14, %zmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + + vaddps (CO1, LDC,2), %zmm8,%zmm8 + + vaddps (CO2), %zmm10,%zmm10 + + vaddps (CO2, LDC), %zmm12,%zmm12 + + vaddps (CO2, LDC,2), %zmm14,%zmm14 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + + vmovups %zmm8 , (CO1, LDC,2) + + vmovups %zmm10, (CO2) + + vmovups %zmm12, (CO2, LDC) + + vmovups %zmm14, (CO2, LDC,2) + +.endm + + + + +/*******************************************************************************************/ + +.macro KERNEL8x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE8x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm14, %ymm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps (CO2), %ymm10,%ymm10 + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps (CO2, LDC,2), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm10, (CO2) + vmovups %ymm12, (CO2, LDC) + vmovups %ymm14, (CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x6_SUB + vmovups -16 * SIZE(AO), %xmm0 + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + + vbroadcastss -2 * SIZE(BO), %xmm2 + vbroadcastss -1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + + vbroadcastss 0 * SIZE(BO), %xmm2 + vbroadcastss 1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE4x6 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + vmulps %xmm0 , %xmm12, %xmm12 + vmulps %xmm0 , %xmm14, %xmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO1, LDC,2), %xmm8,%xmm8 + vaddps (CO2), %xmm10,%xmm10 + vaddps (CO2, LDC), %xmm12,%xmm12 + vaddps (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO1, LDC,2) + vmovups %xmm10, (CO2) + vmovups %xmm12, (CO2, LDC) + vmovups %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -15 * SIZE(AO), %xmm1 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE2x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm13, %xmm13 + vmulss %xmm0 , %xmm14, %xmm14 + vmulss %xmm0 , %xmm15, %xmm15 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 + + vaddss (CO2), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2), %xmm11,%xmm11 + + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 + + vaddss (CO2, LDC,2), %xmm14,%xmm14 + vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) + + vmovss %xmm10, (CO2) + vmovss %xmm11, 1 * SIZE(CO2) + + vmovss %xmm12, (CO2, LDC) + vmovss %xmm13, 1 * SIZE(CO2, LDC) + + vmovss %xmm14, (CO2, LDC,2) + vmovss %xmm15, 1 * SIZE(CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO + decq %rax +.endm + +.macro SAVE1x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm14, %xmm14 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss (CO2), %xmm10,%xmm10 + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm10, (CO2) + vmovss %xmm12, (CO2, LDC) + vmovss %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm8,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm10,%zmm3,%zmm0 ) + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + vmulps %zmm0 , %zmm8 , %zmm8 + vmulps %zmm0 , %zmm10, %zmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + + vaddps (CO2), %zmm8,%zmm8 + + vaddps (CO2, LDC), %zmm10,%zmm10 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + + vmovups %zmm8 , (CO2) + + vmovups %zmm10, (CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + vmulps %zmm0 , %zmm6 , %zmm6 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + + vaddps (CO1, LDC), %zmm6,%zmm6 + +#endif + + vmovups %zmm4 , (CO1) + + vmovups %zmm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %zmm0 + + vmulps %zmm0 , %zmm4 , %zmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %zmm4,%zmm4 + +#endif + + vmovups %zmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv6 // N / 12 + movq %rdx, Nmod6 // N % 12 + + movq Ndiv6, J + cmpq $0, J + je .L4_00 + ALIGN_4 + + +/*******************************************************************************************/ + +.L6_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq BO2, B // next offset of B + movq K, %rax + + ALIGN_4 + + +.L6_02c: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 4*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_02c + + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + + ALIGN_4 + +.L6_12: + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L6_16 + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x6_SUB + + jnz .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L6_60 // to next 6 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + ALIGN_4 + +.L6_20_7: + + KERNEL8x6_SUB + + jnz .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x6_SUB + + jnz .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + + ALIGN_4 + +.L6_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x6_SUB + + jnz .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L6_60 // to next 4 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x6_SUB + + jnz .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L6_60: + + +/*******************************************************************************************/ + + +.L7_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq K, %rax + + ALIGN_4 + + +.L7_02c: + + vmovsd 2*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_02c + + movq BO2, B // next offset of B + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + + ALIGN_4 + +.L7_12: + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L7_16 + + KERNEL16x6_SUB4 + + KERNEL16x6_SUB4 + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_4 + +.L7_17: + + KERNEL16x6_SUB + + jnz .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 6 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x6_SUB + + jnz .L7_20_7 + ALIGN_4 + + +.L7_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x6_SUB + + jnz .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x6_SUB + + jnz .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 4 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x6_SUB + + jnz .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L7_60: + + decq J // j -- + jg .L6_01 // next 12 lines of N + + + + +/*******************************************************************************************/ +.L4_00: + + movq Nmod6, J + sarq $2, J // j = j / 4 + cmpq $ 0, J + je .L2_00 + ALIGN_4 + + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_00: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#else + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#endif + diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index fd028964be..65305ac59f 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index f04d461f77..065e5b3852 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "sgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 199d8a5176..73ae001ea8 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 691a071f73..f37c251a18 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 8cae3fc1b8..8a5c44c9ba 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index d7091624d5..0c40a3435e 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 3549b98637..7a2eeace59 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 882b035a90..0408b577c7 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 8cb1d532f1..53866cf954 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index d11c76647c..ef12569c89 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index f6f88155c6..0fedc496b9 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 3e4b7d5dfc..2ab7a671bb 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index aa5d8fac00..2a6d0e4c79 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index dd95eea174..e44bd75506 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 75124cf3ed..e9f330c365 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index db1a4ff5f8..9f0dead180 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 599765a6de..b6106a37d7 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index 4227d548e8..49a5e85e89 100644 --- a/param.h +++ b/param.h @@ -1613,6 +1613,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#endif + +#ifdef SKYLAKEX + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#ifdef WINDOWS_ABI +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 128 +#else +#define SGEMM_DEFAULT_Q 384 +#define DGEMM_DEFAULT_Q 256 +#endif +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 13824 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 8 +#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define ZGEMM3M_DEFAULT_UNROLL_N 8 +#define ZGEMM3M_DEFAULT_UNROLL_M 2 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif + + #endif From 00235157339dc5fba2b4194bd660c45257e539e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 13:22:59 +0200 Subject: [PATCH 16/86] Typo fix (misplaced parenthesis) --- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 361ccf6030..e2f731fca8 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL || defined (SKYLAKEX)) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif From f1fb9a474571846ffc140313dbe5b8ba21925b74 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 13:48:27 +0200 Subject: [PATCH 17/86] Propagate NO_AVX512 if needed --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index b005b80c9f..cec4b44e55 100644 --- a/Makefile.system +++ b/Makefile.system @@ -147,6 +147,10 @@ ifeq ($(NO_AVX2), 1) GETARCH_FLAGS += -DNO_AVX2 endif +ifeq ($(NO_AVX512), 1) +GETARCH_FLAGS += -DNO_AVX512 +endif + ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif From a7d0f49cec68dc3f116feed0320708ae004af4c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:13:25 +0200 Subject: [PATCH 18/86] Add SKYLAKEX to DYNAMIC_CORE list only if AVX512 is available --- Makefile.system | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index cec4b44e55..82e38a6d28 100644 --- a/Makefile.system +++ b/Makefile.system @@ -477,7 +477,12 @@ ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += HASWELL ZEN SKYLAKEX +DYNAMIC_CORE += HASWELL ZEN +endif +ifneq ($(NO_AVX512), 1) +ifneq ($(NO_AVX2), 1) +DYNAMIC_CORE += SKYLAKEX +endif endif endif From 5a92b311e05fb938e1fd85dcaf6fbeebc77bd4fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:29:07 +0200 Subject: [PATCH 19/86] Separate Skylake X from Skylake --- cpuid_x86.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 5f49e77157..d0dbe1d24e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1301,6 +1301,19 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; case 5: + // Skylake X +#ifndef NO_AVX512 + return CPUTYPE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; +#endif case 14: // Skylake if(support_avx()) @@ -1558,6 +1571,7 @@ static char *cpuname[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", + "SKYLAKEX" }; static char *lowercpuname[] = { @@ -1612,6 +1626,7 @@ static char *lowercpuname[] = { "steamroller", "excavator", "zen", + "skylakex" }; static char *corename[] = { @@ -1643,6 +1658,7 @@ static char *corename[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", + "SKYLAKEX" }; static char *corename_lower[] = { @@ -1674,6 +1690,7 @@ static char *corename_lower[] = { "steamroller", "excavator", "zen", + "skylakex" }; @@ -1862,6 +1879,19 @@ int get_coretype(void){ else return CORE_NEHALEM; case 5: + // Skylake X +#ifndef NO_AVX512 + return CORE_SKYLAKEX; +#else + if/support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif case 14: // Skylake if(support_avx()) From 5a51cf4576df2e065e5517b04369ff10a2a83f58 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Jun 2018 23:41:33 +0200 Subject: [PATCH 20/86] Separate Skylake X from Skylake --- driver/others/dynamic.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index a0c9794b1c..5e9a24b8b5 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -79,6 +79,11 @@ extern gotoblas_t gotoblas_EXCAVATOR; #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; +#ifndef NO_AVX512 +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_HASWELL; +#endif #endif #else //Use NEHALEM kernels for sandy bridge @@ -286,8 +291,21 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + if (model == 5) { + // Intel Skylake X +#ifndef NO_AVX512 + return $gotoblas_SKYLAKEX; +#else + if(support_avx()) + return &gotoblas_HASWELL; + else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } +#endif //Intel Skylake - if (model == 14 || model == 5) { + if (model == 14) { if(support_avx()) return &gotoblas_HASWELL; else{ @@ -447,7 +465,8 @@ static char *corename[] = { "Haswell", "Steamroller", "Excavator", - "Zen" + "Zen", + "SkylakeX" }; char *gotoblas_corename(void) { @@ -475,7 +494,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_ZEN) return corename[23]; - + if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; return corename[0]; } @@ -505,6 +524,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { + case 24: return (&gotoblas_SKYLAKEX); case 23: return (&gotoblas_ZEN); case 22: return (&gotoblas_EXCAVATOR); case 21: return (&gotoblas_STEAMROLLER); From 83fec56a3f55fa24b2e541549852bdee03d30a0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 00:01:11 +0200 Subject: [PATCH 21/86] Disable AVX512 (Skylake X) support if the build system is too old --- c_check | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/c_check b/c_check index a3b3376024..dfe99350aa 100644 --- a/c_check +++ b/c_check @@ -201,6 +201,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); +$no_avx512= 0; +if (($architecture eq "x86") || ($architecture eq "x86_64")) { + $code = '"vaddps %zmm1, %zmm0, %zmm0"'; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + $args = " -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); + system(@cmd) == 0; + if ($? != 0) { + $no_avx512 = 1; + } else { + $no_avx512 = 0; + } + unlink("tmpf.o"); +} + $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data =~ /globl\s([_\.]*)(.*)/; @@ -288,6 +303,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; +print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; From ef626c6824c26415bc074d11325245e72f9e3284 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 00:13:19 +0200 Subject: [PATCH 22/86] typo fix --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 5e9a24b8b5..2c902d1083 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -294,7 +294,7 @@ static gotoblas_t *get_coretype(void){ if (model == 5) { // Intel Skylake X #ifndef NO_AVX512 - return $gotoblas_SKYLAKEX; + return &gotoblas_SKYLAKEX; #else if(support_avx()) return &gotoblas_HASWELL; From 89372e0993b7d9fe9061797625713519392fa42b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 3 Jun 2018 22:15:09 +0000 Subject: [PATCH 23/86] Use AVX512 also for DGEMM this required switching to the generic gemm_beta code (which is faster anyway on SKX) for both DGEMM and SGEMM Performance for the not-retuned version is in the 30% range --- kernel/x86_64/KERNEL.SKYLAKEX | 15 + kernel/x86_64/dgemm_kernel_16x2_skylakex.S | 5138 ++++++++++++++++++++ kernel/x86_64/sgemm_kernel_16x4_skylakex.S | 3 +- 3 files changed, 5154 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/dgemm_kernel_16x2_skylakex.S diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 744831d678..c273ff8cd1 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -2,3 +2,18 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S + +DTRMMKERNEL = ../generic/trmmkernel_16x2.c +DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c \ No newline at end of file diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.S b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S new file mode 100644 index 0000000000..91ac512805 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.S @@ -0,0 +1,5138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/10/20 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/20 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 2 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 192 +* DGEMM_DEFAULT_Q 128 +* A_PR1 512 +* +* +* Performance without prefetch of B: +* 1 thread: 45.8 GFLOPS (MKL: 45) +* 2 threads: 80.0 GFLOPS (MKL: 91) +* 4 threads: 135.0 GFLOPS (MKL: 135) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + +#define A_PR1 1024 +#define B_PR1 256 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_SUBN + vbroadcastsd -12 * SIZE(BO), %zmm1 + vbroadcastsd -11 * SIZE(BO), %zmm2 + vbroadcastsd -10 * SIZE(BO), %zmm3 + + vmovaps -16 * SIZE(AO), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + + vmovaps -8 * SIZE(AO), %zmm9 + VFMADD231PD_ %zmm10,%zmm1,%zmm9 + VFMADD231PD_ %zmm11,%zmm2,%zmm9 + VFMADD231PD_ %zmm12,%zmm3,%zmm9 + addq $ 3*SIZE , BO + addq $ 16*SIZE, AO +.endm + + +.macro KERNEL8x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + prefetcht0 B_PR1(BO) + addq $ 3*SIZE , BO + addq $ 8*SIZE, AO +.endm + +.macro KERNEL4x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 4*SIZE, AO +.endm + +.macro KERNEL2x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -15 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 2*SIZE, AO +.endm + +.macro KERNEL1x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 1*SIZE, AO +.endm + + + + + + +/******************************************************************************************/ + +.macro KERNEL16x3_1 + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %zmm2 +.endm + + + + +.macro KERNEL16x3_2 + vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %zmm2 +.endm + +.macro KERNEL16x3_3 + vmovups 0 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %zmm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %zmm2 +.endm + +.macro KERNEL16x3_4 + vmovups 16 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + addq $12, BI + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + addq $64, %rax +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm4,%zmm1,%zmm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 + VFMADD231PD_ %zmm5,%zmm2,%zmm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 + VFMADD231PD_ %zmm6,%zmm3,%zmm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 + VFMADD231PD_ %zmm10,%zmm1,%zmm0 + VFMADD231PD_ %zmm11,%zmm2,%zmm0 + VFMADD231PD_ %zmm12,%zmm3,%zmm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %zmm0 + + vmulpd %zmm0 , %zmm4 , %zmm4 + vmulpd %zmm0 , %zmm10, %zmm10 + + vmulpd %zmm0 , %zmm5 , %zmm5 + vmulpd %zmm0 , %zmm11, %zmm11 + + vmulpd %zmm0 , %zmm6 , %zmm6 + vmulpd %zmm0 , %zmm12, %zmm12 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %zmm4,%zmm4 + vaddpd 8 * SIZE(CO1), %zmm10,%zmm10 + + vaddpd (CO1, LDC), %zmm5,%zmm5 + vaddpd 8 * SIZE(CO1, LDC), %zmm11,%zmm11 + + vaddpd (CO1, LDC, 2), %zmm6,%zmm6 + vaddpd 8 * SIZE(CO1, LDC, 2), %zmm12,%zmm12 + +#endif + + vmovups %zmm4 , (CO1) + vmovups %zmm10, 8 * SIZE(CO1) + + vmovups %zmm5 , (CO1, LDC) + vmovups %zmm11, 8 * SIZE(CO1, LDC) + + vmovups %zmm6 , (CO1, LDC, 2) + vmovups %zmm12, 8 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 1, BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $1, %rax // K / 8 + je .L6_16 + + ALIGN_5 + +.L6_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN +/* + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN +*/ + dec %rax + jne .L6_12 + +.L6_16: + movq K, %rax + + andq $1, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + dec %rax + jne .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + dec %rax + jne .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_36 + ALIGN_4 + +.L6_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + dec %rax + jne .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3,%rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L7_16 + ALIGN_5 + +.L7_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + dec %rax + jne .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_5 + +.L7_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L7_17 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + dec %rax + jne .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + dec %rax + jne .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S index 1fab892ca7..ac4421252d 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.S +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.S @@ -159,7 +159,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO), %zmm0 vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -3 * SIZE(BO), %zmm3 - prefetcht0 A_PR1(AO) +# prefetcht0 A_PR1(AO) VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) @@ -183,7 +183,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups -16 * SIZE(AO), %zmm0 vbroadcastss -4 * SIZE(BO), %zmm2 vbroadcastss -3 * SIZE(BO), %zmm3 - prefetcht0 A_PR1(AO) VFMADD231PS_( %zmm4,%zmm2,%zmm0 ) VFMADD231PS_( %zmm6,%zmm3,%zmm0 ) From ac7b6e3e9aeffe111a0ef23ba74ac2b181b87e30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 08:23:40 +0200 Subject: [PATCH 24/86] Fix misplaced endif --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2c902d1083..ac1186c8ff 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -302,8 +302,8 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; } +#endif } -#endif //Intel Skylake if (model == 14) { if(support_avx()) From 8be027e4c62460f373980e883c487a30a15b5a5d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 14:36:39 +0200 Subject: [PATCH 25/86] Update dynamic.c --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ac1186c8ff..96612cc52f 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -82,7 +82,7 @@ extern gotoblas_t gotoblas_ZEN; #ifndef NO_AVX512 extern gotoblas_t gotoblas_SKYLAKEX; #else -#define gotoblas_SKYLAKEX gotoblas_HASWELL; +#define gotoblas_SKYLAKEX gotoblas_HASWELL #endif #endif #else From dc9fe05ab5845452d684746bb7b7b7ad400c0c31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Jun 2018 17:10:19 +0200 Subject: [PATCH 26/86] Update cpuid_x86.c --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index d0dbe1d24e..fc937865cb 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1883,7 +1883,7 @@ int get_coretype(void){ #ifndef NO_AVX512 return CORE_SKYLAKEX; #else - if/support_avx()) + if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else From b7feded85acaf95d68ed4cfd573e60c83fdbca5d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:24:05 +0200 Subject: [PATCH 27/86] Propagate NO_AVX512 via CCOMMON_OPT --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 82e38a6d28..8c875d6f78 100644 --- a/Makefile.system +++ b/Makefile.system @@ -939,6 +939,10 @@ ifeq ($(NO_AVX2), 1) CCOMMON_OPT += -DNO_AVX2 endif +ifeq ($(NO_AVX512), 1) +CCOMMON_OPT += -DNO_AVX512 +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER From 38ad05bd0484ea723a42415f986cf0db24e01ca8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:26:49 +0200 Subject: [PATCH 28/86] Extend loop range to find SkylakeX in force_coretype --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 96612cc52f..acb2d8b8c2 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -506,7 +506,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 23; i++) + for ( i=1 ; i <= 24; i++) { if (!strncasecmp(coretype,corename[i],20)) { From 354a976a59f1280c5403b8de37587baf53527b67 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 10:31:34 +0200 Subject: [PATCH 29/86] Fix inverted condition in _Atomic declaration fixes #1593 --- common.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/common.h b/common.h index 123e3dee78..980099ee33 100644 --- a/common.h +++ b/common.h @@ -642,6 +642,7 @@ void gotoblas_profile_init(void); void gotoblas_profile_quit(void); #ifdef USE_OPENMP + #ifndef C_MSVC int omp_in_parallel(void); int omp_get_num_procs(void); @@ -649,12 +650,15 @@ int omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif + #if (__STDC_VERSION__ >= 201112L) +#include +#else #ifndef _Atomic #define _Atomic volatile #endif -#include #endif + #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); From 15a78d6b662569a464de9a00517897b036fe7886 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 15:58:34 +0200 Subject: [PATCH 30/86] export NO_AVX512 setting --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 8c875d6f78..eaf3e98891 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1249,6 +1249,7 @@ export MSA_FLAGS export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE +export NO_AVX512 export SGEMM_UNROLL_M export SGEMM_UNROLL_N From e8002536ec90b74148abce1c3de9bca0061dbe32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 18:23:01 +0200 Subject: [PATCH 31/86] disable quiet_make for the moment --- Makefile.system | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index eaf3e98891..5c16e2bee7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif -ifeq ($(QUIET_MAKE), 1) -MAKE += -s -endif +#ifeq ($(QUIET_MAKE), 1) +#MAKE += -s +#endif ifndef NO_PARALLEL_MAKE NO_PARALLEL_MAKE=0 From f6021c798dea23685af3eedcb63c4a388c78f226 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 19:09:38 +0200 Subject: [PATCH 32/86] Re-enable QUIET_MAKE --- Makefile.system | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 5c16e2bee7..eaf3e98891 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,9 +155,9 @@ ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif -#ifeq ($(QUIET_MAKE), 1) -#MAKE += -s -#endif +ifeq ($(QUIET_MAKE), 1) +MAKE += -s +endif ifndef NO_PARALLEL_MAKE NO_PARALLEL_MAKE=0 From 7fb62aed7e2a08fb8fc62054a164d3479511ce82 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Jun 2018 23:29:33 +0200 Subject: [PATCH 33/86] Check build system support for AVX512 instructions --- cmake/system_check.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d47c38cdd7..f054852bf6 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -66,3 +66,12 @@ else() set(BINARY32 1) endif() +if (X86_64 OR X86) + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "void main(void){ __asm__ volatile(\"vaddps %zmm1, %zmm0, %zmm0\"); }") +execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) +if (NO_AVX512 EQUAL 1) +set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") +endif() + file(REMOVE "avx512.tmp" "avx512.o") +endif() + From 06d43760e4ca2cc7007e54d88938eff9e95e0579 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 09:18:10 +0200 Subject: [PATCH 34/86] Restore _Atomic define before stdatomic.h for old gcc see #1593 --- common.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/common.h b/common.h index 123e3dee78..ecf07316d2 100644 --- a/common.h +++ b/common.h @@ -649,12 +649,21 @@ int omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif + #if (__STDC_VERSION__ >= 201112L) +#if defined(C_GCC) && ( __GNUC__ < 7) +// workaround for GCC bug 65467 #ifndef _Atomic #define _Atomic volatile #endif +#endif #include +#else +#ifndef _Atomic +#define _Atomic volatile #endif + + #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); From 83da278093e32f1e089a12d880c7ec65dfbb1457 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 09:27:49 +0200 Subject: [PATCH 35/86] Update common.h --- common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common.h b/common.h index cd1c4c0d1a..663f37e7b3 100644 --- a/common.h +++ b/common.h @@ -663,6 +663,7 @@ __declspec(dllimport) int __cdecl omp_get_num_procs(void); #ifndef _Atomic #define _Atomic volatile #endif +#endif #else #ifdef __ELF__ From 9b87b642624b398ebacee525edbc879cf3f950ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 16:49:00 +0200 Subject: [PATCH 36/86] Improve AVX512 testcase clang 3.4 managed to accept the original test code, only to fail on the actual Skylake asm later --- c_check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_check b/c_check index dfe99350aa..cc64c16c62 100644 --- a/c_check +++ b/c_check @@ -203,8 +203,8 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { - $code = '"vaddps %zmm1, %zmm0, %zmm0"'; - print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; + print $tmpf "int main(void){ __asm__ volatile($code); }\n"; $args = " -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args"); system(@cmd) == 0; From e4718b1fee0f8dcd0c892063d619477bd5ed31ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Jun 2018 16:51:30 +0200 Subject: [PATCH 37/86] Better AVX512 test case --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index f054852bf6..a565fc0d54 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -67,7 +67,7 @@ else() endif() if (X86_64 OR X86) - file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "void main(void){ __asm__ volatile(\"vaddps %zmm1, %zmm0, %zmm0\"); }") + file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") From ed7c4a043b3093dfe8ddb3d6d3e3d6fd6af43d4a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 Jun 2018 10:18:26 +0200 Subject: [PATCH 38/86] Use usleep instead of sched_yield by default sched_yield only burns cpu cycles, fixes #900, see also #923, #1560 --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 663f37e7b3..b7181e6702 100644 --- a/common.h +++ b/common.h @@ -356,7 +356,7 @@ typedef int blasint; */ #ifndef YIELDING -#define YIELDING sched_yield() +#define YIELDING usleep(10) #endif /*** From e8880c1699816483090aa5574cf9b3322943831f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 Jun 2018 10:26:55 +0200 Subject: [PATCH 39/86] Use a single thread for small input size copies daxpy improvement from #27, see #1560 --- interface/zaxpy.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/interface/zaxpy.c b/interface/zaxpy.c index fbb830ffba..529e78e79a 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -41,7 +41,11 @@ #ifdef FUNCTION_PROFILE #include "functable.h" #endif - +#if defined(Z13) +#define MULTI_THREAD_MINIMAL 200000 +#else +#define MULTI_THREAD_MINIMAL 10000 +#endif #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ @@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in #endif #ifndef CBLAS - PRINT_DEBUG_CNAME; + PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif @@ -93,6 +97,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if (incx == 0 || incy == 0) nthreads = 1; + //Work around the low performance issue with small imput size & + //multithreads. + if (n <= MULTI_THREAD_MINIMAL) { + nthreads = 1; + } if (nthreads == 1) { #endif From 66316b9f4c8c7c48eed8b29e86f64581c02d45b0 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 7 Jun 2018 14:54:42 +0100 Subject: [PATCH 40/86] Improve performance of GEMM for small matrices when SMP is defined. Always checking num_cpu_avail() regardless of whether threading will actually be used adds noticeable overhead for small matrices. Most other uses of num_cpu_avail() do so only if threading will be used, so do the same here. --- interface/gemm.c | 27 ++++++--------------------- interface/trsm.c | 3 ++- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index 8baf3fbec1..a3bac59849 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -44,6 +44,7 @@ #endif #ifndef COMPLEX +#define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE #define ERROR_NAME "QGEMM " #elif defined(DOUBLE) @@ -52,6 +53,7 @@ #define ERROR_NAME "SGEMM " #endif #else +#define SMP_THRESHOLD_MIN 8192.0 #ifndef GEMM3M #ifdef XDOUBLE #define ERROR_NAME "XGEMM " @@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB, FLOAT *sa, *sb; #ifdef SMP - int nthreads_max; - int nthreads_avail; double MNK; #ifndef COMPLEX #ifdef XDOUBLE @@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS XFLOAT *sa, *sb; #ifdef SMP - int nthreads_max; - int nthreads_avail; double MNK; #ifndef COMPLEX #ifdef XDOUBLE @@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); - nthreads_max = num_cpu_avail(3); - nthreads_avail = nthreads_max; - -#ifndef COMPLEX MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - nthreads_max = 1; -#else - MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - nthreads_max = 1; -#endif - args.common = NULL; - - if ( nthreads_max > nthreads_avail ) - args.nthreads = nthreads_avail; + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) + args.nthreads = 1; else - args.nthreads = nthreads_max; - + args.nthreads = num_cpu_avail(3); + args.common = NULL; if (args.nthreads == 1) { #endif diff --git a/interface/trsm.c b/interface/trsm.c index 60c49795d4..5c2750e791 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - args.nthreads = num_cpu_avail(3); if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; + else + args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { From 6c2d90ba7724b05e7fb97c7ec33324499e4a1a79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 16:29:17 +0200 Subject: [PATCH 41/86] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option --- CMakeLists.txt | 1 + Makefile | 3 +++ Makefile.install | 2 +- Makefile.rule | 5 +++++ Makefile.system | 17 ++++++++++++++++- 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f49f205137..66c3d8afa6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON endif() option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) +option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) ####### if(BUILD_WITHOUT_LAPACK) diff --git a/Makefile b/Makefile index 380ba1ce8f..56b4426f8a 100644 --- a/Makefile +++ b/Makefile @@ -153,6 +153,9 @@ ifeq ($(DYNAMIC_ARCH), 1) do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ done @echo DYNAMIC_ARCH=1 >> Makefile.conf_last +ifeq ($(DYNAMIC_OLDER), 1) + @echo DYNAMIC_OLDER=1 >> Makefile.conf_last +endif endif ifdef USE_THREAD @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last diff --git a/Makefile.install b/Makefile.install index 21c3c9e227..c51c8a021f 100644 --- a/Makefile.install +++ b/Makefile.install @@ -98,7 +98,7 @@ endif @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" diff --git a/Makefile.rule b/Makefile.rule index 1b4b8eb637..5c03d01959 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -17,6 +17,11 @@ VERSION = 0.3.1.dev # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 +# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH +# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON, +# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures) +# DYNAMIC_OLDER = 1 + # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. # CC = gcc diff --git a/Makefile.system b/Makefile.system index eaf3e98891..62ba0e4667 100644 --- a/Makefile.system +++ b/Makefile.system @@ -472,7 +472,18 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 +ifeq ($(DYNAMIC_OLDER), 1) +DYNAMIC_CORE += PENRYN DUNNINGTON +endif +DYNAMIC_CORE += NEHALEM +ifeq ($(DYNAMIC_OLDER), 1) +DYNAMIC_CORE += OPTERON OPTERON_SSE3 +endif +DYNAMIC_CORE += BARCELONA +ifeq ($(DYNAMIC_OLDER), 1) +DYNAMIC_CORE += BOBCAT ATOM NANO +endif ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif @@ -917,6 +928,10 @@ ifeq ($(DYNAMIC_ARCH), 1) CCOMMON_OPT += -DDYNAMIC_ARCH endif +ifeq ($(DYNAMIC_OLDER), 1) +CCOMMON_OPT += -DDYNAMIC_OLDER +endif + ifeq ($(NO_LAPACK), 1) CCOMMON_OPT += -DNO_LAPACK #Disable LAPACK C interface From 1cbd8f3ae47ffb89523fa247e81ffea07c6505a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 16:30:46 +0200 Subject: [PATCH 42/86] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option --- cmake/arch.cmake | 13 ++++++++++++- cmake/openblas.pc.in | 2 +- cmake/system.cmake | 3 +++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 527d2bec6e..52fb64eaae 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -49,7 +49,18 @@ if (DYNAMIC_ARCH) endif () if (X86_64) - set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) + set(DYNAMIC_CORE PRESCOTT CORE2) + if (DYNAMIC_OLDER) + set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON) + endif () + set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM) + if (DYNAMIC_OLDER) + set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3) + endif () + set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA) + if (DYNAMIC_OLDER) + set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO) + endif () if (NOT NO_AVX) set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR) endif () diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 35973b09bd..ca88a6d5fc 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,7 +1,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ -openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ +openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ diff --git a/cmake/system.cmake b/cmake/system.cmake index c21fe7c142..48e8f75bcc 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -163,6 +163,9 @@ endif () if (DYNAMIC_ARCH) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + endif () endif () if (NO_LAPACK) From 63f7395fb49091295463785f6c1056f61dd64a7d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 16:31:38 +0200 Subject: [PATCH 43/86] Move some DYNAMIC_ARCH targets to new DYNAMIC_OLDER option --- driver/others/dynamic.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index acb2d8b8c2..4271c0a0d7 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -56,16 +56,27 @@ EXTERN gotoblas_t gotoblas_BANIAS; EXTERN gotoblas_t gotoblas_ATHLON; extern gotoblas_t gotoblas_PRESCOTT; +extern gotoblas_t gotoblas_CORE2; +extern gotoblas_t gotoblas_NEHALEM; +extern gotoblas_t gotoblas_BARCELONA; +#ifdef DYNAMIC_OLDER extern gotoblas_t gotoblas_ATOM; extern gotoblas_t gotoblas_NANO; -extern gotoblas_t gotoblas_CORE2; extern gotoblas_t gotoblas_PENRYN; extern gotoblas_t gotoblas_DUNNINGTON; -extern gotoblas_t gotoblas_NEHALEM; extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON_SSE3; -extern gotoblas_t gotoblas_BARCELONA; extern gotoblas_t gotoblas_BOBCAT; +#else +#define gotoblas_ATOM gotoblas_NEHALEM +#define gotoblas_NANO gotoblas_NEHALEM +#define gotoblas_PENRYN gotoblas_CORE2 +#define gotoblas_DUNNINGTON gotoblas_CORE2 +#define gotoblas_OPTERON gotoblas_CORE2 +#define gotoblas_OPTERON_SSE3 gotoblas_CORE2 +#define gotoblas_BOBCAT gotoblas_CORE2 +#endif + #ifndef NO_AVX extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; From e9cd11768c20707eff31912db1bafc837c0224d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Jun 2018 17:54:36 +0200 Subject: [PATCH 44/86] Enable parallel make on MS Windows by default fixes #874 --- getarch.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/getarch.c b/getarch.c index fcffe63e22..31f41d62cf 100644 --- a/getarch.c +++ b/getarch.c @@ -1196,9 +1196,7 @@ int main(int argc, char *argv[]){ #elif NO_PARALLEL_MAKE==1 printf("MAKE += -j 1\n"); #else -#ifndef OS_WINDOWS printf("MAKE += -j %d\n", get_num_cores()); -#endif #endif break; From 0bea6bb9e7e2468bc9d42f5ffdf27f772f2984af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Jun 2018 09:24:37 +0200 Subject: [PATCH 45/86] Create OpenBLASConfig.cmake from cmake as well --- CMakeLists.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f49f205137..e1c3089106 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON endif() option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) +option(DYNAMIC_OLDER "Support older cpus with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) ####### if(BUILD_WITHOUT_LAPACK) @@ -208,6 +209,7 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES # Install libraries install(TARGETS ${OpenBLAS_LIBNAME} + EXPORT "OpenBLASTargets" RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) @@ -267,3 +269,21 @@ if(PKG_CONFIG_FOUND) configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) endif() + + +# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". +set(PN OpenBLAS) +set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}") +configure_package_config_file(cmake/${PN}Config.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake" + INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + VERSION ${${PN}_VERSION} + COMPATIBILITY AnyNewerVersion) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}Config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) +install(EXPORT "${PN}Targets" + NAMESPACE "${PN}::" + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) + From 02634b549b678dc38c85ce4c77ebb532e8d9e471 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Jun 2018 09:25:46 +0200 Subject: [PATCH 46/86] Add template for OpenBLASConfig.cmake --- cmake/OpenBLASConfig.cmake.in | 79 +++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 cmake/OpenBLASConfig.cmake.in diff --git a/cmake/OpenBLASConfig.cmake.in b/cmake/OpenBLASConfig.cmake.in new file mode 100644 index 0000000000..87a1621b4a --- /dev/null +++ b/cmake/OpenBLASConfig.cmake.in @@ -0,0 +1,79 @@ +# OpenBLASConfig.cmake +# -------------------- +# +# OpenBLAS cmake module. +# This module sets the following variables in your project:: +# +# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system +# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release +# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located. +# OpenBLAS_INCLUDE_DIR - same as DIRS +# OpenBLAS_LIBRARIES - OpenBLAS library to link against. +# OpenBLAS_LIBRARY - same as LIBRARIES +# +# +# Available components:: +# +## shared - search for only shared library +## static - search for only static library +# serial - search for unthreaded library +# pthread - search for native pthread threaded library +# openmp - search for OpenMP threaded library +# +# +# Exported targets:: +# +# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED` +## target. Target is shared _or_ static, so, for both, use separate, not +## overlapping, installations. :: +# +# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached. +# +# +# Suggested usage:: +# +# find_package(OpenBLAS) +# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread) +# +# +# The following variables can be set to guide the search for this package:: +# +# OpenBLAS_DIR - CMake variable, set to directory containing this Config file +# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package +# PATH - environment variable, set to bin directory of this package +# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables +# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build + +@PACKAGE_INIT@ + +set(PN OpenBLAS) + +# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon +if(@USE_OPENMP@) + set(${PN}_openmp_FOUND 1) +elseif(@USE_THREAD@) + set(${PN}_pthread_FOUND 1) +else() + set(${PN}_serial_FOUND 1) +endif() + +check_required_components(${PN}) + +#----------------------------------------------------------------------------- +# Don't include targets if this file is being picked up by another +# project which has already built this as a subproject +#----------------------------------------------------------------------------- +if(NOT TARGET ${PN}::OpenBLAS) + include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake") + + get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION) + set(${PN}_LIBRARY ${_loc}) + get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES) + set(${PN}_LIBRARIES ${_ill}) + + get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES) + set(${PN}_INCLUDE_DIR ${_id}) + get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES) + set(${PN}_INCLUDE_DIRS ${_iid}) +endif() + From e65f451409e2150bf299a2cdd906bec4ffff7915 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Jun 2018 15:09:43 +0200 Subject: [PATCH 47/86] include CMakePackageConfigHelpers --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e1c3089106..a2421ac54b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,9 @@ set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${Open # Adhere to GNU filesystem layout conventions include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + + set(OpenBLAS_LIBNAME openblas) ####### From c2545b0fd6978e1fb09c2dc86b825846e0034228 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Mon, 11 Jun 2018 10:13:09 +0100 Subject: [PATCH 48/86] Fixed a few more unnecessary calls to num_cpu_avail. I don't have as many benchmarks for these as for gemm, but it should still make a difference for small matrices. --- interface/axpy.c | 14 ++++++-------- interface/scal.c | 5 +++-- interface/zaxpy.c | 14 ++++++-------- interface/zscal.c | 4 ++-- interface/zswap.c | 4 ++-- kernel/arm64/casum_thunderx2t99.c | 9 +++------ kernel/arm64/copy_thunderx2t99.c | 9 +++------ kernel/arm64/dasum_thunderx2t99.c | 9 +++------ kernel/arm64/dot_thunderx2t99.c | 11 ++++------- kernel/arm64/dznrm2_thunderx2t99.c | 4 ++-- kernel/arm64/dznrm2_thunderx2t99_fast.c | 4 ++-- kernel/arm64/iamax_thunderx2t99.c | 9 +++------ kernel/arm64/izamax_thunderx2t99.c | 9 +++------ kernel/arm64/sasum_thunderx2t99.c | 9 +++------ kernel/arm64/scnrm2_thunderx2t99.c | 4 ++-- kernel/arm64/zasum_thunderx2t99.c | 9 +++------ kernel/arm64/zdot_thunderx2t99.c | 9 +++------ kernel/x86_64/ddot.c | 15 ++++++--------- 18 files changed, 59 insertions(+), 92 deletions(-) diff --git a/interface/axpy.c b/interface/axpy.c index f0d95b395d..39edea6af2 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -40,11 +40,11 @@ #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" -#endif +#endif #if defined(Z13) #define MULTI_THREAD_MINIMAL 200000 #else -#define MULTI_THREAD_MINIMAL 10000 +#define MULTI_THREAD_MINIMAL 10000 #endif #ifndef CBLAS @@ -83,17 +83,15 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc if (incy < 0) y -= (n - 1) * incy; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) - nthreads = 1; - + // //Temporarily work-around the low performance issue with small imput size & //multithreads. - if (n <= MULTI_THREAD_MINIMAL) + if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/interface/scal.c b/interface/scal.c index 3f468a2a33..6d07b16502 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -76,10 +76,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #ifdef SMP - nthreads = num_cpu_avail(1); - if (n <= 1048576 ) nthreads = 1; + else + nthreads = num_cpu_avail(1); + if (nthreads == 1) { #endif diff --git a/interface/zaxpy.c b/interface/zaxpy.c index 529e78e79a..1a0259c969 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -90,18 +90,16 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) - nthreads = 1; - - //Work around the low performance issue with small imput size & + // + //Temporarily work-around the low performance issue with small imput size & //multithreads. - if (n <= MULTI_THREAD_MINIMAL) { + if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; - } + else + nthreads = num_cpu_avail(1); + if (nthreads == 1) { #endif diff --git a/interface/zscal.c b/interface/zscal.c index 633b6ecf5d..bfaddc2608 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -90,10 +90,10 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ FUNCTION_PROFILE_START(); #ifdef SMP - nthreads = num_cpu_avail(1); - if ( n <= 1048576 ) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/interface/zswap.c b/interface/zswap.c index 5308cbe908..e33bbafbac 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -79,12 +79,12 @@ FLOAT *y = (FLOAT*)vy; if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP - nthreads = num_cpu_avail(1); - //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/kernel/arm64/casum_thunderx2t99.c b/kernel/arm64/casum_thunderx2t99.c index cd5d936c5b..c6dbb3f772 100644 --- a/kernel/arm64/casum_thunderx2t99.c +++ b/kernel/arm64/casum_thunderx2t99.c @@ -233,13 +233,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = casum_compute(n, x, inc_x); diff --git a/kernel/arm64/copy_thunderx2t99.c b/kernel/arm64/copy_thunderx2t99.c index bd67b48b05..e318761391 100644 --- a/kernel/arm64/copy_thunderx2t99.c +++ b/kernel/arm64/copy_thunderx2t99.c @@ -183,13 +183,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if (n <= 0) return 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { do_copy(n, x, inc_x, y, inc_y); diff --git a/kernel/arm64/dasum_thunderx2t99.c b/kernel/arm64/dasum_thunderx2t99.c index ba12fc7763..a212c9534b 100644 --- a/kernel/arm64/dasum_thunderx2t99.c +++ b/kernel/arm64/dasum_thunderx2t99.c @@ -228,13 +228,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = dasum_compute(n, x, inc_x); diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_thunderx2t99.c index 8eeb94f360..3940acdddc 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_thunderx2t99.c @@ -199,7 +199,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " faddp "DOTF", v0.2d \n" #endif /* !defined(DSDOT) */ -#else /* !defined(DOUBLE) */ +#else /* !defined(DOUBLE) */ #define KERNEL_F1 \ " ldr "TMPX", ["X"] \n" \ " ldr "TMPY", ["Y"] \n" \ @@ -384,13 +384,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y RETURN_TYPE dot = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index 2aea9b4a9a..b94f0cffcd 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -328,10 +328,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2_compute(n, x, inc_x, &ssq, &scale); diff --git a/kernel/arm64/dznrm2_thunderx2t99_fast.c b/kernel/arm64/dznrm2_thunderx2t99_fast.c index 8b04a3eb6e..8405b388bc 100644 --- a/kernel/arm64/dznrm2_thunderx2t99_fast.c +++ b/kernel/arm64/dznrm2_thunderx2t99_fast.c @@ -235,10 +235,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2 = nrm2_compute(n, x, inc_x); diff --git a/kernel/arm64/iamax_thunderx2t99.c b/kernel/arm64/iamax_thunderx2t99.c index a11b184192..e3bec4a204 100644 --- a/kernel/arm64/iamax_thunderx2t99.c +++ b/kernel/arm64/iamax_thunderx2t99.c @@ -321,13 +321,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max_index = 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { max_index = iamax_compute(n, x, inc_x); diff --git a/kernel/arm64/izamax_thunderx2t99.c b/kernel/arm64/izamax_thunderx2t99.c index 8d70b0515e..b2e2828f0f 100644 --- a/kernel/arm64/izamax_thunderx2t99.c +++ b/kernel/arm64/izamax_thunderx2t99.c @@ -330,13 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG max_index = 0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { max_index = izamax_compute(n, x, inc_x); diff --git a/kernel/arm64/sasum_thunderx2t99.c b/kernel/arm64/sasum_thunderx2t99.c index 28fc34c625..014c667bac 100644 --- a/kernel/arm64/sasum_thunderx2t99.c +++ b/kernel/arm64/sasum_thunderx2t99.c @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = sasum_compute(n, x, inc_x); diff --git a/kernel/arm64/scnrm2_thunderx2t99.c b/kernel/arm64/scnrm2_thunderx2t99.c index b8df4962bd..f96de441ec 100644 --- a/kernel/arm64/scnrm2_thunderx2t99.c +++ b/kernel/arm64/scnrm2_thunderx2t99.c @@ -318,10 +318,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - if (n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { nrm2_double = nrm2_compute(n, x, inc_x); diff --git a/kernel/arm64/zasum_thunderx2t99.c b/kernel/arm64/zasum_thunderx2t99.c index 140e5a7410..1d303a9a30 100644 --- a/kernel/arm64/zasum_thunderx2t99.c +++ b/kernel/arm64/zasum_thunderx2t99.c @@ -230,13 +230,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asum = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { asum = zasum_compute(n, x, inc_x); diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 70d6830773..6185bc7d99 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -317,13 +317,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA CIMAG(zdot) = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { zdot_compute(n, x, inc_x, y, inc_y, &zdot); diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 0595490288..0dc9cd3da3 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) #include "ddot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "ddot_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "ddot_microk_piledriver-2.c" -#elif defined(NEHALEM) +#elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) #include "ddot_microk_haswell-2.c" @@ -110,7 +110,7 @@ static FLOAT dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; - BLASLONG n1 = n & -4; + BLASLONG n1 = n & -4; while(i < n1) { @@ -169,13 +169,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT dot = 0.0; #if defined(SMP) - nthreads = num_cpu_avail(1); - - if (inc_x == 0 || inc_y == 0) - nthreads = 1; - - if (n <= 10000) + if (inc_x == 0 || inc_y == 0 || n <= 10000) nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); From 6f71c0fce45c86c55d12b6e12e69b9ccb8ec2f28 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 11 Jun 2018 13:26:19 +0200 Subject: [PATCH 49/86] =?UTF-8?q?Return=20a=20somewhat=20sane=20default=20?= =?UTF-8?q?value=20for=20L2=20cache=20size=20if=20cpuid=20retur=E2=80=A6?= =?UTF-8?q?=20(#1611)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Return a somewhat sane default value for L2 cache size if cpuid returned something unexpected Fixes #1610, the KVM hypervisor on Google Chromebooks returning zero for CPUID 0x80000006, causing DYNAMIC_ARCH builds of OpenBLAS to hang --- kernel/setparam-ref.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 9030d7c6dd..f654de1106 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -647,7 +647,9 @@ static int get_l2_size_old(void){ return 6144; } } - return 0; +// return 0; +fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n"); +return 256; } #endif @@ -660,6 +662,10 @@ static __inline__ int get_l2_size(void){ l2 = BITMASK(ecx, 16, 0xffff); #ifndef ARCH_X86 + if (l2 <= 0) { + fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n"); + return 256; + } return l2; #else From de8fff671d6081bf543b55c95655fe5f6b5e4007 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 11 Jun 2018 17:05:27 +0200 Subject: [PATCH 50/86] Revert "Use usleep instead of sched_yield by default" --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index b7181e6702..663f37e7b3 100644 --- a/common.h +++ b/common.h @@ -356,7 +356,7 @@ typedef int blasint; */ #ifndef YIELDING -#define YIELDING usleep(10) +#define YIELDING sched_yield() #endif /*** From fcb77ab129821690fac4e532640c5cfa786c3a79 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Jun 2018 16:57:58 +0200 Subject: [PATCH 51/86] Update OSX deployment target to 10.8 fixes #1580 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 62ba0e4667..5dffd8d2e2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -248,7 +248,7 @@ endif ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET -export MACOSX_DEPLOYMENT_TARGET=10.6 +export MACOSX_DEPLOYMENT_TARGET=10.8 endif MD5SUM = md5 -r endif From bf40f806efa55c7a7c7ec57535919598eaeb569d Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 14 Jun 2018 12:18:04 +0100 Subject: [PATCH 52/86] Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something. --- driver/others/memory.c | 199 +++++++++-------------------------------- 1 file changed, 43 insertions(+), 156 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index d69e52e97a..85f790615c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -13,9 +13,9 @@ modification, are permitted provided that the following conditions are notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FIXED_PAGESIZE 4096 #endif +#ifndef BUFFERS_PER_THREAD +#ifdef USE_OPENMP +#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) +#else +#define BUFFERS_PER_THREAD NUM_BUFFERS +#endif +#endif + #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) @@ -213,7 +221,7 @@ int i,n; ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif @@ -415,8 +423,15 @@ struct release_t { int hugetlb_allocated = 0; -static struct release_t release_info[NUM_BUFFERS]; -static int release_pos = 0; +#if defined(OS_WINDOWS) +#define THREAD_LOCAL __declspec(thread) +#define UNLIKELY_TO_BE_ZERO(x) (x) +#else +#define THREAD_LOCAL __thread +#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0)) +#endif +static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD]; +static int THREAD_LOCAL release_pos = 0; #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif } #ifdef OS_LINUX @@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){ #endif if (map_address != (void *)-1) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif } return map_address; @@ -872,7 +875,7 @@ static void *alloc_hugetlb(void *address){ tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); return (void*)-1; @@ -961,20 +964,17 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -static volatile struct { - BLASULONG lock; +struct memory_t { void *addr; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int pos; -#endif int used; #ifndef __64BIT__ char dummy[48]; #else char dummy[40]; #endif +}; -} memory[NUM_BUFFERS]; +static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD]; static int memory_initialized = 0; @@ -987,9 +987,6 @@ static int memory_initialized = 0; void *blas_memory_alloc(int procpos){ int position; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos; -#endif void *map_address; @@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){ }; void *(**func)(void *address); -#if defined(USE_OPENMP) - if (!memory_initialized) { -#endif - - LOCK_COMMAND(&alloc_lock); + if (UNLIKELY_TO_BE_ZERO(memory_initialized)) { - if (!memory_initialized) { + /* Only allow a single thread to initialize memory system */ + LOCK_COMMAND(&alloc_lock); -#if defined(WHEREAMI) && !defined(USE_OPENMP) - for (position = 0; position < NUM_BUFFERS; position ++){ - memory[position].addr = (void *)0; - memory[position].pos = -1; - memory[position].used = 0; - memory[position].lock = 0; - } -#endif + if (!memory_initialized) { #ifdef DYNAMIC_ARCH - gotoblas_dynamic_init(); + gotoblas_dynamic_init(); #endif #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) - gotoblas_affinity_init(); + gotoblas_affinity_init(); #endif #ifdef SMP - if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) #ifndef DYNAMIC_ARCH - blas_set_parameter(); + blas_set_parameter(); #endif #endif - memory_initialized = 1; + memory_initialized = 1; + } + UNLOCK_COMMAND(&alloc_lock); } - UNLOCK_COMMAND(&alloc_lock); -#if defined(USE_OPENMP) - } -#endif #ifdef DEBUG printf("Alloc Start ...\n"); -#endif - -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - mypos = WhereAmI(); - - position = mypos; - while (position >= NUM_BUFFERS) position >>= 1; - - do { - if (!memory[position].used && (memory[position].pos == mypos)) { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - blas_lock(&memory[position].lock); -#endif - if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif - } - - position ++; - - } while (position < NUM_BUFFERS); - - #endif position = 0; do { -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#else - if (!memory[position].used) { - blas_lock(&memory[position].lock); -#endif if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); - } -#endif - position ++; - } while (position < NUM_BUFFERS); + } while (position < BUFFERS_PER_THREAD); goto error; @@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif if (!memory[position].addr) { do { @@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n"); } #endif #ifdef ALLOC_HUGETLBFILE if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS - fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n"); #endif } #endif @@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif memory[position].addr = map_address; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); #endif } -#if defined(WHEREAMI) && !defined(USE_OPENMP) - - if (memory[position].pos == -1) memory[position].pos = mypos; - -#endif - -#ifdef DYNAMIC_ARCH - - if (memory_initialized == 1) { - - LOCK_COMMAND(&alloc_lock); - - if (memory_initialized == 1) { - - if (!gotoblas) gotoblas_dynamic_init(); - - memory_initialized = 2; - } - - UNLOCK_COMMAND(&alloc_lock); - - } -#endif - - #ifdef DEBUG printf("Mapped : %p %3d\n\n", (void *)memory[position].addr, position); @@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: - printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); return NULL; } @@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){ #endif position = 0; -#if defined(SMP) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) + while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area)) position++; if (memory[position].addr != free_area) goto error; @@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif - // arm: ensure all writes are finished before other thread takes this memory - WMB; - memory[position].used = 0; -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){ printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); #ifdef DEBUG - for (position = 0; position < NUM_BUFFERS; position++) + for (position = 0; position < BUFFERS_PER_THREAD; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); -#endif -#if defined(SMP) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); #endif return; } @@ -1293,8 +1188,6 @@ void blas_shutdown(void){ BLASFUNC(blas_thread_shutdown)(); #endif - LOCK_COMMAND(&alloc_lock); - for (pos = 0; pos < release_pos; pos ++) { release_info[pos].func(&release_info[pos]); } @@ -1305,17 +1198,11 @@ void blas_shutdown(void){ base_address = BASE_ADDRESS; #endif - for (pos = 0; pos < NUM_BUFFERS; pos ++){ + for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ memory[pos].addr = (void *)0; memory[pos].used = 0; -#if defined(WHEREAMI) && !defined(USE_OPENMP) - memory[pos].pos = -1; -#endif - memory[pos].lock = 0; } - UNLOCK_COMMAND(&alloc_lock); - return; } From 47bf0dba8f7a9cbd559e2f9cabe0bf2c7d3ee7a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Jun 2018 11:25:05 +0200 Subject: [PATCH 53/86] Add build-time option for OMP scheduler; document MULTITHREAD_THRESHOLD range (#1620) * Allow choosing the OpenMP scheduler and add range hint for GEMM_MULTITHREAD_THRESHOLD * Amended description of GEMM_MULTITHREAD_THRESHOLD to reflect #742 making it track floating point operations rather than matrix size --- Makefile.rule | 15 +++++++++++++-- driver/others/blas_server_omp.c | 6 +++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 5c03d01959..649aabe709 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,14 @@ VERSION = 0.3.1.dev # This flag is always set for POWER8. Don't modify the flag # USE_OPENMP = 1 +# The OpenMP scheduler to use - by default this is "static" and you +# will normally not want to change this unless you know that your main +# workload will involve tasks that have highly unbalanced running times +# for individual threads. Changing away from "static" may also adversely +# affect memory access locality in NUMA systems. Setting to "runtime" will +# allow you to select the scheduler from the environment variable OMP_SCHEDULE +# CCOMMON_OPT += -DOMP_SCHED=dynamic + # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. @@ -156,8 +164,11 @@ NO_AFFINITY = 1 # CONSISTENT_FPCSR = 1 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute -# with single thread. You can use this flag to avoid the overhead of multi-threading -# in small matrix sizes. The default value is 4. +# with single thread. (Actually in recent versions this is a factor proportional to the +# number of floating point operations necessary for the given problem size, no longer +# an individual dimension). You can use this setting to avoid the overhead of multi- +# threading in small matrix sizes. The default value is 4, but values as high as 50 have +# been reported to be optimal for certain workloads (50 is the recommended value for Julia). # GEMM_MULTITHREAD_THRESHOLD = 4 # If you need santy check by comparing reference BLAS. It'll be very diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index fccdb43205..4255852c81 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -48,6 +48,10 @@ #else +#ifndef OMP_SCHED +#define OMP_SCHED static +#endif + int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; @@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(OMP_SCHED) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 From 9e162146a93a58a06515bc53f07e37b8924e0d67 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 15:32:03 +0000 Subject: [PATCH 54/86] Only initialize the part of the jobs array that will get used The jobs array is getting initialized in O(compiled cpus^2) complexity. Distros and people with bigger systems will use pretty high values (128 or 256 or more) for this value, leading to interesting bubbles in performance. Baseline (single threaded performance) gets roughly 13 - 15 multiplications per cycle in the interesting range (threading kicks in at 65x65 mult by 65x65). The hardware is capable of 32 multiplications per cycle theoretically. Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10703.9 10.6 0.0% 17990.6 6.3 0.0% 64 x 64 20778.4 12.8 0.0% 40629.2 6.5 0.0% 65 x 65 26869.9 10.3 0.0% 52545.7 5.3 0.0% 80 x 80 38104.5 13.5 0.0% 72492.7 7.1 0.0% 96 x 96 61626.4 14.4 0.0% 113983.8 7.8 0.0% 112 x 112 91803.8 15.3 0.0% 180987.3 7.8 0.0% 128 x 128 133161.4 15.8 0.0% 258374.3 8.1 0.0% When threading is turned on TARGET=SKYLAKEX F_COMPILER=GFORTRAN SHARED=1 DYNAMIC_THREADS=1 USE_OPENMP=0 NUM_THREADS=128 Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10725.9 10.5 -0.2% 18134.9 6.2 -0.8% 64 x 64 20500.6 12.9 1.3% 40929.1 6.5 -0.7% 65 x 65 2040832.1 0.1 -7495.2% 2097633.6 0.1 -3892.0% 80 x 80 2063129.1 0.2 -5314.4% 2119925.2 0.2 -2824.3% 96 x 96 2070374.5 0.4 -3259.6% 2173604.4 0.4 -1806.9% 112 x 112 2111721.5 0.7 -2169.6% 2263330.8 0.6 -1170.0% 128 x 128 2276181.5 0.9 -1609.3% 2377228.9 0.9 -820.1% There is a deep deep cliff once you hit 65x65 With this patch Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% The cliff is very significantly reduced. (more to follow) --- driver/level3/level3_thread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 4ab1ee8cc0..018813b8cf 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -658,8 +658,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG } /* Clear synchronization flags */ - for (i = 0; i < MAX_CPU_NUMBER; i++) { - for (j = 0; j < MAX_CPU_NUMBER; j++) { + for (i = 0; i < nthreads; i++) { + for (j = 0; j < nthreads; j++) { for (k = 0; k < DIVIDE_RATE; k++) { job[i].working[j][CACHE_LINE_SIZE * k] = 0; } From d148ec4ea18e672dacb1270d4a5308ccaaae18bc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 15:39:15 +0000 Subject: [PATCH 55/86] Don't use _Atomic for jobs sometimes... The use of _Atomic leads to really bad code generation in the compiler (on x86, you get 2 "mfence" memory barriers around each access with gcc8, despite x86 being ordered and cache coherent). But there's a fallback in the code that just uses volatile which is more than plenty in practice. If we're nervous about cross thread synchronization for these variables, we should make the YIELD function be a compiler/memory barrier instead. performance before (after last commit) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10630.0 10.6 0.7% 18112.8 6.2 -0.7% 64 x 64 20374.8 13.0 1.9% 40487.0 6.5 0.4% 65 x 65 141955.2 1.9 -428.3% 146708.8 1.9 -179.2% 80 x 80 178921.1 2.9 -369.6% 186032.7 2.8 -156.6% 96 x 96 205436.2 4.3 -233.4% 224513.1 3.9 -97.0% 112 x 112 244408.2 5.8 -162.7% 262158.7 5.4 -47.1% 128 x 128 321334.5 6.5 -141.3% 333829.0 6.3 -29.2% Performance with this patch (roughly a 2x improvement): Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10756.0 10.5 -0.5% 18296.7 6.1 -1.7% 64 x 64 20490.0 12.9 1.4% 40615.0 6.5 0.0% 65 x 65 83528.3 3.3 -210.9% 96319.0 2.9 -83.3% 80 x 80 101453.5 5.1 -166.3% 128021.7 4.0 -76.6% 96 x 96 149795.1 5.9 -143.1% 168059.4 5.3 -47.4% 112 x 112 191481.2 7.3 -105.8% 204165.0 6.9 -14.6% 128 x 128 265019.2 7.9 -99.0% 272006.4 7.7 -5.3% --- driver/level3/level3_thread.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 018813b8cf..7e75f69d10 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -91,11 +91,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L -_Atomic -#else volatile -#endif BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; From 5c6f008365ee3c6d42f8630d27259f130a688468 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 15:47:50 +0000 Subject: [PATCH 56/86] Tune param.h for SkylakeX param.h defines a per-platform SWITCH_RATIO, which is used as a measure for how fine grained the blocks for gemm need to be split up. Many platforms define this to 4. The reality is that the gemm low level implementation for SkylakeX likes bigger blocks due to the nature of SIMD... by tuning the SWITCH_RATIO to 32 the threading performance improves significantly: Before Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10756.0 10.5 -0.5% 18296.7 6.1 -1.7% 64 x 64 20490.0 12.9 1.4% 40615.0 6.5 0.0% 65 x 65 83528.3 3.3 -210.9% 96319.0 2.9 -83.3% 80 x 80 101453.5 5.1 -166.3% 128021.7 4.0 -76.6% 96 x 96 149795.1 5.9 -143.1% 168059.4 5.3 -47.4% 112 x 112 191481.2 7.3 -105.8% 204165.0 6.9 -14.6% 128 x 128 265019.2 7.9 -99.0% 272006.4 7.7 -5.3% After Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 10666.3 10.6 0.4% 18236.9 6.2 -1.4% 64 x 64 20410.1 13.0 1.8% 39925.8 6.6 1.7% 65 x 65 34983.0 7.9 -30.2% 51494.6 5.4 2.0% 80 x 80 39769.1 13.0 -4.4% 63805.2 8.1 12.0% 96 x 96 45169.6 19.7 26.7% 80065.8 11.1 29.8% 112 x 112 57026.1 24.7 38.7% 99535.5 14.2 44.1% 128 x 128 64789.8 32.5 51.3% 117407.2 17.9 54.6% With this change, threading starts to be a win already at 96x96 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 49a5e85e89..3573fffbb9 100644 --- a/param.h +++ b/param.h @@ -1626,7 +1626,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 32 #ifdef ARCH_X86 From 6eb4b9ae7c7cc58af00ac21b52fed8810d7e5710 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 17:05:04 +0000 Subject: [PATCH 57/86] Tune HASWELL SWITCH_RATIO as well Similar to the SKYLAKEX patch, 32 seems to work best (much better than 4 or 16) Before (4) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 15554.3 7.2 0.2% 30353.8 3.7 0.3% 64 x 64 30346.8 8.7 1.6% 63495.0 4.1 -0.1% 65 x 65 81668.1 3.4 -123.3% 82705.2 3.3 -21.2% 80 x 80 105045.9 4.9 -95.5% 115226.0 4.5 -2.2% 96 x 96 152461.2 5.8 -74.3% 148156.3 6.0 16.4% 112 x 112 188505.2 7.5 -42.2% 171187.3 8.2 36.4% 128 x 128 257884.0 8.1 -39.5% 224764.8 9.3 46.0% Intermediate (16) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 15565.7 7.2 0.2% 30378.9 3.7 0.2% 64 x 64 30430.2 8.7 1.3% 63046.4 4.2 0.6% 65 x 65 27306.0 10.1 25.3% 38879.2 7.1 43.0% 80 x 80 51008.7 10.1 5.1% 61007.6 8.4 45.9% 96 x 96 70856.7 12.5 19.0% 83403.1 10.6 53.0% 112 x 112 84769.9 16.6 36.0% 99920.1 14.1 62.9% 128 x 128 84213.2 25.0 54.5% 113024.2 18.6 72.8% After (32) Matrix SGEMM cycles MPC DGEMM cycles MPC 48 x 48 15537.3 7.2 0.3% 30537.0 3.6 -0.3% 64 x 64 30352.7 8.7 1.6% 62597.8 4.2 1.3% 65 x 65 36857.0 7.5 -0.8% 56167.6 4.9 17.7% 80 x 80 42552.6 12.1 20.8% 69536.7 7.4 38.3% 96 x 96 52101.5 17.1 40.5% 91016.1 9.7 48.7% 112 x 112 63853.7 22.1 51.8% 110507.4 12.7 58.9% 128 x 128 73966.1 28.4 60.0% 163146.4 12.9 60.8% --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 3573fffbb9..cfa4bba5ca 100644 --- a/param.h +++ b/param.h @@ -1507,7 +1507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 32 #ifdef ARCH_X86 From 73de17664dfdf2934a2fdc6dd9442107e6c85035 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 17:50:43 +0000 Subject: [PATCH 58/86] Add missing barriers in gemm scheduler a few places in the gemm scheduler code were missing barriers; the code likely worked OK due to heavy use of volatile / _Atomic but there's no reason to get this incorrect --- driver/level3/level3_thread.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 7e75f69d10..aeb5e6ed4e 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -347,7 +347,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using workspace */ START_RPCC(); for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; STOP_RPCC(waiting1); #if defined(FUSED_GEMM) && !defined(TIMING) @@ -409,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Wait until other region of B is initialized */ START_RPCC(); - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); /* Apply kernel with local region of A and part of other region of B */ @@ -427,6 +427,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; } } } while (current != mypos); @@ -488,7 +489,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); for (i = 0; i < args -> nthreads; i++) { for (js = 0; js < DIVIDE_RATE; js++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; } } STOP_RPCC(waiting3); From 7e39ffe1135ee6ca1dc119f6eea9566668fd0916 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 17:53:15 +0000 Subject: [PATCH 59/86] On x86-64, make MB/WMB compiler barriers Whie on x86(64) one does not normally need full memory barriers, it's good practice to at least use compiler barriers for places where on other architectures memory barriers are used; this prevents the compiler from over-optimizing. --- common_x86_64.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common_x86_64.h b/common_x86_64.h index 7461aaf601..3236778b83 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -60,8 +60,13 @@ #endif */ +#ifdef __GNUC__ +#define MB __asm__ __volatile__("": : :"memory") +#define WMB __asm__ __volatile__("": : :"memory") +#else #define MB #define WMB +#endif static void __inline blas_lock(volatile BLASULONG *address){ From 2ddc96c9e5a86e3fd12954b3efc269f0cc8d07d8 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 17 Jun 2018 18:06:24 +0000 Subject: [PATCH 60/86] make WMB / MB safer on x86-64 make it so that if (foo) RMB; else MB; is always done correctly and without syntax surprises --- common_x86_64.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 3236778b83..62e138e342 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -61,11 +61,11 @@ */ #ifdef __GNUC__ -#define MB __asm__ __volatile__("": : :"memory") -#define WMB __asm__ __volatile__("": : :"memory") +#define MB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) #else -#define MB -#define WMB +#define MB do {} while (0) +#define WMB do {} while (0) #endif static void __inline blas_lock(volatile BLASULONG *address){ From 2d8cc7193ace18c28ea05ef39e13bb28437b6d89 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Jun 2018 23:38:14 +0200 Subject: [PATCH 61/86] Support upcoming Intel Cannon Lake CPUs as Skylake X (#1621) * Support upcoming Cannon Lake as Skylake X --- cpuid_x86.c | 17 +++++++++++++++++ driver/others/dynamic.c | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index fc937865cb..89eb809b0d 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1339,6 +1339,23 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; + case 6: + switch (model) { + case 6: // Cannon Lake +#ifndef NO_AVX512 + return CPUTYPE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; +#endif + } + break; case 9: case 8: switch (model) { diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4271c0a0d7..bacd3b7fa7 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -338,6 +338,23 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } return NULL; + case 6: + if (model == 6) { + // Cannon Lake +#ifndef NO_AVX512 + return &gotoblas_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return &gotoblas_HASWELL; +#else + return &gotblas_SANDYBRIDGE; +#endif + else + return &gotoblas_NEHALEM; +#endif + } + return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake From 1f9e4f319327dd53d1243edb3a812c5a2366a938 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 20:46:36 +0200 Subject: [PATCH 62/86] Handle special case of gfortran+clang+OpenMP --- ctest/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ctest/Makefile b/ctest/Makefile index 6eda438635..569a5dda32 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -102,7 +102,13 @@ clean :: rm -f x* FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = +ifeq ($(USE_OPENMP), 1) +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB = -lomp +endif +endif +endif # Single real xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) From 6a5ab083b7e78458861b197b8e98b2506345d6d7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 20:47:33 +0200 Subject: [PATCH 63/86] Handle special case of gfortran+clang+OpenMP --- test/Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/Makefile b/test/Makefile index 65fb6f4387..074411b051 100644 --- a/test/Makefile +++ b/test/Makefile @@ -122,8 +122,13 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = - +ifeq ($(USE_OPENMP), 1) +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB = -lomp +endif +endif +endif sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) From 10b70c904d9e3b610d35f1efe8d89888da4011bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 20:53:19 +0200 Subject: [PATCH 64/86] Handle erroneous user settings NOFORTRAN=0 and NO_FORTRAN --- Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile b/Makefile index 56b4426f8a..728567f80c 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif +ifeq ($(NOFORTRAN), 0) +undefine NOFORTRAN +endif + +ifeq ($(NO_FORTRAN), 1) +undefine NO_FORTRAN +NOFORTRAN=1 +endif + LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench From 9369d3e6e5207c6974af162e67d4060ed625c322 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Jun 2018 23:28:06 +0200 Subject: [PATCH 65/86] Modify NOFORTRAN tests to always check the value; fix rewriting of NO_FORTRAN --- Makefile | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 728567f80c..4760be0bec 100644 --- a/Makefile +++ b/Makefile @@ -21,13 +21,15 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif -ifeq ($(NOFORTRAN), 0) -undefine NOFORTRAN -endif - ifeq ($(NO_FORTRAN), 1) -undefine NO_FORTRAN -NOFORTRAN=1 +define NOFORTRAN +1 +endef +define NO_LAPACK +1 +endef +export NOFORTRAN +export NO_LAPACK endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) @@ -56,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -117,7 +119,7 @@ endif endif tests : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -219,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -240,7 +242,7 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -283,21 +285,21 @@ endif endif large.tgz : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifndef NOFORTRAN +ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING From 952541e840bddbcdcdfce81aefc09edf7fbfb84f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jun 2018 13:20:30 +0200 Subject: [PATCH 66/86] Need to use filter-out to handle NOFORTRAN not set --- Makefile | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 4760be0bec..49dab6484a 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -119,7 +119,7 @@ endif endif tests : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -221,7 +221,7 @@ netlib : else netlib : lapack_prebuild -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -242,7 +242,10 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) + $(info filter value of NOFORTRAN is:) + $(info x$(filter-out $(NOFORTRAN), 1 2)x) + +ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc From 0c5b7b400b3973d214ce24c566be4446743eacf7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jun 2018 15:16:19 +0200 Subject: [PATCH 67/86] Add -march=skylake-avx512 to flags if target is skylake x --- Makefile.x86_64 | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 1ba63278a5..677c05d93a 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,13 @@ endif endif endif +ifeq ($(CORE), SKYLAKEX) +ifndef NO_AVX512 +CCOMMON_OPT += -march=skylake-avx512 +FCOMMON_OPT += -march=skylake-avx512 +endif +endif + ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 endif From 05978528c3f3c61fb370e1fae0ac3013faaa595e Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Wed, 20 Jun 2018 17:03:18 +0100 Subject: [PATCH 68/86] Avoid declaring arrays of size 0 when making large stack allocations. --- common_stackalloc.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/common_stackalloc.h b/common_stackalloc.h index 71fb1a4777..ec0fa1611b 100644 --- a/common_stackalloc.h +++ b/common_stackalloc.h @@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - large enough to support all architectures and kernel * Chosing a too small SIZE will lead to a stack smashing. */ -#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ - /* make it volatile because some function (ex: dgemv_n.S) */ \ - /* do not restore all register */ \ - volatile int stack_alloc_size = SIZE; \ - if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ - stack_alloc_size = 0; \ - STACK_ALLOC_PROTECT_SET \ - TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ +#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ + /* make it volatile because some function (ex: dgemv_n.S) */ \ + /* do not restore all register */ \ + volatile int stack_alloc_size = SIZE; \ + if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \ + STACK_ALLOC_PROTECT_SET \ + /* Avoid declaring an array of length 0 */ \ + TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \ + __attribute__((aligned(0x20))); \ BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); #else //Original OpenBLAS/GotoBLAS codes. From a399d004257b2f43e8211341f924f3a73171b98c Mon Sep 17 00:00:00 2001 From: oon3m0oo Date: Wed, 20 Jun 2018 21:04:03 +0100 Subject: [PATCH 69/86] Further improvements to memory.c. (#1625) - Compiler TLS is now used only used when the compiler supports it - If compiler TLS is unsupported, we use platform-specific TLS - Only one variable (an index) is now in TLS - We only access TLS once per alloc, and never when freeing - Allocation / release info is now stored within the allocation itself, by over-allocating; this saves having external structures do the bookkeeping, and reduces some of the redundant data that was being stored (such as addresses) - We never hit the alloc lock when not using SMP or when using OpenMP (that was my fault) - Now that there are fewer tracking structures I think this is a bit easier to read than before --- driver/others/memory.c | 397 +++++++++++++++++++++++++---------------- 1 file changed, 242 insertions(+), 155 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 85f790615c..ed20cf5cd5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -326,6 +326,8 @@ int goto_get_num_procs (void) { return blas_cpu_number; } +static void blas_memory_init(); + void openblas_fork_handler() { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is @@ -337,7 +339,7 @@ void openblas_fork_handler() // implementation of OpenMP. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; - err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); + err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init); if(err != 0) openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n"); #endif @@ -415,23 +417,104 @@ int openblas_get_num_threads(void) { #endif } -struct release_t { - void *address; - void (*func)(struct release_t *); - long attr; -}; - int hugetlb_allocated = 0; #if defined(OS_WINDOWS) #define THREAD_LOCAL __declspec(thread) -#define UNLIKELY_TO_BE_ZERO(x) (x) +#define LIKELY_ONE(x) (x) #else #define THREAD_LOCAL __thread -#define UNLIKELY_TO_BE_ZERO(x) (__builtin_expect(x, 0)) +#define LIKELY_ONE(x) (__builtin_expect(x, 1)) +#endif + +/* Stores information about the allocation and how to release it */ +struct alloc_t { + /* Whether this allocation is being used */ + int used; + /* Any special attributes needed when releasing this allocation */ + int attr; + /* Function that can properly release this memory */ + void (*release_func)(struct alloc_t *); + /* Pad to 64-byte alignment */ + char pad[64 - 2 * sizeof(int) - sizeof(void(*))]; +}; + +/* Convenience macros for storing release funcs */ +#define STORE_RELEASE_FUNC(address, func) \ + if (address != (void *)-1) { \ + struct alloc_t *alloc_info = (struct alloc_t *)address; \ + alloc_info->release_func = func; \ + } + +#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \ + if (address != (void *)-1) { \ + struct alloc_t *alloc_info = (struct alloc_t *)address; \ + alloc_info->release_func = func; \ + alloc_info->attr = attr; \ + } + +/* The number of bytes that will be allocated for each buffer. When allocating + memory, we store an alloc_t followed by the actual buffer memory. This means + that each allocation always has its associated alloc_t, without the need + for an auxiliary tracking structure. */ +static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); + +/* Clang supports TLS from version 2.8 */ +#if defined(__clang__) && __clang_major__ > 2 || \ + (__clang_minor__ == 2 || __clang_minor__ == 8) +#define HAS_COMPILER_TLS #endif -static struct release_t THREAD_LOCAL release_info[BUFFERS_PER_THREAD]; -static int THREAD_LOCAL release_pos = 0; + +/* GCC supports TLS from version 4.1 */ +#if !defined(__clang__) && defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) +#define HAS_COMPILER_TLS +#endif + +/* MSVC supports TLS from version 2005 */ +#if defined(_MSC_VER) && _MSC_VER >= 1400 +#define HAS_COMPILER_TLS +#endif + +/* Versions of XCode before 8 did not properly support TLS */ +#if defined(__apple_build_version__) && __apple_build_version__ < 8000042 +#undef HAS_COMPILER_TLS +#endif + +/* Android NDK's before version 12b did not support TLS */ +#if defined(__ANDROID__) && defined(__clang__) +#if __has_include() +#include +#endif +#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ + defined(__NDK_MINOR__) && \ + ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) +#undef HAS_COMPILER_TLS +#endif +#endif + +/* Holds pointers to allocated memory */ +#if defined(SMP) && !defined(USE_OPENMP) +/* This is the number of threads than can be spawned by the server, which is the + server plus the number of threads in the thread pool */ +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +static int next_memory_table_pos = 0; +# if defined(HAS_COMPILER_TLS) +/* Use compiler generated thread-local-storage */ +static int THREAD_LOCAL local_memory_table_pos = 0; +# else +/* Use system-dependent thread-local-storage */ +# if defined(OS_WINDOWS) +static DWORD local_storage_key; +# else +static pthread_key_t local_storage_key; +# endif /* defined(OS_WINDOWS) */ +# endif /* defined(HAS_COMPILER_TLS) */ +#else +/* There is only one allocating thread when in single-threaded mode and when using OpenMP */ +# define MAX_ALLOCATING_THREADS 1 +#endif /* defined(SMP) && !defined(USE_OPENMP) */ +static struct alloc_t * local_memory_table[MAX_ALLOCATING_THREADS][BUFFERS_PER_THREAD]; #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; @@ -447,11 +530,41 @@ static pthread_spinlock_t alloc_lock = 0; static BLASULONG alloc_lock = 0UL; #endif +/* Returns a pointer to the start of the per-thread memory allocation data */ +static __inline struct alloc_t ** get_memory_table() { +#if defined(SMP) && !defined(USE_OPENMP) +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); +# else + int local_memory_table_pos = (int)pthread_getspecific(local_storage_key); +# endif /* defined(OS_WINDOWS) */ +# endif /* !defined(HAS_COMPILER_TLS) */ + if (!local_memory_table_pos) { + LOCK_COMMAND(&alloc_lock); + local_memory_table_pos = next_memory_table_pos++; + UNLOCK_COMMAND(&alloc_lock); + if (next_memory_table_pos > MAX_ALLOCATING_THREADS) + printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); +# else + pthread_setspecific(local_storage_key, (void*)local_memory_table_pos); +# endif /* defined(OS_WINDOWS) */ +# endif /* !defined(HAS_COMPILER_TLS) */ + } + return local_memory_table[local_memory_table_pos]; +#else + return local_memory_table[0]; +#endif /* defined(SMP) && !defined(USE_OPENMP) */ +} + #ifdef ALLOC_MMAP -static void alloc_mmap_free(struct release_t *release){ +static void alloc_mmap_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + if (munmap(alloc_info, allocation_block_size)) { printf("OpenBLAS : munmap failed\n"); } } @@ -465,22 +578,18 @@ static void *alloc_mmap(void *address){ if (address){ map_address = mmap(address, - BUFFER_SIZE, + allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { map_address = mmap(address, - BUFFER_SIZE, + allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0); } - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_mmap_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_mmap_free); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif return map_address; @@ -533,25 +642,25 @@ static void *alloc_mmap(void *address){ if (address){ /* Just give up use advanced operation */ - map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #if defined(OS_LINUX) && !defined(NO_WARMUP) if (hot_alloc == 0) { - map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #endif - map_address = mmap(NULL, BUFFER_SIZE * SCALING, + map_address = mmap(NULL, allocation_block_size * SCALING, MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { @@ -559,7 +668,7 @@ static void *alloc_mmap(void *address){ #ifdef OS_LINUX #ifdef DEBUG int ret=0; - ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; perror("OpenBLAS alloc_mmap:"); @@ -567,7 +676,7 @@ static void *alloc_mmap(void *address){ } #else - my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); #endif #endif @@ -575,7 +684,7 @@ static void *alloc_mmap(void *address){ allocsize = DGEMM_P * DGEMM_Q * sizeof(double); start = (BLASULONG)map_address; - current = (SCALING - 1) * BUFFER_SIZE; + current = (SCALING - 1) * allocation_block_size; while(current > 0) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; @@ -590,7 +699,7 @@ static void *alloc_mmap(void *address){ best = (BLASULONG)-1; best_address = map_address; - while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) { current = run_bench(start, allocsize); @@ -606,7 +715,7 @@ static void *alloc_mmap(void *address){ if ((BLASULONG)best_address > (BLASULONG)map_address) munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); - munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address); map_address = best_address; @@ -619,11 +728,7 @@ static void *alloc_mmap(void *address){ } #endif - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_mmap_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_mmap_free); return map_address; } @@ -635,9 +740,9 @@ static void *alloc_mmap(void *address){ #ifdef ALLOC_MALLOC -static void alloc_malloc_free(struct release_t *release){ +static void alloc_malloc_free(struct alloc_t *alloc_info){ - free(release -> address); + free(alloc_info); } @@ -645,15 +750,11 @@ static void *alloc_malloc(void *address){ void *map_address; - map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_malloc_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_malloc_free); return map_address; @@ -670,24 +771,20 @@ void *qfree (void *address); #define QCOMMS 0x2 #define QFAST 0x4 -static void alloc_qalloc_free(struct release_t *release){ +static void alloc_qalloc_free(struct alloc_t *alloc_info){ - qfree(release -> address); + qfree(alloc_info); } static void *alloc_qalloc(void *address){ void *map_address; - map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_qalloc_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_qalloc_free); return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); } @@ -696,9 +793,9 @@ static void *alloc_qalloc(void *address){ #ifdef ALLOC_WINDOWS -static void alloc_windows_free(struct release_t *release){ +static void alloc_windows_free(struct alloc_t *alloc_info){ - VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT); } @@ -706,17 +803,13 @@ static void *alloc_windows(void *address){ void *map_address; map_address = VirtualAlloc(address, - BUFFER_SIZE, + allocation_block_size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); if (map_address == (void *)NULL) map_address = (void *)-1; - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_windows_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_windows_free); return map_address; } @@ -728,13 +821,14 @@ static void *alloc_windows(void *address){ #define DEVICEDRIVER_NAME "/dev/mapper" #endif -static void alloc_devicedirver_free(struct release_t *release){ +static void alloc_devicedirver_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + int attr = alloc_info -> attr; + if (munmap(address, allocation_block_size)) { printf("OpenBLAS : Bugphysarea unmap failed.\n"); } - if (close(release -> attr)) { + if (close(attr)) { printf("OpenBLAS : Bugphysarea close failed.\n"); } @@ -751,17 +845,12 @@ static void *alloc_devicedirver(void *address){ } - map_address = mmap(address, BUFFER_SIZE, + map_address = mmap(address, allocation_block_size, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].attr = fd; - release_info[release_pos].func = alloc_devicedirver_free; - release_pos ++; - } + STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd); return map_address; } @@ -770,9 +859,9 @@ static void *alloc_devicedirver(void *address){ #ifdef ALLOC_SHM -static void alloc_shm_free(struct release_t *release){ +static void alloc_shm_free(struct alloc_t *alloc_info){ - if (shmdt(release -> address)) { + if (shmdt(alloc_info)) { printf("OpenBLAS : Shared memory unmap failed.\n"); } } @@ -781,22 +870,21 @@ static void *alloc_shm(void *address){ void *map_address; int shmid; - shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600); map_address = (void *)shmat(shmid, address, 0); if (map_address != (void *)-1){ #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif shmctl(shmid, IPC_RMID, 0); - release_info[release_pos].address = map_address; - release_info[release_pos].attr = shmid; - release_info[release_pos].func = alloc_shm_free; - release_pos ++; + struct alloc_t *alloc_info = (struct alloc_t *)map_address; + alloc_info->release_func = alloc_shm_free; + alloc_info->attr = shmid; } return map_address; @@ -804,23 +892,23 @@ static void *alloc_shm(void *address){ #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS -static void alloc_hugetlb_free(struct release_t *release){ +static void alloc_hugetlb_free(struct alloc_t *alloc_info){ #if defined(OS_LINUX) || defined(OS_AIX) - if (shmdt(release -> address)) { + if (shmdt(alloc_info)) { printf("OpenBLAS : Hugepage unmap failed.\n"); } #endif #ifdef __sun__ - munmap(release -> address, BUFFER_SIZE); + munmap(alloc_info, allocation_block_size); #endif #ifdef OS_WINDOWS - VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT); #endif @@ -833,7 +921,7 @@ static void *alloc_hugetlb(void *address){ #if defined(OS_LINUX) || defined(OS_AIX) int shmid; - shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, + shmid = shmget(IPC_PRIVATE, allocation_block_size, #ifdef OS_LINUX SHM_HUGETLB | #endif @@ -846,7 +934,7 @@ static void *alloc_hugetlb(void *address){ map_address = (void *)shmat(shmid, address, SHM_RND); #ifdef OS_LINUX - my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0); #endif if (map_address != (void *)-1){ @@ -863,7 +951,7 @@ static void *alloc_hugetlb(void *address){ mha.mha_pagesize = HUGE_PAGESIZE; memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); - map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size); #endif #ifdef OS_WINDOWS @@ -887,7 +975,7 @@ static void *alloc_hugetlb(void *address){ } map_address = (void *)VirtualAlloc(address, - BUFFER_SIZE, + allocation_block_size, MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); @@ -898,11 +986,7 @@ static void *alloc_hugetlb(void *address){ #endif - if (map_address != (void *)-1){ - release_info[release_pos].address = map_address; - release_info[release_pos].func = alloc_hugetlb_free; - release_pos ++; - } + STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free); return map_address; } @@ -914,13 +998,14 @@ static void *alloc_hugetlb(void *address){ static int hugetlb_pid = 0; -static void alloc_hugetlbfile_free(struct release_t *release){ +static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){ - if (munmap(release -> address, BUFFER_SIZE)) { + int attr = alloc_info -> attr; + if (munmap(alloc_info, allocation_block_size)) { printf("OpenBLAS : HugeTLBfs unmap failed.\n"); } - if (close(release -> attr)) { + if (close(attr)) { printf("OpenBLAS : HugeTLBfs close failed.\n"); } } @@ -941,17 +1026,12 @@ static void *alloc_hugetlbfile(void *address){ unlink(filename); - map_address = mmap(address, BUFFER_SIZE, + map_address = mmap(address, allocation_block_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (map_address != (void *)-1) { - release_info[release_pos].address = map_address; - release_info[release_pos].attr = fd; - release_info[release_pos].func = alloc_hugetlbfile_free; - release_pos ++; - } + STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd); return map_address; } @@ -964,19 +1044,11 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -struct memory_t { - void *addr; - int used; -#ifndef __64BIT__ - char dummy[48]; +#if __STDC_VERSION__ >= 201112L +static _Atomic int memory_initialized = 0; #else - char dummy[40]; +static volatile int memory_initialized = 0; #endif -}; - -static struct memory_t THREAD_LOCAL memory[BUFFERS_PER_THREAD]; - -static int memory_initialized = 0; /* Memory allocation routine */ /* procpos ... indicates where it comes from */ @@ -984,6 +1056,20 @@ static int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ +static void blas_memory_init(){ +#if defined(SMP) && !defined(USE_OPENMP) + next_memory_table_pos = 0; +# if !defined(HAS_COMPILER_TLS) +# if defined(OS_WINDOWS) + local_storage_key = ::TlsAlloc(); +# else + pthread_key_create(&local_storage_key, NULL); +# endif /* defined(OS_WINDOWS) */ +# endif /* defined(HAS_COMPILER_TLS) */ +#endif /* defined(SMP) && !defined(USE_OPENMP) */ + memset(local_memory_table, 0, sizeof(local_memory_table)); +} + void *blas_memory_alloc(int procpos){ int position; @@ -1016,14 +1102,17 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); + struct alloc_t * alloc_info; + struct alloc_t ** alloc_table; - if (UNLIKELY_TO_BE_ZERO(memory_initialized)) { - + if (!LIKELY_ONE(memory_initialized)) { +#if defined(SMP) && !defined(USE_OPENMP) /* Only allow a single thread to initialize memory system */ LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { - +#endif + blas_memory_init(); #ifdef DYNAMIC_ARCH gotoblas_dynamic_init(); #endif @@ -1044,8 +1133,10 @@ void *blas_memory_alloc(int procpos){ memory_initialized = 1; +#if defined(SMP) && !defined(USE_OPENMP) } UNLOCK_COMMAND(&alloc_lock); +#endif } #ifdef DEBUG @@ -1053,9 +1144,9 @@ void *blas_memory_alloc(int procpos){ #endif position = 0; - + alloc_table = get_memory_table(); do { - if (!memory[position].used) goto allocation; + if (!alloc_table[position] || !alloc_table[position]->used) goto allocation; position ++; } while (position < BUFFERS_PER_THREAD); @@ -1068,9 +1159,8 @@ void *blas_memory_alloc(int procpos){ printf(" Position -> %d\n", position); #endif - memory[position].used = 1; - - if (!memory[position].addr) { + alloc_info = alloc_table[position]; + if (!alloc_info) { do { #ifdef DEBUG printf("Allocation Start : %lx\n", base_address); @@ -1082,7 +1172,7 @@ void *blas_memory_alloc(int procpos){ while ((func != NULL) && (map_address == (void *) -1)) { - map_address = (*func)((void *)base_address); + map_address = (*func)((void *)base_address); #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { @@ -1110,23 +1200,24 @@ void *blas_memory_alloc(int procpos){ #endif if (((BLASLONG) map_address) == -1) base_address = 0UL; - if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE; } while ((BLASLONG)map_address == -1); - memory[position].addr = map_address; + alloc_table[position] = alloc_info = map_address; #ifdef DEBUG - printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); + printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position); #endif } #ifdef DEBUG - printf("Mapped : %p %3d\n\n", - (void *)memory[position].addr, position); + printf("Mapped : %p %3d\n\n", (void *)alloc_info, position); #endif - return (void *)memory[position].addr; + alloc_info->used = 1; + + return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); error: printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); @@ -1134,25 +1225,19 @@ void *blas_memory_alloc(int procpos){ return NULL; } -void blas_memory_free(void *free_area){ - - int position; - +void blas_memory_free(void *buffer){ #ifdef DEBUG - printf("Unmapped Start : %p ...\n", free_area); + int position; + struct alloc_t ** alloc_table; #endif - - position = 0; - while ((position < BUFFERS_PER_THREAD) && (memory[position].addr != free_area)) - position++; - - if (memory[position].addr != free_area) goto error; + /* Since we passed an offset pointer to the caller, get back to the actual allocation */ + struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t)); #ifdef DEBUG - printf(" Position : %d\n", position); + printf("Unmapped Start : %p ...\n", alloc_info); #endif - memory[position].used = 0; + alloc_info->used = 0; #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -1160,12 +1245,13 @@ void blas_memory_free(void *free_area){ return; - error: - printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); - #ifdef DEBUG - for (position = 0; position < BUFFERS_PER_THREAD; position++) - printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); + alloc_table = get_memory_table(); + for (position = 0; position < BUFFERS_PER_THREAD; position++){ + if (alloc_table[position]) { + printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used); + } + } #endif return; } @@ -1182,14 +1268,20 @@ void blas_memory_free_nolock(void * map_address) { void blas_shutdown(void){ - int pos; + int pos, thread; #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif - for (pos = 0; pos < release_pos; pos ++) { - release_info[pos].func(&release_info[pos]); + for (thread = 0; thread < MAX_ALLOCATING_THREADS; thread ++){ + for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ + struct alloc_t *alloc_info = local_memory_table[thread][pos]; + if (alloc_info) { + alloc_info->release_func(alloc_info); + alloc_info = (void *)0; + } + } } #ifdef SEEK_ADDRESS @@ -1198,11 +1290,6 @@ void blas_shutdown(void){ base_address = BASE_ADDRESS; #endif - for (pos = 0; pos < BUFFERS_PER_THREAD; pos ++){ - memory[pos].addr = (void *)0; - memory[pos].used = 0; - } - return; } @@ -1226,7 +1313,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, size_t size; BLASULONG buffer; - size = BUFFER_SIZE - PAGESIZE; + size = allocation_block_size - PAGESIZE; buffer = (BLASULONG)sa + GEMM_OFFSET_A; #if defined(OS_LINUX) && !defined(NO_WARMUP) @@ -1247,7 +1334,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, UNLOCK_COMMAND(&init_lock); #endif - size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + size = MIN((allocation_block_size - PAGESIZE), L2_SIZE); buffer = (BLASULONG)sa + GEMM_OFFSET_A; while (size > 0) { From 28c28ed275df2fd812bcdc75fdc04cdb6d9580b3 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Thu, 21 Jun 2018 11:13:57 +0100 Subject: [PATCH 70/86] Fix data races reported by TSAN. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ed20cf5cd5..7eff16ce39 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -543,9 +543,9 @@ static __inline struct alloc_t ** get_memory_table() { if (!local_memory_table_pos) { LOCK_COMMAND(&alloc_lock); local_memory_table_pos = next_memory_table_pos++; - UNLOCK_COMMAND(&alloc_lock); if (next_memory_table_pos > MAX_ALLOCATING_THREADS) printf("OpenBLAS : Program will terminate because you tried to start too many threads.\n"); + UNLOCK_COMMAND(&alloc_lock); # if !defined(HAS_COMPILER_TLS) # if defined(OS_WINDOWS) ::TlsSetValue(local_storage_key, (void*)local_memory_table_pos); From 2aa0a5804e381f89a53fdbef9bd51e8af23c8940 Mon Sep 17 00:00:00 2001 From: oon3m0oo Date: Thu, 21 Jun 2018 17:47:45 +0100 Subject: [PATCH 71/86] Use BLAS rather than CBLAS in test_fork.c (#1626) This is handy for people not using lapack. --- utest/CMakeLists.txt | 2 -- utest/Makefile | 2 -- utest/test_fork.c | 22 +++++++++++++--------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 77a42d84f3..1b426afe7e 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -25,7 +25,6 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms -if (NOT NO_CBLAS) if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) set(OpenBLAS_utest_src @@ -34,7 +33,6 @@ set(OpenBLAS_utest_src ) endif() endif() -endif() if (NOT NO_LAPACK) set(OpenBLAS_utest_src diff --git a/utest/Makefile b/utest/Makefile index e071540dc1..e40b3c6db5 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -17,13 +17,11 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch -ifneq ($(NO_CBLAS), 1) ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) OBJS += test_fork.o endif endif -endif all : run_test diff --git a/utest/test_fork.c b/utest/test_fork.c index 9e0244305b..9fc51287c4 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -13,9 +13,9 @@ modification, are permitted provided that the following conditions are notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -48,11 +48,13 @@ void* xmalloc(size_t n) } } -void check_dgemm(double *a, double *b, double *result, double *expected, int n) +void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) { + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; int i; - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, - 1.0, a, n, b, n, 0.0, result, n); + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n); for(i = 0; i < n * n; ++i) { ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); } @@ -60,7 +62,7 @@ void check_dgemm(double *a, double *b, double *result, double *expected, int n) CTEST(fork, safety) { - int n = 1000; + blasint n = 1000; int i; double *a, *b, *c, *d; @@ -84,8 +86,10 @@ CTEST(fork, safety) // Compute a DGEMM product in the parent process prior to forking to // ensure that the OpenBLAS thread pool is initialized. - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, - 1.0, a, n, b, n, 0.0, c, n); + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, c, &n); fork_pid = fork(); if (fork_pid == -1) { From 9cf22b7d9129e186a1ee941fbab8e45328c50b61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 13:27:30 +0200 Subject: [PATCH 72/86] Build cblas_iXamin interfaces --- interface/Makefile | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 9b2b93b835..20ec74e9ee 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -260,7 +260,7 @@ HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) CSBLAS1OBJS = \ - cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ + cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) @@ -277,7 +277,7 @@ CSBLAS3OBJS = \ cblas_sgeadd.$(SUFFIX) CDBLAS1OBJS = \ - cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ + cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) @@ -294,7 +294,7 @@ CDBLAS3OBJS += \ cblas_dgeadd.$(SUFFIX) CCBLAS1OBJS = \ - cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ + cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ cblas_ccopy.$(SUFFIX) \ cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ @@ -320,7 +320,7 @@ CCBLAS3OBJS = \ CZBLAS1OBJS = \ - cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ + cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ cblas_zcopy.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ @@ -1359,6 +1359,18 @@ cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) +cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) From eb71d61c7cb6640e66a5239d1113de8a8c1477df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 13:31:09 +0200 Subject: [PATCH 73/86] Expose CBLAS interface to BLAS extensions iXamin --- cblas.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cblas.h b/cblas.h index 89f78c1338..6461f42091 100644 --- a/cblas.h +++ b/cblas.h @@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); From 0b2b83d9ed91e5e9234e41b1d41b0a7f21f5234c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 19:41:32 +0200 Subject: [PATCH 74/86] Add support for a user-defined list of dynamic targets --- Makefile.system | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 62ba0e4667..4712d95258 100644 --- a/Makefile.system +++ b/Makefile.system @@ -248,7 +248,7 @@ endif ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET -export MACOSX_DEPLOYMENT_TARGET=10.6 +export MACOSX_DEPLOYMENT_TARGET=10.8 endif MD5SUM = md5 -r endif @@ -497,6 +497,14 @@ endif endif endif +ifdef DYNAMIC_LIST +override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST) +XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT +XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +CCOMMON_OPT += $(XCCOMMON_OPT) +#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)' +endif + # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= From 1833a6707157abe966f39dcac90530c2461117d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Jun 2018 19:42:15 +0200 Subject: [PATCH 75/86] Add support for a user-defined list of dynamic targets --- driver/others/dynamic.c | 139 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4271c0a0d7..d5ed6d1645 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -49,6 +49,127 @@ #define EXTERN #endif +#ifdef DYNAMIC_LIST +extern gotoblas_t gotoblas_PRESCOTT; + +#ifdef DYN_ATHLON +extern gotoblas_t gotoblas_ATHLON; +#else +#define gotoblas_ATHLON gotoblas_PRESCOTT +#endif +#ifdef DYN_KATMAI +extern gotoblas_t gotoblas_KATMAI; +#else +#define gotoblas_KATMAI gotoblas_PRESCOTT +#endif +#ifdef DYN_BANIAS +extern gotoblas_t gotoblas_BANIAS; +#else +#define gotoblas_BANIAS gotoblas_PRESCOTT +#endif +#ifdef DYN_COPPERMINE +extern gotoblas_t gotoblas_COPPERMINE; +#else +#define gotoblas_COPPERMINE gotoblas_PRESCOTT +#endif +#ifdef DYN_NORTHWOOD +extern gotoblas_t gotoblas_NORTHWOOD; +#else +#define gotoblas_NORTHWOOD gotoblas_PRESCOTT +#endif +#ifdef DYN_CORE2 +extern gotoblas_t gotoblas_CORE2; +#else +#define gotoblas_CORE2 gotoblas_PRESCOTT +#endif +#ifdef DYN_NEHALEM +extern gotoblas_t gotoblas_NEHALEM; +#else +#define gotoblas_NEHALEM gotoblas_PRESCOTT +#endif +#ifdef DYN_BARCELONA +extern gotoblas_t gotoblas_BARCELONA; +#else +#define gotoblas_BARCELONA gotoblas_PRESCOTT +#endif +#ifdef DYN_ATOM +extern gotoblas_t gotoblas_ATOM; +#else +#define gotoblas_ATOM gotoblas_PRESCOTT +#endif +#ifdef DYN_NANO +extern gotoblas_t gotoblas_NANO; +#else +#define gotoblas_NANO gotoblas_PRESCOTT +#endif +#ifdef DYN_PENRYN +extern gotoblas_t gotoblas_PENRYN; +#else +#define gotoblas_PENRYN gotoblas_PRESCOTT +#endif +#ifdef DYN_DUNNINGTON +extern gotoblas_t gotoblas_DUNNINGTON; +#else +#define gotoblas_DUNNINGTON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON +extern gotoblas_t gotoblas_OPTERON; +#else +#define gotoblas_OPTERON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON_SSE3 +extern gotoblas_t gotoblas_OPTERON_SSE3; +#else +#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT +#endif +#ifdef DYN_BOBCAT +extern gotoblas_t gotoblas_BOBCAT; +#else +#define gotoblas_BOBCAT gotoblas_PRESCOTT +#endif +#ifdef DYN_SANDYBRIDGE +extern gotoblas_t gotoblas_SANDYBRIDGE; +#else +#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT +#endif +#ifdef DYN_BULLDOZER +extern gotoblas_t gotoblas_BULLDOZER; +#else +#define gotoblas_BULLDOZER gotoblas_PRESCOTT +#endif +#ifdef DYN_PILEDRIVER +extern gotoblas_t gotoblas_PILEDRIVER; +#else +#define gotoblas_PILEDRIVER gotoblas_PRESCOTT +#endif +#ifdef DYN_STEAMROLLER +extern gotoblas_t gotoblas_STEAMROLLER; +#else +#define gotoblas_STEAMROLLER gotoblas_PRESCOTT +#endif +#ifdef DYN_EXCAVATOR +extern gotoblas_t gotoblas_EXCAVATOR; +#else +#define gotoblas_EXCAVATOR gotoblas_PRESCOTT +#endif +#ifdef DYN_HASWELL +extern gotoblas_t gotoblas_HASWELL; +#else +#define gotoblas_HASWELL gotoblas_PRESCOTT +#endif +#ifdef DYN_ZEN +extern gotoblas_t gotoblas_ZEN; +#else +#define gotoblas_ZEN gotoblas_PRESCOTT +#endif +#ifdef DYN_SKYLAKEX +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_PRESCOTT +#endif + + +#else // not DYNAMIC_LIST EXTERN gotoblas_t gotoblas_KATMAI; EXTERN gotoblas_t gotoblas_COPPERMINE; EXTERN gotoblas_t gotoblas_NORTHWOOD; @@ -108,6 +229,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; #define gotoblas_ZEN gotoblas_BARCELONA #endif +#endif // DYNAMIC_LIST #define VENDOR_INTEL 1 #define VENDOR_AMD 2 @@ -338,6 +460,23 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } return NULL; + case 6: + if (model == 6) { + // Cannon Lake +#ifndef NO_AVX512 + return &gotoblas_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return &gotoblas_HASWELL; +#else + return &gotblas_SANDYBRIDGE; +#endif + else + return &gotoblas_NEHALEM; +#endif + } + return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake From 01440685379f11f158c5f612cf15fc279eb16c88 Mon Sep 17 00:00:00 2001 From: Craig Donner Date: Mon, 25 Jun 2018 13:53:11 +0100 Subject: [PATCH 76/86] Rewrite &= -> = and simplify the initial blocking phase. --- driver/level3/level3_thread.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index aeb5e6ed4e..ee3e3b9a93 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -344,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { - /* Make sure if no one is using workspace */ - START_RPCC(); - for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; - STOP_RPCC(waiting1); - #if defined(FUSED_GEMM) && !defined(TIMING) /* Fused operation to copy region of B into workspace and apply kernel */ @@ -387,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - /* Set flag so other threads can access local region of B */ - for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) + for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { + /* Make sure if no one is using workspace */ + START_RPCC(); + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + STOP_RPCC(waiting1); + /* Set flag so other threads can access local region of B */ job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; + WMB; + } } /* Get regions of B from other threads and apply kernel */ @@ -426,13 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; WMB; } } } while (current != mypos); - /* Iterate through steps of m + /* Iterate through steps of m * Note: First step has already been finished */ for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; @@ -462,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); - + #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; #endif - + /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; WMB; } } From 750162a05f8c6d0d9530955f78e8e6bb138d8df9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Jun 2018 21:02:31 +0200 Subject: [PATCH 77/86] Try gradual fallback for cores not in the dynamic core list --- driver/others/dynamic.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index d5ed6d1645..13794207c7 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -89,11 +89,15 @@ extern gotoblas_t gotoblas_NEHALEM; #endif #ifdef DYN_BARCELONA extern gotoblas_t gotoblas_BARCELONA; +#elif defined(DYN_NEHALEM) +#define gotoblas_BARCELONA gotoblas_NEHALEM #else #define gotoblas_BARCELONA gotoblas_PRESCOTT #endif #ifdef DYN_ATOM extern gotoblas_t gotoblas_ATOM; +elif defined(DYN_NEHALEM) +#define gotoblas_ATOM gotoblas_NEHALEM #else #define gotoblas_ATOM gotoblas_PRESCOTT #endif @@ -124,46 +128,82 @@ extern gotoblas_t gotoblas_OPTERON_SSE3; #endif #ifdef DYN_BOBCAT extern gotoblas_t gotoblas_BOBCAT; +#elif defined(DYN_NEHALEM) +#define gotoblas_BOBCAT gotoblas_NEHALEM #else #define gotoblas_BOBCAT gotoblas_PRESCOTT #endif #ifdef DYN_SANDYBRIDGE extern gotoblas_t gotoblas_SANDYBRIDGE; +#elif defined(DYN_NEHALEM) +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #else #define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT #endif #ifdef DYN_BULLDOZER extern gotoblas_t gotoblas_BULLDOZER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_BULLDOZER gotoblas_NEHALEM #else #define gotoblas_BULLDOZER gotoblas_PRESCOTT #endif #ifdef DYN_PILEDRIVER extern gotoblas_t gotoblas_PILEDRIVER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_PILEDRIVER gotoblas_NEHALEM #else #define gotoblas_PILEDRIVER gotoblas_PRESCOTT #endif #ifdef DYN_STEAMROLLER extern gotoblas_t gotoblas_STEAMROLLER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_STEAMROLLER gotoblas_NEHALEM #else #define gotoblas_STEAMROLLER gotoblas_PRESCOTT #endif #ifdef DYN_EXCAVATOR extern gotoblas_t gotoblas_EXCAVATOR; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_EXCAVATOR gotoblas_NEHALEM #else #define gotoblas_EXCAVATOR gotoblas_PRESCOTT #endif #ifdef DYN_HASWELL extern gotoblas_t gotoblas_HASWELL; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_HASWELL gotoblas_NEHALEM #else #define gotoblas_HASWELL gotoblas_PRESCOTT #endif #ifdef DYN_ZEN extern gotoblas_t gotoblas_ZEN; +#elif defined(DYN_HASWELL) +#define gotoblas_ZEN gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_ZEN gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_ZEN gotoblas_NEHALEM #else #define gotoblas_ZEN gotoblas_PRESCOTT #endif #ifdef DYN_SKYLAKEX extern gotoblas_t gotoblas_SKYLAKEX; +#elif defined(DYN_HASWELL) +#define gotoblas_SKYLAKEX gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_SKYLAKEX gotoblas_NEHALEM #else #define gotoblas_SKYLAKEX gotoblas_PRESCOTT #endif From 092175cfec7d49d40904aeff1d8121acb4ed1452 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 08:09:52 +0200 Subject: [PATCH 78/86] Revert changes to NOFORTRAN handling from 952541e --- Makefile | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 49dab6484a..56b4426f8a 100644 --- a/Makefile +++ b/Makefile @@ -21,17 +21,6 @@ ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif -ifeq ($(NO_FORTRAN), 1) -define NOFORTRAN -1 -endef -define NO_LAPACK -1 -endef -export NOFORTRAN -export NO_LAPACK -endif - LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench @@ -58,7 +47,7 @@ endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @@ -119,7 +108,7 @@ endif endif tests : -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all @@ -221,7 +210,7 @@ netlib : else netlib : lapack_prebuild -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif @@ -242,10 +231,7 @@ prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : - $(info filter value of NOFORTRAN is:) - $(info x$(filter-out $(NOFORTRAN), 1 2)x) - -ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc @@ -288,21 +274,21 @@ endif endif large.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz -ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2)) +ifndef NOFORTRAN (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING From e322a951febc933e0bae192dcb117e447df24050 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:44:13 +0200 Subject: [PATCH 79/86] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/cdot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index e5a6e4d35a..fd86a37b0d 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -215,11 +215,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble cdot_kernel_L999 - cmp INC_X, #0 - beq cdot_kernel_L999 +# cmp INC_X, #0 +# beq cdot_kernel_L999 - cmp INC_Y, #0 - beq cdot_kernel_L999 +# cmp INC_Y, #0 +# beq cdot_kernel_L999 cmp INC_X, #1 bne cdot_kernel_S_BEGIN From 545b82efd30e4e0a33cb57bb7c6fb12601a6d3d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:45:00 +0200 Subject: [PATCH 80/86] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/ddot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index fb294d8b46..cc2e485b73 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -164,11 +164,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble ddot_kernel_L999 - cmp INC_X, #0 - beq ddot_kernel_L999 +# cmp INC_X, #0 +# beq ddot_kernel_L999 - cmp INC_Y, #0 - beq ddot_kernel_L999 +# cmp INC_Y, #0 +# beq ddot_kernel_L999 cmp INC_X, #1 bne ddot_kernel_S_BEGIN From e344db269b5b45d08ff4ce60801de0ece0965866 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:45:57 +0200 Subject: [PATCH 81/86] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/sdot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index 5f4f424bfc..544846258d 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -253,11 +253,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble sdot_kernel_L999 - cmp INC_X, #0 - beq sdot_kernel_L999 +# cmp INC_X, #0 +# beq sdot_kernel_L999 - cmp INC_Y, #0 - beq sdot_kernel_L999 +# cmp INC_Y, #0 +# beq sdot_kernel_L999 cmp INC_X, #1 bne sdot_kernel_S_BEGIN From b83e4c60c73e80269e84b46590005d622d05e6d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 26 Jun 2018 20:46:42 +0200 Subject: [PATCH 82/86] Remove premature exit for INC_X or INC_Y zero --- kernel/arm/zdot_vfp.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 43f2c0c0bf..c0cd92d3cc 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -218,11 +218,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble zdot_kernel_L999 - cmp INC_X, #0 - beq zdot_kernel_L999 +# cmp INC_X, #0 +# beq zdot_kernel_L999 - cmp INC_Y, #0 - beq zdot_kernel_L999 +# cmp INC_Y, #0 +# beq zdot_kernel_L999 cmp INC_X, #1 bne zdot_kernel_S_BEGIN From f0a8dc2eec86a20a1486034a999c36709e699266 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 11:34:48 +0200 Subject: [PATCH 83/86] Disable the AVX512 DGEMM kernel for now due to #1643 --- kernel/x86_64/KERNEL.SKYLAKEX | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index c273ff8cd1..2deb41b085 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -4,16 +4,16 @@ SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S DTRMMKERNEL = ../generic/trmmkernel_16x2.c -DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S -DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = ../generic/gemm_tcopy_16.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +#DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S +#DGEMMINCOPY = ../generic/gemm_ncopy_16.c +#DGEMMITCOPY = ../generic/gemm_tcopy_16.c +#DGEMMONCOPY = ../generic/gemm_ncopy_2.c +#DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +#DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +#DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +#DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +#DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMM_BETA = ../generic/gemm_beta.c -DGEMM_BETA = ../generic/gemm_beta.c \ No newline at end of file +DGEMM_BETA = ../generic/gemm_beta.c From 6e54b0a027437303e425382c7e5611c1e860632f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 17:31:06 +0200 Subject: [PATCH 84/86] Disable the 16x2 DTRMM kernel on SkylakeX as well --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 2deb41b085..1256f4c3c8 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -3,7 +3,7 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S -DTRMMKERNEL = ../generic/trmmkernel_16x2.c +#DTRMMKERNEL = ../generic/trmmkernel_16x2.c #DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S #DGEMMINCOPY = ../generic/gemm_ncopy_16.c #DGEMMITCOPY = ../generic/gemm_tcopy_16.c From f5243e8e1fc585147e8b6e1553232f5f868eff1d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 23:47:44 +0200 Subject: [PATCH 85/86] Add compiler option to avx512 test and hide test output --- c_check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_check b/c_check index cc64c16c62..3831d7aa33 100644 --- a/c_check +++ b/c_check @@ -205,8 +205,8 @@ $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; print $tmpf "int main(void){ __asm__ volatile($code); }\n"; - $args = " -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args"); + $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { $no_avx512 = 1; From 4e9c34018e06615ea2c0c64551691e297682e7a3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jun 2018 23:57:50 +0200 Subject: [PATCH 86/86] Fix apparent off-by-one error in calculation of MAX_ALLOCATING_THREADS fixes #1641 --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 7eff16ce39..98bcfb216a 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -497,7 +497,7 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); #if defined(SMP) && !defined(USE_OPENMP) /* This is the number of threads than can be spawned by the server, which is the server plus the number of threads in the thread pool */ -# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 static int next_memory_table_pos = 0; # if defined(HAS_COMPILER_TLS) /* Use compiler generated thread-local-storage */