diff --git a/config/mic/bli_config.h b/config/mic/bli_config.h index 688ed75ec0ac26907636b2f088307a4997e4b4ef..e0138c3a68d93da04c575053ac969b6ae587ea7b 100644 --- a/config/mic/bli_config.h +++ b/config/mic/bli_config.h @@ -97,7 +97,7 @@ // It is sometimes useful to define the various memory alignments in terms // of some other characteristics of the system, such as the cache line size // and the page size. -#define BLIS_CACHE_LINE_SIZE 64 +#define BLIS_CACHE_LINE_SIZE 256 #define BLIS_PAGE_SIZE 4096 // Alignment size needed by the instruction set for aligned SIMD/vector diff --git a/config/mic/bli_kernel.h b/config/mic/bli_kernel.h index 7114adf52de9ab9c7a6d534a39687d400cf81568..c2a8637edeb0464efb31abdcede8f807b0c62ccb 100644 --- a/config/mic/bli_kernel.h +++ b/config/mic/bli_kernel.h @@ -54,35 +54,42 @@ // (b) NR (for triangular operations such as trmm and trsm). // -#define BLIS_DEFAULT_MC_S 256 -#define BLIS_DEFAULT_KC_S 256 -#define BLIS_DEFAULT_NC_S 8192 +#define BLIS_DEFAULT_MC_S 240 +#define BLIS_DEFAULT_KC_S 240 +#define BLIS_DEFAULT_NC_S 9600 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 -#define BLIS_DEFAULT_MC_C 128 -#define BLIS_DEFAULT_KC_C 256 -#define BLIS_DEFAULT_NC_C 4096 +#define BLIS_DEFAULT_4M_MC_C BLIS_DEFAULT_MC_S +#define BLIS_DEFAULT_4M_KC_C BLIS_DEFAULT_KC_S -#define BLIS_DEFAULT_MC_Z 64 -#define BLIS_DEFAULT_KC_Z 256 -#define BLIS_DEFAULT_NC_Z 2048 +#define BLIS_DEFAULT_3M_MC_C BLIS_DEFAULT_MC_S +#define BLIS_DEFAULT_3M_KC_C BLIS_DEFAULT_KC_S +/* +#define BLIS_DEFAULT_MC_C 120 +#define BLIS_DEFAULT_KC_C 240 +#define BLIS_DEFAULT_NC_C 9600 + +#define BLIS_DEFAULT_MC_Z 120 +#define BLIS_DEFAULT_KC_Z 240 +#define BLIS_DEFAULT_NC_Z 9600 +*/ // -- Register blocksizes -- -#define BLIS_DEFAULT_MR_S 8 -#define BLIS_DEFAULT_NR_S 4 +#define BLIS_DEFAULT_MR_S 30 +#define BLIS_DEFAULT_NR_S 16 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 -#define BLIS_DEFAULT_MR_C 8 -#define BLIS_DEFAULT_NR_C 4 +//#define BLIS_DEFAULT_MR_C 8 +//#define BLIS_DEFAULT_NR_C 4 -#define BLIS_DEFAULT_MR_Z 8 -#define BLIS_DEFAULT_NR_Z 4 +//#define BLIS_DEFAULT_MR_Z 8 +//#define BLIS_DEFAULT_NR_Z 4 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these @@ -123,8 +130,8 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -//#define BLIS_EXTEND_MR_S 0 -//#define BLIS_EXTEND_NR_S 0 +#define BLIS_EXTEND_MR_S 2 +#define BLIS_EXTEND_NR_S 0 #define BLIS_EXTEND_MR_D 2 #define BLIS_EXTEND_NR_D 0 @@ -153,12 +160,11 @@ // -- gemm -- #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16 // -- trsm-related -- - - // -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- // -- packm -- diff --git a/kernels/mic/3/bli_gemm_opt_30x8.c b/kernels/mic/3/bli_dgemm_opt_30x8.c similarity index 97% rename from kernels/mic/3/bli_gemm_opt_30x8.c rename to kernels/mic/3/bli_dgemm_opt_30x8.c index 9a08c14b26668602fb4353f080d3b4b6d197828b..d0380883b182bf9211c7b82c1a7cb746b4069af6 100644 --- a/kernels/mic/3/bli_gemm_opt_30x8.c +++ b/kernels/mic/3/bli_dgemm_opt_30x8.c @@ -40,25 +40,6 @@ #define B_L1_PREFETCH_DIST 2 #define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed. -void bli_sgemm_opt_30x8( - dim_t k, - float* restrict alpha, - float* restrict a, - float* restrict b, - float* restrict beta, - float* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) -{ - BLIS_SGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data ); -} - //Alternate code path uused if C is not row-major #define UPDATE_C_ROW_SCATTERED(REG1, NUM, BASE_DEST) \ { \ @@ -268,7 +249,7 @@ void bli_sgemm_opt_30x8( } //This is an array used for the scattter/gather instructions. -int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +extern int offsets[16]; //#define MONITORS diff --git a/kernels/mic/3/bli_sgemm_opt_30x16.c b/kernels/mic/3/bli_sgemm_opt_30x16.c new file mode 100644 index 0000000000000000000000000000000000000000..88614177717de00aa910d8bed220c56e8bafc62a --- /dev/null +++ b/kernels/mic/3/bli_sgemm_opt_30x16.c @@ -0,0 +1,575 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include <assert.h> + + +#define A_L1_PREFETCH_DIST 4 +#define B_L1_PREFETCH_DIST 2 +#define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed. + +//Alternate code path uused if C is not row-major +#define UPDATE_C_ROW_SCATTERED(REG1, NUM, BASE_DEST) \ +{ \ + __asm kmov k3, ebx \ + __asm GATHER##NUM: \ + __asm vgatherdps zmm31{k3}, [BASE_DEST + zmm30 * 4] \ + __asm jknzd k3, GATHER##NUM \ + \ + __asm vmulps REG1, REG1, 0[r12]{1to16} /*scale by alpha*/ \ + __asm vfmadd132ps zmm31, REG1, 0[r13]{1to16} /*scale by beta, add in result*/\ + __asm kmov k3, ebx \ + \ + __asm SCATTER##NUM: \ + __asm vscatterdps [BASE_DEST + zmm30 * 4]{k3}, zmm31 \ + __asm jknzd k3, SCATTER##NUM \ + __asm add BASE_DEST, r11 \ +} + + +//One iteration of the k_r loop. +//Each iteration, we prefetch A into L1 and into L2 +#define ONE_ITER_MAIN_LOOP(C_ADDR, COUNTER) \ +{\ + __asm vbroadcastf32x4 zmm30, 0[r15] \ + __asm vmovaps zmm31, 0[rbx] \ + \ + __asm vfmadd231ps zmm0, zmm31, zmm30{aaaa} \ + __asm vfmadd231ps zmm4, zmm31, 4*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \ + __asm vfmadd231ps zmm5, zmm31, 5*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \ + __asm vfmadd231ps zmm6, zmm31, 6*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\ + __asm vfmadd231ps zmm7, zmm31, 7*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\ + __asm vfmadd231ps zmm8, zmm31, 8*4[r15]{1to16} \ + \ + __asm vprefetch1 0[r15 + r14] \ + __asm vfmadd231ps zmm9, zmm31, 9*4[r15]{1to16} \ + __asm vfmadd231ps zmm1, zmm31, zmm30{bbbb} \ + __asm vfmadd231ps zmm2, zmm31, zmm30{cccc} \ + __asm vfmadd231ps zmm3, zmm31, zmm30{dddd} \ + __asm vfmadd231ps zmm10, zmm31, 10*4[r15]{1to16} \ + \ + __asm vprefetch1 64[r15 + r14] \ + __asm vfmadd231ps zmm11, zmm31, 11*4[r15]{1to16} \ + __asm vfmadd231ps zmm12, zmm31, 12*4[r15]{1to16} \ + __asm vfmadd231ps zmm13, zmm31, 13*4[r15]{1to16} \ + __asm vfmadd231ps zmm14, zmm31, 14*4[r15]{1to16} \ + __asm vfmadd231ps zmm15, zmm31, 15*4[r15]{1to16} \ + \ + __asm vprefetch1 2*64[r15 + r14] \ + __asm vfmadd231ps zmm16, zmm31, 16*4[r15]{1to16} \ + __asm vfmadd231ps zmm17, zmm31, 17*4[r15]{1to16} \ + __asm vfmadd231ps zmm18, zmm31, 18*4[r15]{1to16} \ + __asm vfmadd231ps zmm19, zmm31, 19*4[r15]{1to16} \ + __asm vfmadd231ps zmm20, zmm31, 20*4[r15]{1to16} \ + \ + __asm vprefetch1 3*64[r15 + r14] \ + __asm vfmadd231ps zmm21, zmm31, 21*4[r15]{1to16} \ + __asm add r15, r12 \ + __asm vfmadd231ps zmm22, zmm31, -10*4[r15]{1to16}\ + __asm vfmadd231ps zmm23, zmm31, -9*4[r15]{1to16} \ + __asm vfmadd231ps zmm24, zmm31, -8*4[r15]{1to16} \ + __asm dec COUNTER \ + __asm vfmadd231ps zmm25, zmm31, -7*4[r15]{1to16} \ + \ + \ + __asm vprefetch1 0[rbx + r13] \ + __asm vfmadd231ps zmm26, zmm31, -6*4[r15]{1to16} \ + __asm vprefetch0 B_L1_PREFETCH_DIST*16*4[rbx] \ + __asm vfmadd231ps zmm27, zmm31, -5*4[r15]{1to16} \ + __asm add rbx, r9 \ + __asm vfmadd231ps zmm28, zmm31, -4*4[r15]{1to16} \ + __asm cmp COUNTER, 0 \ + __asm vfmadd231ps zmm29, zmm31, -3*4[r15]{1to16} \ +} + +//One iteration of the k_r loop. +//Same as ONE_ITER_MAIN_LOOP, but additionally, we prefetch one line of C into the L2 cache +//Current placement of this prefetch instruction is somewhat arbitrary. +#define ONE_ITER_PC_L2(C_ADDR) \ +{\ + __asm vbroadcastf32x4 zmm30, 0[r15] \ + __asm vmovaps zmm31, 0[rbx] \ + \ + __asm vfmadd231ps zmm0, zmm31, zmm30{aaaa} \ + __asm vfmadd231ps zmm4, zmm31, 4*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \ + __asm vfmadd231ps zmm5, zmm31, 5*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \ + __asm vfmadd231ps zmm6, zmm31, 6*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\ + __asm vfmadd231ps zmm7, zmm31, 7*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\ + __asm vfmadd231ps zmm8, zmm31, 8*4[r15]{1to16} \ + \ + __asm vprefetch1 0[r15 + r14] \ + __asm vfmadd231ps zmm9, zmm31, 9*4[r15]{1to16} \ + __asm vfmadd231ps zmm1, zmm31, zmm30{bbbb} \ + __asm vfmadd231ps zmm2, zmm31, zmm30{cccc} \ + __asm vfmadd231ps zmm3, zmm31, zmm30{dddd} \ + __asm vfmadd231ps zmm10, zmm31, 10*4[r15]{1to16} \ + \ + __asm vprefetch1 64[r15 + r14] \ + __asm vfmadd231ps zmm11, zmm31, 11*4[r15]{1to16} \ + __asm vprefetch1 0[C_ADDR] \ + __asm vfmadd231ps zmm12, zmm31, 12*4[r15]{1to16} \ + __asm vfmadd231ps zmm13, zmm31, 13*4[r15]{1to16} \ + __asm vfmadd231ps zmm14, zmm31, 14*4[r15]{1to16} \ + __asm vfmadd231ps zmm15, zmm31, 15*4[r15]{1to16} \ + \ + __asm vprefetch1 2*64[r15 + r14] \ + __asm vfmadd231ps zmm16, zmm31, 16*4[r15]{1to16} \ + __asm vfmadd231ps zmm17, zmm31, 17*4[r15]{1to16} \ + __asm vfmadd231ps zmm18, zmm31, 18*4[r15]{1to16} \ + __asm vfmadd231ps zmm19, zmm31, 19*4[r15]{1to16} \ + __asm vfmadd231ps zmm20, zmm31, 20*4[r15]{1to16} \ + \ + __asm vprefetch1 3*64[r15 + r14] \ + __asm vfmadd231ps zmm21, zmm31, 21*4[r15]{1to16} \ + __asm add r15, r12 \ + __asm vfmadd231ps zmm22, zmm31, -10*4[r15]{1to16}\ + __asm vfmadd231ps zmm23, zmm31, -9*4[r15]{1to16} \ + __asm add C_ADDR, r11 \ + __asm vfmadd231ps zmm24, zmm31, -8*4[r15]{1to16} \ + __asm dec r8 \ + __asm vfmadd231ps zmm25, zmm31, -7*4[r15]{1to16} \ + \ + \ + __asm vprefetch1 0[rbx + r13] \ + __asm vfmadd231ps zmm26, zmm31, -6*4[r15]{1to16} \ + __asm vprefetch0 B_L1_PREFETCH_DIST*16*4[rbx] \ + __asm vfmadd231ps zmm27, zmm31, -5*4[r15]{1to16} \ + __asm add rbx, r9 \ + __asm vfmadd231ps zmm28, zmm31, -4*4[r15]{1to16} \ + __asm cmp r8, 0 \ + __asm vfmadd231ps zmm29, zmm31, -3*4[r15]{1to16} \ +\ +} + +//One iteration of the k_r loop. +//Same as ONE_ITER_MAIN_LOOP, but additionally, we prefetch 3 cache lines of C into the L1 cache +//Current placement of these prefetch instructions is somewhat arbitrary. +#define ONE_ITER_PC_L1(C_ADDR) \ +{\ + __asm vbroadcastf32x4 zmm30, 0[r15] \ + __asm vmovaps zmm31, 0[rbx] \ + \ + __asm vfmadd231ps zmm0, zmm31, zmm30{aaaa} \ + __asm vfmadd231ps zmm4, zmm31, 4*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \ + __asm vfmadd231ps zmm5, zmm31, 5*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \ + __asm vfmadd231ps zmm6, zmm31, 6*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\ + __asm vfmadd231ps zmm7, zmm31, 7*4[r15]{1to16} \ + __asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\ + __asm vfmadd231ps zmm8, zmm31, 8*4[r15]{1to16} \ + \ + __asm vprefetch1 0[r15 + r14] \ + __asm vfmadd231ps zmm9, zmm31, 9*4[r15]{1to16} \ + __asm vprefetch0 0[C_ADDR] \ + __asm vfmadd231ps zmm1, zmm31, zmm30{bbbb} \ + __asm add C_ADDR, r11 \ + __asm vfmadd231ps zmm2, zmm31, zmm30{cccc} \ + __asm vfmadd231ps zmm3, zmm31, zmm30{dddd} \ + __asm vfmadd231ps zmm10, zmm31, 10*4[r15]{1to16} \ + \ + __asm vprefetch1 64[r15 + r14] \ + __asm vfmadd231ps zmm11, zmm31, 11*4[r15]{1to16} \ + __asm vprefetch0 0[C_ADDR] \ + __asm vfmadd231ps zmm12, zmm31, 12*4[r15]{1to16} \ + __asm add C_ADDR, r11 \ + __asm vfmadd231ps zmm13, zmm31, 13*4[r15]{1to16} \ + __asm vfmadd231ps zmm14, zmm31, 14*4[r15]{1to16} \ + __asm vfmadd231ps zmm15, zmm31, 15*4[r15]{1to16} \ + \ + __asm vprefetch1 2*64[r15 + r14] \ + __asm vfmadd231ps zmm16, zmm31, 16*4[r15]{1to16} \ + __asm vprefetch0 0[C_ADDR] \ + __asm vfmadd231ps zmm17, zmm31, 17*4[r15]{1to16} \ + __asm add C_ADDR, r11 \ + __asm vfmadd231ps zmm18, zmm31, 18*4[r15]{1to16} \ + __asm vfmadd231ps zmm19, zmm31, 19*4[r15]{1to16} \ + __asm vfmadd231ps zmm20, zmm31, 20*4[r15]{1to16} \ + \ + __asm vprefetch1 3*64[r15 + r14] \ + __asm vfmadd231ps zmm21, zmm31, 21*4[r15]{1to16} \ + __asm add r15, r12 \ + __asm vfmadd231ps zmm22, zmm31, -10*4[r15]{1to16}\ + __asm vfmadd231ps zmm23, zmm31, -9*4[r15]{1to16} \ + __asm vfmadd231ps zmm24, zmm31, -8*4[r15]{1to16} \ + __asm dec r8 \ + __asm vfmadd231ps zmm25, zmm31, -7*4[r15]{1to16} \ + \ + \ + __asm vprefetch1 0[rbx + r13] \ + __asm vfmadd231ps zmm26, zmm31, -6*4[r15]{1to16} \ + __asm vprefetch0 B_L1_PREFETCH_DIST*16*4[rbx] \ + __asm vfmadd231ps zmm27, zmm31, -5*4[r15]{1to16} \ + __asm add rbx, r9 \ + __asm vfmadd231ps zmm28, zmm31, -4*4[r15]{1to16} \ + __asm cmp r8, 0 \ + __asm vfmadd231ps zmm29, zmm31, -3*4[r15]{1to16} \ +\ +} + +//This is an array used for the scattter/gather instructions. +int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + +//#define MONITORS +//#define LOOPMON +void bli_sgemm_opt_30x16( + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + float * a_next = bli_auxinfo_next_a( data ); + float * b_next = bli_auxinfo_next_b( data ); + + int * offsetPtr = &offsets[0]; + +#ifdef MONITORS + int toph, topl, both, botl, midl, midh, mid2l, mid2h; +#endif +#ifdef LOOPMON + int tlooph, tloopl, blooph, bloopl; +#endif + + __asm + { +#ifdef MONITORS + rdtsc + mov topl, eax + mov toph, edx +#endif + vpxord zmm0, zmm0, zmm0 + vmovaps zmm1, zmm0 //clear out registers + vmovaps zmm2, zmm0 + mov rsi, k //loop index + vmovaps zmm3, zmm0 + + mov r11, rs_c //load row stride + vmovaps zmm4, zmm0 + sal r11, 2 //scale row stride + vmovaps zmm5, zmm0 + mov r15, a //load address of a + vmovaps zmm6, zmm0 + mov rbx, b //load address of b + vmovaps zmm7, zmm0 + + vmovaps zmm8, zmm0 + lea r10, [r11 + 2*r11 + 0] //r10 has 3 * r11 + vmovaps zmm9, zmm0 + vmovaps zmm10, zmm0 + mov rdi, r11 + vmovaps zmm11, zmm0 + sal rdi, 2 //rdi has 4*r11 + + vmovaps zmm12, zmm0 + mov rcx, c //load address of c for prefetching + vmovaps zmm13, zmm0 + vmovaps zmm14, zmm0 + mov r8, k + vmovaps zmm15, zmm0 + + vmovaps zmm16, zmm0 + vmovaps zmm17, zmm0 + mov r13, L2_PREFETCH_DIST*4*16 + vmovaps zmm18, zmm0 + mov r14, L2_PREFETCH_DIST*4*32 + vmovaps zmm19, zmm0 + vmovaps zmm20, zmm0 + vmovaps zmm21, zmm0 + vmovaps zmm22, zmm0 + + vmovaps zmm23, zmm0 + sub r8, 30 + L2_PREFETCH_DIST //Check if we have over 40 operations to do. + vmovaps zmm24, zmm0 + mov r8, 30 + vmovaps zmm25, zmm0 + mov r9, 16*4 //amount to increment b* by each iteration + vmovaps zmm26, zmm0 + mov r12, 32*4 //amount to increment a* by each iteration + vmovaps zmm27, zmm0 + vmovaps zmm28, zmm0 + vmovaps zmm29, zmm0 + +#ifdef MONITORS + rdtsc + mov midl, eax + mov midh, edx +#endif + jle CONSIDER_UNDER_40 + sub rsi, 30 + L2_PREFETCH_DIST + + //First 30 iterations + LOOPREFECHCL2: + ONE_ITER_PC_L2(rcx) + jne LOOPREFECHCL2 + mov rcx, c + + //Main Loop. + LOOPMAIN: + ONE_ITER_MAIN_LOOP(rcx, rsi) + jne LOOPMAIN + + //Penultimate 22 iterations. + //Break these off from the main loop to avoid prefetching extra shit. + mov r14, a_next + mov r13, b_next + sub r14, r15 + sub r13, rbx + + mov rsi, L2_PREFETCH_DIST-10 + LOOPMAIN2: + ONE_ITER_MAIN_LOOP(rcx, rsi) + jne LOOPMAIN2 + + + //Last 10 iterations + mov r8, 10 + LOOPREFETCHCL1: + ONE_ITER_PC_L1(rcx) + jne LOOPREFETCHCL1 + + + jmp POSTACCUM + + //Alternate main loop, with no prefetching of C + //Used when <= 40 iterations + CONSIDER_UNDER_40: + mov rsi, k + LOOP_UNDER_40: + ONE_ITER_MAIN_LOOP(rcx, rsi) + jne LOOP_UNDER_40 + + + + POSTACCUM: + +#ifdef MONITORS + rdtsc + mov mid2l, eax + mov mid2h, edx +#endif + + mov r9, c //load address of c for update + mov r12, alpha //load address of alpha + + // Check if C is row stride. If not, jump to the slow scattered update + mov r14, cs_c + dec r14 + jne SCATTEREDUPDATE + + mov r14, beta + vbroadcastss zmm31, 0[r14] + + + vmulps zmm0, zmm0, 0[r12]{1to16} + vmulps zmm1, zmm1, 0[r12]{1to16} + vmulps zmm2, zmm2, 0[r12]{1to16} + vmulps zmm3, zmm3, 0[r12]{1to16} + vfmadd231ps zmm0, zmm31, [r9+0] + vfmadd231ps zmm1, zmm31, [r9+r11+0] + vfmadd231ps zmm2, zmm31, [r9+2*r11+0] + vfmadd231ps zmm3, zmm31, [r9+r10+0] + vmovaps [r9+0], zmm0 + vmovaps [r9+r11+0], zmm1 + vmovaps [r9+2*r11+0], zmm2 + vmovaps [r9+r10+0], zmm3 + add r9, rdi + + vmulps zmm4, zmm4, 0[r12]{1to16} + vmulps zmm5, zmm5, 0[r12]{1to16} + vmulps zmm6, zmm6, 0[r12]{1to16} + vmulps zmm7, zmm7, 0[r12]{1to16} + vfmadd231ps zmm4, zmm31, [r9+0] + vfmadd231ps zmm5, zmm31, [r9+r11+0] + vfmadd231ps zmm6, zmm31, [r9+2*r11+0] + vfmadd231ps zmm7, zmm31, [r9+r10+0] + vmovaps [r9+0], zmm4 + vmovaps [r9+r11+0], zmm5 + vmovaps [r9+2*r11+0], zmm6 + vmovaps [r9+r10+0], zmm7 + add r9, rdi + + vmulps zmm8, zmm8, 0[r12]{1to16} + vmulps zmm9, zmm9, 0[r12]{1to16} + vmulps zmm10, zmm10, 0[r12]{1to16} + vmulps zmm11, zmm11, 0[r12]{1to16} + vfmadd231ps zmm8, zmm31, [r9+0] + vfmadd231ps zmm9, zmm31, [r9+r11+0] + vfmadd231ps zmm10, zmm31, [r9+2*r11+0] + vfmadd231ps zmm11, zmm31, [r9+r10+0] + vmovaps [r9+0], zmm8 + vmovaps [r9+r11+0], zmm9 + vmovaps [r9+2*r11+0], zmm10 + vmovaps [r9+r10+0], zmm11 + add r9, rdi + + vmulps zmm12, zmm12, 0[r12]{1to16} + vmulps zmm13, zmm13, 0[r12]{1to16} + vmulps zmm14, zmm14, 0[r12]{1to16} + vmulps zmm15, zmm15, 0[r12]{1to16} + vfmadd231ps zmm12, zmm31, [r9+0] + vfmadd231ps zmm13, zmm31, [r9+r11+0] + vfmadd231ps zmm14, zmm31, [r9+2*r11+0] + vfmadd231ps zmm15, zmm31, [r9+r10+0] + vmovaps [r9+0], zmm12 + vmovaps [r9+r11+0], zmm13 + vmovaps [r9+2*r11+0], zmm14 + vmovaps [r9+r10+0], zmm15 + add r9, rdi + + vmulps zmm16, zmm16, 0[r12]{1to16} + vmulps zmm17, zmm17, 0[r12]{1to16} + vmulps zmm18, zmm18, 0[r12]{1to16} + vmulps zmm19, zmm19, 0[r12]{1to16} + vfmadd231ps zmm16, zmm31, [r9+0] + vfmadd231ps zmm17, zmm31, [r9+r11+0] + vfmadd231ps zmm18, zmm31, [r9+2*r11+0] + vfmadd231ps zmm19, zmm31, [r9+r10+0] + vmovaps [r9+0], zmm16 + vmovaps [r9+r11+0], zmm17 + vmovaps [r9+2*r11+0], zmm18 + vmovaps [r9+r10+0], zmm19 + add r9, rdi + + vmulps zmm20, zmm20, 0[r12]{1to16} + vmulps zmm21, zmm21, 0[r12]{1to16} + vmulps zmm22, zmm22, 0[r12]{1to16} + vmulps zmm23, zmm23, 0[r12]{1to16} + vfmadd231ps zmm20, zmm31, [r9+0] + vfmadd231ps zmm21, zmm31, [r9+r11+0] + vfmadd231ps zmm22, zmm31, [r9+2*r11+0] + vfmadd231ps zmm23, zmm31, [r9+r10+0] + vmovaps [r9+0], zmm20 + vmovaps [r9+r11+0], zmm21 + vmovaps [r9+2*r11+0], zmm22 + vmovaps [r9+r10+0], zmm23 + add r9, rdi + + vmulps zmm24, zmm24, 0[r12]{1to16} + vmulps zmm25, zmm25, 0[r12]{1to16} + vmulps zmm26, zmm26, 0[r12]{1to16} + vmulps zmm27, zmm27, 0[r12]{1to16} + vfmadd231ps zmm24, zmm31, [r9+0] + vfmadd231ps zmm25, zmm31, [r9+r11+0] + vfmadd231ps zmm26, zmm31, [r9+2*r11+0] + vfmadd231ps zmm27, zmm31, [r9+r10+0] + vmovaps [r9+0], zmm24 + vmovaps [r9+r11+0], zmm25 + vmovaps [r9+2*r11+0], zmm26 + vmovaps [r9+r10+0], zmm27 + add r9, rdi + + vmulps zmm28, zmm28, 0[r12]{1to16} + vmulps zmm29, zmm29, 0[r12]{1to16} + vfmadd231ps zmm28, zmm31, [r9+0] + vfmadd231ps zmm29, zmm31, [r9+r11+0] + vmovaps [r9+0], zmm28 + vmovaps [r9+r11+0], zmm29 + + jmp END + + SCATTEREDUPDATE: + + mov r10, offsetPtr + vmovaps zmm31, 0[r10] + vpbroadcastd zmm30, cs_c + mov r13, beta + vpmulld zmm30, zmm31, zmm30 + + mov ebx, 0xFFFF + UPDATE_C_ROW_SCATTERED(zmm0, 0, r9) + UPDATE_C_ROW_SCATTERED(zmm1, 1, r9) + UPDATE_C_ROW_SCATTERED(zmm2, 2, r9) + UPDATE_C_ROW_SCATTERED(zmm3, 3, r9) + UPDATE_C_ROW_SCATTERED(zmm4, 4, r9) + UPDATE_C_ROW_SCATTERED(zmm5, 5, r9) + UPDATE_C_ROW_SCATTERED(zmm6, 6, r9) + UPDATE_C_ROW_SCATTERED(zmm7, 7, r9) + UPDATE_C_ROW_SCATTERED(zmm8, 8, r9) + UPDATE_C_ROW_SCATTERED(zmm9, 9, r9) + UPDATE_C_ROW_SCATTERED(zmm10, 10, r9) + UPDATE_C_ROW_SCATTERED(zmm11, 11, r9) + UPDATE_C_ROW_SCATTERED(zmm12, 12, r9) + UPDATE_C_ROW_SCATTERED(zmm13, 13, r9) + UPDATE_C_ROW_SCATTERED(zmm14, 14, r9) + UPDATE_C_ROW_SCATTERED(zmm15, 15, r9) + UPDATE_C_ROW_SCATTERED(zmm16, 16, r9) + UPDATE_C_ROW_SCATTERED(zmm17, 17, r9) + UPDATE_C_ROW_SCATTERED(zmm18, 18, r9) + UPDATE_C_ROW_SCATTERED(zmm19, 19, r9) + UPDATE_C_ROW_SCATTERED(zmm20, 20, r9) + UPDATE_C_ROW_SCATTERED(zmm21, 21, r9) + UPDATE_C_ROW_SCATTERED(zmm22, 22, r9) + UPDATE_C_ROW_SCATTERED(zmm23, 23, r9) + UPDATE_C_ROW_SCATTERED(zmm24, 24, r9) + UPDATE_C_ROW_SCATTERED(zmm25, 25, r9) + UPDATE_C_ROW_SCATTERED(zmm26, 26, r9) + UPDATE_C_ROW_SCATTERED(zmm27, 27, r9) + UPDATE_C_ROW_SCATTERED(zmm28, 28, r9) + UPDATE_C_ROW_SCATTERED(zmm29, 29, r9) + + END: +#ifdef MONITORS + rdtsc + mov botl, eax + mov both, edx +#endif + } + +#ifdef LOOPMON + printf("looptime = \t%d\n", bloopl - tloopl); +#endif +#ifdef MONITORS + dim_t top = ((dim_t)toph << 32) | topl; + dim_t mid = ((dim_t)midh << 32) | midl; + dim_t mid2 = ((dim_t)mid2h << 32) | mid2l; + dim_t bot = ((dim_t)both << 32) | botl; + printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top); +#endif +} +