diff --git a/Golden_Repo/b/BLIS/BLIS-0.8.1-GCCcore-11.2.0.eb b/Golden_Repo/b/BLIS/BLIS-0.8.1-GCCcore-11.2.0.eb new file mode 100644 index 0000000000000000000000000000000000000000..96688e4b680bb270f05746a434dc164af7009464 --- /dev/null +++ b/Golden_Repo/b/BLIS/BLIS-0.8.1-GCCcore-11.2.0.eb @@ -0,0 +1,41 @@ +easyblock = 'ConfigureMake' + +name = 'BLIS' +version = '0.8.1' + +homepage = 'https://github.com/flame/blis/' +description = """BLIS is a portable software framework for instantiating high-performance +BLAS-like dense linear algebra libraries.""" + +toolchain = {'name': 'GCCcore', 'version': '11.2.0'} + +source_urls = ['https://github.com/flame/blis/archive/'] +sources = ['%(version)s.tar.gz'] +patches = [ + '%(name)s-%(version)s_fix_dgemm-fpe-signalling-on-broadwell.patch', +] +checksums = [ + '729694128719801e82fae7b5f2489ab73e4a467f46271beff09588c9265a697b', # 0.8.1.tar.gz + # BLIS-0.8.1_fix_dgemm-fpe-signalling-on-broadwell.patch + '345fa39933e9d1442d2eb1e4ed9129df3fe4aefecf4d104e5d4f25b3bca24d0d', +] + +builddependencies = [ + ('binutils', '2.37'), + ('Python', '3.9.6'), + ('Perl', '5.34.0'), +] + +configopts = '--enable-cblas --enable-threading=openmp --enable-shared CC="$CC" auto' + +runtest = 'check' + +sanity_check_paths = { + 'files': ['include/blis/cblas.h', 'include/blis/blis.h', + 'lib/libblis.a', 'lib/libblis.%s' % SHLIB_EXT], + 'dirs': [], +} + +modextrapaths = {'CPATH': 'include/blis'} + +moduleclass = 'numlib' diff --git a/Golden_Repo/b/BLIS/BLIS-0.8.1_fix_dgemm-fpe-signalling-on-broadwell.patch b/Golden_Repo/b/BLIS/BLIS-0.8.1_fix_dgemm-fpe-signalling-on-broadwell.patch new file mode 100644 index 0000000000000000000000000000000000000000..ad6dee6c3de33262118ba540ffb855f2d4decf3d --- /dev/null +++ b/Golden_Repo/b/BLIS/BLIS-0.8.1_fix_dgemm-fpe-signalling-on-broadwell.patch @@ -0,0 +1,2219 @@ +Taken from https://github.com/flame/blis/pull/544 +Fixes a problem with DGEMM causing FPR signalling on Broadwell +See https://github.com/flame/blis/issues/486 + +Åke Sandgren, 20210916 + +commit 5191c43faccf45975f577c60b9089abee25722c9 +Author: Devin Matthews <damatthews@smu.edu> +Date: Thu Sep 16 10:16:17 2021 -0500 + + Fix more copy-paste errors in the haswell gemmsup code. + + Fixes #486. + +diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +index 4c6094b1..21dd3b89 100644 +--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c ++++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +@@ -101,7 +101,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + begin_asm() + + //vzeroall() // zero all xmm/ymm registers. +- ++ + mov(var(a), r14) // load address of a. + mov(var(rs_a), r8) // load rs_a + //mov(var(cs_a), r9) // load cs_a +@@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + + lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b + lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a +- ++ + + mov(var(c), r12) // load address of c + mov(var(rs_c), rdi) // load rs_c +@@ -172,19 +172,19 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c + #endif + lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a +- + +- +- ++ ++ ++ + mov(var(k_iter16), rsi) // i = k_iter16; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKITER4) // if i == 0, jump to code that + // contains the k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER16) // MAIN LOOP +- +- ++ ++ + // ---------------------------------- iteration 0 + + #if 0 +@@ -219,7 +219,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + vfmadd231pd(ymm1, ymm3, ymm14) + vfmadd231pd(ymm2, ymm3, ymm15) + +- ++ + // ---------------------------------- iteration 1 + + vmovupd(mem(rax ), ymm0) +@@ -250,7 +250,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + + + // ---------------------------------- iteration 2 +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -312,27 +312,27 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + vfmadd231pd(ymm1, ymm3, ymm14) + vfmadd231pd(ymm2, ymm3, ymm15) + +- ++ + + dec(rsi) // i -= 1; + jne(.DLOOPKITER16) // iterate again if i != 0. +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + label(.DCONSIDKITER4) +- ++ + mov(var(k_iter4), rsi) // i = k_iter4; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT1) // if i == 0, jump to code that + // considers k_left1 loop. + // else, we prepare to enter k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER4) // EDGE LOOP (ymm) +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -343,7 +343,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + vmovupd(mem(rax, r8, 1), ymm1) + vmovupd(mem(rax, r8, 2), ymm2) + add(imm(4*8), rax) // a += 4*cs_b = 4*8; +- ++ + vmovupd(mem(rbx ), ymm3) + vfmadd231pd(ymm0, ymm3, ymm4) + vfmadd231pd(ymm1, ymm3, ymm5) +@@ -365,21 +365,21 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + vfmadd231pd(ymm1, ymm3, ymm14) + vfmadd231pd(ymm2, ymm3, ymm15) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKITER4) // iterate again if i != 0. +- +- +- ++ ++ ++ + + label(.DCONSIDKLEFT1) +- ++ + mov(var(k_left1), rsi) // i = k_left1; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left1 loop. +- +- ++ ++ + + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) +@@ -387,12 +387,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + // using the xmm registers would zero out the + // high bits of the destination registers, + // which would destory intermediate results. +- ++ + vmovsd(mem(rax ), xmm0) + vmovsd(mem(rax, r8, 1), xmm1) + vmovsd(mem(rax, r8, 2), xmm2) + add(imm(1*8), rax) // a += 1*cs_a = 1*8; +- ++ + vmovsd(mem(rbx ), xmm3) + vfmadd231pd(ymm0, ymm3, ymm4) + vfmadd231pd(ymm1, ymm3, ymm5) +@@ -414,12 +414,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + vfmadd231pd(ymm1, ymm3, ymm14) + vfmadd231pd(ymm2, ymm3, ymm15) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT1) // iterate again if i != 0. +- +- +- ++ ++ ++ + + + +@@ -427,11 +427,11 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + label(.DPOSTACCUM) + + +- +- // ymm4 ymm7 ymm10 ymm13 ++ ++ // ymm4 ymm7 ymm10 ymm13 + // ymm5 ymm8 ymm11 ymm14 + // ymm6 ymm9 ymm12 ymm15 +- ++ + vhaddpd( ymm7, ymm4, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm0 ) +@@ -469,7 +469,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + // xmm6[0:3] = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) + + +- ++ + //mov(var(rs_c), rdi) // load rs_c + //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + +@@ -477,73 +477,73 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate +- ++ + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) + vmulpd(ymm0, ymm6, ymm6) +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + //mov(var(cs_c), rsi) // load cs_c + //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) +- +- +- ++ ++ ++ + // now avoid loading C if beta == 0 +- ++ + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case +- + +- ++ ++ + label(.DROWSTORED) +- +- ++ ++ + vfmadd231pd(mem(rcx), ymm3, ymm4) + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) +- ++ + vfmadd231pd(mem(rcx), ymm3, ymm5) + vmovupd(ymm5, mem(rcx)) + add(rdi, rcx) +- ++ + vfmadd231pd(mem(rcx), ymm3, ymm6) + vmovupd(ymm6, mem(rcx)) + //add(rdi, rcx) +- +- +- ++ ++ ++ + jmp(.DDONE) // jump to end. +- +- +- +- ++ ++ ++ ++ + label(.DBETAZERO) +- + +- ++ ++ + label(.DROWSTORBZ) +- +- ++ ++ + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) +- ++ + vmovupd(ymm5, mem(rcx)) + add(rdi, rcx) +- ++ + vmovupd(ymm6, mem(rcx)) + //add(rdi, rcx) +- +- +- +- ++ ++ ++ ++ + label(.DDONE) +- +- ++ ++ + + + lea(mem(r12, rdi, 2), r12) // +@@ -560,7 +560,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4 + + label(.DRETURN) + +- ++ + + end_asm( + : // output operands (none) +@@ -629,7 +629,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + // ------------------------------------------------------------------------- + + begin_asm() +- ++ + //vzeroall() // zero all xmm/ymm registers. + + mov(var(a), rax) // load address of a. +@@ -649,7 +649,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + + lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b + //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a +- ++ + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c +@@ -682,7 +682,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + //lea(mem(r14), rax) // rax = a; + //lea(mem(rdx), rbx) // rbx = b; + +- ++ + #if 1 + //mov(var(rs_c), rdi) // load rs_c + //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) +@@ -690,18 +690,18 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c + #endif + +- +- +- ++ ++ ++ + mov(var(k_iter16), rsi) // i = k_iter16; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKITER4) // if i == 0, jump to code that + // contains the k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER16) // MAIN LOOP +- +- ++ ++ + // ---------------------------------- iteration 0 + + #if 0 +@@ -730,7 +730,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + vfmadd231pd(ymm0, ymm3, ymm13) + vfmadd231pd(ymm1, ymm3, ymm14) + +- ++ + // ---------------------------------- iteration 1 + + vmovupd(mem(rax ), ymm0) +@@ -756,7 +756,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + + + // ---------------------------------- iteration 2 +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -807,27 +807,27 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + vfmadd231pd(ymm0, ymm3, ymm13) + vfmadd231pd(ymm1, ymm3, ymm14) + +- ++ + + dec(rsi) // i -= 1; + jne(.DLOOPKITER16) // iterate again if i != 0. +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + label(.DCONSIDKITER4) +- ++ + mov(var(k_iter4), rsi) // i = k_iter4; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT1) // if i == 0, jump to code that + // considers k_left1 loop. + // else, we prepare to enter k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER4) // EDGE LOOP (ymm) +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -836,7 +836,7 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + vmovupd(mem(rax ), ymm0) + vmovupd(mem(rax, r8, 1), ymm1) + add(imm(4*8), rax) // a += 4*cs_b = 4*8; +- ++ + vmovupd(mem(rbx ), ymm3) + vfmadd231pd(ymm0, ymm3, ymm4) + vfmadd231pd(ymm1, ymm3, ymm5) +@@ -854,21 +854,21 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + vfmadd231pd(ymm0, ymm3, ymm13) + vfmadd231pd(ymm1, ymm3, ymm14) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKITER4) // iterate again if i != 0. +- +- +- ++ ++ ++ + + label(.DCONSIDKLEFT1) +- ++ + mov(var(k_left1), rsi) // i = k_left1; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left1 loop. +- +- ++ ++ + + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) +@@ -876,11 +876,11 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + // using the xmm registers would zero out the + // high bits of the destination registers, + // which would destory intermediate results. +- ++ + vmovsd(mem(rax ), xmm0) + vmovsd(mem(rax, r8, 1), xmm1) + add(imm(1*8), rax) // a += 1*cs_a = 1*8; +- ++ + vmovsd(mem(rbx ), xmm3) + vfmadd231pd(ymm0, ymm3, ymm4) + vfmadd231pd(ymm1, ymm3, ymm5) +@@ -898,12 +898,12 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + vfmadd231pd(ymm0, ymm3, ymm13) + vfmadd231pd(ymm1, ymm3, ymm14) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT1) // iterate again if i != 0. +- +- +- ++ ++ ++ + + + +@@ -911,10 +911,10 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + label(.DPOSTACCUM) + + +- +- // ymm4 ymm7 ymm10 ymm13 ++ ++ // ymm4 ymm7 ymm10 ymm13 + // ymm5 ymm8 ymm11 ymm14 +- ++ + vhaddpd( ymm7, ymm4, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm0 ) +@@ -943,75 +943,75 @@ void bli_dgemmsup_rd_haswell_asm_2x4 + + //mov(var(rs_c), rdi) // load rs_c + //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) +- ++ + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate +- ++ + vmulpd(ymm0, ymm4, ymm4) // scale by alpha + vmulpd(ymm0, ymm5, ymm5) +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + //mov(var(cs_c), rsi) // load cs_c + //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) +- +- +- ++ ++ ++ + // now avoid loading C if beta == 0 +- ++ + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case +- + +- ++ ++ + label(.DROWSTORED) +- +- ++ ++ + vfmadd231pd(mem(rcx), ymm3, ymm4) + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) +- ++ + vfmadd231pd(mem(rcx), ymm3, ymm5) + vmovupd(ymm5, mem(rcx)) + //add(rdi, rcx) +- +- +- ++ ++ ++ + jmp(.DDONE) // jump to end. +- +- +- +- ++ ++ ++ ++ + label(.DBETAZERO) +- + +- ++ ++ + label(.DROWSTORBZ) +- +- ++ ++ + vmovupd(ymm4, mem(rcx)) + add(rdi, rcx) +- ++ + vmovupd(ymm5, mem(rcx)) + //add(rdi, rcx) +- +- +- +- ++ ++ ++ ++ + label(.DDONE) + + + + + label(.DRETURN) +- +- ++ ++ + + end_asm( + : // output operands (none) +@@ -1079,7 +1079,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + // ------------------------------------------------------------------------- + + begin_asm() +- ++ + //vzeroall() // zero all xmm/ymm registers. + + mov(var(a), rax) // load address of a. +@@ -1099,7 +1099,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + + lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b + //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a +- ++ + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c +@@ -1128,26 +1128,26 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + //lea(mem(r14), rax) // rax = a; + //lea(mem(rdx), rbx) // rbx = b; + +- ++ + #if 1 + //mov(var(rs_c), rdi) // load rs_c + //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c +- prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c ++ //prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c + #endif + +- +- +- ++ ++ ++ + mov(var(k_iter16), rsi) // i = k_iter16; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKITER4) // if i == 0, jump to code that + // contains the k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER16) // MAIN LOOP +- +- ++ ++ + // ---------------------------------- iteration 0 + + #if 0 +@@ -1170,7 +1170,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + add(imm(4*8), rbx) // b += 4*rs_b = 4*8; + vfmadd231pd(ymm0, ymm3, ymm13) + +- ++ + // ---------------------------------- iteration 1 + + vmovupd(mem(rax ), ymm0) +@@ -1191,7 +1191,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + + + // ---------------------------------- iteration 2 +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + #endif +@@ -1231,27 +1231,27 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + add(imm(4*8), rbx) // b += 4*rs_b = 4*8; + vfmadd231pd(ymm0, ymm3, ymm13) + +- ++ + + dec(rsi) // i -= 1; + jne(.DLOOPKITER16) // iterate again if i != 0. +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + label(.DCONSIDKITER4) +- ++ + mov(var(k_iter4), rsi) // i = k_iter4; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT1) // if i == 0, jump to code that + // considers k_left1 loop. + // else, we prepare to enter k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER4) // EDGE LOOP (ymm) +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -1259,7 +1259,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + + vmovupd(mem(rax ), ymm0) + add(imm(4*8), rax) // a += 4*cs_b = 4*8; +- ++ + vmovupd(mem(rbx ), ymm3) + vfmadd231pd(ymm0, ymm3, ymm4) + +@@ -1273,21 +1273,21 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + add(imm(4*8), rbx) // b += 4*rs_b = 4*8; + vfmadd231pd(ymm0, ymm3, ymm13) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKITER4) // iterate again if i != 0. +- +- +- ++ ++ ++ + + label(.DCONSIDKLEFT1) +- ++ + mov(var(k_left1), rsi) // i = k_left1; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left1 loop. +- +- ++ ++ + + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) +@@ -1295,10 +1295,10 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + // using the xmm registers would zero out the + // high bits of the destination registers, + // which would destory intermediate results. +- ++ + vmovsd(mem(rax ), xmm0) + add(imm(1*8), rax) // a += 1*cs_a = 1*8; +- ++ + vmovsd(mem(rbx ), xmm3) + vfmadd231pd(ymm0, ymm3, ymm4) + +@@ -1312,12 +1312,12 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + add(imm(1*8), rbx) // b += 1*rs_b = 1*8; + vfmadd231pd(ymm0, ymm3, ymm13) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT1) // iterate again if i != 0. +- +- +- ++ ++ ++ + + + +@@ -1325,9 +1325,9 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + label(.DPOSTACCUM) + + +- +- // ymm4 ymm7 ymm10 ymm13 +- ++ ++ // ymm4 ymm7 ymm10 ymm13 ++ + vhaddpd( ymm7, ymm4, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm0 ) +@@ -1339,15 +1339,15 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) + + +- vhaddpd( ymm8, ymm5, ymm0 ) +- vextractf128(imm(1), ymm0, xmm1 ) +- vaddpd( xmm0, xmm1, xmm0 ) ++ //vhaddpd( ymm8, ymm5, ymm0 ) ++ //vextractf128(imm(1), ymm0, xmm1 ) ++ //vaddpd( xmm0, xmm1, xmm0 ) + +- vhaddpd( ymm14, ymm11, ymm2 ) +- vextractf128(imm(1), ymm2, xmm1 ) +- vaddpd( xmm2, xmm1, xmm2 ) ++ //vhaddpd( ymm14, ymm11, ymm2 ) ++ //vextractf128(imm(1), ymm2, xmm1 ) ++ //vaddpd( xmm2, xmm1, xmm2 ) + +- vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) ++ //vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) + + // xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) + +@@ -1355,67 +1355,67 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + + //mov(var(rs_c), rdi) // load rs_c + //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) +- ++ + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate +- ++ + vmulpd(ymm0, ymm4, ymm4) // scale by alpha +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + //mov(var(cs_c), rsi) // load cs_c + //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) +- +- +- ++ ++ ++ + // now avoid loading C if beta == 0 +- ++ + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case +- + +- ++ ++ + label(.DROWSTORED) +- +- ++ ++ + vfmadd231pd(mem(rcx), ymm3, ymm4) + vmovupd(ymm4, mem(rcx)) + //add(rdi, rcx) +- +- +- ++ ++ ++ + jmp(.DDONE) // jump to end. +- +- +- +- ++ ++ ++ ++ + label(.DBETAZERO) +- + +- ++ ++ + label(.DROWSTORBZ) +- +- ++ ++ + vmovupd(ymm4, mem(rcx)) + //add(rdi, rcx) +- +- +- +- ++ ++ ++ ++ + label(.DDONE) + + + + + label(.DRETURN) +- +- ++ ++ + + end_asm( + : // output operands (none) +commit e3dc1954ffb5eee2a8b41fce85ba589f75770eea +Author: Devin Matthews <damatthews@smu.edu> +Date: Thu Sep 16 10:59:37 2021 -0500 + + Fix problem where uninitialized registers are included in vhaddpd in the Mx1 gemmsup kernels for haswell. + + The fix is to use the same (valid) source register twice in the horizontal addition. + +diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c +index 6e3c1a0e..457ef9f2 100644 +--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c ++++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c +@@ -99,9 +99,9 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + // ------------------------------------------------------------------------- + + begin_asm() +- ++ + //vzeroall() // zero all xmm/ymm registers. +- ++ + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + //mov(var(cs_a), r9) // load cs_a +@@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + + //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b + //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a +- ++ + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c +@@ -163,19 +163,19 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c + prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c + #endif +- + +- +- ++ ++ ++ + mov(var(k_iter16), rsi) // i = k_iter16; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKITER4) // if i == 0, jump to code that + // contains the k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER16) // MAIN LOOP +- +- ++ ++ + // ---------------------------------- iteration 0 + + #if 0 +@@ -206,7 +206,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm14) + +- ++ + // ---------------------------------- iteration 1 + + vmovupd(mem(rbx ), ymm0) +@@ -233,7 +233,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + + + // ---------------------------------- iteration 2 +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -287,27 +287,27 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm14) + +- ++ + + dec(rsi) // i -= 1; + jne(.DLOOPKITER16) // iterate again if i != 0. +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + label(.DCONSIDKITER4) +- ++ + mov(var(k_iter4), rsi) // i = k_iter4; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT1) // if i == 0, jump to code that + // considers k_left1 loop. + // else, we prepare to enter k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER4) // EDGE LOOP (ymm) +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -336,21 +336,21 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm14) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKITER4) // iterate again if i != 0. +- +- +- ++ ++ ++ + + label(.DCONSIDKLEFT1) +- ++ + mov(var(k_left1), rsi) // i = k_left1; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left1 loop. +- +- ++ ++ + + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) +@@ -358,7 +358,7 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + // using the xmm registers would zero out the + // high bits of the destination registers, + // which would destory intermediate results. +- ++ + vmovsd(mem(rbx ), xmm0) + add(imm(1*8), rbx) // b += 1*rs_b = 1*8; + +@@ -381,12 +381,12 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + add(imm(1*8), rax) // a += 1*cs_a = 1*8; + vfmadd231pd(ymm0, ymm3, ymm14) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT1) // iterate again if i != 0. +- +- +- ++ ++ ++ + + + +@@ -399,28 +399,28 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + // ymm10 + // ymm12 + // ymm14 +- +- vhaddpd( ymm5, ymm4, ymm0 ) ++ ++ vhaddpd( ymm4, ymm4, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm4 ) + +- vhaddpd( ymm7, ymm6, ymm0 ) ++ vhaddpd( ymm6, ymm6, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm6 ) + +- vhaddpd( ymm9, ymm8, ymm0 ) ++ vhaddpd( ymm8, ymm8, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm8 ) + +- vhaddpd( ymm11, ymm10, ymm0 ) ++ vhaddpd( ymm10, ymm10, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm10 ) + +- vhaddpd( ymm13, ymm12, ymm0 ) ++ vhaddpd( ymm12, ymm12, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm12 ) + +- vhaddpd( ymm15, ymm14, ymm0 ) ++ vhaddpd( ymm14, ymm14, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm14 ) + +@@ -435,114 +435,114 @@ void bli_dgemmsup_rd_haswell_asm_6x1 + + //mov(var(rs_c), rdi) // load rs_c + //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) +- ++ + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate +- ++ + vmulpd(xmm0, xmm4, xmm4) // scale by alpha + vmulpd(xmm0, xmm6, xmm6) + vmulpd(xmm0, xmm8, xmm8) + vmulpd(xmm0, xmm10, xmm10) + vmulpd(xmm0, xmm12, xmm12) + vmulpd(xmm0, xmm14, xmm14) +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + //mov(var(cs_c), rsi) // load cs_c + //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) +- +- +- ++ ++ ++ + // now avoid loading C if beta == 0 +- ++ + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case +- + +- ++ ++ + label(.DROWSTORED) +- + +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm4) + vmovsd(xmm4, mem(rcx)) + add(rdi, rcx) +- +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm6) + vmovsd(xmm6, mem(rcx)) + add(rdi, rcx) +- +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm8) + vmovsd(xmm8, mem(rcx)) + add(rdi, rcx) +- +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm10) + vmovsd(xmm10, mem(rcx)) + add(rdi, rcx) +- +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm12) + vmovsd(xmm12, mem(rcx)) + add(rdi, rcx) +- +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm14) + vmovsd(xmm14, mem(rcx)) + //add(rdi, rcx) +- +- +- ++ ++ ++ + jmp(.DDONE) // jump to end. +- +- +- +- ++ ++ ++ ++ + label(.DBETAZERO) +- + +- ++ ++ + label(.DROWSTORBZ) +- +- ++ ++ + vmovsd(xmm4, mem(rcx)) + add(rdi, rcx) +- ++ + vmovsd(xmm6, mem(rcx)) + add(rdi, rcx) +- ++ + vmovsd(xmm8, mem(rcx)) + add(rdi, rcx) +- ++ + vmovsd(xmm10, mem(rcx)) + add(rdi, rcx) +- ++ + vmovsd(xmm12, mem(rcx)) + add(rdi, rcx) +- ++ + vmovsd(xmm14, mem(rcx)) + //add(rdi, rcx) +- + +- +- +- ++ ++ ++ ++ + label(.DDONE) +- ++ + + + + label(.DRETURN) + +- ++ + + end_asm( + : // output operands (none) +@@ -613,9 +613,9 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + // ------------------------------------------------------------------------- + + begin_asm() +- ++ + //vzeroall() // zero all xmm/ymm registers. +- ++ + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + //mov(var(cs_a), r9) // load cs_a +@@ -633,7 +633,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + + //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b + //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a +- ++ + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c +@@ -671,19 +671,19 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c + #endif +- + +- +- ++ ++ ++ + mov(var(k_iter16), rsi) // i = k_iter16; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKITER4) // if i == 0, jump to code that + // contains the k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER16) // MAIN LOOP +- +- ++ ++ + // ---------------------------------- iteration 0 + + #if 0 +@@ -705,7 +705,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm8) + +- ++ + // ---------------------------------- iteration 1 + + vmovupd(mem(rbx ), ymm0) +@@ -723,7 +723,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + + + // ---------------------------------- iteration 2 +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -759,27 +759,27 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm8) + +- ++ + + dec(rsi) // i -= 1; + jne(.DLOOPKITER16) // iterate again if i != 0. +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + label(.DCONSIDKITER4) +- ++ + mov(var(k_iter4), rsi) // i = k_iter4; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT1) // if i == 0, jump to code that + // considers k_left1 loop. + // else, we prepare to enter k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER4) // EDGE LOOP (ymm) +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -799,21 +799,21 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm8) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKITER4) // iterate again if i != 0. +- +- +- ++ ++ ++ + + label(.DCONSIDKLEFT1) +- ++ + mov(var(k_left1), rsi) // i = k_left1; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left1 loop. +- +- ++ ++ + + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) +@@ -821,7 +821,7 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + // using the xmm registers would zero out the + // high bits of the destination registers, + // which would destory intermediate results. +- ++ + vmovsd(mem(rbx ), xmm0) + add(imm(1*8), rbx) // b += 1*rs_b = 1*8; + +@@ -835,12 +835,12 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + add(imm(1*8), rax) // a += 1*cs_a = 1*8; + vfmadd231pd(ymm0, ymm3, ymm8) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT1) // iterate again if i != 0. +- +- +- ++ ++ ++ + + + +@@ -850,16 +850,16 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + // ymm4 + // ymm6 + // ymm8 +- +- vhaddpd( ymm5, ymm4, ymm0 ) ++ ++ vhaddpd( ymm4, ymm4, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm4 ) + +- vhaddpd( ymm7, ymm6, ymm0 ) ++ vhaddpd( ymm6, ymm6, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm6 ) + +- vhaddpd( ymm9, ymm8, ymm0 ) ++ vhaddpd( ymm8, ymm8, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm8 ) + +@@ -871,87 +871,87 @@ void bli_dgemmsup_rd_haswell_asm_3x1 + + //mov(var(rs_c), rdi) // load rs_c + //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) +- ++ + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate +- ++ + vmulpd(xmm0, xmm4, xmm4) // scale by alpha + vmulpd(xmm0, xmm6, xmm6) + vmulpd(xmm0, xmm8, xmm8) +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + //mov(var(cs_c), rsi) // load cs_c + //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) +- +- +- ++ ++ ++ + // now avoid loading C if beta == 0 +- ++ + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case +- + +- ++ ++ + label(.DROWSTORED) +- + +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm4) + vmovsd(xmm4, mem(rcx)) + add(rdi, rcx) +- +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm6) + vmovsd(xmm6, mem(rcx)) + add(rdi, rcx) +- +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm8) + vmovsd(xmm8, mem(rcx)) + //add(rdi, rcx) +- +- +- ++ ++ ++ + jmp(.DDONE) // jump to end. +- +- +- +- ++ ++ ++ ++ + label(.DBETAZERO) +- + +- ++ ++ + label(.DROWSTORBZ) +- +- ++ ++ + vmovsd(xmm4, mem(rcx)) + add(rdi, rcx) +- ++ + vmovsd(xmm6, mem(rcx)) + add(rdi, rcx) +- ++ + vmovsd(xmm8, mem(rcx)) + //add(rdi, rcx) +- + +- +- +- ++ ++ ++ ++ + label(.DDONE) +- ++ + + + + label(.DRETURN) + +- ++ + + end_asm( + : // output operands (none) +@@ -1022,9 +1022,9 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + // ------------------------------------------------------------------------- + + begin_asm() +- ++ + //vzeroall() // zero all xmm/ymm registers. +- ++ + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + //mov(var(cs_a), r9) // load cs_a +@@ -1042,7 +1042,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + + //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b + //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a +- ++ + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c +@@ -1078,19 +1078,19 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c + #endif +- + +- +- ++ ++ ++ + mov(var(k_iter16), rsi) // i = k_iter16; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKITER4) // if i == 0, jump to code that + // contains the k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER16) // MAIN LOOP +- +- ++ ++ + // ---------------------------------- iteration 0 + + #if 0 +@@ -1109,7 +1109,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm6) + +- ++ + // ---------------------------------- iteration 1 + + vmovupd(mem(rbx ), ymm0) +@@ -1124,7 +1124,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + + + // ---------------------------------- iteration 2 +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -1154,27 +1154,27 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm6) + +- ++ + + dec(rsi) // i -= 1; + jne(.DLOOPKITER16) // iterate again if i != 0. +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + label(.DCONSIDKITER4) +- ++ + mov(var(k_iter4), rsi) // i = k_iter4; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT1) // if i == 0, jump to code that + // considers k_left1 loop. + // else, we prepare to enter k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER4) // EDGE LOOP (ymm) +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -1191,21 +1191,21 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm6) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKITER4) // iterate again if i != 0. +- +- +- ++ ++ ++ + + label(.DCONSIDKLEFT1) +- ++ + mov(var(k_left1), rsi) // i = k_left1; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left1 loop. +- +- ++ ++ + + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) +@@ -1213,7 +1213,7 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + // using the xmm registers would zero out the + // high bits of the destination registers, + // which would destory intermediate results. +- ++ + vmovsd(mem(rbx ), xmm0) + add(imm(1*8), rbx) // b += 1*rs_b = 1*8; + +@@ -1224,12 +1224,12 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + add(imm(1*8), rax) // a += 1*cs_a = 1*8; + vfmadd231pd(ymm0, ymm3, ymm6) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT1) // iterate again if i != 0. +- +- +- ++ ++ ++ + + + +@@ -1238,12 +1238,12 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + + // ymm4 + // ymm6 +- +- vhaddpd( ymm5, ymm4, ymm0 ) ++ ++ vhaddpd( ymm4, ymm4, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm4 ) + +- vhaddpd( ymm7, ymm6, ymm0 ) ++ vhaddpd( ymm6, ymm6, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm6 ) + +@@ -1254,78 +1254,78 @@ void bli_dgemmsup_rd_haswell_asm_2x1 + + //mov(var(rs_c), rdi) // load rs_c + //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) +- ++ + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate +- ++ + vmulpd(xmm0, xmm4, xmm4) // scale by alpha + vmulpd(xmm0, xmm6, xmm6) +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + //mov(var(cs_c), rsi) // load cs_c + //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) +- +- +- ++ ++ ++ + // now avoid loading C if beta == 0 +- ++ + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case +- + +- ++ ++ + label(.DROWSTORED) +- + +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm4) + vmovsd(xmm4, mem(rcx)) + add(rdi, rcx) +- +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm6) + vmovsd(xmm6, mem(rcx)) + //add(rdi, rcx) +- +- +- ++ ++ ++ + jmp(.DDONE) // jump to end. +- +- +- +- ++ ++ ++ ++ + label(.DBETAZERO) +- + +- ++ ++ + label(.DROWSTORBZ) +- +- ++ ++ + vmovsd(xmm4, mem(rcx)) + add(rdi, rcx) +- ++ + vmovsd(xmm6, mem(rcx)) + //add(rdi, rcx) +- + +- +- +- ++ ++ ++ ++ + label(.DDONE) +- ++ + + + + label(.DRETURN) + +- ++ + + end_asm( + : // output operands (none) +@@ -1396,9 +1396,9 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + // ------------------------------------------------------------------------- + + begin_asm() +- ++ + //vzeroall() // zero all xmm/ymm registers. +- ++ + mov(var(a), rax) // load address of a. + mov(var(rs_a), r8) // load rs_a + //mov(var(cs_a), r9) // load cs_a +@@ -1416,7 +1416,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + + //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b + //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a +- ++ + + mov(var(c), rcx) // load address of c + mov(var(rs_c), rdi) // load rs_c +@@ -1450,19 +1450,19 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c + #endif +- + +- +- ++ ++ ++ + mov(var(k_iter16), rsi) // i = k_iter16; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKITER4) // if i == 0, jump to code that + // contains the k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER16) // MAIN LOOP +- +- ++ ++ + // ---------------------------------- iteration 0 + + #if 0 +@@ -1478,7 +1478,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm4) + +- ++ + // ---------------------------------- iteration 1 + + vmovupd(mem(rbx ), ymm0) +@@ -1490,7 +1490,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + + + // ---------------------------------- iteration 2 +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -1514,27 +1514,27 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm4) + +- ++ + + dec(rsi) // i -= 1; + jne(.DLOOPKITER16) // iterate again if i != 0. +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + label(.DCONSIDKITER4) +- ++ + mov(var(k_iter4), rsi) // i = k_iter4; + test(rsi, rsi) // check i via logical AND. + je(.DCONSIDKLEFT1) // if i == 0, jump to code that + // considers k_left1 loop. + // else, we prepare to enter k_iter4 loop. +- +- ++ ++ + label(.DLOOPKITER4) // EDGE LOOP (ymm) +- ++ + #if 0 + prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a + prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a +@@ -1548,21 +1548,21 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + add(imm(4*8), rax) // a += 4*cs_a = 4*8; + vfmadd231pd(ymm0, ymm3, ymm4) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKITER4) // iterate again if i != 0. +- +- +- ++ ++ ++ + + label(.DCONSIDKLEFT1) +- ++ + mov(var(k_left1), rsi) // i = k_left1; + test(rsi, rsi) // check i via logical AND. + je(.DPOSTACCUM) // if i == 0, we're done; jump to end. + // else, we prepare to enter k_left1 loop. +- +- ++ ++ + + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) +@@ -1570,7 +1570,7 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + // using the xmm registers would zero out the + // high bits of the destination registers, + // which would destory intermediate results. +- ++ + vmovsd(mem(rbx ), xmm0) + add(imm(1*8), rbx) // b += 1*rs_b = 1*8; + +@@ -1578,12 +1578,12 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + add(imm(1*8), rax) // a += 1*cs_a = 1*8; + vfmadd231pd(ymm0, ymm3, ymm4) + +- ++ + dec(rsi) // i -= 1; + jne(.DLOOPKLEFT1) // iterate again if i != 0. +- +- +- ++ ++ ++ + + + +@@ -1591,8 +1591,8 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + label(.DPOSTACCUM) + + // ymm4 +- +- vhaddpd( ymm5, ymm4, ymm0 ) ++ ++ vhaddpd( ymm4, ymm4, ymm0 ) + vextractf128(imm(1), ymm0, xmm1 ) + vaddpd( xmm0, xmm1, xmm4 ) + +@@ -1602,69 +1602,69 @@ void bli_dgemmsup_rd_haswell_asm_1x1 + + //mov(var(rs_c), rdi) // load rs_c + //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) +- ++ + mov(var(alpha), rax) // load address of alpha + mov(var(beta), rbx) // load address of beta + vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate + vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate +- ++ + vmulpd(xmm0, xmm4, xmm4) // scale by alpha +- +- +- +- +- +- ++ ++ ++ ++ ++ ++ + //mov(var(cs_c), rsi) // load cs_c + //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) +- +- +- ++ ++ ++ + // now avoid loading C if beta == 0 +- ++ + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. + vucomisd(xmm0, xmm3) // set ZF if beta == 0. + je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case +- + +- ++ ++ + label(.DROWSTORED) +- + +- vmovsd(mem(rcx), xmm0) ++ ++ vmovsd(mem(rcx), xmm0) + vfmadd231pd(xmm0, xmm3, xmm4) + vmovsd(xmm4, mem(rcx)) + //add(rdi, rcx) +- +- +- ++ ++ ++ + jmp(.DDONE) // jump to end. +- +- +- +- ++ ++ ++ ++ + label(.DBETAZERO) +- + +- ++ ++ + label(.DROWSTORBZ) +- +- ++ ++ + vmovsd(xmm4, mem(rcx)) + //add(rdi, rcx) +- + +- +- +- ++ ++ ++ ++ + label(.DDONE) +- ++ + + + + label(.DRETURN) + +- ++ + + end_asm( + : // output operands (none) +diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +index 21dd3b89..516bfced 100644 +--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c ++++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +@@ -1338,17 +1338,6 @@ void bli_dgemmsup_rd_haswell_asm_1x4 + + vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) + +- +- //vhaddpd( ymm8, ymm5, ymm0 ) +- //vextractf128(imm(1), ymm0, xmm1 ) +- //vaddpd( xmm0, xmm1, xmm0 ) +- +- //vhaddpd( ymm14, ymm11, ymm2 ) +- //vextractf128(imm(1), ymm2, xmm1 ) +- //vaddpd( xmm2, xmm1, xmm2 ) +- +- //vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) +- + // xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) + +