From ab8c87801bf0f2b7e6a1de93848c0467a8efa265 Mon Sep 17 00:00:00 2001
From: Bine Brank <b.brank@fz-juelich.de>
Date: Fri, 29 Jan 2021 12:14:17 +0100
Subject: [PATCH] add outer-loop vectorisation

---
 linear-algebra/blas/gemver/gemver.c      |  69 ++++++++++++++++++++++-
 linear-algebra/blas/gesummv/gesummv.c    |  31 +++++++++-
 linear-algebra/blas/symm/symm.c          |  48 ++++++++++++++--
 linear-algebra/blas/trmm/trmm.c          |  28 +++++++++
 linear-algebra/kernels/2mm/2mm.c         |  47 +++++++++++++++
 linear-algebra/kernels/3mm/3mm.c         |  61 ++++++++++++++++++++
 linear-algebra/kernels/doitgen/doitgen.c |  32 ++++++++++-
 linear-algebra/kernels/mvt/mvt.c         |  34 +++++++++++
 utilities/.run-all.pl.swp                | Bin 0 -> 12288 bytes
 utilities/makefile-gen.pl                |   2 +-
 utilities/run-all.pl                     |   2 -
 11 files changed, 343 insertions(+), 11 deletions(-)
 create mode 100644 utilities/.run-all.pl.swp

diff --git a/linear-algebra/blas/gemver/gemver.c b/linear-algebra/blas/gemver/gemver.c
index 65620c0..b930ef2 100644
--- a/linear-algebra/blas/gemver/gemver.c
+++ b/linear-algebra/blas/gemver/gemver.c
@@ -104,7 +104,73 @@ void kernel_gemver(int n,
 		   DATA_TYPE POLYBENCH_1D(z,N,n))
 {
   int i, j;
-#ifdef USEINTRINSICS
+
+#ifdef VECTORIZE_OUTER
+#ifdef __ARM_FEATURE_SVE
+#pragma scop
+  int slice = svcntd();
+  svfloat64_t betav = svdup_f64(beta);
+  svfloat64_t alphav = svdup_f64(alpha);
+  svuint64_t indv = svindex_u64(0, _PB_N);
+  for (i = 0; i < _PB_N; i++){
+      j = 0;
+      svfloat64_t u1v = svdup_f64(u1[i]);
+      svfloat64_t u2v = svdup_f64(u2[i]);
+      svbool_t pg = svwhilelt_b64(j, _PB_N);
+      do {
+          svfloat64_t a = svld1(pg, &A[i][j]);
+          svfloat64_t v1v = svld1(pg, &v1[j]);
+          svfloat64_t v2v = svld1(pg, &v2[j]);
+          svfloat64_t temp = svmul_z(pg, u2v, v2v);
+          a = svmla_z(pg, a, v1v, u1v);
+          svst1_f64(pg, &A[i][j], svadd_z(pg, a, temp));
+          j += svcntd();
+          pg = svwhilelt_b64(j, _PB_N);
+      } while (svptest_any(svptrue_b64(), pg));
+  }
+
+  for (i = 0; i < _PB_N; i += slice) {
+    svbool_t pg = svwhilelt_b64(i, _PB_N);
+    svfloat64_t xv = svld1(pg, &x[i]);
+    for (j = 0; j < _PB_N; j++) {
+      svfloat64_t av = svld1(pg, &A[j][i]);
+      svfloat64_t yv = svdup_f64(y[j]);
+      svfloat64_t temp = svmul_z(pg, betav, av);
+      xv = svmla_z(pg, xv, temp, yv);
+    }
+    svst1_f64(pg, &x[i], xv);
+  }
+
+  i = 0;
+  svbool_t pg = svwhilelt_b64(i, _PB_N);
+  do {
+      svfloat64_t xv = svld1(pg, &x[i]);
+      svfloat64_t zv = svld1(pg, &z[i]);
+      svfloat64_t temp = svadd_z(pg, xv, zv);
+      svst1_f64(pg, &x[i], temp);
+      i += svcntd();
+      pg = svwhilelt_b64(i, _PB_N);
+  } while (svptest_any(svptrue_b64(), pg));
+
+  for (i = 0; i < _PB_N; i += slice) {
+    svbool_t pg = svwhilelt_b64(i, _PB_N);
+    svfloat64_t wv = svld1(pg, &w[i]);
+    for (j = 0; j < _PB_N; j++) {
+      svfloat64_t av = svld1_gather_index(pg, &A[i][j], indv);
+      svfloat64_t xv = svdup_f64(x[j]);
+      svfloat64_t temp = svmul_z(pg, alphav, av);
+      wv = svmla_z(pg, wv, temp, xv);
+    }
+    svst1_f64(pg, &w[i], wv);
+  }
+
+#pragma endscop
+#else
+#error -DVECTORIZE_OUTER used but no support for SVE
+#endif
+  
+
+#elif defined(USEINTRINSICS)
 #ifdef __ARM_FEATURE_SVE
 #pragma scop
   for (i = 0; i < _PB_N; i++){
@@ -166,7 +232,6 @@ void kernel_gemver(int n,
       } while (svptest_any(svptrue_b64(), pg));
   }
 #pragma endscop
-#pragma endscop
 #else
 #error -DUSEINTRISICS used but no support for SVE
 #endif
diff --git a/linear-algebra/blas/gesummv/gesummv.c b/linear-algebra/blas/gesummv/gesummv.c
index 54a2b60..9a04787 100644
--- a/linear-algebra/blas/gesummv/gesummv.c
+++ b/linear-algebra/blas/gesummv/gesummv.c
@@ -88,7 +88,36 @@ void kernel_gesummv(int n,
 {
   int i, j;
 
-#ifdef USEINTRINSICS
+#ifdef VECTORIZE_OUTER
+#ifdef __ARM_FEATURE_SVE
+#pragma scop
+  int slice = svcntd();
+  svfloat64_t betav = svdup_f64(beta);
+  svfloat64_t alphav = svdup_f64(alpha);
+  svuint64_t indv = svindex_u64(0, _PB_N);
+  for (i = 0; i < _PB_N; i += slice)
+    {
+        svbool_t pg = svwhilelt_b64(i, _PB_N);
+        svfloat64_t tmpv = svdup_f64(0.0);
+        svfloat64_t yv = svdup_f64(0.0);
+        for (j = 0; j < _PB_N; j++)
+        {
+            svfloat64_t xv = svdup_f64(x[j]);
+            svfloat64_t av = svld1_gather_index(pg, &A[i][j], indv);
+            svfloat64_t bv = svld1_gather_index(pg, &B[i][j], indv);
+            tmpv = svmla_z(pg, tmpv, av, xv);
+            yv = svmla_z(pg, yv, bv, xv);
+        }
+        svfloat64_t tv = svmul_z(pg, betav, yv);
+        yv = svmla_z(pg, tv, alphav, tmpv);
+        svst1_f64(pg, &y[i], yv);
+    }
+#pragma endscop
+#else
+#error -DVECTORIZE_OUTER used but no support for SVE
+#endif
+
+#elif defined(USEINTRINSICS)
 #ifdef __ARM_FEATURE_SVE
 #pragma scop
   for (i = 0; i < _PB_N; i++)
diff --git a/linear-algebra/blas/symm/symm.c b/linear-algebra/blas/symm/symm.c
index 956aeeb..4744294 100644
--- a/linear-algebra/blas/symm/symm.c
+++ b/linear-algebra/blas/symm/symm.c
@@ -20,6 +20,9 @@
 #include <sys/prctl.h>
 #endif
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
 
 /* Include benchmark-specific header. */
 #include "symm.h"
@@ -87,14 +90,50 @@ void kernel_symm(int m, int n,
   int i, j, k;
   DATA_TYPE temp2;
 
-//BLAS PARAMS
-//SIDE = 'L'
-//UPLO = 'L'
+// BLAS PARAMS
+// SIDE = 'L'
+// UPLO = 'L'
 // =>  Form  C := alpha*A*B + beta*C
 // A is MxM
 // B is MxN
 // C is MxN
-//note that due to Fortran array layout, the code below more closely resembles upper triangular case in BLAS
+// note that due to Fortran array layout, the code below more closely resembles upper triangular case in BLAS
+
+#if defined(VECTORIZE_OUTER)
+#ifdef __ARM_FEATURE_SVE
+#pragma scop
+  int slice = svcntd();
+  svfloat64_t betav = svdup_f64(beta);
+  svfloat64_t alphav = svdup_f64(alpha);
+  svuint64_t indv = svindex_u64(0, _PB_N);
+   for (i = 0; i < _PB_M; i++)
+      for (j = 0; j < _PB_N; j += slice )
+      {
+        svfloat64_t temp2v = svdup_f64(0.0);
+        svbool_t pg = svwhilelt_b64(j, _PB_N);
+        svfloat64_t bijv = svld1(pg, &B[i][j]);
+        for (k = 0; k < i; k++) {
+            svfloat64_t aikv = svdup_f64(A[i][k]);
+            svfloat64_t ckjv = svld1(pg, &C[k][j]);
+            svfloat64_t tempv = svmul_z(pg, alphav, bijv);
+            svst1_f64(pg, &C[k][j], svmla_z(pg, ckjv, tempv, aikv));
+
+            svfloat64_t bkjv = svld1(pg, &B[k][j]);
+            temp2v = svmla_z(pg, temp2v, bkjv, aikv);
+        }
+        svfloat64_t cijv = svld1(pg, &C[i][j]);
+        svfloat64_t bc = svmul_z(pg, betav, cijv);
+        svfloat64_t aiiv = svdup_f64(A[i][i]);
+        svfloat64_t aba = svmul_z(pg, alphav, svmul_z(pg, bijv, aiiv));
+        svfloat64_t tempv = svadd_z(pg, bc, aba);
+        svst1_f64(pg, &C[i][j], svmla_z(pg, tempv, alphav, temp2v));
+     }
+#pragma endscop
+#else
+#error -DVECTORIZE_OUTER used but no support for SVE
+#endif
+
+#else
 #pragma scop
    for (i = 0; i < _PB_M; i++)
       for (j = 0; j < _PB_N; j++ )
@@ -107,6 +146,7 @@ void kernel_symm(int m, int n,
         C[i][j] = beta * C[i][j] + alpha*B[i][j] * A[i][i] + alpha * temp2;
      }
 #pragma endscop
+#endif
 
 }
 
diff --git a/linear-algebra/blas/trmm/trmm.c b/linear-algebra/blas/trmm/trmm.c
index dccab73..acbc236 100644
--- a/linear-algebra/blas/trmm/trmm.c
+++ b/linear-algebra/blas/trmm/trmm.c
@@ -20,6 +20,9 @@
 #include <sys/prctl.h>
 #endif
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */ 
 
 /* Include benchmark-specific header. */
 #include "trmm.h"
@@ -88,6 +91,30 @@ void kernel_trmm(int m, int n,
 // => Form  B := alpha*A**T*B.
 // A is MxM
 // B is MxN
+#ifdef VECTORIZE_OUTER
+#ifdef __ARM_FEATURE_SVE
+#pragma scop
+  int slice = svcntd();
+  svfloat64_t alphav = svdup_f64(alpha);
+  svuint64_t indv = svindex_u64(0, _PB_N);
+  for (i = 0; i < _PB_M; i++) {
+     for (j = 0; j < _PB_N; j += slice) {
+         svbool_t pg = svwhilelt_b64(j, _PB_N);
+         svfloat64_t bijv = svld1(pg, &B[i][j]);
+         for (k = i+1; k < _PB_M; k++) {
+             svfloat64_t akiv = svdup_f64(A[k][i]);
+             svfloat64_t bkjv = svld1(pg, &B[k][j]);
+             bijv = svmla_z(pg, bijv, akiv, bkjv);
+         }
+         bijv = svmul_z(pg, alphav, bijv);
+         svst1_f64(pg, &B[i][j], bijv);
+     }
+  }
+#pragma endcop
+#else
+#error -DVECTORIZE_OUTER used but no support for SVE
+#endif
+#else
 #pragma scop
   for (i = 0; i < _PB_M; i++)
      for (j = 0; j < _PB_N; j++) {
@@ -96,6 +123,7 @@ void kernel_trmm(int m, int n,
         B[i][j] = alpha * B[i][j];
      }
 #pragma endscop
+#endif
 
 }
 
diff --git a/linear-algebra/kernels/2mm/2mm.c b/linear-algebra/kernels/2mm/2mm.c
index 562c2f5..a0f5bd4 100644
--- a/linear-algebra/kernels/2mm/2mm.c
+++ b/linear-algebra/kernels/2mm/2mm.c
@@ -20,6 +20,9 @@
 #include <sys/prctl.h>
 #endif
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */ 
 
 /* Include benchmark-specific header. */
 #include "2mm.h"
@@ -89,7 +92,50 @@ void kernel_2mm(int ni, int nj, int nk, int nl,
 		DATA_TYPE POLYBENCH_2D(D,NI,NL,ni,nl))
 {
   int i, j, k;
+#ifdef VECTORIZE_OUTER
+#ifdef __ARM_FEATURE_SVE
+#pragma scop
+  int slice = svcntd();
+  svfloat64_t betav = svdup_f64(beta);
+  svfloat64_t alphav = svdup_f64(alpha);
+  /* D := alpha*A*B*C + beta*D */
+  for (i = 0; i < _PB_NI; i++) {
+      for (j = 0; j < _PB_NJ; j += slice)
+      {
+          svbool_t pg = svwhilelt_b64(j, _PB_NJ);
+          svfloat64_t tmpijv = svdup_f64(0.0);
+          //tmp[i][j] = SCALAR_VAL(0.0);
+          for (k = 0; k < _PB_NK; ++k) {
+            svfloat64_t aikv = svdup_f64(A[i][k]);
+            svfloat64_t bkjv = svld1(pg, &B[k][j]);
+            svfloat64_t tmp = svmul_z(pg, alphav, aikv);
+            tmpijv = svmla_z(pg, tmpijv, tmp, bkjv);
+          }
+          svst1_f64(pg, &tmp[i][j], tmpijv);
+      }
+  }
+  for (i = 0; i < _PB_NI; i++) {
+      for (j = 0; j < _PB_NL; j += slice)
+      {
+          svbool_t pg = svwhilelt_b64(j, _PB_NL);
+          svfloat64_t dijv = svld1(pg, &D[i][j]);
+          dijv = svmul_z(pg, dijv, betav);
+          for (k = 0; k < _PB_NJ; ++k) {
+            svfloat64_t tmpikv = svdup_f64(tmp[i][k]);
+            svfloat64_t ckjv = svld1(pg, &C[k][j]);
+            dijv = svmla_z(pg, dijv, tmpikv, ckjv);
+          }
+          svst1_f64(pg, &D[i][j], dijv);
+      }
+  }
+
+#pragma endscop
+#else
+#error -DVECTORIZE_OUTER used but no support for SVE
+#endif
+  
 
+#else
 #pragma scop
   /* D := alpha*A*B*C + beta*D */
   for (i = 0; i < _PB_NI; i++)
@@ -107,6 +153,7 @@ void kernel_2mm(int ni, int nj, int nk, int nl,
 	  D[i][j] += tmp[i][k] * C[k][j];
       }
 #pragma endscop
+#endif
 
 }
 
diff --git a/linear-algebra/kernels/3mm/3mm.c b/linear-algebra/kernels/3mm/3mm.c
index b339912..6b93c53 100644
--- a/linear-algebra/kernels/3mm/3mm.c
+++ b/linear-algebra/kernels/3mm/3mm.c
@@ -20,6 +20,9 @@
 #include <sys/prctl.h>
 #endif
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */ 
 
 /* Include benchmark-specific header. */
 #include "3mm.h"
@@ -85,7 +88,64 @@ void kernel_3mm(int ni, int nj, int nk, int nl, int nm,
 		DATA_TYPE POLYBENCH_2D(G,NI,NL,ni,nl))
 {
   int i, j, k;
+#ifdef VECTORIZE_OUTER
+#ifdef __ARM_FEATURE_SVE
+#pragma scop
+  int slice = svcntd();
+  /* E := A*B */
+  for (i = 0; i < _PB_NI; i++)
+  {
+      for (j = 0; j < _PB_NJ; j += slice)
+      {
+          svbool_t pg = svwhilelt_b64(j, _PB_NJ);
+          svfloat64_t eijv = svdup_f64(0.0);
+          for (k = 0; k < _PB_NK; ++k) {
+              svfloat64_t aikv = svdup_f64(A[i][k]);
+              svfloat64_t bkjv = svld1(pg, &B[k][j]);
+              eijv = svmla_z(pg, eijv, aikv, bkjv);
+          }
+          svst1_f64(pg, &E[i][j], eijv);
+      }
+  }
+  /* F := C*D */
+  for (i = 0; i < _PB_NJ; i++)
+  {
+      for (j = 0; j < _PB_NL; j += slice)
+      {
+          svbool_t pg = svwhilelt_b64(j, _PB_NL);
+          svfloat64_t fijv = svdup_f64(0.0);
+          for (k = 0; k < _PB_NM; ++k)
+          {
+              svfloat64_t cikv = svdup_f64(C[i][k]);
+              svfloat64_t dkjv = svld1(pg, &D[k][j]);
+              fijv = svmla_z(pg, fijv, cikv, dkjv);
+          }
+          svst1_f64(pg, &F[i][j], fijv);
+      }
+  }
+  /* G := E*F */
+  for (i = 0; i < _PB_NI; i++)
+  {
+      for (j = 0; j < _PB_NL; j += slice)
+      {
+          svbool_t pg = svwhilelt_b64(j, _PB_NL);
+          svfloat64_t gijv = svdup_f64(0.0);
+          for (k = 0; k < _PB_NJ; ++k)
+          {
+              svfloat64_t eikv = svdup_f64(E[i][k]);
+              svfloat64_t fkjv = svld1(pg, &F[k][j]);
+              gijv = svmla_z(pg, gijv, eikv, fkjv);
+          }
+          svst1_f64(pg, &G[i][j], gijv);
+      }
+  }
 
+#pragma endscop
+#else
+#error -DVECTORIZE_OUTER used but no support for SVE
+#endif
+
+#else
 #pragma scop
   /* E := A*B */
   for (i = 0; i < _PB_NI; i++)
@@ -112,6 +172,7 @@ void kernel_3mm(int ni, int nj, int nk, int nl, int nm,
 	  G[i][j] += E[i][k] * F[k][j];
       }
 #pragma endscop
+#endif
 
 }
 
diff --git a/linear-algebra/kernels/doitgen/doitgen.c b/linear-algebra/kernels/doitgen/doitgen.c
index ed6fcdc..d3667d2 100644
--- a/linear-algebra/kernels/doitgen/doitgen.c
+++ b/linear-algebra/kernels/doitgen/doitgen.c
@@ -20,6 +20,9 @@
 #include <sys/prctl.h>
 #endif
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */ 
 
 /* Include benchmark-specific header. */
 #include "doitgen.h"
@@ -74,7 +77,33 @@ void kernel_doitgen(int nr, int nq, int np,
 		    DATA_TYPE POLYBENCH_1D(sum,NP,np))
 {
   int r, q, p, s;
-
+#ifdef VECTORIZE_OUTER
+#ifdef __ARM_FEATURE_SVE
+#pragma scop
+  int slice = svcntd();
+  for (r = 0; r < _PB_NR; r++) {
+      for (q = 0; q < _PB_NQ; q++)  {
+          for (p = 0; p < _PB_NP; p += slice)  {
+              svfloat64_t sumv = svdup_f64(0.0);
+              svbool_t pg = svwhilelt_b64(p, _PB_NP);
+              //sum[p] = SCALAR_VAL(0.0);
+              for (s = 0; s < _PB_NP; s++) {
+                  svfloat64_t c4sp = svld1(pg, &C4[s][p]);
+                  svfloat64_t arqsv = svdup_f64(A[r][q][s]);
+                  sumv = svmla_z(pg, sumv, arqsv, c4sp);
+              }
+              svst1_f64(pg, &sum[p], sumv);
+          }
+          for (p = 0; p < _PB_NP; p++)
+              A[r][q][p] = sum[p];
+      }
+  }
+#pragma endscop
+#else
+#error -DVECTORIZE_OUTER used but no support for SVE
+#endif
+  
+#else
 #pragma scop
   for (r = 0; r < _PB_NR; r++)
     for (q = 0; q < _PB_NQ; q++)  {
@@ -87,6 +116,7 @@ void kernel_doitgen(int nr, int nq, int np,
 	A[r][q][p] = sum[p];
     }
 #pragma endscop
+#endif
 
 }
 
diff --git a/linear-algebra/kernels/mvt/mvt.c b/linear-algebra/kernels/mvt/mvt.c
index 88fa97d..c8ac137 100644
--- a/linear-algebra/kernels/mvt/mvt.c
+++ b/linear-algebra/kernels/mvt/mvt.c
@@ -20,6 +20,9 @@
 #include <sys/prctl.h>
 #endif
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */ 
 
 /* Include benchmark-specific header. */
 #include "mvt.h"
@@ -89,7 +92,37 @@ void kernel_mvt(int n,
 		DATA_TYPE POLYBENCH_2D(A,N,N,n,n))
 {
   int i, j;
+#ifdef VECTORIZE_OUTER
+#ifdef __ARM_FEATURE_SVE
+#pragma scop
+  int slice = svcntd();
+  svuint64_t indv = svindex_u64(0, _PB_N);
+  for (i = 0; i < _PB_N; i += slice) {
+      svbool_t pg = svwhilelt_b64(i, _PB_N);
+      svfloat64_t xv = svld1(pg, &x1[i]);
+    for (j = 0; j < _PB_N; j++) {
+      svfloat64_t y1v = svdup_f64(y_1[j]);
+      svfloat64_t av = svld1_gather_index(pg, &A[i][j], indv);
+      xv = svmla_z(pg, xv, av, y1v);
+    }
+    svst1_f64(pg, &x1[i], xv);
+  }
+  for (i = 0; i < _PB_N; i += slice) {
+      svbool_t pg = svwhilelt_b64(i, _PB_N);
+      svfloat64_t xv = svld1(pg, &x2[i]);
+    for (j = 0; j < _PB_N; j++) {
+      svfloat64_t y2v = svdup_f64(y_2[j]);
+      svfloat64_t av = svld1(pg, &A[j][i]);
+      xv = svmla_z(pg, xv, av, y2v);
+    }
+    svst1_f64(pg, &x2[i], xv);
+  }
+#pragma endscop
+#else
+#error -DVECTORIZE_OUTER used but no support for SVE
+#endif
 
+#else
 #pragma scop
   for (i = 0; i < _PB_N; i++)
     for (j = 0; j < _PB_N; j++)
@@ -98,6 +131,7 @@ void kernel_mvt(int n,
     for (j = 0; j < _PB_N; j++)
       x2[i] = x2[i] + A[j][i] * y_2[j];
 #pragma endscop
+#endif
 
 }
 
diff --git a/utilities/.run-all.pl.swp b/utilities/.run-all.pl.swp
new file mode 100644
index 0000000000000000000000000000000000000000..dcd1ff98da5998a0f8ea923613dbd1df017d20aa
GIT binary patch
literal 12288
zcmYc?2=nw+u+TGNU|?VnU|?APRyg64P=jF?9|J>DQDR=UAxHv0&MHkTPtDX#G_t_2
zt`1_LenEatWm0Nha)xeld~sQ-erZW&PG(7FYO#J%X`XIkPL5td4t|qHWk*9`Gz3Tr
zfzpyRT?<|YV<STYkUnK4MFnA@P!MwzkA}c#2#kinXb6mkz-S1JhQMeDjE2By2#k;r
zC@ElMsAphcV1oKL8A>yv(NOLvH5vk=Aut*OqaiRF0;3@?8UmvsFd71*Aut*OqaiRF
z0;3@?1VbP(g@IuX69a<_3uOKu*8l&&&%p4WpMhZ|KLbN7KLbNFKLdjaKLdjiKLdjy
zKLdj-KLdjdKLdjRKLf)LJ_d&Gd<+Z^`4|{B@G&sd@i8#e@-Z-E@i8!P@G&s_;bma>
z#mm6(lb3<v2QLG|Q(gv!6TA!z>v<U%*77nitl?!~n8nM$(8SBYki^Tt;Kj?pu#AU+
zp@oNmp@N5jp^S%tp_GS#p@fHlp_qq(!Hb81L5_!k;S)Cl!zFG8hArF-4AZ$87^ZPE
zFnDn@FnDq^FgS2CFeq~~F#P3WVA#XOz%Y}GfuVwnfx(Z9fx(xHfx(E2fq{dIfq|Wi
zf#D%11H&3l28P+33=A_k85mqR85nFi85nFh85s6)Ffi=pU|`t8!N8Eu!N6e4!N8!w
z!N9=9!NBl{oq=IDI|IWqb_Rx4b_RxAb_NDLb_NC>b_ND+b_RyeYzz!X*ccdAvoSDK
zvN14tvoSF6vN14RU}a#~!OFnU#LB?n%gVqY#LB>Ml7)dGj)j3Cl7)dGf`x&>orQtH
zl?4(n&zKn)o-z-?#5L;V!4d+s3<?Sg$vOGOsVSL73ND^O)(jvKu&@Fc6clCVm85AX
z<(HPEW#*(>#gs6BWeICyAXXuY>8iShX2mfjs%B|1C8{Y1MxK(6f=YI3QC@0}j)F=_
zW^SsEf=Y2_6^Ket%gHY&)=^MN%g-+XJ6J(q!O+0KK+_uJ8?ZkVlpzvG9)TJcQvx<F
zrUYzUOo<BEG!P9lEhbM1$)MayWQ%MSlJj#5N=s7Xb5nEkiz+pgpvHpS3vwT_8$eni
zj#aQ#fY_s;ub=_8UqK7(WCJij*93Nw0>UmZpP*GB>p>P7BDr2ABQ-H4wJ1J0Kd&S;
zuSCJNP9ax6Gc6@GO+m%QF~l+0HAGJ<CQnbxULz(&OEV@<A4xOF{pk7?Qu3>F@{<#D
zG#ng*+{1KK^fhgvmRQ@^)ne7HV5^{{Qj%Ddo?7CPS)>mQF})0=U{+Ru##cyYZmJb5
z#vuv73YHR3B1TyO<UCNyK=ZS*f=YgANkM5zd~r!pW?nkP&me_X3K}sfdLVxy`BGV-
zAhD=8RRN+$0h}SgNeGgHG?YL(^Gi#hj#5I3D@a7zDioI{6_*sj9BrVZplfKYpj?!i
zn_rfyke6DXlbM&QkercNl$czSS_Ds{U>|EJflWa+7A6jH5!4r06vX5q3nGO^f=Xgp
zYEfc(YJ75jZf;^;N&+HUK*~@IR<Kn_P{C9OaRf|4PYKmBh%8i)6Q)W*&sIT6!N|@|
zLB&5b#Ld&kRY@T;O+f?3Q^-qIP*>NqW>5wJ1(guTAa~c0co)wgq{V-r;EqoMtqaUe
zEXvj^&QQ>Ur5IfmXMf)SPaoHKXMbN`M?V+63MFd>M2vy_hURln(o0TJK%_ovg@VeG
zjQl(twnMe+6;#3!9fqa4D!#6c!J$E}dL_99Ap2AD^HQNn8Mjjrx+)Yv0R&0f_zkya
zNXchN%P&&MRLIOzs5aEoGc*8~#^Cf>l&hedMw}~AqaPaCm_dUYlJE=*%3lzUp1Que
zo=Qq)5vqX@zkrm1rQiuXFSVisoB$Lxbm4ITH$xLCC&5*M47aUQkBiaM(*mUw1qFrj
zjLe)=1r1Q-f@&>Wg`(8N6cEeBGe}dR8f063L24dU1XNpqOB#jzB88O9R0XA!%%arf
zlKi4dSSnD+%P&z#%P-AK(SwyJAj2VE1o;AyE+LLhPAsto#Y1Xha)yFRa$<>shC^~<
zNosn2QD$l}$PFO<U{ws@oS}eJ?RbMLF>s72C@92~fYLXZ0V=LzO5DL^drS$uF4M?%
z(o|3YI}GGCm_{hAo|~GIlUfN48mJ(QQZFt^%}dVADOT6PrZpw8Br!KLFEcM4yF93v
zN-WY%%t=p8DoWHZ&d(`JEy7_Inp#K-0IAKbR6q(F1zQCTbu_g}If)>J4A2HcYDH#=
zh9M*^fCDd0L&-C*EHNiDMFHgE;?xp7P;h8kGl0tyXsxB7Xse*)>*?ni4;4^SP*a1~
zX<*r4Uq>Gw+|s_TE}o&jxaEC7r4w#x*NBiHM^rVilmIOsKuHNyM@AdRS~GxB2wLd{
zmp6jSLwy1lH-w5q3sX>r1(i48A^@rIRIpV@Pc2C<Pl4o0WmG8@uoDbop~0sN;wsqL
zDj0xVizg0BixbmR6$(;|aug8ju@xYhL>H9VppgS^5M}1T>zZiYqEt}Dn_LpB5DiM~
zHF^1EsmZa(aR_mQqOF24*bzmkg{7HAsR~L+E=Q^nAc+gw;L<C|QL<(rP6s$^K{SAZ
zACav=A*lxn%F^Og1?Tb<g~D=;<nk0~Flt&efW#r5ux4OT2JzfLtzoO+oYJC#)FO}|
zgR(++QD#X=YMw$;r9w!4Zhl&6Zl*$HX?CWLf`OsFp{atAfuSjbGJ~>$hD&O3W_n&~
ziUO!K&rQq*HA{+1i&7O*%TkLf6(Bx@g(P?Ze6d1eT1je=LT-LaW?E))Vo7FxUa=;F
zvO-v9ab`&|R3p4>(@{uH%*iQM$W6>n)lo>yOHnAvNX=77tw>ESElDi~@f6^dDg%SE
RqJC*{k$zHUo<1ms836fF&kFzm

literal 0
HcmV?d00001

diff --git a/utilities/makefile-gen.pl b/utilities/makefile-gen.pl
index 152c3bf..dfea220 100644
--- a/utilities/makefile-gen.pl
+++ b/utilities/makefile-gen.pl
@@ -151,7 +151,7 @@ open FILE, '>'.$TARGET_DIR.'/config.mk';
 
 print FILE << "EOF";
 asdfCFLAGS=/scratch/gem5_utils/libm5.a -march=armv8-a+sve -O3 -ffp-contract=fast -static -lpthread -DUSEM5OPS -I/scratch/gem5_utils -DPOLYBENCH_NO_FLUSH_CACHE -DREPEATKERNEL
-CFLAGS=-march=armv8-a+sve -O3 -ffp-contract=fast -static -lpthread -DPOLYBENCH_DUMP_ARRAYS -DSVE_OPTIMIZED
+CFLAGS=-march=armv8-a+sve -O3 -ffp-contract=fast -static -lpthread 
 
 NOVEC_GCC=-fno-tree-vectorize
 NOVEC_CLANG=-fno-vectorize
diff --git a/utilities/run-all.pl b/utilities/run-all.pl
index fe624e2..5991301 100644
--- a/utilities/run-all.pl
+++ b/utilities/run-all.pl
@@ -58,8 +58,6 @@ foreach $cat (@categories) {
         my $targetDir = $target.'/'.$dir;
 
         my $command = "cd $targetDir;
-        #make clean;
-        #make;
         rm -f $kernel-$COMPILE_COMMAND-$MEASURE.tmp;
 for i in {1..10}
 do
-- 
GitLab