diff --git a/linear-algebra/blas/gemver/gemver.c b/linear-algebra/blas/gemver/gemver.c index 65620c0703ecfda0b57280a7501acde82c56dd91..b930ef2ce945186cb83a2ae6250fac5d7d943a4e 100644 --- a/linear-algebra/blas/gemver/gemver.c +++ b/linear-algebra/blas/gemver/gemver.c @@ -104,7 +104,73 @@ void kernel_gemver(int n, DATA_TYPE POLYBENCH_1D(z,N,n)) { int i, j; -#ifdef USEINTRINSICS + +#ifdef VECTORIZE_OUTER +#ifdef __ARM_FEATURE_SVE +#pragma scop + int slice = svcntd(); + svfloat64_t betav = svdup_f64(beta); + svfloat64_t alphav = svdup_f64(alpha); + svuint64_t indv = svindex_u64(0, _PB_N); + for (i = 0; i < _PB_N; i++){ + j = 0; + svfloat64_t u1v = svdup_f64(u1[i]); + svfloat64_t u2v = svdup_f64(u2[i]); + svbool_t pg = svwhilelt_b64(j, _PB_N); + do { + svfloat64_t a = svld1(pg, &A[i][j]); + svfloat64_t v1v = svld1(pg, &v1[j]); + svfloat64_t v2v = svld1(pg, &v2[j]); + svfloat64_t temp = svmul_z(pg, u2v, v2v); + a = svmla_z(pg, a, v1v, u1v); + svst1_f64(pg, &A[i][j], svadd_z(pg, a, temp)); + j += svcntd(); + pg = svwhilelt_b64(j, _PB_N); + } while (svptest_any(svptrue_b64(), pg)); + } + + for (i = 0; i < _PB_N; i += slice) { + svbool_t pg = svwhilelt_b64(i, _PB_N); + svfloat64_t xv = svld1(pg, &x[i]); + for (j = 0; j < _PB_N; j++) { + svfloat64_t av = svld1(pg, &A[j][i]); + svfloat64_t yv = svdup_f64(y[j]); + svfloat64_t temp = svmul_z(pg, betav, av); + xv = svmla_z(pg, xv, temp, yv); + } + svst1_f64(pg, &x[i], xv); + } + + i = 0; + svbool_t pg = svwhilelt_b64(i, _PB_N); + do { + svfloat64_t xv = svld1(pg, &x[i]); + svfloat64_t zv = svld1(pg, &z[i]); + svfloat64_t temp = svadd_z(pg, xv, zv); + svst1_f64(pg, &x[i], temp); + i += svcntd(); + pg = svwhilelt_b64(i, _PB_N); + } while (svptest_any(svptrue_b64(), pg)); + + for (i = 0; i < _PB_N; i += slice) { + svbool_t pg = svwhilelt_b64(i, _PB_N); + svfloat64_t wv = svld1(pg, &w[i]); + for (j = 0; j < _PB_N; j++) { + svfloat64_t av = svld1_gather_index(pg, &A[i][j], indv); + svfloat64_t xv = svdup_f64(x[j]); + svfloat64_t temp = svmul_z(pg, alphav, av); + wv = svmla_z(pg, wv, temp, xv); + } + svst1_f64(pg, &w[i], wv); + } + +#pragma endscop +#else +#error -DVECTORIZE_OUTER used but no support for SVE +#endif + + +#elif defined(USEINTRINSICS) #ifdef __ARM_FEATURE_SVE #pragma scop for (i = 0; i < _PB_N; i++){ @@ -166,7 +232,6 @@ void kernel_gemver(int n, } while (svptest_any(svptrue_b64(), pg)); } #pragma endscop -#pragma endscop #else #error -DUSEINTRISICS used but no support for SVE #endif diff --git a/linear-algebra/blas/gesummv/gesummv.c b/linear-algebra/blas/gesummv/gesummv.c index 54a2b607785a335e7ccd632fd5291bc358c2abc4..9a047873733b23120c856826b3b3a800b934f2ed 100644 --- a/linear-algebra/blas/gesummv/gesummv.c +++ b/linear-algebra/blas/gesummv/gesummv.c @@ -88,7 +88,36 @@ void kernel_gesummv(int n, { int i, j; -#ifdef USEINTRINSICS +#ifdef VECTORIZE_OUTER +#ifdef __ARM_FEATURE_SVE +#pragma scop + int slice = svcntd(); + svfloat64_t betav = svdup_f64(beta); + svfloat64_t alphav = svdup_f64(alpha); + svuint64_t indv = svindex_u64(0, _PB_N); + for (i = 0; i < _PB_N; i += slice) + { + svbool_t pg = svwhilelt_b64(i, _PB_N); + svfloat64_t tmpv = svdup_f64(0.0); + svfloat64_t yv = svdup_f64(0.0); + for (j = 0; j < _PB_N; j++) + { + svfloat64_t xv = svdup_f64(x[j]); + svfloat64_t av = svld1_gather_index(pg, &A[i][j], indv); + svfloat64_t bv = svld1_gather_index(pg, &B[i][j], indv); + tmpv = svmla_z(pg, tmpv, av, xv); + yv = svmla_z(pg, yv, bv, xv); + } + svfloat64_t tv = svmul_z(pg, betav, yv); + yv = svmla_z(pg, tv, alphav, tmpv); + svst1_f64(pg, &y[i], yv); + } +#pragma endscop +#else +#error -DVECTORIZE_OUTER used but no support for SVE +#endif + +#elif defined(USEINTRINSICS) #ifdef __ARM_FEATURE_SVE #pragma scop for (i = 0; i < _PB_N; i++) diff --git a/linear-algebra/blas/symm/symm.c b/linear-algebra/blas/symm/symm.c index 956aeeb488ecc23e648db67bf6f7bd947f41e557..474429453dde806223d1a628272451c567ea05b3 100644 --- a/linear-algebra/blas/symm/symm.c +++ b/linear-algebra/blas/symm/symm.c @@ -20,6 +20,9 @@ #include <sys/prctl.h> #endif +#ifdef __ARM_FEATURE_SVE +#include <arm_sve.h> +#endif /* Include benchmark-specific header. */ #include "symm.h" @@ -87,14 +90,50 @@ void kernel_symm(int m, int n, int i, j, k; DATA_TYPE temp2; -//BLAS PARAMS -//SIDE = 'L' -//UPLO = 'L' +// BLAS PARAMS +// SIDE = 'L' +// UPLO = 'L' // => Form C := alpha*A*B + beta*C // A is MxM // B is MxN // C is MxN -//note that due to Fortran array layout, the code below more closely resembles upper triangular case in BLAS +// note that due to Fortran array layout, the code below more closely resembles upper triangular case in BLAS + +#if defined(VECTORIZE_OUTER) +#ifdef __ARM_FEATURE_SVE +#pragma scop + int slice = svcntd(); + svfloat64_t betav = svdup_f64(beta); + svfloat64_t alphav = svdup_f64(alpha); + svuint64_t indv = svindex_u64(0, _PB_N); + for (i = 0; i < _PB_M; i++) + for (j = 0; j < _PB_N; j += slice ) + { + svfloat64_t temp2v = svdup_f64(0.0); + svbool_t pg = svwhilelt_b64(j, _PB_N); + svfloat64_t bijv = svld1(pg, &B[i][j]); + for (k = 0; k < i; k++) { + svfloat64_t aikv = svdup_f64(A[i][k]); + svfloat64_t ckjv = svld1(pg, &C[k][j]); + svfloat64_t tempv = svmul_z(pg, alphav, bijv); + svst1_f64(pg, &C[k][j], svmla_z(pg, ckjv, tempv, aikv)); + + svfloat64_t bkjv = svld1(pg, &B[k][j]); + temp2v = svmla_z(pg, temp2v, bkjv, aikv); + } + svfloat64_t cijv = svld1(pg, &C[i][j]); + svfloat64_t bc = svmul_z(pg, betav, cijv); + svfloat64_t aiiv = svdup_f64(A[i][i]); + svfloat64_t aba = svmul_z(pg, alphav, svmul_z(pg, bijv, aiiv)); + svfloat64_t tempv = svadd_z(pg, bc, aba); + svst1_f64(pg, &C[i][j], svmla_z(pg, tempv, alphav, temp2v)); + } +#pragma endscop +#else +#error -DVECTORIZE_OUTER used but no support for SVE +#endif + +#else #pragma scop for (i = 0; i < _PB_M; i++) for (j = 0; j < _PB_N; j++ ) @@ -107,6 +146,7 @@ void kernel_symm(int m, int n, C[i][j] = beta * C[i][j] + alpha*B[i][j] * A[i][i] + alpha * temp2; } #pragma endscop +#endif } diff --git a/linear-algebra/blas/trmm/trmm.c b/linear-algebra/blas/trmm/trmm.c index dccab734a8b3237a81a0d6e30589bea408b89b75..acbc23634ede5f2eb52a749233644f7e21437b45 100644 --- a/linear-algebra/blas/trmm/trmm.c +++ b/linear-algebra/blas/trmm/trmm.c @@ -20,6 +20,9 @@ #include <sys/prctl.h> #endif +#ifdef __ARM_FEATURE_SVE +#include <arm_sve.h> +#endif /* __ARM_FEATURE_SVE */ /* Include benchmark-specific header. */ #include "trmm.h" @@ -88,6 +91,30 @@ void kernel_trmm(int m, int n, // => Form B := alpha*A**T*B. // A is MxM // B is MxN +#ifdef VECTORIZE_OUTER +#ifdef __ARM_FEATURE_SVE +#pragma scop + int slice = svcntd(); + svfloat64_t alphav = svdup_f64(alpha); + svuint64_t indv = svindex_u64(0, _PB_N); + for (i = 0; i < _PB_M; i++) { + for (j = 0; j < _PB_N; j += slice) { + svbool_t pg = svwhilelt_b64(j, _PB_N); + svfloat64_t bijv = svld1(pg, &B[i][j]); + for (k = i+1; k < _PB_M; k++) { + svfloat64_t akiv = svdup_f64(A[k][i]); + svfloat64_t bkjv = svld1(pg, &B[k][j]); + bijv = svmla_z(pg, bijv, akiv, bkjv); + } + bijv = svmul_z(pg, alphav, bijv); + svst1_f64(pg, &B[i][j], bijv); + } + } +#pragma endcop +#else +#error -DVECTORIZE_OUTER used but no support for SVE +#endif +#else #pragma scop for (i = 0; i < _PB_M; i++) for (j = 0; j < _PB_N; j++) { @@ -96,6 +123,7 @@ void kernel_trmm(int m, int n, B[i][j] = alpha * B[i][j]; } #pragma endscop +#endif } diff --git a/linear-algebra/kernels/2mm/2mm.c b/linear-algebra/kernels/2mm/2mm.c index 562c2f50190c05d03ab2bc0ae9d21007c4028d76..a0f5bd4e12f83cb347b83b332469d64b02502764 100644 --- a/linear-algebra/kernels/2mm/2mm.c +++ b/linear-algebra/kernels/2mm/2mm.c @@ -20,6 +20,9 @@ #include <sys/prctl.h> #endif +#ifdef __ARM_FEATURE_SVE +#include <arm_sve.h> +#endif /* __ARM_FEATURE_SVE */ /* Include benchmark-specific header. */ #include "2mm.h" @@ -89,7 +92,50 @@ void kernel_2mm(int ni, int nj, int nk, int nl, DATA_TYPE POLYBENCH_2D(D,NI,NL,ni,nl)) { int i, j, k; +#ifdef VECTORIZE_OUTER +#ifdef __ARM_FEATURE_SVE +#pragma scop + int slice = svcntd(); + svfloat64_t betav = svdup_f64(beta); + svfloat64_t alphav = svdup_f64(alpha); + /* D := alpha*A*B*C + beta*D */ + for (i = 0; i < _PB_NI; i++) { + for (j = 0; j < _PB_NJ; j += slice) + { + svbool_t pg = svwhilelt_b64(j, _PB_NJ); + svfloat64_t tmpijv = svdup_f64(0.0); + //tmp[i][j] = SCALAR_VAL(0.0); + for (k = 0; k < _PB_NK; ++k) { + svfloat64_t aikv = svdup_f64(A[i][k]); + svfloat64_t bkjv = svld1(pg, &B[k][j]); + svfloat64_t tmp = svmul_z(pg, alphav, aikv); + tmpijv = svmla_z(pg, tmpijv, tmp, bkjv); + } + svst1_f64(pg, &tmp[i][j], tmpijv); + } + } + for (i = 0; i < _PB_NI; i++) { + for (j = 0; j < _PB_NL; j += slice) + { + svbool_t pg = svwhilelt_b64(j, _PB_NL); + svfloat64_t dijv = svld1(pg, &D[i][j]); + dijv = svmul_z(pg, dijv, betav); + for (k = 0; k < _PB_NJ; ++k) { + svfloat64_t tmpikv = svdup_f64(tmp[i][k]); + svfloat64_t ckjv = svld1(pg, &C[k][j]); + dijv = svmla_z(pg, dijv, tmpikv, ckjv); + } + svst1_f64(pg, &D[i][j], dijv); + } + } + +#pragma endscop +#else +#error -DVECTORIZE_OUTER used but no support for SVE +#endif + +#else #pragma scop /* D := alpha*A*B*C + beta*D */ for (i = 0; i < _PB_NI; i++) @@ -107,6 +153,7 @@ void kernel_2mm(int ni, int nj, int nk, int nl, D[i][j] += tmp[i][k] * C[k][j]; } #pragma endscop +#endif } diff --git a/linear-algebra/kernels/3mm/3mm.c b/linear-algebra/kernels/3mm/3mm.c index b339912c729e250108762aba63f62d1371697f4a..6b93c536c1ab867ded10b5f1d1a55a387f0a0a9b 100644 --- a/linear-algebra/kernels/3mm/3mm.c +++ b/linear-algebra/kernels/3mm/3mm.c @@ -20,6 +20,9 @@ #include <sys/prctl.h> #endif +#ifdef __ARM_FEATURE_SVE +#include <arm_sve.h> +#endif /* __ARM_FEATURE_SVE */ /* Include benchmark-specific header. */ #include "3mm.h" @@ -85,7 +88,64 @@ void kernel_3mm(int ni, int nj, int nk, int nl, int nm, DATA_TYPE POLYBENCH_2D(G,NI,NL,ni,nl)) { int i, j, k; +#ifdef VECTORIZE_OUTER +#ifdef __ARM_FEATURE_SVE +#pragma scop + int slice = svcntd(); + /* E := A*B */ + for (i = 0; i < _PB_NI; i++) + { + for (j = 0; j < _PB_NJ; j += slice) + { + svbool_t pg = svwhilelt_b64(j, _PB_NJ); + svfloat64_t eijv = svdup_f64(0.0); + for (k = 0; k < _PB_NK; ++k) { + svfloat64_t aikv = svdup_f64(A[i][k]); + svfloat64_t bkjv = svld1(pg, &B[k][j]); + eijv = svmla_z(pg, eijv, aikv, bkjv); + } + svst1_f64(pg, &E[i][j], eijv); + } + } + /* F := C*D */ + for (i = 0; i < _PB_NJ; i++) + { + for (j = 0; j < _PB_NL; j += slice) + { + svbool_t pg = svwhilelt_b64(j, _PB_NL); + svfloat64_t fijv = svdup_f64(0.0); + for (k = 0; k < _PB_NM; ++k) + { + svfloat64_t cikv = svdup_f64(C[i][k]); + svfloat64_t dkjv = svld1(pg, &D[k][j]); + fijv = svmla_z(pg, fijv, cikv, dkjv); + } + svst1_f64(pg, &F[i][j], fijv); + } + } + /* G := E*F */ + for (i = 0; i < _PB_NI; i++) + { + for (j = 0; j < _PB_NL; j += slice) + { + svbool_t pg = svwhilelt_b64(j, _PB_NL); + svfloat64_t gijv = svdup_f64(0.0); + for (k = 0; k < _PB_NJ; ++k) + { + svfloat64_t eikv = svdup_f64(E[i][k]); + svfloat64_t fkjv = svld1(pg, &F[k][j]); + gijv = svmla_z(pg, gijv, eikv, fkjv); + } + svst1_f64(pg, &G[i][j], gijv); + } + } +#pragma endscop +#else +#error -DVECTORIZE_OUTER used but no support for SVE +#endif + +#else #pragma scop /* E := A*B */ for (i = 0; i < _PB_NI; i++) @@ -112,6 +172,7 @@ void kernel_3mm(int ni, int nj, int nk, int nl, int nm, G[i][j] += E[i][k] * F[k][j]; } #pragma endscop +#endif } diff --git a/linear-algebra/kernels/doitgen/doitgen.c b/linear-algebra/kernels/doitgen/doitgen.c index ed6fcdc126d438372e09080e79b2a16c7b6e37bb..d3667d2f013c949a0a55c0ee66bcf4cbd49a0bfa 100644 --- a/linear-algebra/kernels/doitgen/doitgen.c +++ b/linear-algebra/kernels/doitgen/doitgen.c @@ -20,6 +20,9 @@ #include <sys/prctl.h> #endif +#ifdef __ARM_FEATURE_SVE +#include <arm_sve.h> +#endif /* __ARM_FEATURE_SVE */ /* Include benchmark-specific header. */ #include "doitgen.h" @@ -74,7 +77,33 @@ void kernel_doitgen(int nr, int nq, int np, DATA_TYPE POLYBENCH_1D(sum,NP,np)) { int r, q, p, s; - +#ifdef VECTORIZE_OUTER +#ifdef __ARM_FEATURE_SVE +#pragma scop + int slice = svcntd(); + for (r = 0; r < _PB_NR; r++) { + for (q = 0; q < _PB_NQ; q++) { + for (p = 0; p < _PB_NP; p += slice) { + svfloat64_t sumv = svdup_f64(0.0); + svbool_t pg = svwhilelt_b64(p, _PB_NP); + //sum[p] = SCALAR_VAL(0.0); + for (s = 0; s < _PB_NP; s++) { + svfloat64_t c4sp = svld1(pg, &C4[s][p]); + svfloat64_t arqsv = svdup_f64(A[r][q][s]); + sumv = svmla_z(pg, sumv, arqsv, c4sp); + } + svst1_f64(pg, &sum[p], sumv); + } + for (p = 0; p < _PB_NP; p++) + A[r][q][p] = sum[p]; + } + } +#pragma endscop +#else +#error -DVECTORIZE_OUTER used but no support for SVE +#endif + +#else #pragma scop for (r = 0; r < _PB_NR; r++) for (q = 0; q < _PB_NQ; q++) { @@ -87,6 +116,7 @@ void kernel_doitgen(int nr, int nq, int np, A[r][q][p] = sum[p]; } #pragma endscop +#endif } diff --git a/linear-algebra/kernels/mvt/mvt.c b/linear-algebra/kernels/mvt/mvt.c index 88fa97deaa75398ca2cdc9477664733764b7a394..c8ac137de259ee7c05ac1b96386aae9d706909d0 100644 --- a/linear-algebra/kernels/mvt/mvt.c +++ b/linear-algebra/kernels/mvt/mvt.c @@ -20,6 +20,9 @@ #include <sys/prctl.h> #endif +#ifdef __ARM_FEATURE_SVE +#include <arm_sve.h> +#endif /* __ARM_FEATURE_SVE */ /* Include benchmark-specific header. */ #include "mvt.h" @@ -89,7 +92,37 @@ void kernel_mvt(int n, DATA_TYPE POLYBENCH_2D(A,N,N,n,n)) { int i, j; +#ifdef VECTORIZE_OUTER +#ifdef __ARM_FEATURE_SVE +#pragma scop + int slice = svcntd(); + svuint64_t indv = svindex_u64(0, _PB_N); + for (i = 0; i < _PB_N; i += slice) { + svbool_t pg = svwhilelt_b64(i, _PB_N); + svfloat64_t xv = svld1(pg, &x1[i]); + for (j = 0; j < _PB_N; j++) { + svfloat64_t y1v = svdup_f64(y_1[j]); + svfloat64_t av = svld1_gather_index(pg, &A[i][j], indv); + xv = svmla_z(pg, xv, av, y1v); + } + svst1_f64(pg, &x1[i], xv); + } + for (i = 0; i < _PB_N; i += slice) { + svbool_t pg = svwhilelt_b64(i, _PB_N); + svfloat64_t xv = svld1(pg, &x2[i]); + for (j = 0; j < _PB_N; j++) { + svfloat64_t y2v = svdup_f64(y_2[j]); + svfloat64_t av = svld1(pg, &A[j][i]); + xv = svmla_z(pg, xv, av, y2v); + } + svst1_f64(pg, &x2[i], xv); + } +#pragma endscop +#else +#error -DVECTORIZE_OUTER used but no support for SVE +#endif +#else #pragma scop for (i = 0; i < _PB_N; i++) for (j = 0; j < _PB_N; j++) @@ -98,6 +131,7 @@ void kernel_mvt(int n, for (j = 0; j < _PB_N; j++) x2[i] = x2[i] + A[j][i] * y_2[j]; #pragma endscop +#endif } diff --git a/utilities/.run-all.pl.swp b/utilities/.run-all.pl.swp new file mode 100644 index 0000000000000000000000000000000000000000..dcd1ff98da5998a0f8ea923613dbd1df017d20aa Binary files /dev/null and b/utilities/.run-all.pl.swp differ diff --git a/utilities/makefile-gen.pl b/utilities/makefile-gen.pl index 152c3bf1de2a4a34fa80621496c0172e22334c10..dfea22001e5818e9501d9345083576b2902dbd38 100644 --- a/utilities/makefile-gen.pl +++ b/utilities/makefile-gen.pl @@ -151,7 +151,7 @@ open FILE, '>'.$TARGET_DIR.'/config.mk'; print FILE << "EOF"; asdfCFLAGS=/scratch/gem5_utils/libm5.a -march=armv8-a+sve -O3 -ffp-contract=fast -static -lpthread -DUSEM5OPS -I/scratch/gem5_utils -DPOLYBENCH_NO_FLUSH_CACHE -DREPEATKERNEL -CFLAGS=-march=armv8-a+sve -O3 -ffp-contract=fast -static -lpthread -DPOLYBENCH_DUMP_ARRAYS -DSVE_OPTIMIZED +CFLAGS=-march=armv8-a+sve -O3 -ffp-contract=fast -static -lpthread NOVEC_GCC=-fno-tree-vectorize NOVEC_CLANG=-fno-vectorize diff --git a/utilities/run-all.pl b/utilities/run-all.pl index fe624e2286b085e7088d4f7f25ac394352c671e4..599130146224771f995198caba225ec3737bf439 100644 --- a/utilities/run-all.pl +++ b/utilities/run-all.pl @@ -58,8 +58,6 @@ foreach $cat (@categories) { my $targetDir = $target.'/'.$dir; my $command = "cd $targetDir; - #make clean; - #make; rm -f $kernel-$COMPILE_COMMAND-$MEASURE.tmp; for i in {1..10} do