From 600625d2544d00b1813f17aeeefea31decbf5f88 Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain.dolbeau@european-processor-initiative.eu>
Date: Mon, 4 May 2020 07:44:45 -0400
Subject: [PATCH] Implements LMUL=2 using wide-add to expand the indices; not
 yet supported by vehave so untested.

---
 openmp-rvv/src/Makefile.clang.openmp      |  2 +-
 openmp-rvv/src/SELLMatrix.hpp             |  4 ++++
 openmp-rvv/src/SparseMatrix_functions.hpp | 23 +++++++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/openmp-rvv/src/Makefile.clang.openmp b/openmp-rvv/src/Makefile.clang.openmp
index dc0f5e9..0672a53 100644
--- a/openmp-rvv/src/Makefile.clang.openmp
+++ b/openmp-rvv/src/Makefile.clang.openmp
@@ -18,7 +18,7 @@ MINIFE_MATRIX_TYPE = -DMINIFE_SELL_MATRIX
 
 #-----------------------------------------------------------------------
 
-CFLAGS = -O2 -mepi -fno-vectorize -fopenmp #-static 
+CFLAGS = -O2 -mepi -fno-vectorize -fopenmp #-DWIDE_LMUL #-static 
 CXXFLAGS = $(CFLAGS)
 
 LLVMDIR=/opt/llvm-EPI-development-toolchain-native
diff --git a/openmp-rvv/src/SELLMatrix.hpp b/openmp-rvv/src/SELLMatrix.hpp
index 744ba72..e0e4134 100644
--- a/openmp-rvv/src/SELLMatrix.hpp
+++ b/openmp-rvv/src/SELLMatrix.hpp
@@ -96,7 +96,11 @@ SELLMatrix {
     num_cols_per_row = ncols_per_row;
 
     // make size of the block equal to a reasonable RVV size
+#ifndef WIDE_LMUL
     num_rows_per_block = __builtin_epi_vsetvl(8, __epi_e64, __epi_m1); // FIXME: heuristic instead of 8?
+#else
+    num_rows_per_block = __builtin_epi_vsetvl(8, __epi_e64, __epi_m2); // FIXME: heuristic instead of 8?
+#endif
 
     num_blocks = (nrows + num_rows_per_block - 1) / num_rows_per_block;
   }
diff --git a/openmp-rvv/src/SparseMatrix_functions.hpp b/openmp-rvv/src/SparseMatrix_functions.hpp
index c633af1..8402e8e 100644
--- a/openmp-rvv/src/SparseMatrix_functions.hpp
+++ b/openmp-rvv/src/SparseMatrix_functions.hpp
@@ -675,7 +675,9 @@ void operator()(MatrixType& A,
   int num_blocks = A.num_blocks;
   int num_rows_per_block = A.num_rows_per_block;
 
+#pragma omp parallel for
   for(int block_id=0; block_id < num_blocks; block_id++) {
+#ifndef WIDE_LMUL
 	  __epi_1xf64 sum = __builtin_epi_vfmv_v_f_1xf64(0.0, num_rows_per_block);
 	  int block_offset = block_id * num_rows_per_block * row_len;
 	  int stride = num_rows_per_block;
@@ -696,6 +698,27 @@ void operator()(MatrixType& A,
 		  block_offset += stride;
 	  }
 	  __builtin_epi_vstore_1xf64(&ycoefs[block_id * num_rows_per_block], sum, stride);
+#else
+	  __epi_2xf64 sum = __builtin_epi_vfmv_v_f_2xf64(0.0, num_rows_per_block);
+	  int block_offset = block_id * num_rows_per_block * row_len;
+	  int stride = num_rows_per_block;
+	  if (block_id == num_blocks-1  && n%num_rows_per_block!=0){
+		  stride = n%num_rows_per_block;
+	  }
+	  /* SVE uses a mask here, we can just alter VL, any VL <= num_rows_per_block (from vsetvli) should be legit */
+	  for(int i=0; i<row_len; i++){
+		  __epi_2xf64 acofs = __builtin_epi_vload_2xf64(&Acoefs[block_offset], stride);
+		  __epi_2xi32 indices32 = __builtin_epi_vload_2xi32(&Acols[block_offset], stride); // in element
+		  __epi_2xi64 indices = __builtin_epi_vwadd_2xi64(indices32, indices32, stride); // in 2*element, but we need sizeof(double)*element
+		  indices = __builtin_epi_vsll_2xi64(indices, __builtin_epi_vmv_v_x_2xi64(2, stride), stride); // time 4
+		  __epi_2xf64 xcofs = __builtin_epi_vload_indexed_2xf64(&xcoefs[0], indices, stride); // ... indices in bytes, no element in EPI/V
+
+		  sum = __builtin_epi_vfmacc_2xf64(sum, acofs, xcofs, stride);
+
+		  block_offset += stride;
+	  }
+	  __builtin_epi_vstore_2xf64(&ycoefs[block_id * num_rows_per_block], sum, stride);
+#endif
   }
 
 
-- 
GitLab