Implements LMUL=2 using wide-add to expand the indices; not yet supported by vehave so untested.

600625d2 · Romain Dolbeau · 3ced7971 · 600625d2 · 600625d2 · 600625d2
Commit 600625d2 authored 5 years ago by Romain Dolbeau
--- a/openmp-rvv/src/Makefile.clang.openmp
+++ b/openmp-rvv/src/Makefile.clang.openmp
@@ -18,7 +18,7 @@ MINIFE_MATRIX_TYPE = -DMINIFE_SELL_MATRIX

 #-----------------------------------------------------------------------

-CFLAGS = -O2 -mepi -fno-vectorize -fopenmp #-static 
+CFLAGS = -O2 -mepi -fno-vectorize -fopenmp #-DWIDE_LMUL #-static 
 CXXFLAGS = $(CFLAGS)

 LLVMDIR=/opt/llvm-EPI-development-toolchain-native

--- a/openmp-rvv/src/SELLMatrix.hpp
+++ b/openmp-rvv/src/SELLMatrix.hpp
@@ -96,7 +96,11 @@ SELLMatrix {
    num_cols_per_row = ncols_per_row;

    // make size of the block equal to a reasonable RVV size
+#ifndef WIDE_LMUL
    num_rows_per_block = __builtin_epi_vsetvl(8, __epi_e64, __epi_m1); // FIXME: heuristic instead of 8?
+#else
+    num_rows_per_block = __builtin_epi_vsetvl(8, __epi_e64, __epi_m2); // FIXME: heuristic instead of 8?
+#endif

    num_blocks = (nrows + num_rows_per_block - 1) / num_rows_per_block;
  }

--- a/openmp-rvv/src/SparseMatrix_functions.hpp
+++ b/openmp-rvv/src/SparseMatrix_functions.hpp
@@ -675,7 +675,9 @@ void operator()(MatrixType& A,
  int num_blocks = A.num_blocks;
  int num_rows_per_block = A.num_rows_per_block;

+#pragma omp parallel for
  for(int block_id=0; block_id < num_blocks; block_id++) {
+#ifndef WIDE_LMUL
 	  __epi_1xf64 sum = __builtin_epi_vfmv_v_f_1xf64(0.0, num_rows_per_block);
 	  int block_offset = block_id * num_rows_per_block * row_len;
 	  int stride = num_rows_per_block;
@@ -696,6 +698,27 @@ void operator()(MatrixType& A,
 		  block_offset += stride;
 	  }
 	  __builtin_epi_vstore_1xf64(&ycoefs[block_id * num_rows_per_block], sum, stride);
+#else
+	  __epi_2xf64 sum = __builtin_epi_vfmv_v_f_2xf64(0.0, num_rows_per_block);
+	  int block_offset = block_id * num_rows_per_block * row_len;
+	  int stride = num_rows_per_block;
+	  if (block_id == num_blocks-1  && n%num_rows_per_block!=0){
+		  stride = n%num_rows_per_block;
+	  }
+	  /* SVE uses a mask here, we can just alter VL, any VL <= num_rows_per_block (from vsetvli) should be legit */
+	  for(int i=0; i<row_len; i++){
+		  __epi_2xf64 acofs = __builtin_epi_vload_2xf64(&Acoefs[block_offset], stride);
+		  __epi_2xi32 indices32 = __builtin_epi_vload_2xi32(&Acols[block_offset], stride); // in element
+		  __epi_2xi64 indices = __builtin_epi_vwadd_2xi64(indices32, indices32, stride); // in 2*element, but we need sizeof(double)*element
+		  indices = __builtin_epi_vsll_2xi64(indices, __builtin_epi_vmv_v_x_2xi64(2, stride), stride); // time 4
+		  __epi_2xf64 xcofs = __builtin_epi_vload_indexed_2xf64(&xcoefs[0], indices, stride); // ... indices in bytes, no element in EPI/V
+
+		  sum = __builtin_epi_vfmacc_2xf64(sum, acofs, xcofs, stride);
+
+		  block_offset += stride;
+	  }
+	  __builtin_epi_vstore_2xf64(&ycoefs[block_id * num_rows_per_block], sum, stride);
+#endif
  }