From 600625d2544d00b1813f17aeeefea31decbf5f88 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau <romain.dolbeau@european-processor-initiative.eu> Date: Mon, 4 May 2020 07:44:45 -0400 Subject: [PATCH] Implements LMUL=2 using wide-add to expand the indices; not yet supported by vehave so untested. --- openmp-rvv/src/Makefile.clang.openmp | 2 +- openmp-rvv/src/SELLMatrix.hpp | 4 ++++ openmp-rvv/src/SparseMatrix_functions.hpp | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/openmp-rvv/src/Makefile.clang.openmp b/openmp-rvv/src/Makefile.clang.openmp index dc0f5e9..0672a53 100644 --- a/openmp-rvv/src/Makefile.clang.openmp +++ b/openmp-rvv/src/Makefile.clang.openmp @@ -18,7 +18,7 @@ MINIFE_MATRIX_TYPE = -DMINIFE_SELL_MATRIX #----------------------------------------------------------------------- -CFLAGS = -O2 -mepi -fno-vectorize -fopenmp #-static +CFLAGS = -O2 -mepi -fno-vectorize -fopenmp #-DWIDE_LMUL #-static CXXFLAGS = $(CFLAGS) LLVMDIR=/opt/llvm-EPI-development-toolchain-native diff --git a/openmp-rvv/src/SELLMatrix.hpp b/openmp-rvv/src/SELLMatrix.hpp index 744ba72..e0e4134 100644 --- a/openmp-rvv/src/SELLMatrix.hpp +++ b/openmp-rvv/src/SELLMatrix.hpp @@ -96,7 +96,11 @@ SELLMatrix { num_cols_per_row = ncols_per_row; // make size of the block equal to a reasonable RVV size +#ifndef WIDE_LMUL num_rows_per_block = __builtin_epi_vsetvl(8, __epi_e64, __epi_m1); // FIXME: heuristic instead of 8? +#else + num_rows_per_block = __builtin_epi_vsetvl(8, __epi_e64, __epi_m2); // FIXME: heuristic instead of 8? +#endif num_blocks = (nrows + num_rows_per_block - 1) / num_rows_per_block; } diff --git a/openmp-rvv/src/SparseMatrix_functions.hpp b/openmp-rvv/src/SparseMatrix_functions.hpp index c633af1..8402e8e 100644 --- a/openmp-rvv/src/SparseMatrix_functions.hpp +++ b/openmp-rvv/src/SparseMatrix_functions.hpp @@ -675,7 +675,9 @@ void operator()(MatrixType& A, int num_blocks = A.num_blocks; int num_rows_per_block = A.num_rows_per_block; +#pragma omp parallel for for(int block_id=0; block_id < num_blocks; block_id++) { +#ifndef WIDE_LMUL __epi_1xf64 sum = __builtin_epi_vfmv_v_f_1xf64(0.0, num_rows_per_block); int block_offset = block_id * num_rows_per_block * row_len; int stride = num_rows_per_block; @@ -696,6 +698,27 @@ void operator()(MatrixType& A, block_offset += stride; } __builtin_epi_vstore_1xf64(&ycoefs[block_id * num_rows_per_block], sum, stride); +#else + __epi_2xf64 sum = __builtin_epi_vfmv_v_f_2xf64(0.0, num_rows_per_block); + int block_offset = block_id * num_rows_per_block * row_len; + int stride = num_rows_per_block; + if (block_id == num_blocks-1 && n%num_rows_per_block!=0){ + stride = n%num_rows_per_block; + } + /* SVE uses a mask here, we can just alter VL, any VL <= num_rows_per_block (from vsetvli) should be legit */ + for(int i=0; i<row_len; i++){ + __epi_2xf64 acofs = __builtin_epi_vload_2xf64(&Acoefs[block_offset], stride); + __epi_2xi32 indices32 = __builtin_epi_vload_2xi32(&Acols[block_offset], stride); // in element + __epi_2xi64 indices = __builtin_epi_vwadd_2xi64(indices32, indices32, stride); // in 2*element, but we need sizeof(double)*element + indices = __builtin_epi_vsll_2xi64(indices, __builtin_epi_vmv_v_x_2xi64(2, stride), stride); // time 4 + __epi_2xf64 xcofs = __builtin_epi_vload_indexed_2xf64(&xcoefs[0], indices, stride); // ... indices in bytes, no element in EPI/V + + sum = __builtin_epi_vfmacc_2xf64(sum, acofs, xcofs, stride); + + block_offset += stride; + } + __builtin_epi_vstore_2xf64(&ycoefs[block_id * num_rows_per_block], sum, stride); +#endif } -- GitLab