Skip to content
Snippets Groups Projects
Commit 600625d2 authored by Romain Dolbeau's avatar Romain Dolbeau
Browse files

Implements LMUL=2 using wide-add to expand the indices; not yet supported by vehave so untested.

parent 3ced7971
No related tags found
No related merge requests found
......@@ -18,7 +18,7 @@ MINIFE_MATRIX_TYPE = -DMINIFE_SELL_MATRIX
#-----------------------------------------------------------------------
CFLAGS = -O2 -mepi -fno-vectorize -fopenmp #-static
CFLAGS = -O2 -mepi -fno-vectorize -fopenmp #-DWIDE_LMUL #-static
CXXFLAGS = $(CFLAGS)
LLVMDIR=/opt/llvm-EPI-development-toolchain-native
......
......@@ -96,7 +96,11 @@ SELLMatrix {
num_cols_per_row = ncols_per_row;
// make size of the block equal to a reasonable RVV size
#ifndef WIDE_LMUL
num_rows_per_block = __builtin_epi_vsetvl(8, __epi_e64, __epi_m1); // FIXME: heuristic instead of 8?
#else
num_rows_per_block = __builtin_epi_vsetvl(8, __epi_e64, __epi_m2); // FIXME: heuristic instead of 8?
#endif
num_blocks = (nrows + num_rows_per_block - 1) / num_rows_per_block;
}
......
......@@ -675,7 +675,9 @@ void operator()(MatrixType& A,
int num_blocks = A.num_blocks;
int num_rows_per_block = A.num_rows_per_block;
#pragma omp parallel for
for(int block_id=0; block_id < num_blocks; block_id++) {
#ifndef WIDE_LMUL
__epi_1xf64 sum = __builtin_epi_vfmv_v_f_1xf64(0.0, num_rows_per_block);
int block_offset = block_id * num_rows_per_block * row_len;
int stride = num_rows_per_block;
......@@ -696,6 +698,27 @@ void operator()(MatrixType& A,
block_offset += stride;
}
__builtin_epi_vstore_1xf64(&ycoefs[block_id * num_rows_per_block], sum, stride);
#else
__epi_2xf64 sum = __builtin_epi_vfmv_v_f_2xf64(0.0, num_rows_per_block);
int block_offset = block_id * num_rows_per_block * row_len;
int stride = num_rows_per_block;
if (block_id == num_blocks-1 && n%num_rows_per_block!=0){
stride = n%num_rows_per_block;
}
/* SVE uses a mask here, we can just alter VL, any VL <= num_rows_per_block (from vsetvli) should be legit */
for(int i=0; i<row_len; i++){
__epi_2xf64 acofs = __builtin_epi_vload_2xf64(&Acoefs[block_offset], stride);
__epi_2xi32 indices32 = __builtin_epi_vload_2xi32(&Acols[block_offset], stride); // in element
__epi_2xi64 indices = __builtin_epi_vwadd_2xi64(indices32, indices32, stride); // in 2*element, but we need sizeof(double)*element
indices = __builtin_epi_vsll_2xi64(indices, __builtin_epi_vmv_v_x_2xi64(2, stride), stride); // time 4
__epi_2xf64 xcofs = __builtin_epi_vload_indexed_2xf64(&xcoefs[0], indices, stride); // ... indices in bytes, no element in EPI/V
sum = __builtin_epi_vfmacc_2xf64(sum, acofs, xcofs, stride);
block_offset += stride;
}
__builtin_epi_vstore_2xf64(&ycoefs[block_id * num_rows_per_block], sum, stride);
#endif
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment