Skip to content
Snippets Groups Projects
Commit 00f232f8 authored by Tyler Smith's avatar Tyler Smith
Browse files

Added single-precision micro-kernel for Knights Corner aka MIC aka Xeon Phi

parent 3fc60e49
No related branches found
No related tags found
No related merge requests found
...@@ -97,7 +97,7 @@ ...@@ -97,7 +97,7 @@
// It is sometimes useful to define the various memory alignments in terms // It is sometimes useful to define the various memory alignments in terms
// of some other characteristics of the system, such as the cache line size // of some other characteristics of the system, such as the cache line size
// and the page size. // and the page size.
#define BLIS_CACHE_LINE_SIZE 64 #define BLIS_CACHE_LINE_SIZE 256
#define BLIS_PAGE_SIZE 4096 #define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector // Alignment size needed by the instruction set for aligned SIMD/vector
......
...@@ -54,35 +54,42 @@ ...@@ -54,35 +54,42 @@
// (b) NR (for triangular operations such as trmm and trsm). // (b) NR (for triangular operations such as trmm and trsm).
// //
#define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_MC_S 240
#define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_KC_S 240
#define BLIS_DEFAULT_NC_S 8192 #define BLIS_DEFAULT_NC_S 9600
#define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_MC_D 120
#define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_KC_D 240
#define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_NC_D 14400
#define BLIS_DEFAULT_MC_C 128 #define BLIS_DEFAULT_4M_MC_C BLIS_DEFAULT_MC_S
#define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_4M_KC_C BLIS_DEFAULT_KC_S
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_3M_MC_C BLIS_DEFAULT_MC_S
#define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_3M_KC_C BLIS_DEFAULT_KC_S
#define BLIS_DEFAULT_NC_Z 2048
/*
#define BLIS_DEFAULT_MC_C 120
#define BLIS_DEFAULT_KC_C 240
#define BLIS_DEFAULT_NC_C 9600
#define BLIS_DEFAULT_MC_Z 120
#define BLIS_DEFAULT_KC_Z 240
#define BLIS_DEFAULT_NC_Z 9600
*/
// -- Register blocksizes -- // -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_MR_S 30
#define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_NR_S 16
#define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_MR_D 30
#define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_NR_D 8
#define BLIS_DEFAULT_MR_C 8 //#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4 //#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MR_Z 8 //#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4 //#define BLIS_DEFAULT_NR_Z 4
// NOTE: If the micro-kernel, which is typically unrolled to a factor // NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these // of f, handles leftover edge cases (ie: when k % f > 0) then these
...@@ -123,8 +130,8 @@ ...@@ -123,8 +130,8 @@
// leading dimensions used within the packed micro-panels are equal to // leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above. // or greater than their corresponding register blocksizes above.
//#define BLIS_EXTEND_MR_S 0 #define BLIS_EXTEND_MR_S 2
//#define BLIS_EXTEND_NR_S 0 #define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 2 #define BLIS_EXTEND_MR_D 2
#define BLIS_EXTEND_NR_D 0 #define BLIS_EXTEND_NR_D 0
...@@ -153,12 +160,11 @@ ...@@ -153,12 +160,11 @@
// -- gemm -- // -- gemm --
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16
// -- trsm-related -- // -- trsm-related --
// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- // -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm -- // -- packm --
......
...@@ -40,25 +40,6 @@ ...@@ -40,25 +40,6 @@
#define B_L1_PREFETCH_DIST 2 #define B_L1_PREFETCH_DIST 2
#define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed. #define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed.
void bli_sgemm_opt_30x8(
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
BLIS_SGEMM_UKERNEL_REF( k,
alpha,
a,
b,
beta,
c, rs_c, cs_c,
data );
}
//Alternate code path uused if C is not row-major //Alternate code path uused if C is not row-major
#define UPDATE_C_ROW_SCATTERED(REG1, NUM, BASE_DEST) \ #define UPDATE_C_ROW_SCATTERED(REG1, NUM, BASE_DEST) \
{ \ { \
...@@ -268,7 +249,7 @@ void bli_sgemm_opt_30x8( ...@@ -268,7 +249,7 @@ void bli_sgemm_opt_30x8(
} }
//This is an array used for the scattter/gather instructions. //This is an array used for the scattter/gather instructions.
int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; extern int offsets[16];
//#define MONITORS //#define MONITORS
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment