diff --git a/.travis.yml b/.travis.yml index dbe3c41d8162238be99f241942af3c4cd0c9f979..bbae9a7d9f8581c57c983517b23bcdd984f9a129 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,6 @@ language: c sudo: required dist: trusty -env: - global: - secure: "Ty3PM1xGhXwxfJG6YyY9bUZyXzw98ekHxQEqU9VnrMXTZb28IxfocPCXHjL34r9HTGosO5Pmierhal1Cs3ZKE5ZAJqJhCfck+kwlH21Uay5CNYglDtSmy2qxtbbDG4AxpEZ1UKlIZr1pNh/x+pRemSmnMEnQp/E7QJqdkhm4+aMX2bWKyLPtrdL+B9QXLVT2nT6/Fw3i05aBhpcFJpSPfvYX2KoCZYdJOSKcKci4T8nAfP/c0olkz+jAkBZxZFgO9Ptrt/lvHtVPrkh5o29GvHg2i/4vucbsMltoxlV31/2eYpdr17Ngtt41MMVn2fHV4lVhLmENc04nlm084fBtg73T6b8hNy5JlcA44xI/UrPJsQAJ+0A0ds9BbBQKPxOmaF/O8WGXhwiwdKT6DGS9lj05f3S+yZfeNE3pQhLEcvwXLO5SW3VvKXMj0t/lZyG+XCkvFjD7KEPQV4g+BZc2zzD9TwDx3ydn8Uzd6zZlq1erQUzCnODP24wuwfrNP8nqxFYG0VtI8oZW62IC9U2hcnAF5QNXXW3yDYD65k3BHbigfI28gu9iO9G8RxOglR27J7Whdqkqw3AMRaqyHt2tdbz7tM2dLZ0EatT5m8esjC+LP4EshW9C59jP2U9vJ/94YEgOfwiqk8+e6fL/7dJvOumbwu1RclRI9DS88PPYb3Q=" matrix: include: # full testsuite (all tests except for mixed datatype) @@ -80,4 +77,4 @@ script: - $CC --version - make -j 2 - if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi -- if [ $SDE -eq 1 ] && [ "$TRAVIS_PULL_REQUEST" = "false" ] ; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi +- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi diff --git a/README.md b/README.md index 317c80d00034614daaf8e5205dddcc7a2c128fac..60ac20b2c232e0ac637a18ebd9f3da7ae71e11d9 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ Contents -------- * **[Introduction](#introduction)** +* **[Education and Learning](#education-and-learning)** * **[What's New](#whats-new)** * **[What People Are Saying About BLIS](#what-people-are-saying-about-blis)** * **[Key Features](#key-features)** @@ -76,6 +77,17 @@ and [collaborators](http://shpc.ices.utexas.edu/collaborators.html), [publications](http://shpc.ices.utexas.edu/publications.html), and [other educational projects](http://www.ulaff.net/) (such as MOOCs). +Education and Learning +---------------------- + +Want to understand what's under the hood? +Many of the same concepts and principles employed when developing BLIS are +introduced and taught in a basic pedagogical setting as part of +[LAFF-On Programming for High Performance (LAFF-On-PfHP)](http://www.ulaff.net/), +one of several massive open online courses (MOOCs) in the +[Linear Algebra: Foundations to Frontiers](http://www.ulaff.net/) series, +all of which are available for free via the [edX platform](http://www.edx.org/). + What's New ---------- diff --git a/build/config.mk.in b/build/config.mk.in index 0516ec97baebef73f1f93051b69db01859ca6b38..34f1931a4cf9f7c42d8bfcf6c34d4011f69579c0 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -89,6 +89,10 @@ endif CC_VENDOR := @CC_VENDOR@ CC := @CC@ +# Important C compiler ranges. +GCC_OT_4_9_0 := @gcc_older_than_4_9_0@ +GCC_OT_6_1_0 := @gcc_older_than_6_1_0@ + # The C++ compiler. NOTE: A C++ is typically not needed. CXX := @CXX@ diff --git a/config/amd64/make_defs.mk b/config/amd64/make_defs.mk index 70c0b692b494864b2e3a00fd19a73adb6b9ae4e8..df7cd20b79277ff2e140e9d2150f4e256277e57a 100644 --- a/config/amd64/make_defs.mk +++ b/config/amd64/make_defs.mk @@ -75,10 +75,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/arm32/make_defs.mk b/config/arm32/make_defs.mk index b592851e527569b4a1373a66d24df565bf3f1e41..0b517a1efde1bef71316fc5d48bd3b4547d94c7b 100644 --- a/config/arm32/make_defs.mk +++ b/config/arm32/make_defs.mk @@ -70,7 +70,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/arm64/make_defs.mk b/config/arm64/make_defs.mk index ac1cd697398f4b393ab2a2f5b82a8aa0fad65f8f..5ffb0815ad9db2ee095ee5cee56415cc28babc1f 100644 --- a/config/arm64/make_defs.mk +++ b/config/arm64/make_defs.mk @@ -70,7 +70,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index a577a9a32cbfb316424f87304fe657c18bb11315..97ea5a5ac6c3f5e350f7ff78af0b4856f59d49f4 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -79,7 +79,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Override the default value for LDFLAGS. ifeq ($(CC_VENDOR),ibm) diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index dec89a4c3e49fc40761cd5e0f7a94587fb0e8dd4..8f71da3bfa5b950f37765cd49d9e25fa1cf05f7c 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -75,10 +75,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/cortexa15/make_defs.mk b/config/cortexa15/make_defs.mk index ee4d301f4baaff9a4404b45db03fbfa5eedf79ce..0cbf304db2bad10bd7e624568d91326ed4bc4065 100644 --- a/config/cortexa15/make_defs.mk +++ b/config/cortexa15/make_defs.mk @@ -70,7 +70,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/cortexa53/make_defs.mk b/config/cortexa53/make_defs.mk index 9f723bcde3110e8063595240bd1a3c8f86b19e21..3e116cd6eb06504253d1e62f97b91eed5e2c7d69 100644 --- a/config/cortexa53/make_defs.mk +++ b/config/cortexa53/make_defs.mk @@ -70,7 +70,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/cortexa57/make_defs.mk b/config/cortexa57/make_defs.mk index 23bcf51e6e7c37dabe6c430558945eb86e69ae79..864872bc2730d51bf526d0e6ee1411624fb98f41 100644 --- a/config/cortexa57/make_defs.mk +++ b/config/cortexa57/make_defs.mk @@ -70,7 +70,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/cortexa9/make_defs.mk b/config/cortexa9/make_defs.mk index 2adc40e307cd7f337d34a9031c3124c1f6764d02..310b75b95b1c2c23fa46c89b00e654cc26de241c 100644 --- a/config/cortexa9/make_defs.mk +++ b/config/cortexa9/make_defs.mk @@ -70,7 +70,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/excavator/make_defs.mk b/config/excavator/make_defs.mk index deb85c79bdd33fd3306587310043696e236e50c0..ed73d5dc8641d85dbda3d6cc9270f1b5f01bb3c9 100644 --- a/config/excavator/make_defs.mk +++ b/config/excavator/make_defs.mk @@ -75,10 +75,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/generic/make_defs.mk b/config/generic/make_defs.mk index 3388291da017677dc636fde06b1e7b200f505d5c..7f934de38e1d1b669349e35720182359a3304ba6 100644 --- a/config/generic/make_defs.mk +++ b/config/generic/make_defs.mk @@ -79,10 +79,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c index ea839e4ee64bea6ab7e339aea199e742ab4a8541..7f222415a7f88787ae4f922ec52d43da71aed5b4 100644 --- a/config/haswell/bli_cntx_init_haswell.c +++ b/config/haswell/bli_cntx_init_haswell.c @@ -123,12 +123,18 @@ void bli_cntx_init_haswell( cntx_t* cntx ) #if 1 bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + //bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1008, 1008, 1008, 1008 ); + //bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 75, 192 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); #else bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 6, 6, 3, 3 ); -#endif - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + //bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1024, 1024, 1024, 1024 ); + //bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 64, 56, 32 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 72, 56, 44 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); +#endif bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 ); diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index f08d5a937e42ea1376a690d954c03a74fc82cbfd..6752dde295871c6f5117f49806ab15f357f563d8 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -63,13 +63,17 @@ endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell +ifeq ($(GCC_OT_4_9_0),yes) +# If gcc is older than 4.9.0, we must use a different label for -march. CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2 +endif else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xCORE-AVX2 else ifeq ($(CC_VENDOR),clang) -CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2 +CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell else $(error gcc, icc, or clang is required for this configuration.) endif @@ -79,10 +83,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) #-funsafe-math-optimizations +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/intel64/make_defs.mk b/config/intel64/make_defs.mk index af462fdc3f2da32005c6e9b604d68303bf10e61f..f74fb4d70a33a4a7441ac12938ae5edb93c5cc10 100644 --- a/config/intel64/make_defs.mk +++ b/config/intel64/make_defs.mk @@ -79,10 +79,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/knc/make_defs.mk b/config/knc/make_defs.mk index be3c9019d8b7d1f1e1b6655d8aacf67498a02206..d58521969f31777ffd29ecf8c5c694eadcc6d886 100644 --- a/config/knc/make_defs.mk +++ b/config/knc/make_defs.mk @@ -71,10 +71,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Override the default value for LDFLAGS. LDFLAGS := -mmic diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index b08cf1e4d5ff348dafd512febf825b0a3b0c8998..aa74df31c55759d7a000d90e84df651d4f9ddc8e 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -99,13 +99,13 @@ endif # Note: We use AVX2 for reference kernels instead of AVX-512. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations +CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),icc) CRVECFLAGS := -xMIC-AVX512 else ifeq ($(CC_VENDOR),clang) -CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd +CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast else $(error gcc, icc, or clang is required for this configuration.) endif diff --git a/config/penryn/make_defs.mk b/config/penryn/make_defs.mk index 41d2d939fcd65f4a663d355b0ee4fe6839d6280e..573382ea256e39e301802bfc203a3100d791c441 100644 --- a/config/penryn/make_defs.mk +++ b/config/penryn/make_defs.mk @@ -79,10 +79,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index bb23fbecea7eda7be93efb049de1baf1049bd859..8cf3ac5d988321a94aff5df84e6685214d91277c 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -75,10 +75,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index 18f111bf68f2461b0ce844de8454ac7d45ef3d38..9633b4f18396b84a94d3c384ceacb93bfa3d4e68 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -70,7 +70,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/power9/make_defs.mk b/config/power9/make_defs.mk index 3d66f607956d02f6db76da801720feb63ed38cc5..b2c78b16a7981e53b8f52dccbaca0a00c105aabd 100644 --- a/config/power9/make_defs.mk +++ b/config/power9/make_defs.mk @@ -70,7 +70,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index ba18e4f3286248d97c84322387e25251246ce534..896cb8993ceb266050b8d1e63cecf05a766638f6 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -63,13 +63,17 @@ endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) +CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge +ifeq ($(GCC_OT_4_9_0),yes) +# If gcc is older than 4.9.0, we must use a different label for -march. CKVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx +endif else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xAVX else ifeq ($(CC_VENDOR),clang) -CKVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx +CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge else $(error gcc, icc, or clang is required for this configuration.) endif @@ -79,10 +83,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/skx/make_defs.mk b/config/skx/make_defs.mk index 27bea5ef55ea90563412f14b81414307a6784a2a..920b42d98a06a105f28f32a9e6f5229220642d0b 100644 --- a/config/skx/make_defs.mk +++ b/config/skx/make_defs.mk @@ -89,13 +89,13 @@ endif # to overcome the AVX-512 frequency drop". (Issue #187) CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations +CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),icc) CRVECFLAGS := -xCORE-AVX2 else ifeq ($(CC_VENDOR),clang) -CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd +CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast else $(error gcc, icc, or clang is required for this configuration.) endif diff --git a/config/steamroller/make_defs.mk b/config/steamroller/make_defs.mk index a5b6707041da393715df1f1127f9feea1396ac50..89c76890355a2f976f5e7372af5447d34583f7bb 100644 --- a/config/steamroller/make_defs.mk +++ b/config/steamroller/make_defs.mk @@ -75,10 +75,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/thunderx2/make_defs.mk b/config/thunderx2/make_defs.mk index 3227fe242bad2597a93c3695eeac0c49fb362035..820919d9c80cfbabd388df8c51df6d7ef11603bb 100644 --- a/config/thunderx2/make_defs.mk +++ b/config/thunderx2/make_defs.mk @@ -70,7 +70,15 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else CRVECFLAGS := $(CKVECFLAGS) +endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/x86_64/make_defs.mk b/config/x86_64/make_defs.mk index 4d038ff04b2a38164ce556379bc66a3437b55c12..520cd42ac4993ca70f1629ac678cf942129b000d 100644 --- a/config/x86_64/make_defs.mk +++ b/config/x86_64/make_defs.mk @@ -79,10 +79,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk index 0397f60b7cf3cd9cef988ef6b227adb01a9a7e5d..1b9db53713b2d72c852c3b652eb2e71896fc0626 100644 --- a/config/zen/make_defs.mk +++ b/config/zen/make_defs.mk @@ -63,11 +63,15 @@ endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) -# gcc 6.0 (clang 4.0) or later: -#CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 -# gcc 4.9 (clang 3.5) or later: -# possibly add zen-specific instructions: -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt +CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 +ifeq ($(GCC_OT_6_1_0),yes) +# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the +# Bulldozer instruction sets that were omitted from Zen. +# Additionally, if gcc is 4.9 (clang 3.5?) or newer, we may want to add +# Zen-specific instructions back into the mix: +# -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp +endif else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp @@ -79,10 +83,14 @@ endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +else +ifeq ($(CC_VENDOR),clang) +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif +endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/configure b/configure index 19489d2e47d41c69a9bb534bd7dd8d265c02c717..bf3f94527913741826806916111b8b758c917d78 100755 --- a/configure +++ b/configure @@ -1506,6 +1506,79 @@ check_compiler() fi } +check_compiler_version_ranges() +{ + local cc + + cc="${found_cc}" + + # + # We check for various compiler version ranges that may cause us + # issues in properly supporting those compiler versions within the + # BLIS build system. + # + # range: gcc < 4.9.0 (ie: 4.8.5 or older) + # variable: gcc_older_than_4_9_0 + # comments: + # These older versions of gcc may support microarchitectures such as + # sandybridge, but the '-march=' flag uses a different label syntax. + # In newer versions, '-march=sandybridge' is the preferred syntax [1]. + # However, in older versions, the syntax for the same compiler option + # is '-march=corei7-avx' [2]. + # + # [1] https://gcc.gnu.org/onlinedocs/gcc-4.9.0/gcc/i386-and-x86-64-Options.html#i386-and-x86-64-Options + # [2] https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/i386-and-x86-64-Options.html#i386-and-x86-64-Options + # + # range: gcc < 6.1 (ie: 5.5 or older) + # variable: gcc_older_than_6_1_0 + # comments: + # These older versions of gcc do not explicitly support the Zen (Zen1) + # microarchitecture; the newest microarchitectural value understood by + # these versions is '-march=bdver4' [3]. However, support for them can + # be attained in a roundabout way by starting with the instruction sets + # enabled by '-march=bdver4' and then disabling the instruction sets + # that were removed in the transition from Excavator to Zen, namely: + # FMA4, TBM, XOP, and LWP. Newer versions of gcc support Zen via the + # '-march=znver1' option [4]. + # + # [3] https://gcc.gnu.org/onlinedocs/gcc-5.5.0/gcc/x86-Options.html#x86-Options + # [4] https://gcc.gnu.org/onlinedocs/gcc-6.1.0/gcc/x86-Options.html#x86-Options + # + + gcc_older_than_4_9_0='no' + gcc_older_than_6_1_0='no' + + echo "${script_name}: checking ${cc} ${cc_version} against known consequential version ranges." + + # gcc + if [ "x${cc_vendor}" = "xgcc" ]; then + + # Check for gcc < 4.9.0 (ie: 4.8.5 or older). + if [ ${cc_major} -eq 4 ]; then + if [ ${cc_minor} -lt 9 ]; then + echo "${script_name}: note: found ${cc} version older than 4.9.0." + gcc_older_than_4_9_0='yes' + fi + fi + + # Check for gcc < 6.1.0 (ie: 5.5 or older). + if [ ${cc_major} -lt 6 ]; then + echo "${script_name}: note: found ${cc} version older than 6.1." + gcc_older_than_6_1_0='yes' + fi + fi + + # icc + if [ "x${cc_vendor}" = "xicc" ]; then + : + fi + + # clang + if [ "x${cc_vendor}" = "xclang" ]; then + : + fi +} + check_assembler() { local cc asm_dir cflags asm_fp @@ -2222,9 +2295,11 @@ main() # Check the compiler's version. Certain versions of certain compilers # will preclude building certain sub-configurations, which are added - # to a blacklist. + # to a blacklist. We also make note of certain version ranges that + # will be useful to know about later. get_compiler_version check_compiler + check_compiler_version_ranges # Now check the assembler's ability to assemble code. Older versions # of binutils may not be aware of certain instruction sets. Those @@ -3017,6 +3092,8 @@ main() | sed -e "s/@is_win@/${is_win}/g" \ | sed -e "s/@dist_path@/${dist_path_esc}/g" \ | sed -e "s/@CC_VENDOR@/${cc_vendor}/g" \ + | sed -e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g" \ + | sed -e "s/@gcc_older_than_6_1_0@/${gcc_older_than_6_1_0}/g" \ | sed -e "s/@CC@/${cc_esc}/g" \ | sed -e "s/@CXX@/${cxx_esc}/g" \ | sed -e "s/@RANLIB@/${ranlib_esc}/g" \ diff --git a/docs/Performance.md b/docs/Performance.md index e51028c49a2cf0e137bef112bba0417854dad333..55d0370dafb2ec880aeb0a47f2cf5fbb4aca67d2 100644 --- a/docs/Performance.md +++ b/docs/Performance.md @@ -127,7 +127,9 @@ size of interest so that we can better assist you. * single-core: 17.6 GFLOPS (double-precision), 35.2 GFLOPS (single-precision) * multicore: 17.6 GFLOPS/core (double-precision), 35.2 GFLOPS/core (single-precision) * Operating system: Ubuntu 16.04 (Linux kernel 4.15.0) +* Page size: unknown * Compiler: gcc 7.3.0 +* Driver source code directory: `test/3` * Results gathered: 14 February 2019 * Implementations tested: * BLIS 075143df (0.5.1-39) @@ -187,7 +189,9 @@ size of interest so that we can better assist you. * single-core: 64 GFLOPS (double-precision), 128 GFLOPS (single-precision) * multicore: 64 GFLOPS/core (double-precision), 128 GFLOPS/core (single-precision) * Operating system: Ubuntu 18.04 (Linux kernel 4.15.0) +* Page size: 4096 bytes * Compiler: gcc 7.3.0 +* Driver source code directory: `test/3` * Results gathered: 6 March 2019, 27 March 2019 * Implementations tested: * BLIS 9f1dbe5 (0.5.1-54) @@ -204,7 +208,14 @@ size of interest so that we can better assist you. * Multithreaded (52 core) execution requested via `export OPENBLAS_NUM_THREADS=52` * Eigen 3.3.90 * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019) - * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal). + * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal): + ``` + # These lines added after line 67. + check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + if(COMPILER_SUPPORTS_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + endif() + ``` * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas` * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library. * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1` @@ -259,7 +270,9 @@ size of interest so that we can better assist you. * single-core: 56 GFLOPS (double-precision), 112 GFLOPS (single-precision) * multicore: 49.6 GFLOPS/core (double-precision), 99.2 GFLOPS/core (single-precision) * Operating system: Cray Linux Environment 6 (Linux kernel 4.4.103) +* Page size: 4096 bytes * Compiler: gcc 6.3.0 +* Driver source code directory: `test/3` * Results gathered: 25-26 February 2019, 27 March 2019 * Implementations tested: * BLIS 075143df (0.5.1-39) @@ -276,7 +289,14 @@ size of interest so that we can better assist you. * Multithreaded (24 core) execution requested via `export OPENBLAS_NUM_THREADS=24` * Eigen 3.3.90 * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019) - * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal). + * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal): + ``` + # These lines added after line 67. + check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + if(COMPILER_SUPPORTS_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + endif() + ``` * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas` * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library. * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1` @@ -329,7 +349,9 @@ size of interest so that we can better assist you. * single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision) * multicore: 20.4 GFLOPS/core (double-precision), 40.8 GFLOPS/core (single-precision) * Operating system: Ubuntu 18.04 (Linux kernel 4.15.0) +* Page size: 4096 bytes * Compiler: gcc 7.3.0 +* Driver source code directory: `test/3` * Results gathered: 6 March 2019, 19 March 2019, 27 March 2019 * Implementations tested: * BLIS 9f1dbe5 (0.5.1-54) @@ -346,7 +368,14 @@ size of interest so that we can better assist you. * Multithreaded (64 core) execution requested via `export OPENBLAS_NUM_THREADS=64` * Eigen 3.3.90 * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019) - * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal). + * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal): + ``` + # These lines added after line 67. + check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + if(COMPILER_SUPPORTS_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + endif() + ``` * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas` * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library. * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1` diff --git a/docs/PerformanceSmall.md b/docs/PerformanceSmall.md index 52f15487d09e68fd87b6d29f5d3d8946580beb85..2c46130bde4fd9e20c8618ab461415d17ce25c95 100644 --- a/docs/PerformanceSmall.md +++ b/docs/PerformanceSmall.md @@ -1,16 +1,19 @@ # Contents -* **[Contents](Performance.md#contents)** -* **[Introduction](Performance.md#introduction)** -* **[General information](Performance.md#general-information)** -* **[Level-3 performance](Performance.md#level-3-performance)** - * **[Kaby Lake](Performance.md#kaby-lake)** - * **[Experiment details](Performance.md#kaby-lake-experiment-details)** - * **[Results](Performance.md#kaby-lake-results)** - * **[Epyc](Performance.md#epyc)** - * **[Experiment details](Performance.md#epyc-experiment-details)** - * **[Results](Performance.md#epyc-results)** -* **[Feedback](Performance.md#feedback)** +* **[Contents](PerformanceSmall.md#contents)** +* **[Introduction](PerformanceSmall.md#introduction)** +* **[General information](PerformanceSmall.md#general-information)** +* **[Level-3 performance](PerformanceSmall.md#level-3-performance)** + * **[Kaby Lake](PerformanceSmall.md#kaby-lake)** + * **[Experiment details](PerformanceSmall.md#kaby-lake-experiment-details)** + * **[Results](PerformanceSmall.md#kaby-lake-results)** + * **[Haswell](PerformanceSmall.md#haswell)** + * **[Experiment details](PerformanceSmall.md#haswell-experiment-details)** + * **[Results](PerformanceSmall.md#haswell-results)** + * **[Epyc](PerformanceSmall.md#epyc)** + * **[Experiment details](PerformanceSmall.md#epyc-experiment-details)** + * **[Results](PerformanceSmall.md#epyc-results)** +* **[Feedback](PerformanceSmall.md#feedback)** # Introduction @@ -110,25 +113,37 @@ size of interest so that we can better assist you. * Max FMA vector IPC: 2 * Peak performance: * single-core: 57.6 GFLOPS (double-precision), 115.2 GFLOPS (single-precision) -* Operating system: Gentoo Linux (Linux kernel 5.0.7) -* Compiler: gcc 7.3.0 -* Results gathered: 31 May 2019, 3 June 2019 +* Operating system: Gentoo Linux (Linux kernel 5.2.4) +* Page size: 4096 bytes +* Compiler: gcc 8.3.0 +* Driver source code directory: `test/sup` +* Results gathered: 23-28 August 2019 * Implementations tested: - * BLIS 6bf449c (0.5.2-42) + * BLIS 4a0a6e8 (0.6.0-28) * configured with `./configure --enable-cblas auto` * sub-configuration exercised: `haswell` - * OpenBLAS 0.3.6 + * OpenBLAS 0.3.7 * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded) - * BLASFEO 75a3dd8 + * BLASFEO 01f6b7f * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`. * Eigen 3.3.90 - * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019) - * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal). - * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas` + * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019) + * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal): + ``` + # These lines added after line 67. + check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + if(COMPILER_SUPPORTS_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + endif() + ``` + * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas` + * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install` * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library. * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded) - * MKL 2018 update 4 + * MKL 2019 update 4 * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded) + * libxsmm 77a295c (1.6.5-6679) + * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally. * Affinity: * N/A. * Frequency throttling (via `cpupower`): @@ -137,8 +152,7 @@ size of interest so that we can better assist you. * Hardware limits: 800MHz - 3.8GHz * Adjusted minimum: 3.7GHz * Comments: - * For both row- and column-stored matrices, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution (typically MKL), except for a few cases of where the _k_ dimension is very small. It is likely the case that this shape scenario begs a different kernel approach, since the BLIS microkernel is inherently designed to iterate over many _k_ dimension iterations (which leads them to incur considerable overhead for small values of _k_). - * For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 80 to 180. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`). + * libxsmm is highly competitive for very small problems, but quickly gives up once the "large" dimension exceeds about 180-240 (or 64 in the case where all operands are square). Also, libxsmm's `gemm` cannot handle a transposition on matrix A and similarly dispatches the fallback implementation for those cases. libxsmm also does not export CBLAS interfaces, and therefore only appears on the graphs for column-stored matrices. ### Kaby Lake results @@ -156,6 +170,73 @@ size of interest so that we can better assist you. --- +## Haswell + +### Haswell experiment details + +* Location: TACC (Lonestar5) +* Processor model: Intel Xeon E5-2690 v3 (Haswell) +* Core topology: two sockets, 12 cores per socket, 24 cores total +* SMT status: enabled, but not utilized +* Max clock rate: 3.5GHz (single-core), 3.1GHz (multicore) +* Max vector register length: 256 bits (AVX2) +* Max FMA vector IPC: 2 +* Peak performance: + * single-core: 56 GFLOPS (double-precision), 112 GFLOPS (single-precision) +* Operating system: Cray Linux Environment 6 (Linux kernel 4.4.103) +* Page size: 4096 bytes +* Compiler: gcc 7.3.0 +* Driver source code directory: `test/sup` +* Results gathered: 23-28 August 2019 +* Implementations tested: + * BLIS 4a0a6e8 (0.6.0-28) + * configured with `./configure --enable-cblas auto` + * sub-configuration exercised: `haswell` + * OpenBLAS 0.3.7 + * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded) + * BLASFEO 01f6b7f + * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`. + * Eigen 3.3.90 + * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019) + * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal): + ``` + # These lines added after line 67. + check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + if(COMPILER_SUPPORTS_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + endif() + ``` + * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas` + * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install` + * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library. + * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded) + * MKL 2019 update 4 + * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded) + * libxsmm 77a295c (1.6.5-6679) + * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally. +* Affinity: + * N/A. +* Frequency throttling (via `cpupower`): + * No changes made. +* Comments: + * libxsmm is highly competitive for very small problems, but quickly gives up once the "large" dimension exceeds about 180-240 (or 64 in the case where all operands are square). Also, libxsmm's `gemm` cannot handle a transposition on matrix A and similarly dispatches the fallback implementation for those cases. libxsmm also does not export CBLAS interfaces, and therefore only appears on the graphs for column-stored matrices. + +### Haswell results + +#### pdf + +* [Haswell row-stored](graphs/sup/dgemm_rrr_has_nt1.pdf) +* [Haswell column-stored](graphs/sup/dgemm_ccc_has_nt1.pdf) + +#### png (inline) + +* **Haswell row-stored** + +* **Haswell column-stored** + + +--- + ## Epyc ### Epyc experiment details @@ -171,24 +252,36 @@ size of interest so that we can better assist you. * Peak performance: * single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision) * Operating system: Ubuntu 18.04 (Linux kernel 4.15.0) -* Compiler: gcc 7.3.0 -* Results gathered: 31 May 2019, 3 June 2019 +* Page size: 4096 bytes +* Compiler: gcc 7.4.0 +* Driver source code directory: `test/sup` +* Results gathered: 23-28 August 2019 * Implementations tested: - * BLIS 6bf449c (0.5.2-42) + * BLIS 4a0a6e8 (0.6.0-28) * configured with `./configure --enable-cblas auto` * sub-configuration exercised: `zen` - * OpenBLAS 0.3.6 + * OpenBLAS 0.3.7 * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded) - * BLASFEO 75a3dd8 + * BLASFEO 01f6b7f * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`. * Eigen 3.3.90 - * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019) - * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal). - * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas` + * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019) + * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal): + ``` + # These lines added after line 67. + check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + if(COMPILER_SUPPORTS_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + endif() + ``` + * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas` + * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install` * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library. * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded) * MKL 2019 update 4 * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded) + * libxsmm 77a295c (1.6.5-6679) + * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally. * Affinity: * N/A. * Frequency throttling (via `cpupower`): @@ -197,8 +290,7 @@ size of interest so that we can better assist you. * Hardware limits: 1.2GHz - 2.0GHz * Adjusted minimum: 2.0GHz * Comments: - * As with Kaby Lake, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution, except for a few cases of where the _k_ dimension is very small. - * For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 12 to 256. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`). + * libxsmm is highly competitive for very small problems, but quickly gives up once the "large" dimension exceeds about 180-240 (or 64 in the case where all operands are square). Also, libxsmm's `gemm` cannot handle a transposition on matrix A and similarly dispatches the fallback implementation for those cases. libxsmm also does not export CBLAS interfaces, and therefore only appears on the graphs for column-stored matrices. ### Epyc results diff --git a/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf b/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf index 0f6e07e9a0935c847435ce46d3e71bc36e091748..8bba344a7eeb65568cf6cf400a5534c1f8be07af 100644 Binary files a/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf and b/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_ccc_epyc_nt1.png b/docs/graphs/sup/dgemm_ccc_epyc_nt1.png index 71eeb46c7e7fa8fd6b719a0a9e2211904217a778..bf409bf10580b38d2dea36033c8a81f46d7d4805 100644 Binary files a/docs/graphs/sup/dgemm_ccc_epyc_nt1.png and b/docs/graphs/sup/dgemm_ccc_epyc_nt1.png differ diff --git a/docs/graphs/sup/dgemm_ccc_has_nt1.pdf b/docs/graphs/sup/dgemm_ccc_has_nt1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2614c65a54b491c9ce2d1b0f6d3390e3e4be7f13 Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_has_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_ccc_has_nt1.png b/docs/graphs/sup/dgemm_ccc_has_nt1.png new file mode 100644 index 0000000000000000000000000000000000000000..34ea1eee47b4ec88030096754850b433cc30bfe6 Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_has_nt1.png differ diff --git a/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf b/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf index 9ff9de2023b519fe700d62ed661c4891d5c7ce62..43cdcc6872dea36bd8ba905b12b42192f12828fe 100644 Binary files a/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf and b/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_ccc_kbl_nt1.png b/docs/graphs/sup/dgemm_ccc_kbl_nt1.png index 4f99f8f7fca1e6048f515c01ed5219e09ed57a01..fdf45868ae1fa43ed2e5eef678a8c11f1860b14d 100644 Binary files a/docs/graphs/sup/dgemm_ccc_kbl_nt1.png and b/docs/graphs/sup/dgemm_ccc_kbl_nt1.png differ diff --git a/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf b/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf index f010da9aa60c235e877e65b436114799670d5bc7..f09c9efc9fb8aa47b72cb5ee0871370843184bff 100644 Binary files a/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf and b/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_rrr_epyc_nt1.png b/docs/graphs/sup/dgemm_rrr_epyc_nt1.png index 306bd40b0f7fbc3c7dcca8244346298efa43634e..8add499d80e5c80f95d66be7000130931d903f93 100644 Binary files a/docs/graphs/sup/dgemm_rrr_epyc_nt1.png and b/docs/graphs/sup/dgemm_rrr_epyc_nt1.png differ diff --git a/docs/graphs/sup/dgemm_rrr_has_nt1.pdf b/docs/graphs/sup/dgemm_rrr_has_nt1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e1dc609a04d9f2ef93c72a8a51f127ebf03008e1 Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_has_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_rrr_has_nt1.png b/docs/graphs/sup/dgemm_rrr_has_nt1.png new file mode 100644 index 0000000000000000000000000000000000000000..c8b47b85ac7b728b283173b43284c7ea9463927c Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_has_nt1.png differ diff --git a/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf b/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf index d104363d113df3a2ce24e47a1cd8a65251d78c78..10b674a22d3af6a5e130e8a3b8703130f431d2b2 100644 Binary files a/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf and b/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_rrr_kbl_nt1.png b/docs/graphs/sup/dgemm_rrr_kbl_nt1.png index dbea1b96dc4ea91257555b129ae545fc10f1e059..310b2aad67a9d198a3c793834a01257a20da1fc6 100644 Binary files a/docs/graphs/sup/dgemm_rrr_kbl_nt1.png and b/docs/graphs/sup/dgemm_rrr_kbl_nt1.png differ diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 1d876d50f1b45487607b299323ed0346ff31890a..4f073cb20ade98cbce44b96eeb036a031d1150bf 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -99,35 +99,84 @@ void bli_l3_thrinfo_print_gemm_paths thrinfo_t** threads ) { + // In order to query the number of threads, we query the only thread we + // know exists: thread 0. dim_t n_threads = bli_thread_num_threads( threads[0] ); - dim_t gl_id; - - thrinfo_t* jc_info = threads[0]; - thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); - thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); - thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); - thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); - thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); - thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); - - dim_t jc_way = bli_thread_n_way( jc_info ); - dim_t pc_way = bli_thread_n_way( pc_info ); - dim_t pb_way = bli_thread_n_way( pb_info ); - dim_t ic_way = bli_thread_n_way( ic_info ); - dim_t pa_way = bli_thread_n_way( pa_info ); - dim_t jr_way = bli_thread_n_way( jr_info ); - dim_t ir_way = bli_thread_n_way( ir_info ); - - dim_t jc_nt = bli_thread_num_threads( jc_info ); - dim_t pc_nt = bli_thread_num_threads( pc_info ); - dim_t pb_nt = bli_thread_num_threads( pb_info ); - dim_t ic_nt = bli_thread_num_threads( ic_info ); - dim_t pa_nt = bli_thread_num_threads( pa_info ); - dim_t jr_nt = bli_thread_num_threads( jr_info ); - dim_t ir_nt = bli_thread_num_threads( ir_info ); + + // For the purposes of printing the "header" information that is common + // to the various instances of a thrinfo_t (ie: across all threads), we + // choose the last thread in case the problem is so small that there is + // only an "edge" case, which will always be assigned to the last thread + // (at least for higher levels of partitioning). + thrinfo_t* jc_info = threads[n_threads-1]; + thrinfo_t* pc_info = NULL; + thrinfo_t* pb_info = NULL; + thrinfo_t* ic_info = NULL; + thrinfo_t* pa_info = NULL; + thrinfo_t* jr_info = NULL; + thrinfo_t* ir_info = NULL; + + // Initialize the n_ways and n_threads fields of each thrinfo_t "level" + // to -1. More than likely, these will all be overwritten with meaningful + // values, but in case some thrinfo_t trees are not fully built (see + // next commnet), these will be the placeholder values. + dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1, + pa_way = -1, jr_way = -1, ir_way = -1; + + dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1, + pa_nt = -1, jr_nt = -1, ir_nt = -1; + + // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads + // may not fully build their thrinfo_t structures--specifically when the + // dimension being parallelized is not large enough for each thread to have + // even one unit of work (where as unit is usually a single micropanel's + // width, MR or NR). + + if ( !jc_info ) goto print_header; + + jc_way = bli_thread_n_way( jc_info ); + jc_nt = bli_thread_num_threads( jc_info ); + pc_info = bli_thrinfo_sub_node( jc_info ); + + if ( !pc_info ) goto print_header; + + pc_way = bli_thread_n_way( pc_info ); + pc_nt = bli_thread_num_threads( pc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); + + if ( !pb_info ) goto print_header; + + pb_way = bli_thread_n_way( pb_info ); + pb_nt = bli_thread_num_threads( pb_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + + if ( !ic_info ) goto print_header; + + ic_way = bli_thread_n_way( ic_info ); + ic_nt = bli_thread_num_threads( ic_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + + if ( !pa_info ) goto print_header; + + pa_way = bli_thread_n_way( pa_info ); + pa_nt = bli_thread_num_threads( pa_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + + if ( !jr_info ) goto print_header; + + jr_way = bli_thread_n_way( jr_info ); + jr_nt = bli_thread_num_threads( jr_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + if ( !ir_info ) goto print_header; + + ir_way = bli_thread_n_way( ir_info ); + ir_nt = bli_thread_num_threads( ir_info ); + + print_header: printf( " jc kc pb ic pa jr ir\n" ); - printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + printf( "xx_nt: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( unsigned long )jc_nt, ( unsigned long )pc_nt, ( unsigned long )pb_nt, @@ -135,7 +184,7 @@ void bli_l3_thrinfo_print_gemm_paths ( unsigned long )pa_nt, ( unsigned long )jr_nt, ( unsigned long )ir_nt ); - printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + printf( "xx_way: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( unsigned long )jc_way, ( unsigned long )pc_way, ( unsigned long )pb_way, @@ -145,116 +194,59 @@ void bli_l3_thrinfo_print_gemm_paths ( unsigned long )ir_way ); printf( "============================================\n" ); - dim_t jc_comm_id; - dim_t pc_comm_id; - dim_t pb_comm_id; - dim_t ic_comm_id; - dim_t pa_comm_id; - dim_t jr_comm_id; - dim_t ir_comm_id; - - dim_t jc_work_id; - dim_t pc_work_id; - dim_t pb_work_id; - dim_t ic_work_id; - dim_t pa_work_id; - dim_t jr_work_id; - dim_t ir_work_id; - - for ( gl_id = 0; gl_id < n_threads; ++gl_id ) + for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id ) { jc_info = threads[gl_id]; - // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads - // may not fully build their thrinfo_t structures--specifically when the - // dimension being parallelized is not large enough for each thread to have - // even one unit of work (where as unit is usually a single micropanel's - // width, MR or NR). - if ( !jc_info ) - { - jc_comm_id = pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; - jc_work_id = pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - jc_comm_id = bli_thread_ocomm_id( jc_info ); - jc_work_id = bli_thread_work_id( jc_info ); - pc_info = bli_thrinfo_sub_node( jc_info ); + dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1, + pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1; - if ( !pc_info ) - { - pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; - pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - pc_comm_id = bli_thread_ocomm_id( pc_info ); - pc_work_id = bli_thread_work_id( pc_info ); - pb_info = bli_thrinfo_sub_node( pc_info ); + dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1, + pa_work_id = -1, jr_work_id = -1, ir_work_id = -1; - if ( !pb_info ) - { - pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; - pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - pb_comm_id = bli_thread_ocomm_id( pb_info ); - pb_work_id = bli_thread_work_id( pb_info ); - ic_info = bli_thrinfo_sub_node( pb_info ); + if ( !jc_info ) goto print_thrinfo; - if ( !ic_info ) - { - ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; - ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - ic_comm_id = bli_thread_ocomm_id( ic_info ); - ic_work_id = bli_thread_work_id( ic_info ); - pa_info = bli_thrinfo_sub_node( ic_info ); + jc_comm_id = bli_thread_ocomm_id( jc_info ); + jc_work_id = bli_thread_work_id( jc_info ); + pc_info = bli_thrinfo_sub_node( jc_info ); - if ( !pa_info ) - { - pa_comm_id = jr_comm_id = ir_comm_id = -1; - pa_work_id = jr_work_id = ir_work_id = -1; - } - else - { - pa_comm_id = bli_thread_ocomm_id( pa_info ); - pa_work_id = bli_thread_work_id( pa_info ); - jr_info = bli_thrinfo_sub_node( pa_info ); + if ( !pc_info ) goto print_thrinfo; - if ( !jr_info ) - { - jr_comm_id = ir_comm_id = -1; - jr_work_id = ir_work_id = -1; - } - else - { - jr_comm_id = bli_thread_ocomm_id( jr_info ); - jr_work_id = bli_thread_work_id( jr_info ); - ir_info = bli_thrinfo_sub_node( jr_info ); + pc_comm_id = bli_thread_ocomm_id( pc_info ); + pc_work_id = bli_thread_work_id( pc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); - if ( !ir_info ) - { - ir_comm_id = -1; - ir_work_id = -1; - } - else - { - ir_comm_id = bli_thread_ocomm_id( ir_info ); - ir_work_id = bli_thread_work_id( ir_info ); - } - } - } - } - } - } - } + if ( !pb_info ) goto print_thrinfo; + + pb_comm_id = bli_thread_ocomm_id( pb_info ); + pb_work_id = bli_thread_work_id( pb_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + + if ( !ic_info ) goto print_thrinfo; + + ic_comm_id = bli_thread_ocomm_id( ic_info ); + ic_work_id = bli_thread_work_id( ic_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + + if ( !pa_info ) goto print_thrinfo; + + pa_comm_id = bli_thread_ocomm_id( pa_info ); + pa_work_id = bli_thread_work_id( pa_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + + if ( !jr_info ) goto print_thrinfo; + + jr_comm_id = bli_thread_ocomm_id( jr_info ); + jr_work_id = bli_thread_work_id( jr_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + if ( !ir_info ) goto print_thrinfo; + + ir_comm_id = bli_thread_ocomm_id( ir_info ); + ir_work_id = bli_thread_work_id( ir_info ); + + print_thrinfo: - //printf( " gl jc pb kc pa ic jr \n" ); - //printf( " gl jc kc pb ic pa jr \n" ); printf( "comm ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( long )jc_comm_id, ( long )pc_comm_id, @@ -285,44 +277,105 @@ void bli_l3_thrinfo_print_trsm_paths thrinfo_t** threads ) { + // In order to query the number of threads, we query the only thread we + // know exists: thread 0. dim_t n_threads = bli_thread_num_threads( threads[0] ); - dim_t gl_id; - - thrinfo_t* jc_info = threads[0]; - thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); - thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); - thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); - - thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); - thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); - thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); - thrinfo_t* pa_info0 = bli_thrinfo_sub_prenode( ic_info ); - thrinfo_t* jr_info0 = ( pa_info0 ? bli_thrinfo_sub_node( pa_info0 ) : NULL ); - thrinfo_t* ir_info0 = ( jr_info0 ? bli_thrinfo_sub_node( jr_info0 ) : NULL ); - - dim_t jc_way = bli_thread_n_way( jc_info ); - dim_t pc_way = bli_thread_n_way( pc_info ); - dim_t pb_way = bli_thread_n_way( pb_info ); - dim_t ic_way = bli_thread_n_way( ic_info ); - - dim_t pa_way = bli_thread_n_way( pa_info ); - dim_t jr_way = bli_thread_n_way( jr_info ); - dim_t ir_way = bli_thread_n_way( ir_info ); - dim_t pa_way0 = ( pa_info0 ? bli_thread_n_way( pa_info0 ) : -1 ); - dim_t jr_way0 = ( jr_info0 ? bli_thread_n_way( jr_info0 ) : -1 ); - dim_t ir_way0 = ( ir_info0 ? bli_thread_n_way( ir_info0 ) : -1 ); - - dim_t jc_nt = bli_thread_num_threads( jc_info ); - dim_t pc_nt = bli_thread_num_threads( pc_info ); - dim_t pb_nt = bli_thread_num_threads( pb_info ); - dim_t ic_nt = bli_thread_num_threads( ic_info ); - - dim_t pa_nt = bli_thread_num_threads( pa_info ); - dim_t jr_nt = bli_thread_num_threads( jr_info ); - dim_t ir_nt = bli_thread_num_threads( ir_info ); - dim_t pa_nt0 = ( pa_info0 ? bli_thread_num_threads( pa_info0 ) : -1 ); - dim_t jr_nt0 = ( jr_info0 ? bli_thread_num_threads( jr_info0 ) : -1 ); - dim_t ir_nt0 = ( ir_info0 ? bli_thread_num_threads( ir_info0 ) : -1 ); + + // For the purposes of printing the "header" information that is common + // to the various instances of a thrinfo_t (ie: across all threads), we + // choose the last thread in case the problem is so small that there is + // only an "edge" case, which will always be assigned to the last thread + // (at least for higher levels of partitioning). + thrinfo_t* jc_info = threads[n_threads-1]; + thrinfo_t* pc_info = NULL; + thrinfo_t* pb_info = NULL; + thrinfo_t* ic_info = NULL; + thrinfo_t* pa_info = NULL; thrinfo_t* pa_info0 = NULL; + thrinfo_t* jr_info = NULL; thrinfo_t* jr_info0 = NULL; + thrinfo_t* ir_info = NULL; thrinfo_t* ir_info0 = NULL; + + // Initialize the n_ways and n_threads fields of each thrinfo_t "level" + // to -1. More than likely, these will all be overwritten with meaningful + // values, but in case some thrinfo_t trees are not fully built (see + // next commnet), these will be the placeholder values. + dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1, + pa_way = -1, jr_way = -1, ir_way = -1, + pa_way0 = -1, jr_way0 = -1, ir_way0 = -1; + + dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1, + pa_nt = -1, jr_nt = -1, ir_nt = -1, + pa_nt0 = -1, jr_nt0 = -1, ir_nt0 = -1; + + // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads + // may not fully build their thrinfo_t structures--specifically when the + // dimension being parallelized is not large enough for each thread to have + // even one unit of work (where as unit is usually a single micropanel's + // width, MR or NR). + + if ( !jc_info ) goto print_header; + + jc_way = bli_thread_n_way( jc_info ); + jc_nt = bli_thread_num_threads( jc_info ); + pc_info = bli_thrinfo_sub_node( jc_info ); + + if ( !pc_info ) goto print_header; + + pc_way = bli_thread_n_way( pc_info ); + pc_nt = bli_thread_num_threads( pc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); + + if ( !pb_info ) goto print_header; + + pb_way = bli_thread_n_way( pb_info ); + pb_nt = bli_thread_num_threads( pb_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + + if ( !ic_info ) goto print_header; + + ic_way = bli_thread_n_way( ic_info ); + ic_nt = bli_thread_num_threads( ic_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + pa_info0 = bli_thrinfo_sub_prenode( ic_info ); + + // check_header_prenode: + + if ( !pa_info0 ) goto check_header_node; + + pa_way0 = bli_thread_n_way( pa_info0 ); + pa_nt0 = bli_thread_num_threads( pa_info0 ); + jr_info0 = bli_thrinfo_sub_node( pa_info0 ); + + if ( !jr_info0 ) goto check_header_node; + + jr_way0 = bli_thread_n_way( jr_info0 ); + jr_nt0 = bli_thread_num_threads( jr_info0 ); + ir_info0 = bli_thrinfo_sub_node( jr_info0 ); + + if ( !ir_info0 ) goto check_header_node; + + ir_way0 = bli_thread_n_way( ir_info0 ); + ir_nt0 = bli_thread_num_threads( ir_info0 ); + + check_header_node: + + if ( !pa_info ) goto print_header; + + pa_way = bli_thread_n_way( pa_info ); + pa_nt = bli_thread_num_threads( pa_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + + if ( !jr_info ) goto print_header; + + jr_way = bli_thread_n_way( jr_info ); + jr_nt = bli_thread_num_threads( jr_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + if ( !ir_info ) goto print_header; + + ir_way = bli_thread_n_way( ir_info ); + ir_nt = bli_thread_num_threads( ir_info ); + + print_header: printf( " jc kc pb ic pa jr ir\n" ); printf( "xx_nt: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n", @@ -343,26 +396,105 @@ void bli_l3_thrinfo_print_trsm_paths ( long )ir_way0, ( long )ir_way ); printf( "==================================================\n" ); - dim_t jc_comm_id; - dim_t pc_comm_id; - dim_t pb_comm_id; - dim_t ic_comm_id; - dim_t pa_comm_id0, pa_comm_id; - dim_t jr_comm_id0, jr_comm_id; - dim_t ir_comm_id0, ir_comm_id; - - dim_t jc_work_id; - dim_t pc_work_id; - dim_t pb_work_id; - dim_t ic_work_id; - dim_t pa_work_id0, pa_work_id; - dim_t jr_work_id0, jr_work_id; - dim_t ir_work_id0, ir_work_id; - - for ( gl_id = 0; gl_id < n_threads; ++gl_id ) + + for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id ) { jc_info = threads[gl_id]; +#if 1 + // NOTE: This cpp branch contains code that is safe to execute + // for small problems that are parallelized enough that one or + // more threads gets no work. + + dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1, + pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1, + pa_comm_id0 = -1, jr_comm_id0 = -1, ir_comm_id0 = -1; + + dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1, + pa_work_id = -1, jr_work_id = -1, ir_work_id = -1, + pa_work_id0 = -1, jr_work_id0 = -1, ir_work_id0 = -1; + + if ( !jc_info ) goto print_thrinfo; + + jc_comm_id = bli_thread_ocomm_id( jc_info ); + jc_work_id = bli_thread_work_id( jc_info ); + pc_info = bli_thrinfo_sub_node( jc_info ); + + if ( !pc_info ) goto print_thrinfo; + + pc_comm_id = bli_thread_ocomm_id( pc_info ); + pc_work_id = bli_thread_work_id( pc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); + + if ( !pb_info ) goto print_thrinfo; + + pb_comm_id = bli_thread_ocomm_id( pb_info ); + pb_work_id = bli_thread_work_id( pb_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + + if ( !ic_info ) goto print_thrinfo; + + ic_comm_id = bli_thread_ocomm_id( ic_info ); + ic_work_id = bli_thread_work_id( ic_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + pa_info0 = bli_thrinfo_sub_prenode( ic_info ); + + // check_thrinfo_prenode: + + if ( !pa_info0 ) goto check_thrinfo_node; + + pa_comm_id0 = bli_thread_ocomm_id( pa_info0 ); + pa_work_id0 = bli_thread_work_id( pa_info0 ); + jr_info0 = bli_thrinfo_sub_node( pa_info0 ); + + if ( !jr_info0 ) goto check_thrinfo_node; + + jr_comm_id0 = bli_thread_ocomm_id( jr_info0 ); + jr_work_id0 = bli_thread_work_id( jr_info0 ); + ir_info0 = bli_thrinfo_sub_node( jr_info0 ); + + if ( !ir_info0 ) goto check_thrinfo_node; + + ir_comm_id0 = bli_thread_ocomm_id( ir_info0 ); + ir_work_id0 = bli_thread_work_id( ir_info0 ); + + check_thrinfo_node: + + if ( !pa_info ) goto print_thrinfo; + + pa_comm_id = bli_thread_ocomm_id( pa_info ); + pa_work_id = bli_thread_work_id( pa_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + + if ( !jr_info ) goto print_thrinfo; + + jr_comm_id = bli_thread_ocomm_id( jr_info ); + jr_work_id = bli_thread_work_id( jr_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + if ( !ir_info ) goto print_thrinfo; + + ir_comm_id = bli_thread_ocomm_id( ir_info ); + ir_work_id = bli_thread_work_id( ir_info ); + + print_thrinfo: +#else + dim_t jc_comm_id; + dim_t pc_comm_id; + dim_t pb_comm_id; + dim_t ic_comm_id; + dim_t pa_comm_id0, pa_comm_id; + dim_t jr_comm_id0, jr_comm_id; + dim_t ir_comm_id0, ir_comm_id; + + dim_t jc_work_id; + dim_t pc_work_id; + dim_t pb_work_id; + dim_t ic_work_id; + dim_t pa_work_id0, pa_work_id; + dim_t jr_work_id0, jr_work_id; + dim_t ir_work_id0, ir_work_id; + // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads // may not fully build their thrinfo_t structures--specifically when the // dimension being parallelized is not large enough for each thread to have @@ -488,6 +620,7 @@ void bli_l3_thrinfo_print_trsm_paths } } } +#endif printf( "comm ids: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n", ( long )jc_comm_id, diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 818534b1e95c6f70cd5f8f666cd6e925df71ac62..fae7b5f6ed951d757a22b57bd2bd76588bc980e5 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -648,6 +648,22 @@ static void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, bmults[ bs_id ] = mult_id; } +static void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) +{ + blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); + blksz_t* blksz = &blkszs[ bs_id ]; + + bli_blksz_set_def( bs, dt, blksz ); +} + +static void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) +{ + blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); + blksz_t* blksz = &blkszs[ bs_id ]; + + bli_blksz_set_max( bs, dt, blksz ); +} + static void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index fce31bcfa24dd5a77a1a245b1f439494dc9de7ec..c54592377c9bbba0a88421040a17aa53baa283b6 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -141,7 +141,7 @@ void bli_gks_init( void ) bli_cntx_init_cortexa57_ind ); #endif #ifdef BLIS_CONFIG_CORTEXA53 - bli_gks_register_cntx( BLIS_ARCH_CORTEXA57, bli_cntx_init_cortexa53, + bli_gks_register_cntx( BLIS_ARCH_CORTEXA53, bli_cntx_init_cortexa53, bli_cntx_init_cortexa53_ref, bli_cntx_init_cortexa53_ind ); #endif diff --git a/ref_kernels/ind/bli_gemmtrsm4m1_ref.c b/ref_kernels/ind/bli_gemmtrsm4m1_ref.c index 1b2205c8d711d95f9720221b337435ce7c785018..0988c457da04a4f63dcad8a1b5978373fdc536ec 100644 --- a/ref_kernels/ind/bli_gemmtrsm4m1_ref.c +++ b/ref_kernels/ind/bli_gemmtrsm4m1_ref.c @@ -84,6 +84,14 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ ctype_r* restrict one_r = PASTEMAC(chr,1); \ ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + /* A hack to avoid a 'restrict' warning triggered by passing in the + same address (one_r) for both alpha and beta when calling the last + of the four matrix products. We now use one_r for alpha and this + new local variable, onel, for beta. (See issue #328.) */ \ + ctype_r onel; \ + ctype_r* restrict onel_r = &onel; \ + PASTEMAC(chr,set1s)( onel ); \ \ ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ @@ -187,7 +195,7 @@ PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: bx111p_i", k+m, n, \ one_r, \ a1x_i, \ bx1_i, \ - one_r, \ + onel_r, \ b11_r, rs_b, cs_b, \ data, \ cntx \ diff --git a/test/1m4m/Makefile b/test/1m4m/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..74c0804cac56363e5d3669b0728295ff6a960c79 --- /dev/null +++ b/test/1m4m/Makefile @@ -0,0 +1,515 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2018, Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# Makefile +# +# Field G. Van Zee +# +# Makefile for standalone BLIS test drivers. +# + +# +# --- Makefile PHONY target definitions ---------------------------------------- +# + +.PHONY: all \ + clean cleanx + + + +# +# --- Determine makefile fragment location ------------------------------------- +# + +# Comments: +# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. +# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in +# the second case because CONFIG_NAME is not yet set. +ifneq ($(strip $(BLIS_INSTALL_PATH)),) +LIB_PATH := $(BLIS_INSTALL_PATH)/lib +INC_PATH := $(BLIS_INSTALL_PATH)/include/blis +SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis +else +DIST_PATH := ../.. +LIB_PATH = ../../lib/$(CONFIG_NAME) +INC_PATH = ../../include/$(CONFIG_NAME) +SHARE_PATH := ../.. +endif + + + +# +# --- Include common makefile definitions -------------------------------------- +# + +# Include the common makefile fragment. +-include $(SHARE_PATH)/common.mk + + + +# +# --- BLAS implementations ----------------------------------------------------- +# + +# BLAS library path(s). This is where the BLAS libraries reside. +HOME_LIB_PATH := $(HOME)/flame/lib + +# OpenBLAS +OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a +OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a + +# ATLAS +#ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \ +# $(HOME_LIB_PATH)/libatlas.a + +# Eigen +EIGEN_INC := $(HOME)/flame/eigen/include/eigen3 +EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a +EIGENP_LIB := $(EIGEN_LIB) + +# MKL +MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 +MKL_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_intel_lp64 \ + -lmkl_core \ + -lmkl_sequential \ + -lpthread -lm -ldl +#MKLP_LIB := -L$(MKL_LIB_PATH) \ +# -lmkl_intel_thread \ +# -lmkl_core \ +# -lmkl_intel_ilp64 \ +# -L$(ICC_LIB_PATH) \ +# -liomp5 +MKLP_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_intel_lp64 \ + -lmkl_core \ + -lmkl_gnu_thread \ + -lpthread -lm -ldl -fopenmp + #-L$(ICC_LIB_PATH) \ + #-lgomp + +VENDOR_LIB := $(MKL_LIB) +VENDORP_LIB := $(MKLP_LIB) + + +# +# --- Problem size definitions ------------------------------------------------- +# + +# Single core (single-threaded) +PS_BEGIN := 48 +PS_MAX := 2400 +PS_INC := 48 + +# Single-socket (multithreaded) +P1_BEGIN := 96 +P1_MAX := 4800 +P1_INC := 96 + +# Dual-socket (multithreaded) +P2_BEGIN := 144 +P2_MAX := 7200 +P2_INC := 144 + + +# +# --- General build definitions ------------------------------------------------ +# + +TEST_SRC_PATH := . +TEST_OBJ_PATH := . + +# Gather all local object files. +TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \ + $(TEST_OBJ_PATH)/%.o, \ + $(wildcard $(TEST_SRC_PATH)/*.c))) + +# Override the value of CINCFLAGS so that the value of CFLAGS returned by +# get-user-cflags-for() is not cluttered up with include paths needed only +# while building BLIS. +CINCFLAGS := -I$(INC_PATH) + +# Use the "framework" CFLAGS for the configuration family. +CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME)) + +# Add local header paths to CFLAGS. +CFLAGS += -I$(TEST_SRC_PATH) + +# Locate the libblis library to which we will link. +#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L) + +# Define a set of CFLAGS for use with C++ and Eigen. +CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS)) +CXXFLAGS += -I$(EIGEN_INC) + +# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading. +CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS)) +CXXFLAGS_MT := -march=native $(CXXFLAGS) + + +# Which library? +BLI_DEF := -DBLIS +BLA_DEF := -DBLAS +EIG_DEF := -DEIGEN + +# Complex implementation type +D3MHW := -DIND=BLIS_3MH +D3M1 := -DIND=BLIS_3M1 +D4MHW := -DIND=BLIS_4MH +D4M1B := -DIND=BLIS_4M1B +D4M1A := -DIND=BLIS_4M1A +D1M := -DIND=BLIS_1M +DNAT := -DIND=BLIS_NAT + +# Implementation string +#STR_3MHW := -DSTR=\"3mhw\" +#STR_3M1 := -DSTR=\"3m1\" +#STR_4MHW := -DSTR=\"4mhw\" +#STR_4M1B := -DSTR=\"4m1b\" +STR_4M1A := -DSTR=\"4m1a_blis\" +STR_1M := -DSTR=\"1m_blis\" +STR_NAT := -DSTR=\"asm_blis\" +STR_OBL := -DSTR=\"openblas\" +STR_EIG := -DSTR=\"eigen\" +STR_VEN := -DSTR=\"vendor\" + +# Single or multithreaded string +STR_ST := -DTHR_STR=\"st\" +STR_1S := -DTHR_STR=\"1s\" +STR_2S := -DTHR_STR=\"2s\" + +# Problem size specification +PDEF_ST := -DP_BEGIN=$(PS_BEGIN) -DP_INC=$(PS_INC) -DP_MAX=$(PS_MAX) +PDEF_1S := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX) +PDEF_2S := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX) + + + +# +# --- Targets/rules ------------------------------------------------------------ +# + +all: all-st all-1s all-2s +blis: blis-st blis-1s blis-2s +openblas: openblas-st openblas-1s openblas-2s +eigen: eigen-st eigen-1s eigen-2s +vendor: vendor-st vendor-1s vendor-2s +mkl: vendor +armpl: vendor + +all-st: blis-st openblas-st mkl-st +all-1s: blis-1s openblas-1s mkl-1s +all-2s: blis-2s openblas-2s mkl-2s + +blis-st: blis-nat-st blis-1m-st blis-4m1a-st +blis-1s: blis-nat-1s blis-1m-1s blis-4m1a-1s +blis-2s: blis-nat-2s blis-1m-2s blis-4m1a-2s + +#blis-ind: blis-ind-st blis-ind-mt +blis-nat: blis-nat-st blis-nat-1s blis-nat-2s +blis-1m: blis-1m-st blis-1m-1s blis-1m-2s +blis-4m1a: blis-4m1a-st blis-4m1a-1s blis-4m1a-2s + +# Define the datatypes, operations, and implementations. +DTS := s d c z +OPS := gemm +BIMPLS := asm_blis 4m1a_blis 1m_blis openblas vendor +EIMPLS := eigen + +# Define functions to construct object filenames from the datatypes and +# operations given an implementation. We define one function for single- +# threaded, single-socket, and dual-socket filenames. +get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o)) +get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o)) +get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o)) + +# Construct object and binary names for single-threaded, single-socket, and +# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL). +BLIS_1M_ST_OBJS := $(call get-st-objs,1m_blis) +BLIS_1M_ST_BINS := $(patsubst %.o,%.x,$(BLIS_1M_ST_OBJS)) +BLIS_1M_1S_OBJS := $(call get-1s-objs,1m_blis) +BLIS_1M_1S_BINS := $(patsubst %.o,%.x,$(BLIS_1M_1S_OBJS)) +BLIS_1M_2S_OBJS := $(call get-2s-objs,1m_blis) +BLIS_1M_2S_BINS := $(patsubst %.o,%.x,$(BLIS_1M_2S_OBJS)) + +BLIS_4M1A_ST_OBJS := $(call get-st-objs,4m1a_blis) +BLIS_4M1A_ST_BINS := $(patsubst %.o,%.x,$(BLIS_4M1A_ST_OBJS)) +BLIS_4M1A_1S_OBJS := $(call get-1s-objs,4m1a_blis) +BLIS_4M1A_1S_BINS := $(patsubst %.o,%.x,$(BLIS_4M1A_1S_OBJS)) +BLIS_4M1A_2S_OBJS := $(call get-2s-objs,4m1a_blis) +BLIS_4M1A_2S_BINS := $(patsubst %.o,%.x,$(BLIS_4M1A_2S_OBJS)) + +BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis) +BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS)) +BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis) +BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS)) +BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis) +BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS)) + +OPENBLAS_ST_OBJS := $(call get-st-objs,openblas) +OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS)) +OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas) +OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS)) +OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas) +OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS)) + +EIGEN_ST_OBJS := $(call get-st-objs,eigen) +EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS)) +EIGEN_1S_OBJS := $(call get-1s-objs,eigen) +EIGEN_1S_BINS := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS)) +EIGEN_2S_OBJS := $(call get-2s-objs,eigen) +EIGEN_2S_BINS := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS)) + +VENDOR_ST_OBJS := $(call get-st-objs,vendor) +VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) +VENDOR_1S_OBJS := $(call get-1s-objs,vendor) +VENDOR_1S_BINS := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS)) +VENDOR_2S_OBJS := $(call get-2s-objs,vendor) +VENDOR_2S_BINS := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS)) + +# Define some targets associated with the above object/binary files. +blis-nat-st: $(BLIS_NAT_ST_BINS) +blis-nat-1s: $(BLIS_NAT_1S_BINS) +blis-nat-2s: $(BLIS_NAT_2S_BINS) + +blis-1m-st: $(BLIS_1M_ST_BINS) +blis-1m-1s: $(BLIS_1M_1S_BINS) +blis-1m-2s: $(BLIS_1M_2S_BINS) + +blis-4m1a-st: $(BLIS_4M1A_ST_BINS) +blis-4m1a-1s: $(BLIS_4M1A_1S_BINS) +blis-4m1a-2s: $(BLIS_4M1A_2S_BINS) + +openblas-st: $(OPENBLAS_ST_BINS) +openblas-1s: $(OPENBLAS_1S_BINS) +openblas-2s: $(OPENBLAS_2S_BINS) + +eigen-st: $(EIGEN_ST_BINS) +eigen-1s: $(EIGEN_1S_BINS) +eigen-2s: $(EIGEN_2S_BINS) + +vendor-st: $(VENDOR_ST_BINS) +vendor-1s: $(VENDOR_1S_BINS) +vendor-2s: $(VENDOR_2S_BINS) + +mkl-st: vendor-st +mkl-1s: vendor-1s +mkl-2s: vendor-2s + +armpl-st: vendor-st +armpl-1s: vendor-1s +armpl-2s: vendor-2s + +# Mark the object files as intermediate so that make will remove them +# automatically after building the binaries on which they depend. +.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS) +.INTERMEDIATE: $(BLIS_1M_ST_OBJS) $(BLIS_1M_1S_OBJS) $(BLIS_1M_2S_OBJS) +.INTERMEDIATE: $(BLIS_4M1A_ST_OBJS) $(BLIS_4M1A_1S_OBJS) $(BLIS_4M1A_2S_OBJS) +.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS) +.INTERMEDIATE: $(EIGEN_ST_OBJS) $(EIGEN_1S_OBJS) $(EIGEN_2S_OBJS) +.INTERMEDIATE: $(VENDOR_ST_OBJS) $(VENDOR_1S_OBJS) $(VENDOR_2S_OBJS) + + +# --Object file rules -- + +#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c +# $(CC) $(CFLAGS) -c $< -o $@ + +# A function to return the datatype cpp macro def from the datatype +# character. +get-dt-cpp = $(strip \ + $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\ + $(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\ + $(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\ + -DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX)))) + +get-in-cpp = $(strip \ + $(if $(findstring 1m_blis,$(1)),-DIND=BLIS_1M,\ + $(if $(findstring 4m1a_blis,$(1)),-DIND=BLIS_4M1A,\ + -DIND=BLIS_NAT))) + +# A function to return other cpp macros that help the test driver +# identify the implementation. +#get-bl-cpp = $(strip \ +# $(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\ +# $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\ +# $(if $(findstring eigen,$(1)),$(STR_EIG) $(EIG_DEF),\ +# $(STR_VEN) $(BLA_DEF))))) + +get-bl-cpp = $(strip \ + $(if $(findstring 1m_blis,$(1)),$(STR_1M) $(BLI_DEF),\ + $(if $(findstring 4m1a_blis,$(1)),$(STR_4M1A) $(BLI_DEF),\ + $(if $(findstring asm_blis,$(1)),$(STR_NAT) $(BLI_DEF),\ + $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\ + $(if $(and $(findstring eigen,$(1)),\ + $(findstring gemm,$(2))),\ + $(STR_EIG) $(EIG_DEF),\ + $(if $(findstring eigen,$(1)),\ + $(STR_EIG) $(BLA_DEF),\ + $(STR_VEN) $(BLA_DEF)))))))) + + +# Rules for BLIS and BLAS libraries. +define make-st-rule +test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile + $(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(call get-in-cpp,$(3)) $(STR_ST) -c $$< -o $$@ +endef + +define make-1s-rule +test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile + $(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(call get-in-cpp,$(3)) $(STR_1S) -c $$< -o $$@ +endef + +define make-2s-rule +test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile + $(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(call get-in-cpp,$(3)) $(STR_2S) -c $$< -o $$@ +endef + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im)))))) + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im)))))) + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im)))))) + +# Rules for Eigen. +define make-eigst-rule +test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile + $(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@ +endef + +define make-eig1s-rule +test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile + $(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@ +endef + +define make-eig2s-rule +test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile + $(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@ +endef + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im)))))) + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im)))))) + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im)))))) + + +# -- Executable file rules -- + +# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS +# on the link command line in case BLIS was configured with the BLAS +# compatibility layer. This prevents BLIS from inadvertently getting called +# for the BLAS routines we are trying to test with. + +test_%_$(PS_MAX)_1m_blis_st.x: test_%_$(PS_MAX)_1m_blis_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_1m_blis_1s.x: test_%_$(P1_MAX)_1m_blis_1s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_1m_blis_2s.x: test_%_$(P2_MAX)_1m_blis_2s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +test_%_$(PS_MAX)_4m1a_blis_st.x: test_%_$(PS_MAX)_4m1a_blis_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_4m1a_blis_1s.x: test_%_$(P1_MAX)_4m1a_blis_1s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_4m1a_blis_2s.x: test_%_$(P2_MAX)_4m1a_blis_2s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +test_%_$(PS_MAX)_eigen_st.x: test_%_$(PS_MAX)_eigen_st.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(EIGEN_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_eigen_1s.x: test_%_$(P1_MAX)_eigen_1s.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_eigen_2s.x: test_%_$(P2_MAX)_eigen_2s.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +test_%_$(PS_MAX)_vendor_st.x: test_%_$(PS_MAX)_vendor_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_vendor_1s.x: test_%_$(P1_MAX)_vendor_1s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_vendor_2s.x: test_%_$(P2_MAX)_vendor_2s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +# -- Clean rules -- + +clean: cleanx + +cleanx: + - $(RM_F) *.o *.x + diff --git a/test/1m4m/runme.sh b/test/1m4m/runme.sh new file mode 100755 index 0000000000000000000000000000000000000000..d79d539259cffa4e8d4ebaa97fca572a2b897034 --- /dev/null +++ b/test/1m4m/runme.sh @@ -0,0 +1,242 @@ +#!/bin/bash + +# File pefixes. +exec_root="test" +out_root="output" +delay=0.1 + +#sys="blis" +#sys="stampede2" +sys="lonestar5" +#sys="ul252" +#sys="ul264" + +# Bind threads to processors. +#export OMP_PROC_BIND=true +#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103" + +if [ ${sys} = "blis" ]; then + + export GOMP_CPU_AFFINITY="0 1 2 3" + + threads="jc1ic1jr1_2400 + jc2ic3jr2_6000 + jc4ic3jr2_8000" + +elif [ ${sys} = "stampede2" ]; then + + echo "Need to set GOMP_CPU_AFFINITY." + exit 1 + + threads="jc1ic1jr1_2400 + jc4ic6jr1_6000 + jc4ic12jr1_8000" + +elif [ ${sys} = "lonestar5" ]; then + + export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" + + # A hack to use libiomp5 with gcc. + #export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64" + + #threads="jc1ic1jr1_2400 + # jc2ic3jr2_4800 + # jc4ic3jr2_9600" + threads="jc1ic1jr1_2400 + jc4ic3jr2_7200" + threads="jc4ic3jr2_7200" + +elif [ ${sys} = "ul252" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" + export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51" + + threads="jc1ic1jr1_2400 + jc2ic13jr1_6000 + jc4ic13jr1_8000" + +elif [ ${sys} = "ul264" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" + export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63" + + threads="jc1ic1jr1_2400 + jc1ic8jr4_6000 + jc2ic8jr4_8000" + +fi + +# Datatypes to test. +test_dts="s d c z" + +# Operations to test. +#test_ops="gemm hemm herk trmm trsm" +test_ops="gemm" + +# Implementations to test. +#impls="blis" +#impls="other" +#impls="eigen" +impls="all" + +if [ "${impls}" = "blis" ]; then + + test_impls="asm_blis" + +elif [ "${impls}" = "eigen" ]; then + + test_impls="eigen" + +elif [ "${impls}" = "other" ]; then + + test_impls="openblas vendor" + +elif [ "${impls}" = "eigen" ]; then + + test_impls="eigen" + +else + + test_impls="openblas vendor asm_blis 4m1a_blis 1m_blis" + #test_impls="openblas" + #test_impls="asm_blis 4m1a_blis 1m_blis" + #test_impls="asm_blis 1m_blis" +fi + +# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can +# restore the value. +GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY} + + +# First perform real test cases. +for th in ${threads}; do + + # Start with one way of parallelism in each loop. We will now begin + # parsing the 'th' variable to update one or more of these threading + # parameters. + jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1 + + # Strip everything before and after the underscore so that what remains + # is the problem size and threading parameter string, respectively. + psize=${th##*_}; thinfo=${th%%_*} + + # Identify each threading parameter and insert a space before it. + thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" ) + + nt=1 + + for loopnum in ${thsep}; do + + # Given the current string, which identifies a loop and the + # number of ways of parallelism for that loop, strip out + # the ways and loop separately to identify each. + loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" ) + num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" ) + + # Construct a string that we can evaluate to set the number + # of ways of parallelism for the current loop. + loop_nt_eq_num="${loop}_nt=${num}" + + # Update the total number of threads. + nt=$(expr ${nt} \* ${num}) + + # Evaluate the string to assign the ways to the variable. + eval ${loop_nt_eq_num} + + done + + echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}" + + + for dt in ${test_dts}; do + + for im in ${test_impls}; do + + if [ "${dt}" = "s" -o "${dt}" = "d" ] && \ + [ "${im}" = "1m_blis" -o "${im}" = "4m1a_blis" ]; then + continue + fi + + for op in ${test_ops}; do + + # Eigen does not support multithreading for hemm, herk, trmm, + # or trsm. So if we're getting ready to execute an Eigen driver + # for one of these operations and nt > 1, we skip this test. + if [ "${im}" = "eigen" ] && \ + [ "${op}" != "gemm" ] && \ + [ "${nt}" != "1" ]; then + continue; + fi + + # Find the threading suffix by probing the executable. + binname=$(ls ${exec_root}_${dt}${op}_${psize}_${im}_*.x) + suf_ext=${binname##*_} + suf=${suf_ext%%.*} + + #echo "found file: ${binname} with suffix ${suf}" + + # Set the number of threads according to th. + if [ "${suf}" = "1s" ] || [ "${suf}" = "2s" ]; then + + # Set the threading parameters based on the implementation + # that we are preparing to run. + if [ "${im}" = "asm_blis" ]; then + unset OMP_NUM_THREADS + export BLIS_JC_NT=${jc_nt} + export BLIS_PC_NT=${pc_nt} + export BLIS_IC_NT=${ic_nt} + export BLIS_JR_NT=${jr_nt} + export BLIS_IR_NT=${ir_nt} + elif [ "${im}" = "openblas" ]; then + unset OMP_NUM_THREADS + export OPENBLAS_NUM_THREADS=${nt} + elif [ "${im}" = "eigen" ]; then + export OMP_NUM_THREADS=${nt} + elif [ "${im}" = "vendor" ]; then + unset OMP_NUM_THREADS + export MKL_NUM_THREADS=${nt} + fi + export nt_use=${nt} + + # Multithreaded OpenBLAS seems to have a problem running + # properly if GOMP_CPU_AFFINITY is set. So we temporarily + # unset it here if we are about to execute OpenBLAS, but + # otherwise restore it. + if [ ${im} = "openblas" ]; then + unset GOMP_CPU_AFFINITY + else + export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}" + fi + else + + export BLIS_JC_NT=1 + export BLIS_PC_NT=1 + export BLIS_IC_NT=1 + export BLIS_JR_NT=1 + export BLIS_IR_NT=1 + export OMP_NUM_THREADS=1 + export OPENBLAS_NUM_THREADS=1 + export MKL_NUM_THREADS=1 + export nt_use=1 + fi + + # Construct the name of the test executable. + exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${suf}.x" + + # Construct the name of the output file. + out_file="${out_root}_${suf}_${dt}${op}_${im}.m" + + #echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}" + echo "Running ./${exec_name} > ${out_file}" + + # Run executable. + ./${exec_name} > ${out_file} + + sleep ${delay} + + done + done + done +done + diff --git a/test/1m4m/test_gemm.c b/test/1m4m/test_gemm.c new file mode 100644 index 0000000000000000000000000000000000000000..a58e6e58935773bd3f077ed185b3d14a90dbabb2 --- /dev/null +++ b/test/1m4m/test_gemm.c @@ -0,0 +1,425 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include <unistd.h> +#ifdef EIGEN + #define BLIS_DISABLE_BLAS_DEFS + #include "blis.h" + #include <Eigen/Core> + #include <Eigen/src/misc/blas.h> + using namespace Eigen; +#else + #include "blis.h" +#endif + +#define COL_STORAGE +//#define ROW_STORAGE + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n, k; + dim_t p; + dim_t p_begin, p_max, p_inc; + int m_input, n_input, k_input; + ind_t ind; + num_t dt; + char dt_ch; + int r, n_repeats; + trans_t transa; + trans_t transb; + f77_char f77_transa; + f77_char f77_transb; + + double dtime; + double dtime_save; + double gflops; + + //bli_init(); + + bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + + dt = DT; + + ind = IND; + +#if 1 + p_begin = P_BEGIN; + p_max = P_MAX; + p_inc = P_INC; + + m_input = -1; + n_input = -1; + k_input = -1; +#else + p_begin = 40; + p_max = 2000; + p_inc = 40; + + m_input = -1; + n_input = -1; + k_input = -1; +#endif + + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 0 + + cntx_t* cntx; + + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + + // Initialize a context for the current induced method and datatype. + cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + +#elif 0 + + #ifdef BLIS + if ( ind == BLIS_4M1A ) k_input = 128; + else if ( ind == BLIS_1M ) k_input = 128; + else k_input = 256; + #else + k_input = 192; + #endif + +#endif + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + + transa = BLIS_NO_TRANSPOSE; + transb = BLIS_NO_TRANSPOSE; + + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_trans( transb, &f77_transb ); + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; + + printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) + { + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + if ( k_input < 0 ) k = p / ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + bli_obj_create( dt, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt, 1, 1, 0, 0, &beta ); + + #ifdef COL_STORAGE + bli_obj_create( dt, m, k, 0, 0, &a ); + bli_obj_create( dt, k, n, 0, 0, &b ); + bli_obj_create( dt, m, n, 0, 0, &c ); + bli_obj_create( dt, m, n, 0, 0, &c_save ); + #else + bli_obj_create( dt, m, k, k, 1, &a ); + bli_obj_create( dt, k, n, n, 1, &b ); + bli_obj_create( dt, m, n, n, 1, &c ); + bli_obj_create( dt, m, n, n, 1, &c_save ); + #endif + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_conjtrans( transb, &b ); + + bli_setsc( (1.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + bli_copym( &c, &c_save ); + +#ifdef BLIS + bli_ind_disable_all_dt( dt ); + bli_ind_enable_dt( ind, dt ); +#endif + +#ifdef EIGEN + double alpha_r, alpha_i; + + bli_getsc( &alpha, &alpha_r, &alpha_i ); + + void* ap = bli_obj_buffer_at_off( &a ); + void* bp = bli_obj_buffer_at_off( &b ); + void* cp = bli_obj_buffer_at_off( &c ); + + #ifdef COL_STORAGE + const int os_a = bli_obj_col_stride( &a ); + const int os_b = bli_obj_col_stride( &b ); + const int os_c = bli_obj_col_stride( &c ); + #else + const int os_a = bli_obj_row_stride( &a ); + const int os_b = bli_obj_row_stride( &b ); + const int os_c = bli_obj_row_stride( &c ); + #endif + + Stride<Dynamic,1> stride_a( os_a, 1 ); + Stride<Dynamic,1> stride_b( os_b, 1 ); + Stride<Dynamic,1> stride_c( os_c, 1 ); + + #ifdef COL_STORAGE + #if defined(IS_FLOAT) + typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXf_; + #elif defined (IS_DOUBLE) + typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_; + #elif defined (IS_SCOMPLEX) + typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcf_; + #elif defined (IS_DCOMPLEX) + typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_; + #endif + #else + #if defined(IS_FLOAT) + typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXf_; + #elif defined (IS_DOUBLE) + typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_; + #elif defined (IS_SCOMPLEX) + typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcf_; + #elif defined (IS_DCOMPLEX) + typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_; + #endif + #endif + #if defined(IS_FLOAT) + Map<MatrixXf_, 0, Stride<Dynamic,1> > A( ( float* )ap, m, k, stride_a ); + Map<MatrixXf_, 0, Stride<Dynamic,1> > B( ( float* )bp, k, n, stride_b ); + Map<MatrixXf_, 0, Stride<Dynamic,1> > C( ( float* )cp, m, n, stride_c ); + #elif defined (IS_DOUBLE) + Map<MatrixXd_, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a ); + Map<MatrixXd_, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b ); + Map<MatrixXd_, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c ); + #elif defined (IS_SCOMPLEX) + Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>* )ap, m, k, stride_a ); + Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>* )bp, k, n, stride_b ); + Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>* )cp, m, n, stride_c ); + #elif defined (IS_DCOMPLEX) + Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a ); + Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b ); + Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c ); + #endif +#endif + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "b", &b, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#if defined(BLIS) + + bli_gemm( &alpha, + &a, + &b, + &beta, + &c ); + +#elif defined(EIGEN) + + C.noalias() += alpha_r * A * B; + +#else // if defined(BLAS) + + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = ( float* )bli_obj_buffer( &alpha ); + float* ap = ( float* )bli_obj_buffer( &a ); + float* bp = ( float* )bli_obj_buffer( &b ); + float* betap = ( float* )bli_obj_buffer( &beta ); + float* cp = ( float* )bli_obj_buffer( &c ); + + sgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = ( double* )bli_obj_buffer( &alpha ); + double* ap = ( double* )bli_obj_buffer( &a ); + double* bp = ( double* )bli_obj_buffer( &b ); + double* betap = ( double* )bli_obj_buffer( &beta ); + double* cp = ( double* )bli_obj_buffer( &c ); + + dgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha ); + scomplex* ap = ( scomplex* )bli_obj_buffer( &a ); + scomplex* bp = ( scomplex* )bli_obj_buffer( &b ); + scomplex* betap = ( scomplex* )bli_obj_buffer( &beta ); + scomplex* cp = ( scomplex* )bli_obj_buffer( &c ); + + cgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha ); + dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a ); + dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b ); + dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta ); + dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c ); + + zgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + + printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, + ( unsigned long )n, gflops ); + //fflush( stdout ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + + return 0; +} + diff --git a/test/3/Makefile b/test/3/Makefile index 972b4d93dfb87e325198b008cd4a147d8c9bf4d8..38d915721c712bc41c3f5f5e4f384e58573fe5df 100644 --- a/test/3/Makefile +++ b/test/3/Makefile @@ -135,14 +135,14 @@ PS_MAX := 2400 PS_INC := 48 # Single-socket (multithreaded) -P1_BEGIN := 120 -P1_MAX := 6000 -P1_INC := 120 +P1_BEGIN := 96 +P1_MAX := 4800 +P1_INC := 96 # Dual-socket (multithreaded) -P2_BEGIN := 160 -P2_MAX := 8000 -P2_INC := 160 +P2_BEGIN := 144 +P2_MAX := 7200 +P2_INC := 144 # diff --git a/test/3/test_gemm.c b/test/3/test_gemm.c index e7b6a94359814023ab4bec29f9994dcee9d82312..5ff4c0c0fa58da8f8dc63b92b1d7f6cb9f216f75 100644 --- a/test/3/test_gemm.c +++ b/test/3/test_gemm.c @@ -43,8 +43,8 @@ #include "blis.h" #endif -//#define COL_STORAGE -#define ROW_STORAGE +#define COL_STORAGE +//#define ROW_STORAGE //#define PRINT @@ -141,13 +141,14 @@ int main( int argc, char** argv ) printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_max; p += p_inc ) + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); @@ -396,7 +397,7 @@ int main( int argc, char** argv ) printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )k, ( unsigned long )n, gflops ); diff --git a/test/3/test_hemm.c b/test/3/test_hemm.c index 73746ae4bdef382ab6e745a0b024078f294c4991..e69a1ec5742e21ae7b0ac14ac1cc90af6acb4697 100644 --- a/test/3/test_hemm.c +++ b/test/3/test_hemm.c @@ -119,12 +119,13 @@ int main( int argc, char** argv ) printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_max; p += p_inc ) + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); @@ -317,7 +318,7 @@ int main( int argc, char** argv ) printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); diff --git a/test/3/test_herk.c b/test/3/test_herk.c index f51cf8c298d08fd8cb92c1ccc23f83ad749c454b..b963f944b3590878d6b1933fa8a68b8885bdc2f1 100644 --- a/test/3/test_herk.c +++ b/test/3/test_herk.c @@ -121,12 +121,13 @@ int main( int argc, char** argv ) printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_max; p += p_inc ) + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); @@ -297,7 +298,7 @@ int main( int argc, char** argv ) printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )k, gflops ); diff --git a/test/3/test_trmm.c b/test/3/test_trmm.c index 1915b508ad3a1c66fbf8d0f3db33cb2e4c6b2811..2fa7fe52dd0592b2c762165f73878157801bc822 100644 --- a/test/3/test_trmm.c +++ b/test/3/test_trmm.c @@ -136,12 +136,13 @@ int main( int argc, char** argv ) printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_max; p += p_inc ) + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); @@ -311,7 +312,7 @@ int main( int argc, char** argv ) printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); diff --git a/test/3/test_trsm.c b/test/3/test_trsm.c index b474a52a7e6c0d725e85544dd128676ff2b09217..2e5ff0a53314bab2ace9a2a0c18ddc4e26be01a4 100644 --- a/test/3/test_trsm.c +++ b/test/3/test_trsm.c @@ -136,12 +136,13 @@ int main( int argc, char** argv ) printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_max; p += p_inc ) + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); @@ -315,7 +316,7 @@ int main( int argc, char** argv ) printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); diff --git a/test/mixeddt/Makefile b/test/mixeddt/Makefile index 87568825af99cdf7434866e49790e487644264e1..20e5378ffbed2f310de1a22861a67dff75f46893 100644 --- a/test/mixeddt/Makefile +++ b/test/mixeddt/Makefile @@ -140,11 +140,11 @@ STR_MT := -DTHR_STR=\"mt\" # Problem size specification PDEF_ST := -DP_BEGIN=40 \ - -DP_END=2000 \ + -DP_MAX=2000 \ -DP_INC=40 PDEF_MT := -DP_BEGIN=160 \ - -DP_END=8000 \ + -DP_MAX=8000 \ -DP_INC=160 # Enumerate possible datatypes and computation precisions. diff --git a/test/mixeddt/test_gemm.c b/test/mixeddt/test_gemm.c index ea45a7c1410fef1e814ef2e83d8e1f9e785981d0..12437e41137d075f9e5526ba27482c856a8b01b0 100644 --- a/test/mixeddt/test_gemm.c +++ b/test/mixeddt/test_gemm.c @@ -77,7 +77,7 @@ int main( int argc, char** argv ) prec_t comp_prec = bli_dt_prec( dtx ); dim_t p_begin = P_BEGIN; - dim_t p_end = P_END; + dim_t p_max = P_MAX; dim_t p_inc = P_INC; int m_input = -1; @@ -122,12 +122,12 @@ int main( int argc, char** argv ) // Begin with initializing the last entry to zero so that // matlab allocates space for the entire array once up-front. - for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; + for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; //printf( "data_%s_%c%c%c%cgemm_%s", THR_STR, dtc_ch, dta_ch, dtb_ch, dtx_ch, STR ); printf( "data_gemm_%s", STR ); printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, ( unsigned long )0, 0.0 ); @@ -143,7 +143,8 @@ int main( int argc, char** argv ) else if ( c_complex && a_complex && b_complex ) flopsmul = 8.0; - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); @@ -220,7 +221,7 @@ int main( int argc, char** argv ) //printf( "data_%s_%c%c%c%cgemm_%s", THR_STR, dtc_ch, dta_ch, dtb_ch, dtx_ch, STR ); printf( "data_gemm_%s", STR ); printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )k, ( unsigned long )n, gflops ); diff --git a/test/sup/Makefile b/test/sup/Makefile index d2b3c7170a04bb515d801529648c8a0a80169c8b..2cd0627472843682f7883d38afcc3bcf67bd3f9a 100644 --- a/test/sup/Makefile +++ b/test/sup/Makefile @@ -96,6 +96,9 @@ endif HOME_LIB_PATH := $(HOME)/flame/lib MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 +# netlib BLAS +NETLIB_LIB := $(HOME_LIB_PATH)/libblas.a + # OpenBLAS OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a @@ -103,6 +106,10 @@ OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a # BLASFEO BLASFEO_LIB := $(HOME_LIB_PATH)/libblasfeo.a +# libxsmm +LIBXSMM_LIB := $(HOME_LIB_PATH)/libxsmm.a -ldl \ + $(NETLIB_LIB) -lgfortran + # ATLAS ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \ $(HOME_LIB_PATH)/libatlas.a @@ -210,15 +217,22 @@ TRANS := n_n \ t_n \ t_t +# While BLIS supports all combinations of row and column storage for matrices +# C, A, and B, the alternatives mostly only support CBLAS APIs, which inherently +# support only "all row-storage" or "all column-storage". Thus, we disable the +# building of those other drivers so that compilation/linking completes sooner. +#STORS := r_r_r \ +# r_r_c \ +# r_c_r \ +# r_c_c \ +# c_r_r \ +# c_r_c \ +# c_c_r \ +# c_c_c STORS := r_r_r \ - r_r_c \ - r_c_r \ - r_c_c \ - c_r_r \ - c_r_c \ - c_c_r \ c_c_c + SHAPES := l_l_s \ l_s_l \ s_l_l \ @@ -306,14 +320,18 @@ get-imp-defs = $(strip $(subst blissup,-DSTR=\"$(1)\" -DBLIS -DSUP, \ $(subst eigen,-DSTR=\"$(1)\" -DEIGEN, \ $(subst openblas,-DSTR=\"$(1)\" -DCBLAS, \ $(subst blasfeo,-DSTR=\"$(1)\" -DCBLAS, \ - $(subst vendor,-DSTR=\"$(1)\" -DCBLAS,$(1)))))))) + $(subst libxsmm,-DSTR=\"$(1)\" -DBLAS -DXSMM, \ + $(subst vendor,-DSTR=\"$(1)\" -DCBLAS,$(1))))))))) TRANS0 = $(call stripu,$(TRANS)) STORS0 = $(call stripu,$(STORS)) # Limit BLAS and Eigen to only using all row-stored, or all column-stored matrices. +# Also, limit libxsmm to using all column-stored matrices since it does not offer +# CBLAS interfaces. BSTORS0 = rrr ccc ESTORS0 = rrr ccc +XSTORS0 = ccc # @@ -339,6 +357,9 @@ OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS)) BLASFEO_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blasfeo) BLASFEO_ST_BINS := $(patsubst %.o,%.x,$(BLASFEO_ST_OBJS)) +LIBXSMM_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),libxsmm) +LIBXSMM_ST_BINS := $(patsubst %.o,%.x,$(LIBXSMM_ST_OBJS)) + VENDOR_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor) VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) @@ -351,6 +372,7 @@ VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) $(EIGEN_ST_OBJS) \ $(OPENBLAS_ST_OBJS) \ $(BLASFEO_ST_OBJS) \ + $(LIBXSMM_ST_OBJS) \ $(VENDOR_ST_OBJS) @@ -365,9 +387,11 @@ blislpab: blislpab-st eigen: eigen-st openblas: openblas-st blasfeo: blasfeo-st +libxsmm: libxsmm-st vendor: vendor-st -st: blissup-st blislpab-st eigen-st openblas-st blasfeo-st vendor-st +st: blissup-st blislpab-st \ + eigen-st openblas-st blasfeo-st libxsmm-st vendor-st blis: blissup-st blislpab-st blissup-st: $(BLISSUP_ST_BINS) @@ -375,13 +399,14 @@ blislpab-st: $(BLISLPAB_ST_BINS) eigen-st: $(EIGEN_ST_BINS) openblas-st: $(OPENBLAS_ST_BINS) blasfeo-st: $(BLASFEO_ST_BINS) +libxsmm-st: $(LIBXSMM_ST_BINS) vendor-st: $(VENDOR_ST_BINS) # --Object file rules -- # Define the implementations for which we will instantiate compilation rules. -BIMPLS := blissup blislpab openblas blasfeo vendor +BIMPLS := blissup blislpab openblas blasfeo libxsmm vendor EIMPLS := eigen # 1 2 3 4 567 8 @@ -447,6 +472,9 @@ test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK) test_%_blasfeo_st.x: test_%_blasfeo_st.o $(LIBBLIS_LINK) $(CC) $(strip $< $(BLASFEO_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +test_%_libxsmm_st.x: test_%_libxsmm_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBXSMM_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + test_%_vendor_st.x: test_%_vendor_st.o $(LIBBLIS_LINK) $(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) @@ -456,5 +484,5 @@ test_%_vendor_st.x: test_%_vendor_st.o $(LIBBLIS_LINK) clean: cleanx cleanx: - - $(RM_F) *.x + - $(RM_F) *.x *.o diff --git a/test/sup/octave/plot_l3sup_perf.m b/test/sup/octave/plot_l3sup_perf.m index b027bf7a43015d1118ca2484cc75a118885bf366..bf291087891c6ce9b923d5e9f41fdb1defebfc59 100644 --- a/test/sup/octave/plot_l3sup_perf.m +++ b/test/sup/octave/plot_l3sup_perf.m @@ -4,20 +4,21 @@ function r_val = plot_l3sup_perf( opname, ... data_eigen, ... data_open, ... data_bfeo, ... + data_xsmm, ... data_vend, vend_str, ... nth, ... rows, cols, ... cfreq, ... dfps, ... theid, impl ) -if ... %mod(theid-1,cols) == 2 || ... - ... %mod(theid-1,cols) == 3 || ... - ... %mod(theid-1,cols) == 4 || ... - 0 == 1 ... %theid >= 19 - show_plot = 0; -else +%if ... %mod(theid-1,cols) == 2 || ... +% ... %mod(theid-1,cols) == 3 || ... +% ... %mod(theid-1,cols) == 4 || ... +% 0 == 1 ... %theid >= 19 +% show_plot = 0; +%else show_plot = 1; -end +%end %legend_plot_id = 11; legend_plot_id = 1*cols + 1*5; @@ -33,6 +34,7 @@ color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = ''; color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o'; color_open = 'r'; lines_open = '--'; markr_open = 'o'; color_bfeo = 'c'; lines_bfeo = '-'; markr_bfeo = 'o'; +color_xsmm = 'g'; lines_xsmm = '-'; markr_xsmm = 'o'; color_vend = 'b'; lines_vend = '-.'; markr_vend = '.'; % Compute the peak performance in terms of the number of double flops @@ -57,6 +59,7 @@ blislpab_legend = sprintf( 'BLIS conv' ); eigen_legend = sprintf( 'Eigen' ); open_legend = sprintf( 'OpenBLAS' ); bfeo_legend = sprintf( 'BLASFEO' ); +xsmm_legend = sprintf( 'libxsmm' ); %vend_legend = sprintf( 'MKL' ); %vend_legend = sprintf( 'ARMPL' ); vend_legend = vend_str; @@ -96,30 +99,53 @@ for psize_col = 1:3 break; end end -x_end = data_blissup( size( data_blissup, 1 ), psize_col ); - x_axis( :, 1 ) = data_blissup( :, psize_col ); +% Compute the number of data points we have in the x-axis. Note that +% we only use half the data points for the m = n = k column of graphs. +if mod(theid-1,cols) == 6 + np = size( data_blissup, 1 ) / 2; +else + np = size( data_blissup, 1 ); +end + +has_xsmm = 1; +if data_xsmm( 1, flopscol ) == 0.0 + has_xsmm = 0; +end + +% Grab the last x-axis value. +x_end = data_blissup( np, psize_col ); + %data_peak( 1, 1:2 ) = [ 0 max_perf_core ]; %data_peak( 2, 1:2 ) = [ x_end max_perf_core ]; if show_plot == 1 -blissup_ln = line( x_axis( :, 1 ), data_blissup( :, flopscol ) / nth, ... +blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ... 'Color',color_blissup, 'LineStyle',lines_blissup, ... 'LineWidth',linesize ); -blislpab_ln = line( x_axis( :, 1 ), data_blislpab( :, flopscol ) / nth, ... +blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ... 'Color',color_blislpab, 'LineStyle',lines_blislpab, ... 'LineWidth',linesize ); -eigen_ln = line( x_axis( :, 1 ), data_eigen( :, flopscol ) / nth, ... +eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ... 'Color',color_eigen, 'LineStyle',lines_eigen, ... 'LineWidth',linesize ); -open_ln = line( x_axis( :, 1 ), data_open( :, flopscol ) / nth, ... +open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ... 'Color',color_open, 'LineStyle',lines_open, ... 'LineWidth',linesize ); -bfeo_ln = line( x_axis( :, 1 ), data_bfeo( :, flopscol ) / nth, ... +bfeo_ln = line( x_axis( 1:np, 1 ), data_bfeo( 1:np, flopscol ) / nth, ... 'Color',color_bfeo, 'LineStyle',lines_bfeo, ... 'LineWidth',linesize ); -vend_ln = line( x_axis( :, 1 ), data_vend( :, flopscol ) / nth, ... +if has_xsmm == 1 +xsmm_ln = line( x_axis( 1:np, 1 ), data_xsmm( 1:np, flopscol ) / nth, ... + 'Color',color_xsmm, 'LineStyle',lines_xsmm, ... + 'LineWidth',linesize ); +else +xsmm_ln = line( nan, nan, ... + 'Color',color_xsmm, 'LineStyle',lines_xsmm, ... + 'LineWidth',linesize ); +end +vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ... 'Color',color_vend, 'LineStyle',lines_vend, ... 'LineWidth',linesize ); else @@ -139,6 +165,9 @@ open_ln = line( nan, nan, ... bfeo_ln = line( nan, nan, ... 'Color',color_bfeo, 'LineStyle',lines_bfeo, ... 'LineWidth',linesize ); +xsmm_ln = line( nan, nan, ... + 'Color',color_xsmm, 'LineStyle',lines_xsmm, ... + 'LineWidth',linesize ); vend_ln = line( nan, nan, ... 'Color',color_vend, 'LineStyle',lines_vend, ... 'LineWidth',linesize ); @@ -169,40 +198,72 @@ elseif 500 <= x_end && x_end < 1000 end if show_plot == 1 || theid == legend_plot_id -if rows == 4 && cols == 7 if nth == 1 && theid == legend_plot_id - leg = legend( ... - [ ... - blissup_ln ... - blislpab_ln ... - eigen_ln ... - open_ln ... - bfeo_ln ... - vend_ln ... - ], ... - blissup_legend, ... - blislpab_legend, ... - eigen_legend, ... - open_legend, ... - bfeo_legend, ... - vend_legend, ... - 'Location', legend_loc ); + if has_xsmm == 1 + leg = legend( ... + [ ... + blissup_ln ... + blislpab_ln ... + eigen_ln ... + open_ln ... + bfeo_ln ... + xsmm_ln ... + vend_ln ... + ], ... + blissup_legend, ... + blislpab_legend, ... + eigen_legend, ... + open_legend, ... + bfeo_legend, ... + xsmm_legend, ... + vend_legend, ... + 'Location', legend_loc ); + set( leg,'Box','off' ); + set( leg,'Color','none' ); + set( leg,'Units','inches' ); + if impl == 'octave' + set( leg,'FontSize',fontsize ); + set( leg,'Position',[11.92 6.54 1.15 0.7 ] ); % (1,4tl) + else + set( leg,'FontSize',fontsize-3 ); + set( leg,'Position',[18.20 10.20 1.15 0.7 ] ); % (1,4tl) + end + else + leg = legend( ... + [ ... + blissup_ln ... + blislpab_ln ... + eigen_ln ... + open_ln ... + bfeo_ln ... + vend_ln ... + ], ... + blissup_legend, ... + blislpab_legend, ... + eigen_legend, ... + open_legend, ... + bfeo_legend, ... + vend_legend, ... + 'Location', legend_loc ); + set( leg,'Box','off' ); + set( leg,'Color','none' ); + set( leg,'Units','inches' ); + if impl == 'octave' + set( leg,'FontSize',fontsize ); + set( leg,'Position',[11.92 6.54 1.15 0.7 ] ); % (1,4tl) + else + set( leg,'FontSize',fontsize-1 ); + set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl) + end + end set( leg,'Box','off' ); set( leg,'Color','none' ); set( leg,'Units','inches' ); % xpos ypos %set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl) - if impl == 'octave' - set( leg,'FontSize',fontsize ); - set( leg,'Position',[11.92 6.54 1.15 0.7 ] ); % (1,4tl) - else - set( leg,'FontSize',fontsize-1 ); - set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl) - end elseif nth > 1 && theid == legend_plot_id end end -end set( ax1,'FontSize',fontsize ); set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1. diff --git a/test/sup/octave/plot_panel_trxsh.m b/test/sup/octave/plot_panel_trxsh.m index e5d282bc8c8b01592b03b1ba4aa4822150d8bae3..ebc216e3b9ae97101c0b295fae21284f3f7bf4d3 100644 --- a/test/sup/octave/plot_panel_trxsh.m +++ b/test/sup/octave/plot_panel_trxsh.m @@ -23,6 +23,7 @@ filetemp_blislpab = '%s/output_%s_%s_blislpab.m'; filetemp_eigen = '%s/output_%s_%s_eigen.m'; filetemp_open = '%s/output_%s_%s_openblas.m'; filetemp_bfeo = '%s/output_%s_%s_blasfeo.m'; +filetemp_xsmm = '%s/output_%s_%s_libxsmm.m'; filetemp_vend = '%s/output_%s_%s_vendor.m'; % Create a variable name "template" for the variables contained in the @@ -83,15 +84,10 @@ for opi = 1:n_opsupnames % Load the data files. %str = sprintf( ' Loading %s', file_blissup ); disp(str); run( file_blissup ) - %str = sprintf( ' Loading %s', file_blislpab ); disp(str); run( file_blislpab ) - %str = sprintf( ' Loading %s', file_eigen ); disp(str); run( file_eigen ) - %str = sprintf( ' Loading %s', file_open ); disp(str); run( file_open ) - %str = sprintf( ' Loading %s', file_open ); disp(str); run( file_bfeo ) - %str = sprintf( ' Loading %s', file_vend ); disp(str); run( file_vend ) % Construct variable names for the variables in the data files. @@ -111,11 +107,25 @@ for opi = 1:n_opsupnames data_bfeo = eval( var_bfeo ); % e.g. data_st_dgemm_blasfeo( :, : ); data_vend = eval( var_vend ); % e.g. data_st_dgemm_vendor( :, : ); + if stor_str == 'ccc' + % Only read xsmm data for the column storage case, since that's the + % only format that libxsmm supports. + file_xsmm = sprintf( filetemp_xsmm, dirpath, thr_str, opsupname ); + run( file_xsmm ) + var_xsmm = sprintf( vartemp, thr_str, opname, 'libxsmm' ); + data_xsmm = eval( var_xsmm ); % e.g. data_st_dgemm_libxsmm( :, : ); + else + % Set the data variable to zeros using the same dimensions as the other + % variables. + data_xsmm = zeros( size( data_blissup, 1 ), ... + size( data_blissup, 2 ) ); + end %str = sprintf( ' Reading %s', var_blissup ); disp(str); %str = sprintf( ' Reading %s', var_blislpab ); disp(str); %str = sprintf( ' Reading %s', var_eigen ); disp(str); %str = sprintf( ' Reading %s', var_open ); disp(str); %str = sprintf( ' Reading %s', var_bfeo ); disp(str); + %str = sprintf( ' Reading %s', var_xsmm ); disp(str); %str = sprintf( ' Reading %s', var_vend ); disp(str); % Plot one result in an m x n grid of plots, via the subplot() @@ -127,6 +137,7 @@ for opi = 1:n_opsupnames data_eigen, ... data_open, ... data_bfeo, ... + data_xsmm, ... data_vend, vend_str, ... nth, ... 4, 7, ... @@ -140,6 +151,7 @@ for opi = 1:n_opsupnames clear data_eigen; clear data_open; clear data_bfeo; + clear data_xsmm; clear data_vend; end diff --git a/test/sup/octave/runme.m b/test/sup/octave/runme.m index 5fd894c2b21e0ac9d9f60629b9987e774ef5e219..a9e053c3ecf3282dd418df89928b1db96406dcea 100644 --- a/test/sup/octave/runme.m +++ b/test/sup/octave/runme.m @@ -1,8 +1,12 @@ +% haswell +plot_panel_trxsh(3.25,16,1,'st','d','ccc',[ 6 8 4 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all; +plot_panel_trxsh(3.25,16,1,'st','d','rrr',[ 6 8 4 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all; + % kabylake -plot_panel_trxsh(3.6,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20190531/4_800_4_mt201_last400','kbl','MKL','matlab'); close; clear all; -plot_panel_trxsh(3.6,16,1,'st','d','ccc',[ 6 8 4 ],'../results/kabylake/20190531/4_800_4_mt201_last400','kbl','MKL','matlab'); close; clear all; +plot_panel_trxsh(3.80,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20190823/4_800_4_mt201','kbl','MKL','matlab'); close; clear all; +plot_panel_trxsh(3.80,16,1,'st','d','ccc',[ 6 8 4 ],'../results/kabylake/20190823/4_800_4_mt201','kbl','MKL','matlab'); close; clear all; % epyc -plot_panel_trxsh(3.0,8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20190531/4_800_4_mt256_last400','epyc','MKL','matlab'); close; clear all; -plot_panel_trxsh(3.0,8,1,'st','d','ccc',[ 6 8 4 ],'../results/epyc/20190531/4_800_4_mt256_last400','epyc','MKL','matlab'); close; clear all; +plot_panel_trxsh(3.00, 8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all; +plot_panel_trxsh(3.00, 8,1,'st','d','ccc',[ 6 8 4 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all; diff --git a/test/sup/runme.sh b/test/sup/runme.sh index 9646e3ccc55c6e950ab4e23b2f4ab90b4ae12667..48dacfa3a6790b378a994ce5cace39fe03761485 100755 --- a/test/sup/runme.sh +++ b/test/sup/runme.sh @@ -37,12 +37,13 @@ sns="8" sks="4" # Implementations to test. -impls="vendor blissup blislpab openblas eigen" -#impls="vendor openblas eigen" -#impls="blislpab blissup" -#mpls="openblas eigen vendor" -#mpls="eigen" +impls="vendor blissup blislpab openblas eigen libxsmm blasfeo" +#impls="vendor" #impls="blissup" +#impls="blislpab" +#impls="openblas" +#impls="eigen" +#impls="libxsmm" #impls="blasfeo" # Example: test_dgemm_nn_rrc_m6npkp_blissup_st.x @@ -75,6 +76,13 @@ for th in ${threads}; do continue; fi + # Further limit execution of libxsmm to + # ccc storage cases. + if [ "${im:0:7}" = "libxsmm" ] && \ + [ "${st}" != "ccc" ]; then + continue; + fi + # Extract the shape chars for m, n, k. chm=${sh:0:1} chn=${sh:1:1} diff --git a/test/sup/test_gemm.c b/test/sup/test_gemm.c index 311e8552afed3aa72dde7b404ca1045ca8ffccd7..7f611b554de77b39949dd05277e7031ebc337168 100644 --- a/test/sup/test_gemm.c +++ b/test/sup/test_gemm.c @@ -152,13 +152,14 @@ int main( int argc, char** argv ) printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch, transal, transbl, STR ); printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_max; p += p_inc ) + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) { obj_t a, b, c; obj_t c_save; @@ -195,7 +196,7 @@ int main( int argc, char** argv ) bli_obj_set_conjtrans( transa, &a ); bli_obj_set_conjtrans( transb, &b ); - bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &alpha ); bli_setsc( (1.0/1.0), 0.0, &beta ); bli_copym( &c, &c_save ); @@ -317,7 +318,11 @@ int main( int argc, char** argv ) float* betap = ( float* )bli_obj_buffer( &beta ); float* cp = ( float* )bli_obj_buffer( &c ); + #ifdef XSMM + libxsmm_sgemm( &f77_transa, + #else sgemm_( &f77_transa, + #endif &f77_transb, &mm, &nn, @@ -342,7 +347,11 @@ int main( int argc, char** argv ) double* betap = ( double* )bli_obj_buffer( &beta ); double* cp = ( double* )bli_obj_buffer( &c ); + #ifdef XSMM + libxsmm_dgemm( &f77_transa, + #else dgemm_( &f77_transa, + #endif &f77_transb, &mm, &nn, @@ -367,7 +376,11 @@ int main( int argc, char** argv ) scomplex* betap = ( scomplex* )bli_obj_buffer( &beta ); scomplex* cp = ( scomplex* )bli_obj_buffer( &c ); + #ifdef XSMM + libxsmm_cgemm( &f77_transa, + #else cgemm_( &f77_transa, + #endif &f77_transb, &mm, &nn, @@ -392,7 +405,11 @@ int main( int argc, char** argv ) dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta ); dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c ); + #ifdef XSMM + libxsmm_zgemm( &f77_transa, + #else zgemm_( &f77_transa, + #endif &f77_transb, &mm, &nn, @@ -545,7 +562,7 @@ int main( int argc, char** argv ) printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch, transal, transbl, STR ); printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, ( unsigned long )k, gflops ); diff --git a/test/test_axpyv.c b/test/test_axpyv.c index 268e3ea0de56015e1d1ca03f8b6698d0f931d4bf..54a4f61340339c69bc8e1c1fd00f54c0e8dfc9da 100644 --- a/test/test_axpyv.c +++ b/test/test_axpyv.c @@ -96,10 +96,11 @@ int main( int argc, char** argv ) printf( "data_axpyv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); @@ -188,7 +189,7 @@ int main( int argc, char** argv ) printf( "data_axpyv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )n, gflops ); bli_obj_free( &alpha ); diff --git a/test/test_dotv.c b/test/test_dotv.c index ea0f7e4c580c8b768a80f32297fb1eb44e8ae787..d5bebea5a60679a465cc27fcc957f3be95020867 100644 --- a/test/test_dotv.c +++ b/test/test_dotv.c @@ -93,10 +93,11 @@ int main( int argc, char** argv ) printf( "data_dotv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); @@ -172,7 +173,7 @@ int main( int argc, char** argv ) printf( "data_dotv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )n, gflops ); bli_obj_free( &x ); diff --git a/test/test_gemm.c b/test/test_gemm.c index 5d6b6aa9af17e3bb8e28a18227af2bcf1bca162a..042e42c8bd2f0445781c2f41f1520d3265ccc5cd 100644 --- a/test/test_gemm.c +++ b/test/test_gemm.c @@ -105,12 +105,13 @@ int main( int argc, char** argv ) printf( "data_gemm_%s", BLAS ); #endif printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); else m = ( dim_t ) m_input; @@ -287,7 +288,7 @@ int main( int argc, char** argv ) printf( "data_gemm_%s", BLAS ); #endif printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )k, ( unsigned long )n, gflops ); diff --git a/test/test_gemv.c b/test/test_gemv.c index 7d15c3249a67182ab1f4a714e163af724bbf1fa9..5e72d8655726ae365a7617660c10f1247ed52a3b 100644 --- a/test/test_gemv.c +++ b/test/test_gemv.c @@ -88,11 +88,12 @@ int main( int argc, char** argv ) printf( "data_gemv_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); @@ -183,7 +184,7 @@ int main( int argc, char** argv ) printf( "data_gemv_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); diff --git a/test/test_ger.c b/test/test_ger.c index e3497703e60066f0d11d39d2fd92c715f30fd63d..4e584fb9d5bc43e8aa4077c7ed3400826daf7228 100644 --- a/test/test_ger.c +++ b/test/test_ger.c @@ -88,11 +88,12 @@ int main( int argc, char** argv ) printf( "data_ger_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); @@ -176,7 +177,7 @@ int main( int argc, char** argv ) printf( "data_ger_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); diff --git a/test/test_hemm.c b/test/test_hemm.c index 40068c5f95dbedcb705e2105277e066950f3cf63..4f20aaca98c3fa46fbfdfd6d06e703e7d1a05c1d 100644 --- a/test/test_hemm.c +++ b/test/test_hemm.c @@ -106,11 +106,12 @@ int main( int argc, char** argv ) printf( "data_hemm_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); else m = ( dim_t ) m_input; @@ -298,7 +299,7 @@ int main( int argc, char** argv ) printf( "data_hemm_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); diff --git a/test/test_hemv.c b/test/test_hemv.c index 0250d31b8d15be0192cd5a90c34b7ce8a834e868..48227927d3fd2dfe0d6ad8d88aded40fc09496c6 100644 --- a/test/test_hemv.c +++ b/test/test_hemv.c @@ -93,10 +93,11 @@ int main( int argc, char** argv ) printf( "data_hemv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); @@ -190,7 +191,7 @@ int main( int argc, char** argv ) printf( "data_hemv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, gflops ); bli_obj_free( &alpha ); diff --git a/test/test_her.c b/test/test_her.c index 026b91261b3708f02a1150c054f80b58a2f9ad3c..606eb5a21d0b35e8acc1c903b07bb5d10af03d2c 100644 --- a/test/test_her.c +++ b/test/test_her.c @@ -94,10 +94,11 @@ int main( int argc, char** argv ) printf( "data_her_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); @@ -184,7 +185,7 @@ int main( int argc, char** argv ) printf( "data_her_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, gflops ); bli_obj_free( &alpha ); diff --git a/test/test_her2.c b/test/test_her2.c index 7428dde4ec4e0589771fb07726b310469b09b049..5814eee1076eee6633b1bc89b5099dfa93e8729a 100644 --- a/test/test_her2.c +++ b/test/test_her2.c @@ -93,10 +93,11 @@ int main( int argc, char** argv ) printf( "data_her2_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); @@ -186,7 +187,7 @@ int main( int argc, char** argv ) printf( "data_her2_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, gflops ); bli_obj_free( &alpha ); diff --git a/test/test_her2k.c b/test/test_her2k.c index a73e849554d5abf1c2dd6157590cfdbc094808f9..489b453f74e706e5d84281a92e75d5b5f45079e3 100644 --- a/test/test_her2k.c +++ b/test/test_her2k.c @@ -105,11 +105,12 @@ int main( int argc, char** argv ) printf( "data_her2k_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); else m = ( dim_t ) m_input; @@ -287,7 +288,7 @@ int main( int argc, char** argv ) printf( "data_her2k_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )k, gflops ); diff --git a/test/test_herk.c b/test/test_herk.c index db8f826c9f16b7e7d91e085df1b1012b70c70ceb..8f2adfa3450b9dcb407908af95761166e073b239 100644 --- a/test/test_herk.c +++ b/test/test_herk.c @@ -105,11 +105,12 @@ int main( int argc, char** argv ) printf( "data_herk_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); else m = ( dim_t ) m_input; @@ -265,7 +266,7 @@ int main( int argc, char** argv ) printf( "data_herk_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )k, gflops ); diff --git a/test/test_trmm.c b/test/test_trmm.c index 214ea32beb50adc2aca3b05dd392efff2362c505..ae867e4620e5abe46d9fc2c21c8095eff2f033cb 100644 --- a/test/test_trmm.c +++ b/test/test_trmm.c @@ -116,11 +116,12 @@ int main( int argc, char** argv ) printf( "data_trmm_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); else m = ( dim_t ) m_input; @@ -282,7 +283,7 @@ int main( int argc, char** argv ) printf( "data_trmm_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); diff --git a/test/test_trmv.c b/test/test_trmv.c index bd737de9f24dd53cdaec4f7bc31d247349199236..1fa33f3a89dcd0aebdf3b3be8b9a3d77e61f60ac 100644 --- a/test/test_trmv.c +++ b/test/test_trmv.c @@ -90,10 +90,11 @@ int main( int argc, char** argv ) printf( "data_trmv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); @@ -176,7 +177,7 @@ int main( int argc, char** argv ) printf( "data_trmv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, gflops ); bli_obj_free( &alpha ); diff --git a/test/test_trsm.c b/test/test_trsm.c index e5796bad34d77e37b12b51d51bbc4f680e79984b..5be9c965a799e4e90473f70b138af2b56512cc85 100644 --- a/test/test_trsm.c +++ b/test/test_trsm.c @@ -116,11 +116,12 @@ int main( int argc, char** argv ) printf( "data_trsm_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); else m = ( dim_t ) m_input; @@ -285,7 +286,7 @@ int main( int argc, char** argv ) printf( "data_trsm_%s", BLAS ); #endif printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); diff --git a/test/test_trsv.c b/test/test_trsv.c index 048fe3950d6050a4a8710a02d5d476ec6c16e416..10586a81fa9278bda6fd8ac9e1133e05a6d0d2bd 100644 --- a/test/test_trsv.c +++ b/test/test_trsv.c @@ -90,10 +90,11 @@ int main( int argc, char** argv ) printf( "data_trv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, 0.0 ); - for ( p = p_begin; p <= p_end; p += p_inc ) + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) { if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); @@ -183,7 +184,7 @@ int main( int argc, char** argv ) printf( "data_trsv_%s", BLAS ); #endif printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, + ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, gflops ); bli_obj_free( &alpha ); diff --git a/testsuite/src/test_addm.c b/testsuite/src/test_addm.c index 545f9387bd8ff342a8674daa71d51fa448a86423..821c8b55e3d3affedd440834eab786fc8d546312 100644 --- a/testsuite/src/test_addm.c +++ b/testsuite/src/test_addm.c @@ -275,7 +275,7 @@ void libblis_test_addm_check // // is functioning correctly if // - // normfv(y) - sqrt( absqsc( beta + conjx(alpha) ) * m * n ) + // normfm(y) - sqrt( absqsc( beta + conjx(alpha) ) * m * n ) // // is negligible. // diff --git a/testsuite/src/test_axpbyv.c b/testsuite/src/test_axpbyv.c index 24ed4a5ce117fc06de5abf1cf747cbd9ef066ee1..8def7b32d6aaa7fbc4507f839ed4b1c1c5631634 100644 --- a/testsuite/src/test_axpbyv.c +++ b/testsuite/src/test_axpbyv.c @@ -296,7 +296,7 @@ void libblis_test_axpbyv_check // // is functioning correctly if // - // normf( y - ( beta * y_orig + alpha * conjx(x) ) ) + // normfv( y - ( beta * y_orig + alpha * conjx(x) ) ) // // is negligible. // diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index a834aa6a369dc4d86a10a62e47f919a9eda3747f..c23443ab072406a99b18bf4f89d9a1fc170e9626 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -314,7 +314,7 @@ void libblis_test_axpy2v_check // // is functioning correctly if // - // normf( z - v ) + // normfv( z - v ) // // is negligible, where v contains z as computed by two calls to axpyv. // diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index 3bd18ca3ef32c6eab08ad065920fba372f03e84f..155e442b0d1e4d7d151de3d03a82625c4c58758e 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -319,7 +319,7 @@ void libblis_test_axpyf_check // // is functioning correctly if // - // normf( y - v ) + // normfv( y - v ) // // is negligible, where v contains y as computed by repeated calls to // axpyv. diff --git a/testsuite/src/test_axpym.c b/testsuite/src/test_axpym.c index c79866104eda4877b7945a6c46725888a4494223..0138d822ad456fcfeccd2f3c864ef555709940e1 100644 --- a/testsuite/src/test_axpym.c +++ b/testsuite/src/test_axpym.c @@ -289,7 +289,7 @@ void libblis_test_axpym_check // // is functioning correctly if // - // normf( y - ( y_orig + alpha * conjx(x) ) ) + // normfm( y - ( y_orig + alpha * conjx(x) ) ) // // is negligible. // diff --git a/testsuite/src/test_axpyv.c b/testsuite/src/test_axpyv.c index ff0326fb8c442994cca0785ea93d78ff3560fcbd..89b505f4c1955b074675d22f73467a5419606985 100644 --- a/testsuite/src/test_axpyv.c +++ b/testsuite/src/test_axpyv.c @@ -286,7 +286,7 @@ void libblis_test_axpyv_check // // is functioning correctly if // - // normf( y - ( y_orig + alpha * conjx(x) ) ) + // normfv( y - ( y_orig + alpha * conjx(x) ) ) // // is negligible. // diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index cf5563ec95bd2b78919985f2183a498a3f404e77..b9b8d99c33a32c00fe7e08f19198ffefdedb1341 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -345,7 +345,7 @@ void libblis_test_dotaxpyv_check // // and // - // normf( z - z_temp ) + // normfv( z - z_temp ) // // are negligible, where rho_temp and z_temp contain rho and z as // computed by dotv and axpyv, respectively. diff --git a/testsuite/src/test_dotv.c b/testsuite/src/test_dotv.c index ff9cd2b59c26d436a3e6c239a0a58e4101417161..0b0404af3c3c1e6bb18f4e1a2e35e3ab34bb5eda 100644 --- a/testsuite/src/test_dotv.c +++ b/testsuite/src/test_dotv.c @@ -278,7 +278,7 @@ void libblis_test_dotv_check // // is functioning correctly if // - // sqrtsc( rho.real ) - normf( x ) + // sqrtsc( rho.real ) - normfv( x ) // // and // diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index e85edff171e5950b802346ff609779b6ab4b32d3..80638d11059fa41fd045ca0371216f20eba4a2ee 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -366,11 +366,11 @@ void libblis_test_dotxaxpyf_check // // is functioning correctly if // - // normf( y - v ) + // normfv( y - v ) // // and // - // normf( z - q ) + // normfv( z - q ) // // are negligible, where v and q contain y and z as computed by repeated // calls to dotxv and axpyv, respectively. diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index d73fd0609e73729228ec298f62a73b6fdf75564e..cac443ac6b3bc6c7ebbf4d16f5d0d437c3d4aaaf 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -324,7 +324,7 @@ void libblis_test_dotxf_check // // is functioning correctly if // - // normf( y - v ) + // normfv( y - v ) // // is negligible, where v contains y as computed by repeated calls to // dotxv. diff --git a/testsuite/src/test_dotxv.c b/testsuite/src/test_dotxv.c index 76a47a08dccaa9815df96eae1aa3177b9111c5bc..64ab90e02f509e859f9d33baea73c4e502b90a19 100644 --- a/testsuite/src/test_dotxv.c +++ b/testsuite/src/test_dotxv.c @@ -304,7 +304,7 @@ void libblis_test_dotxv_check // // is functioning correctly if // - // sqrtsc( rho.real ) - sqrtsc( alpha ) * normf( x ) + // sqrtsc( rho.real ) - sqrtsc( alpha ) * normfv( x ) // // and // diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index 80cc010a1942440fb26e36090bb7dcaf0444017a..6dae4301ead38ead4630cc6533a5cce95c8d6763 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -625,7 +625,7 @@ void libblis_test_gemm_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 616532491d7ab2884177efde92ab82009b55e26e..2017c70dcaf9cacb82f117be2185ce54dbb60c06 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -390,7 +390,7 @@ void libblis_test_gemm_ukr_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 6d2f028d235c3cc18de4563e5b938fd6a8c2a7ed..20ceac1c612fc47b75da9cb21633efe141788434 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -465,7 +465,7 @@ void libblis_test_gemmtrsm_ukr_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c index aa10764b0ad80d9f94917f900ae7c6eeb581ca13..022fd2b561b471a5a13725bbedf002e5e59572be 100644 --- a/testsuite/src/test_gemv.c +++ b/testsuite/src/test_gemv.c @@ -324,7 +324,7 @@ void libblis_test_gemv_check // // is functioning correctly if // - // normf( y - z ) + // normfv( y - z ) // // is negligible, where // diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c index c611c4661459ac0ba5d21dd0d046f45a796612df..672077ec1945f5e89a2157ba3055958b55cee0fc 100644 --- a/testsuite/src/test_ger.c +++ b/testsuite/src/test_ger.c @@ -303,7 +303,7 @@ void libblis_test_ger_check // // is functioning correctly if // - // normf( v - w ) + // normfv( v - w ) // // is negligible, where // diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c index 15a684c3d49d593b13370304d7fbfeca011e2f63..12afa369882bef0c0237aaa2fb315441cb799ff4 100644 --- a/testsuite/src/test_hemm.c +++ b/testsuite/src/test_hemm.c @@ -338,7 +338,7 @@ void libblis_test_hemm_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c index 17204102e3a90d5268d0a83902758ab2f100215d..0976e0bc856b5643b7ed7ad4b91e07b4944a2d2d 100644 --- a/testsuite/src/test_hemv.c +++ b/testsuite/src/test_hemv.c @@ -322,7 +322,7 @@ void libblis_test_hemv_check // // is functioning correctly if // - // normf( y - v ) + // normfv( y - v ) // // is negligible, where // diff --git a/testsuite/src/test_her.c b/testsuite/src/test_her.c index c5ca4b14d364f27178d0f33426ed4747cbc5a8ea..c545e00a774b7019f0be57c35a6ea8c0e0eeedf6 100644 --- a/testsuite/src/test_her.c +++ b/testsuite/src/test_her.c @@ -301,7 +301,7 @@ void libblis_test_her_check // // is functioning correctly if // - // normf( v - w ) + // normfv( v - w ) // // is negligible, where // diff --git a/testsuite/src/test_her2.c b/testsuite/src/test_her2.c index 896497b4eaa22df161f123acadd1ac0fd33d4e86..d6a8686ade0e2b669b04ee1b958db64b0ae9a289 100644 --- a/testsuite/src/test_her2.c +++ b/testsuite/src/test_her2.c @@ -311,7 +311,7 @@ void libblis_test_her2_check // // is functioning correctly if // - // normf( v - w ) + // normfv( v - w ) // // is negligible, where // diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c index 2b692b021cdd778feb10cd7724d6ad8c81afd508..9aed2d968cad19fbf684685bfcedc7a6f1359af1 100644 --- a/testsuite/src/test_her2k.c +++ b/testsuite/src/test_her2k.c @@ -336,7 +336,7 @@ void libblis_test_her2k_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c index 5b9e1c353fd3e0651858ddad51910a2a9f95645e..eda56d2aa34b2afd2f30990e373870875698ec70 100644 --- a/testsuite/src/test_herk.c +++ b/testsuite/src/test_herk.c @@ -323,7 +323,7 @@ void libblis_test_herk_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_normfm.c b/testsuite/src/test_normfm.c index e8882ed54e8997e3194731153419131c51ea2920..7d80f7c172be5202cf9f25788ee5f524fa6ee011 100644 --- a/testsuite/src/test_normfm.c +++ b/testsuite/src/test_normfm.c @@ -259,7 +259,7 @@ void libblis_test_normfm_check // // Under these conditions, we assume that the implementation for // - // norm := normf( x ) + // norm := normfm( x ) // // is functioning correctly if // diff --git a/testsuite/src/test_normfv.c b/testsuite/src/test_normfv.c index 1622a2e8971dfd30c1e44352694a23c6fb8dd8d6..83210f16883c7d0442cc292c45edc2741660e1e6 100644 --- a/testsuite/src/test_normfv.c +++ b/testsuite/src/test_normfv.c @@ -256,7 +256,7 @@ void libblis_test_normfv_check // // Under these conditions, we assume that the implementation for // - // norm := normf( x ) + // norm := normfv( x ) // // is functioning correctly if // diff --git a/testsuite/src/test_scal2m.c b/testsuite/src/test_scal2m.c index d6f29f996cf3dd2a397ef3721095659b47060a62..7ed1ec49b4cfcd2e72b396e728022862ddfb61c0 100644 --- a/testsuite/src/test_scal2m.c +++ b/testsuite/src/test_scal2m.c @@ -288,7 +288,7 @@ void libblis_test_scal2m_check // // is functioning correctly if // - // normf( y - alpha * conjx(x) ) + // normfm( y - alpha * conjx(x) ) // // is negligible. // diff --git a/testsuite/src/test_scal2v.c b/testsuite/src/test_scal2v.c index 7a28479dbdc5eaf7edc9ccce42a472c69358d4a0..b5b2a3d6533e0ddf4012e5039d6ebdca97fd9391 100644 --- a/testsuite/src/test_scal2v.c +++ b/testsuite/src/test_scal2v.c @@ -285,7 +285,7 @@ void libblis_test_scal2v_check // // is functioning correctly if // - // normf( y - alpha * conjx(x) ) + // normfv( y - alpha * conjx(x) ) // // is negligible. // diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c index 3e9d5069f6ae0d9ffb01ba715181bf9ab2de1b5f..284e23ab667f2069212c1029ade5351ed2d7f58d 100644 --- a/testsuite/src/test_scalm.c +++ b/testsuite/src/test_scalm.c @@ -280,7 +280,7 @@ void libblis_test_scalm_check // // is functioning correctly if // - // normf( y + -conjbeta(beta) * y_orig ) + // normfm( y + -conjbeta(beta) * y_orig ) // // is negligible. // diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c index ef3b980caeddc8ac973a48840f2209124c7d35e4..61b3f5fbe07da790b4cfedf1f5787b3e523040b0 100644 --- a/testsuite/src/test_scalv.c +++ b/testsuite/src/test_scalv.c @@ -276,7 +276,7 @@ void libblis_test_scalv_check // // is functioning correctly if // - // normf( y + -conjbeta(beta) * y_orig ) + // normfv( y + -conjbeta(beta) * y_orig ) // // is negligible. // diff --git a/testsuite/src/test_subm.c b/testsuite/src/test_subm.c index d28eb280089718a91f7dd02533979b2b729cb9fb..8c6a83831b0fcccf216357c724c8b2a19443f742 100644 --- a/testsuite/src/test_subm.c +++ b/testsuite/src/test_subm.c @@ -275,7 +275,7 @@ void libblis_test_subm_check // // is functioning correctly if // - // normfv(y) - sqrt( absqsc( beta - conjx(alpha) ) * m * n ) + // normfm(y) - sqrt( absqsc( beta - conjx(alpha) ) * m * n ) // // is negligible. // diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c index 690594a39e053b556891c682462ccf20ab67c39b..e36147251af2ebda967672492b26b5b187607601 100644 --- a/testsuite/src/test_symm.c +++ b/testsuite/src/test_symm.c @@ -338,7 +338,7 @@ void libblis_test_symm_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_symv.c b/testsuite/src/test_symv.c index c654685dfdfd080e1d27331ddb85ef15e173c946..a1f9141429e3605759ab0442fb5d27e4dfb37316 100644 --- a/testsuite/src/test_symv.c +++ b/testsuite/src/test_symv.c @@ -322,7 +322,7 @@ void libblis_test_symv_check // // is functioning correctly if // - // normf( y - v ) + // normfv( y - v ) // // is negligible, where // diff --git a/testsuite/src/test_syr.c b/testsuite/src/test_syr.c index efdc67b842722265f4d4ffb9ab43d8009679eaed..f328d061b4ed63d695a45b991733cb69a37ec74f 100644 --- a/testsuite/src/test_syr.c +++ b/testsuite/src/test_syr.c @@ -301,7 +301,7 @@ void libblis_test_syr_check // // is functioning correctly if // - // normf( v - w ) + // normfv( v - w ) // // is negligible, where // diff --git a/testsuite/src/test_syr2.c b/testsuite/src/test_syr2.c index e87cd13e5259512d049dac8335ed4034d84e49f4..e79bfeca6e35bfd983395487db5938ea5d555b79 100644 --- a/testsuite/src/test_syr2.c +++ b/testsuite/src/test_syr2.c @@ -313,7 +313,7 @@ void libblis_test_syr2_check // // is functioning correctly if // - // normf( v - w ) + // normfv( v - w ) // // is negligible, where // diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c index 0283135b46f9b54b6fa43d37b9c1c35488aaea33..e1346692dd716c0d60abe26652e733d992b62eff 100644 --- a/testsuite/src/test_syr2k.c +++ b/testsuite/src/test_syr2k.c @@ -335,7 +335,7 @@ void libblis_test_syr2k_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c index 86c5864e33fc6e6e3336800b0d8c56aec9303dba..d6ca4b3bdc10a9f0af6419a9ae2b685894fc0fad 100644 --- a/testsuite/src/test_syrk.c +++ b/testsuite/src/test_syrk.c @@ -324,7 +324,7 @@ void libblis_test_syrk_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c index 1341d2b71258ca3733eeeb9243f96ddaed497824..be6bb941e92694639c4fdeb52fbd5f8ea0b5418e 100644 --- a/testsuite/src/test_trmm.c +++ b/testsuite/src/test_trmm.c @@ -320,7 +320,7 @@ void libblis_test_trmm_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c index 5b9392f5f26e8e024753c53ca6fd351748814cec..ba9431a0b4e43af8d89906e22cc2326c1467c2ab 100644 --- a/testsuite/src/test_trmm3.c +++ b/testsuite/src/test_trmm3.c @@ -339,7 +339,7 @@ void libblis_test_trmm3_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c index cd1b130cf1690496cd510a08fc86eaf003c9d962..b4b2f386d1a81f3076e5479a9a3ef02aa2810fb8 100644 --- a/testsuite/src/test_trmv.c +++ b/testsuite/src/test_trmv.c @@ -304,7 +304,7 @@ void libblis_test_trmv_check // // is functioning correctly if // - // normf( y - x ) + // normfv( y - x ) // // is negligible, where // diff --git a/testsuite/src/test_trsm.c b/testsuite/src/test_trsm.c index 23f17b0876e182014efc1121b733670ab79b2167..fa50bf790e7d08f4e6f8df3a19910f1b9b21ca71 100644 --- a/testsuite/src/test_trsm.c +++ b/testsuite/src/test_trsm.c @@ -327,7 +327,7 @@ void libblis_test_trsm_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 5476e1daf24fb4169d84c894246e52aad19e96ae..7d3df41c9ab23622dac962a23777da68a8d9de62 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -401,7 +401,7 @@ void libblis_test_trsm_ukr_check // // is functioning correctly if // - // normf( v - z ) + // normfv( v - z ) // // is negligible, where // diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c index cb3138c9201c36d9a2ddd668a8d49aa8b1770c1e..b05f7ab975084eb775630e233e62a336aa06e5ec 100644 --- a/testsuite/src/test_trsv.c +++ b/testsuite/src/test_trsv.c @@ -305,7 +305,7 @@ void libblis_test_trsv_check // // is functioning correctly if // - // normf( y - x_orig ) + // normfv( y - x_orig ) // // is negligible, where // diff --git a/testsuite/src/test_xpbym.c b/testsuite/src/test_xpbym.c index b7acc654ef7b787374a6afccf7e3eb3f3278475a..2340b4e11f3c2fa8cbe23095f721b44ce0d99e6f 100644 --- a/testsuite/src/test_xpbym.c +++ b/testsuite/src/test_xpbym.c @@ -288,7 +288,7 @@ void libblis_test_xpbym_check // // is functioning correctly if // - // normf( y - ( beta * y_orig + conjx(x) ) ) + // normfm( y - ( beta * y_orig + conjx(x) ) ) // // is negligible. // diff --git a/testsuite/src/test_xpbyv.c b/testsuite/src/test_xpbyv.c index fa0abdb828e56a21f4c2391aed8290ec14c7dabb..75ad98f6fe2183dfed79b90d2c62395295ae19dd 100644 --- a/testsuite/src/test_xpbyv.c +++ b/testsuite/src/test_xpbyv.c @@ -283,7 +283,7 @@ void libblis_test_xpbyv_check // // is functioning correctly if // - // normf( y - ( beta * y_orig + conjx(x) ) ) + // normfv( y - ( beta * y_orig + conjx(x) ) ) // // is negligible. // diff --git a/travis/do_sde.sh b/travis/do_sde.sh index 6ec9febe5d27a4c6660ca36f548acd93ef64dd11..9bf601034def281b5fd3d54e7901b903cd751d05 100755 --- a/travis/do_sde.sh +++ b/travis/do_sde.sh @@ -7,9 +7,12 @@ SDE_VERSION=sde-external-8.16.0-2018-01-30-lin SDE_TARBALL=$SDE_VERSION.tar.bz2 SDE=$SDE_VERSION/sde64 -set +x -curl -s -X POST https://content.dropboxapi.com/2/files/download -H "Authorization: Bearer $DROPBOX_TOKEN" -H "Dropbox-API-Arg: {\"path\": \"/$SDE_TARBALL\"}" > $SDE_TARBALL -set -x +curl --verbose --form accept_license=1 --form form_id=intel_licensed_dls_step_1 \ + --output /dev/null --cookie-jar jar.txt \ + --location https://software.intel.com/protected-download/267266/144917 +curl --verbose --cookie jar.txt --output $SDE_TARBALL \ + https://software.intel.com/system/files/managed/2a/1a/$SDE_TARBALL + tar xvf $SDE_TARBALL make -j2 testsuite-bin