diff --git a/.travis.yml b/.travis.yml
index dbe3c41d8162238be99f241942af3c4cd0c9f979..bbae9a7d9f8581c57c983517b23bcdd984f9a129 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,9 +1,6 @@
 language: c
 sudo: required
 dist: trusty
-env:
-  global:
-      secure: "Ty3PM1xGhXwxfJG6YyY9bUZyXzw98ekHxQEqU9VnrMXTZb28IxfocPCXHjL34r9HTGosO5Pmierhal1Cs3ZKE5ZAJqJhCfck+kwlH21Uay5CNYglDtSmy2qxtbbDG4AxpEZ1UKlIZr1pNh/x+pRemSmnMEnQp/E7QJqdkhm4+aMX2bWKyLPtrdL+B9QXLVT2nT6/Fw3i05aBhpcFJpSPfvYX2KoCZYdJOSKcKci4T8nAfP/c0olkz+jAkBZxZFgO9Ptrt/lvHtVPrkh5o29GvHg2i/4vucbsMltoxlV31/2eYpdr17Ngtt41MMVn2fHV4lVhLmENc04nlm084fBtg73T6b8hNy5JlcA44xI/UrPJsQAJ+0A0ds9BbBQKPxOmaF/O8WGXhwiwdKT6DGS9lj05f3S+yZfeNE3pQhLEcvwXLO5SW3VvKXMj0t/lZyG+XCkvFjD7KEPQV4g+BZc2zzD9TwDx3ydn8Uzd6zZlq1erQUzCnODP24wuwfrNP8nqxFYG0VtI8oZW62IC9U2hcnAF5QNXXW3yDYD65k3BHbigfI28gu9iO9G8RxOglR27J7Whdqkqw3AMRaqyHt2tdbz7tM2dLZ0EatT5m8esjC+LP4EshW9C59jP2U9vJ/94YEgOfwiqk8+e6fL/7dJvOumbwu1RclRI9DS88PPYb3Q="
 matrix:
   include:
   # full testsuite (all tests except for mixed datatype)
@@ -80,4 +77,4 @@ script:
 - $CC --version
 - make -j 2
 - if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi
-- if [ $SDE -eq 1 ] && [ "$TRAVIS_PULL_REQUEST" = "false" ] ; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi
+- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi
diff --git a/README.md b/README.md
index 317c80d00034614daaf8e5205dddcc7a2c128fac..60ac20b2c232e0ac637a18ebd9f3da7ae71e11d9 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ Contents
 --------
 
 * **[Introduction](#introduction)**
+* **[Education and Learning](#education-and-learning)**
 * **[What's New](#whats-new)**
 * **[What People Are Saying About BLIS](#what-people-are-saying-about-blis)**
 * **[Key Features](#key-features)**
@@ -76,6 +77,17 @@ and [collaborators](http://shpc.ices.utexas.edu/collaborators.html),
 [publications](http://shpc.ices.utexas.edu/publications.html),
 and [other educational projects](http://www.ulaff.net/) (such as MOOCs).
 
+Education and Learning
+----------------------
+
+Want to understand what's under the hood?
+Many of the same concepts and principles employed when developing BLIS are
+introduced and taught in a basic pedagogical setting as part of
+[LAFF-On Programming for High Performance (LAFF-On-PfHP)](http://www.ulaff.net/),
+one of several massive open online courses (MOOCs) in the
+[Linear Algebra: Foundations to Frontiers](http://www.ulaff.net/) series,
+all of which are available for free via the [edX platform](http://www.edx.org/).
+
 What's New
 ----------
 
diff --git a/build/config.mk.in b/build/config.mk.in
index 0516ec97baebef73f1f93051b69db01859ca6b38..34f1931a4cf9f7c42d8bfcf6c34d4011f69579c0 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -89,6 +89,10 @@ endif
 CC_VENDOR         := @CC_VENDOR@
 CC                := @CC@
 
+# Important C compiler ranges.
+GCC_OT_4_9_0      := @gcc_older_than_4_9_0@
+GCC_OT_6_1_0      := @gcc_older_than_6_1_0@
+
 # The C++ compiler. NOTE: A C++ is typically not needed.
 CXX               := @CXX@
 
diff --git a/config/amd64/make_defs.mk b/config/amd64/make_defs.mk
index 70c0b692b494864b2e3a00fd19a73adb6b9ae4e8..df7cd20b79277ff2e140e9d2150f4e256277e57a 100644
--- a/config/amd64/make_defs.mk
+++ b/config/amd64/make_defs.mk
@@ -75,10 +75,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/arm32/make_defs.mk b/config/arm32/make_defs.mk
index b592851e527569b4a1373a66d24df565bf3f1e41..0b517a1efde1bef71316fc5d48bd3b4547d94c7b 100644
--- a/config/arm32/make_defs.mk
+++ b/config/arm32/make_defs.mk
@@ -70,7 +70,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/arm64/make_defs.mk b/config/arm64/make_defs.mk
index ac1cd697398f4b393ab2a2f5b82a8aa0fad65f8f..5ffb0815ad9db2ee095ee5cee56415cc28babc1f 100644
--- a/config/arm64/make_defs.mk
+++ b/config/arm64/make_defs.mk
@@ -70,7 +70,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk
index a577a9a32cbfb316424f87304fe657c18bb11315..97ea5a5ac6c3f5e350f7ff78af0b4856f59d49f4 100644
--- a/config/bgq/make_defs.mk
+++ b/config/bgq/make_defs.mk
@@ -79,7 +79,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Override the default value for LDFLAGS.
 ifeq ($(CC_VENDOR),ibm)
diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk
index dec89a4c3e49fc40761cd5e0f7a94587fb0e8dd4..8f71da3bfa5b950f37765cd49d9e25fa1cf05f7c 100644
--- a/config/bulldozer/make_defs.mk
+++ b/config/bulldozer/make_defs.mk
@@ -75,10 +75,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/cortexa15/make_defs.mk b/config/cortexa15/make_defs.mk
index ee4d301f4baaff9a4404b45db03fbfa5eedf79ce..0cbf304db2bad10bd7e624568d91326ed4bc4065 100644
--- a/config/cortexa15/make_defs.mk
+++ b/config/cortexa15/make_defs.mk
@@ -70,7 +70,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/cortexa53/make_defs.mk b/config/cortexa53/make_defs.mk
index 9f723bcde3110e8063595240bd1a3c8f86b19e21..3e116cd6eb06504253d1e62f97b91eed5e2c7d69 100644
--- a/config/cortexa53/make_defs.mk
+++ b/config/cortexa53/make_defs.mk
@@ -70,7 +70,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/cortexa57/make_defs.mk b/config/cortexa57/make_defs.mk
index 23bcf51e6e7c37dabe6c430558945eb86e69ae79..864872bc2730d51bf526d0e6ee1411624fb98f41 100644
--- a/config/cortexa57/make_defs.mk
+++ b/config/cortexa57/make_defs.mk
@@ -70,7 +70,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/cortexa9/make_defs.mk b/config/cortexa9/make_defs.mk
index 2adc40e307cd7f337d34a9031c3124c1f6764d02..310b75b95b1c2c23fa46c89b00e654cc26de241c 100644
--- a/config/cortexa9/make_defs.mk
+++ b/config/cortexa9/make_defs.mk
@@ -70,7 +70,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/excavator/make_defs.mk b/config/excavator/make_defs.mk
index deb85c79bdd33fd3306587310043696e236e50c0..ed73d5dc8641d85dbda3d6cc9270f1b5f01bb3c9 100644
--- a/config/excavator/make_defs.mk
+++ b/config/excavator/make_defs.mk
@@ -75,10 +75,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/generic/make_defs.mk b/config/generic/make_defs.mk
index 3388291da017677dc636fde06b1e7b200f505d5c..7f934de38e1d1b669349e35720182359a3304ba6 100644
--- a/config/generic/make_defs.mk
+++ b/config/generic/make_defs.mk
@@ -79,10 +79,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c
index ea839e4ee64bea6ab7e339aea199e742ab4a8541..7f222415a7f88787ae4f922ec52d43da71aed5b4 100644
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -123,12 +123,18 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 #if 1
 	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,     6,     3,     3 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_MC ],  1008,  1008,  1008,  1008 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_MC ],   168,    72,    72,    36 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   168,    72,    75,   192 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
 #else
 	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    16,     8,     8,     4 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     6,     6,     3,     3 );
-#endif
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,   144,    72 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_MC ],  1024,  1024,  1024,  1024 );
+	//bli_blksz_init_easy( &blkszs[ BLIS_MC ],   112,    64,    56,    32 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   112,    72,    56,    44 );
 	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
+#endif
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,     8,     8 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,     8,     8 );
diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk
index f08d5a937e42ea1376a690d954c03a74fc82cbfd..6752dde295871c6f5117f49806ab15f357f563d8 100644
--- a/config/haswell/make_defs.mk
+++ b/config/haswell/make_defs.mk
@@ -63,13 +63,17 @@ endif
 # Flags specific to optimized kernels.
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
+CKVECFLAGS     := -mavx2 -mfma -mfpmath=sse -march=haswell
+ifeq ($(GCC_OT_4_9_0),yes)
+# If gcc is older than 4.9.0, we must use a different label for -march.
 CKVECFLAGS     := -mavx2 -mfma -mfpmath=sse -march=core-avx2
+endif
 else
 ifeq ($(CC_VENDOR),icc)
 CKVECFLAGS     := -xCORE-AVX2
 else
 ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := -mavx2 -mfma -mfpmath=sse -march=core-avx2
+CKVECFLAGS     := -mavx2 -mfma -mfpmath=sse -march=haswell
 else
 $(error gcc, icc, or clang is required for this configuration.)
 endif
@@ -79,10 +83,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) #-funsafe-math-optimizations
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/intel64/make_defs.mk b/config/intel64/make_defs.mk
index af462fdc3f2da32005c6e9b604d68303bf10e61f..f74fb4d70a33a4a7441ac12938ae5edb93c5cc10 100644
--- a/config/intel64/make_defs.mk
+++ b/config/intel64/make_defs.mk
@@ -79,10 +79,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/knc/make_defs.mk b/config/knc/make_defs.mk
index be3c9019d8b7d1f1e1b6655d8aacf67498a02206..d58521969f31777ffd29ecf8c5c694eadcc6d886 100644
--- a/config/knc/make_defs.mk
+++ b/config/knc/make_defs.mk
@@ -71,10 +71,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Override the default value for LDFLAGS.
 LDFLAGS        := -mmic
diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk
index b08cf1e4d5ff348dafd512febf825b0a3b0c8998..aa74df31c55759d7a000d90e84df651d4f9ddc8e 100644
--- a/config/knl/make_defs.mk
+++ b/config/knl/make_defs.mk
@@ -99,13 +99,13 @@ endif
 # Note: We use AVX2 for reference kernels instead of AVX-512.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations
+CRVECFLAGS     := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
 else
 ifeq ($(CC_VENDOR),icc)
 CRVECFLAGS     := -xMIC-AVX512
 else
 ifeq ($(CC_VENDOR),clang)
-CRVECFLAGS     := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd
+CRVECFLAGS     := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
 else
 $(error gcc, icc, or clang is required for this configuration.)
 endif
diff --git a/config/penryn/make_defs.mk b/config/penryn/make_defs.mk
index 41d2d939fcd65f4a663d355b0ee4fe6839d6280e..573382ea256e39e301802bfc203a3100d791c441 100644
--- a/config/penryn/make_defs.mk
+++ b/config/penryn/make_defs.mk
@@ -79,10 +79,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk
index bb23fbecea7eda7be93efb049de1baf1049bd859..8cf3ac5d988321a94aff5df84e6685214d91277c 100644
--- a/config/piledriver/make_defs.mk
+++ b/config/piledriver/make_defs.mk
@@ -75,10 +75,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk
index 18f111bf68f2461b0ce844de8454ac7d45ef3d38..9633b4f18396b84a94d3c384ceacb93bfa3d4e68 100644
--- a/config/power7/make_defs.mk
+++ b/config/power7/make_defs.mk
@@ -70,7 +70,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/power9/make_defs.mk b/config/power9/make_defs.mk
index 3d66f607956d02f6db76da801720feb63ed38cc5..b2c78b16a7981e53b8f52dccbaca0a00c105aabd 100644
--- a/config/power9/make_defs.mk
+++ b/config/power9/make_defs.mk
@@ -70,7 +70,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk
index ba18e4f3286248d97c84322387e25251246ce534..896cb8993ceb266050b8d1e63cecf05a766638f6 100644
--- a/config/sandybridge/make_defs.mk
+++ b/config/sandybridge/make_defs.mk
@@ -63,13 +63,17 @@ endif
 # Flags specific to optimized kernels.
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
+CKVECFLAGS     := -mavx -mfpmath=sse -march=sandybridge
+ifeq ($(GCC_OT_4_9_0),yes)
+# If gcc is older than 4.9.0, we must use a different label for -march.
 CKVECFLAGS     := -mavx -mfpmath=sse -march=corei7-avx
+endif
 else
 ifeq ($(CC_VENDOR),icc)
 CKVECFLAGS     := -xAVX
 else
 ifeq ($(CC_VENDOR),clang)
-CKVECFLAGS     := -mavx -mfpmath=sse -march=corei7-avx
+CKVECFLAGS     := -mavx -mfpmath=sse -march=sandybridge
 else
 $(error gcc, icc, or clang is required for this configuration.)
 endif
@@ -79,10 +83,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/skx/make_defs.mk b/config/skx/make_defs.mk
index 27bea5ef55ea90563412f14b81414307a6784a2a..920b42d98a06a105f28f32a9e6f5229220642d0b 100644
--- a/config/skx/make_defs.mk
+++ b/config/skx/make_defs.mk
@@ -89,13 +89,13 @@ endif
 # to overcome the AVX-512 frequency drop". (Issue #187)
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations
+CRVECFLAGS     := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
 else
 ifeq ($(CC_VENDOR),icc)
 CRVECFLAGS     := -xCORE-AVX2
 else
 ifeq ($(CC_VENDOR),clang)
-CRVECFLAGS     := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd
+CRVECFLAGS     := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
 else
 $(error gcc, icc, or clang is required for this configuration.)
 endif
diff --git a/config/steamroller/make_defs.mk b/config/steamroller/make_defs.mk
index a5b6707041da393715df1f1127f9feea1396ac50..89c76890355a2f976f5e7372af5447d34583f7bb 100644
--- a/config/steamroller/make_defs.mk
+++ b/config/steamroller/make_defs.mk
@@ -75,10 +75,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/thunderx2/make_defs.mk b/config/thunderx2/make_defs.mk
index 3227fe242bad2597a93c3695eeac0c49fb362035..820919d9c80cfbabd388df8c51df6d7ef11603bb 100644
--- a/config/thunderx2/make_defs.mk
+++ b/config/thunderx2/make_defs.mk
@@ -70,7 +70,15 @@ endif
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
+ifeq ($(CC_VENDOR),gcc)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
 CRVECFLAGS     := $(CKVECFLAGS)
+endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/x86_64/make_defs.mk b/config/x86_64/make_defs.mk
index 4d038ff04b2a38164ce556379bc66a3437b55c12..520cd42ac4993ca70f1629ac678cf942129b000d 100644
--- a/config/x86_64/make_defs.mk
+++ b/config/x86_64/make_defs.mk
@@ -79,10 +79,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk
index 0397f60b7cf3cd9cef988ef6b227adb01a9a7e5d..1b9db53713b2d72c852c3b652eb2e71896fc0626 100644
--- a/config/zen/make_defs.mk
+++ b/config/zen/make_defs.mk
@@ -63,11 +63,15 @@ endif
 # Flags specific to optimized kernels.
 CKOPTFLAGS     := $(COPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-# gcc 6.0 (clang 4.0) or later:
-#CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma -march=znver1
-# gcc 4.9 (clang 3.5) or later:
-# possibly add zen-specific instructions: -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt
+CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma -march=znver1
+ifeq ($(GCC_OT_6_1_0),yes)
+# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the
+# Bulldozer instruction sets that were omitted from Zen.
+# Additionally, if gcc is 4.9 (clang 3.5?) or newer, we may want to add
+# Zen-specific instructions back into the mix:
+# -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt
 CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
+endif
 else
 ifeq ($(CC_VENDOR),clang)
 CKVECFLAGS     := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
@@ -79,10 +83,14 @@ endif
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+else
+ifeq ($(CC_VENDOR),clang)
+CRVECFLAGS     := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
 else
 CRVECFLAGS     := $(CKVECFLAGS)
 endif
+endif
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/configure b/configure
index 19489d2e47d41c69a9bb534bd7dd8d265c02c717..bf3f94527913741826806916111b8b758c917d78 100755
--- a/configure
+++ b/configure
@@ -1506,6 +1506,79 @@ check_compiler()
 	fi
 }
 
+check_compiler_version_ranges()
+{
+	local cc
+
+	cc="${found_cc}"
+
+	#
+	# We check for various compiler version ranges that may cause us
+	# issues in properly supporting those compiler versions within the
+	# BLIS build system.
+	#
+	# range: gcc < 4.9.0 (ie: 4.8.5 or older)
+	# variable: gcc_older_than_4_9_0
+	# comments:
+	#   These older versions of gcc may support microarchitectures such as
+	#   sandybridge, but the '-march=' flag uses a different label syntax.
+	#   In newer versions, '-march=sandybridge' is the preferred syntax [1].
+	#   However, in older versions, the syntax for the same compiler option
+	#   is '-march=corei7-avx' [2].
+	#
+	#   [1] https://gcc.gnu.org/onlinedocs/gcc-4.9.0/gcc/i386-and-x86-64-Options.html#i386-and-x86-64-Options
+	#   [2] https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/i386-and-x86-64-Options.html#i386-and-x86-64-Options
+	#
+	# range: gcc < 6.1 (ie: 5.5 or older)
+	# variable: gcc_older_than_6_1_0
+	# comments:
+	#   These older versions of gcc do not explicitly support the Zen (Zen1)
+	#   microarchitecture; the newest microarchitectural value understood by
+	#   these versions is '-march=bdver4' [3]. However, support for them can
+	#   be attained in a roundabout way by starting with the instruction sets
+	#   enabled by '-march=bdver4' and then disabling the instruction sets
+	#   that were removed in the transition from Excavator to Zen, namely:
+	#   FMA4, TBM, XOP, and LWP. Newer versions of gcc support Zen via the
+	#   '-march=znver1' option [4].
+	#
+	#   [3] https://gcc.gnu.org/onlinedocs/gcc-5.5.0/gcc/x86-Options.html#x86-Options
+	#   [4] https://gcc.gnu.org/onlinedocs/gcc-6.1.0/gcc/x86-Options.html#x86-Options
+	#
+
+	gcc_older_than_4_9_0='no'
+	gcc_older_than_6_1_0='no'
+
+	echo "${script_name}: checking ${cc} ${cc_version} against known consequential version ranges."
+
+	# gcc
+	if [ "x${cc_vendor}" = "xgcc" ]; then
+
+		# Check for gcc < 4.9.0 (ie: 4.8.5 or older).
+		if [ ${cc_major} -eq 4 ]; then
+			if [ ${cc_minor} -lt 9 ]; then
+				echo "${script_name}: note: found ${cc} version older than 4.9.0."
+				gcc_older_than_4_9_0='yes'
+			fi
+		fi
+
+		# Check for gcc < 6.1.0 (ie: 5.5 or older).
+		if [ ${cc_major} -lt 6 ]; then
+			echo "${script_name}: note: found ${cc} version older than 6.1."
+			gcc_older_than_6_1_0='yes'
+		fi
+	fi
+
+	# icc
+	if [ "x${cc_vendor}" = "xicc" ]; then
+		:
+	fi
+
+	# clang
+	if [ "x${cc_vendor}" = "xclang" ]; then
+		:
+	fi
+}
+
 check_assembler()
 {
 	local cc asm_dir cflags asm_fp
@@ -2222,9 +2295,11 @@ main()
 
 	# Check the compiler's version. Certain versions of certain compilers
 	# will preclude building certain sub-configurations, which are added
-	# to a blacklist.
+	# to a blacklist. We also make note of certain version ranges that
+	# will be useful to know about later.
 	get_compiler_version
 	check_compiler
+	check_compiler_version_ranges
 
 	# Now check the assembler's ability to assemble code. Older versions
 	# of binutils may not be aware of certain instruction sets. Those
@@ -3017,6 +3092,8 @@ main()
 		| sed -e "s/@is_win@/${is_win}/g" \
 		| sed -e "s/@dist_path@/${dist_path_esc}/g" \
 		| sed -e "s/@CC_VENDOR@/${cc_vendor}/g" \
+		| sed -e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g" \
+		| sed -e "s/@gcc_older_than_6_1_0@/${gcc_older_than_6_1_0}/g" \
 		| sed -e "s/@CC@/${cc_esc}/g" \
 		| sed -e "s/@CXX@/${cxx_esc}/g" \
 		| sed -e "s/@RANLIB@/${ranlib_esc}/g" \
diff --git a/docs/Performance.md b/docs/Performance.md
index e51028c49a2cf0e137bef112bba0417854dad333..55d0370dafb2ec880aeb0a47f2cf5fbb4aca67d2 100644
--- a/docs/Performance.md
+++ b/docs/Performance.md
@@ -127,7 +127,9 @@ size of interest so that we can better assist you.
   * single-core: 17.6 GFLOPS (double-precision), 35.2 GFLOPS (single-precision)
   * multicore: 17.6 GFLOPS/core (double-precision), 35.2 GFLOPS/core (single-precision)
 * Operating system: Ubuntu 16.04 (Linux kernel 4.15.0)
+* Page size: unknown
 * Compiler: gcc 7.3.0
+* Driver source code directory: `test/3`
 * Results gathered: 14 February 2019
 * Implementations tested:
   * BLIS 075143df (0.5.1-39)
@@ -187,7 +189,9 @@ size of interest so that we can better assist you.
   * single-core: 64 GFLOPS (double-precision), 128 GFLOPS (single-precision)
   * multicore: 64 GFLOPS/core (double-precision), 128 GFLOPS/core (single-precision)
 * Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
+* Page size: 4096 bytes
 * Compiler: gcc 7.3.0
+* Driver source code directory: `test/3`
 * Results gathered: 6 March 2019, 27 March 2019
 * Implementations tested:
   * BLIS 9f1dbe5 (0.5.1-54)
@@ -204,7 +208,14 @@ size of interest so that we can better assist you.
     * Multithreaded (52 core) execution requested via `export OPENBLAS_NUM_THREADS=52`
   * Eigen 3.3.90
     * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
-    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal):
+         ```
+         # These lines added after line 67.
+         check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+         if(COMPILER_SUPPORTS_MARCH_NATIVE)
+           set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+         endif()
+         ```
     * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
     * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
     * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
@@ -259,7 +270,9 @@ size of interest so that we can better assist you.
   * single-core: 56 GFLOPS (double-precision), 112 GFLOPS (single-precision)
   * multicore: 49.6 GFLOPS/core (double-precision), 99.2 GFLOPS/core (single-precision)
 * Operating system: Cray Linux Environment 6 (Linux kernel 4.4.103)
+* Page size: 4096 bytes
 * Compiler: gcc 6.3.0
+* Driver source code directory: `test/3`
 * Results gathered: 25-26 February 2019, 27 March 2019
 * Implementations tested:
   * BLIS 075143df (0.5.1-39)
@@ -276,7 +289,14 @@ size of interest so that we can better assist you.
     * Multithreaded (24 core) execution requested via `export OPENBLAS_NUM_THREADS=24`
   * Eigen 3.3.90
     * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
-    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal):
+         ```
+         # These lines added after line 67.
+         check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+         if(COMPILER_SUPPORTS_MARCH_NATIVE)
+           set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+         endif()
+         ```
     * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
     * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
     * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
@@ -329,7 +349,9 @@ size of interest so that we can better assist you.
   * single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision)
   * multicore: 20.4 GFLOPS/core (double-precision), 40.8 GFLOPS/core (single-precision)
 * Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
+* Page size: 4096 bytes
 * Compiler: gcc 7.3.0
+* Driver source code directory: `test/3`
 * Results gathered: 6 March 2019, 19 March 2019, 27 March 2019
 * Implementations tested:
   * BLIS 9f1dbe5 (0.5.1-54)
@@ -346,7 +368,14 @@ size of interest so that we can better assist you.
     * Multithreaded (64 core) execution requested via `export OPENBLAS_NUM_THREADS=64`
   * Eigen 3.3.90
     * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (March 27, 2019)
-    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal):
+         ```
+         # These lines added after line 67.
+         check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+         if(COMPILER_SUPPORTS_MARCH_NATIVE)
+           set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+         endif()
+         ```
     * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
     * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
     * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1`
diff --git a/docs/PerformanceSmall.md b/docs/PerformanceSmall.md
index 52f15487d09e68fd87b6d29f5d3d8946580beb85..2c46130bde4fd9e20c8618ab461415d17ce25c95 100644
--- a/docs/PerformanceSmall.md
+++ b/docs/PerformanceSmall.md
@@ -1,16 +1,19 @@
 # Contents
 
-* **[Contents](Performance.md#contents)**
-* **[Introduction](Performance.md#introduction)**
-* **[General information](Performance.md#general-information)**
-* **[Level-3 performance](Performance.md#level-3-performance)**
-  * **[Kaby Lake](Performance.md#kaby-lake)**
-    * **[Experiment details](Performance.md#kaby-lake-experiment-details)**
-    * **[Results](Performance.md#kaby-lake-results)**
-  * **[Epyc](Performance.md#epyc)**
-    * **[Experiment details](Performance.md#epyc-experiment-details)**
-    * **[Results](Performance.md#epyc-results)**
-* **[Feedback](Performance.md#feedback)**
+* **[Contents](PerformanceSmall.md#contents)**
+* **[Introduction](PerformanceSmall.md#introduction)**
+* **[General information](PerformanceSmall.md#general-information)**
+* **[Level-3 performance](PerformanceSmall.md#level-3-performance)**
+  * **[Kaby Lake](PerformanceSmall.md#kaby-lake)**
+    * **[Experiment details](PerformanceSmall.md#kaby-lake-experiment-details)**
+    * **[Results](PerformanceSmall.md#kaby-lake-results)**
+  * **[Haswell](PerformanceSmall.md#haswell)**
+    * **[Experiment details](PerformanceSmall.md#haswell-experiment-details)**
+    * **[Results](PerformanceSmall.md#haswell-results)**
+  * **[Epyc](PerformanceSmall.md#epyc)**
+    * **[Experiment details](PerformanceSmall.md#epyc-experiment-details)**
+    * **[Results](PerformanceSmall.md#epyc-results)**
+* **[Feedback](PerformanceSmall.md#feedback)**
 
 # Introduction
 
@@ -110,25 +113,37 @@ size of interest so that we can better assist you.
 * Max FMA vector IPC: 2
 * Peak performance:
   * single-core: 57.6 GFLOPS (double-precision), 115.2 GFLOPS (single-precision)
-* Operating system: Gentoo Linux (Linux kernel 5.0.7)
-* Compiler: gcc 7.3.0
-* Results gathered: 31 May 2019, 3 June 2019
+* Operating system: Gentoo Linux (Linux kernel 5.2.4)
+* Page size: 4096 bytes
+* Compiler: gcc 8.3.0
+* Driver source code directory: `test/sup`
+* Results gathered: 23-28 August 2019
 * Implementations tested:
-  * BLIS 6bf449c (0.5.2-42)
+  * BLIS 4a0a6e8 (0.6.0-28)
     * configured with `./configure --enable-cblas auto`
     * sub-configuration exercised: `haswell`
-  * OpenBLAS 0.3.6
+  * OpenBLAS 0.3.7
     * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
-  * BLASFEO 75a3dd8
+  * BLASFEO 01f6b7f
     * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
   * Eigen 3.3.90
-    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019)
-    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
-    * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019)
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal):
+         ```
+         # These lines added after line 67.
+         check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+         if(COMPILER_SUPPORTS_MARCH_NATIVE)
+           set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+         endif()
+         ```
+    * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas`
+    * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install`
     * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
     * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
-  * MKL 2018 update 4
+  * MKL 2019 update 4
     * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
+  * libxsmm 77a295c (1.6.5-6679)
+    * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally.
 * Affinity:
   * N/A.
 * Frequency throttling (via `cpupower`):
@@ -137,8 +152,7 @@ size of interest so that we can better assist you.
   * Hardware limits: 800MHz - 3.8GHz
   * Adjusted minimum: 3.7GHz
 * Comments:
-  * For both row- and column-stored matrices, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution (typically MKL), except for a few cases of where the _k_ dimension is very small. It is likely the case that this shape scenario begs a different kernel approach, since the BLIS microkernel is inherently designed to iterate over many _k_ dimension iterations (which leads them to incur considerable overhead for small values of _k_).
-  * For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 80 to 180. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`).
+  * libxsmm is highly competitive for very small problems, but quickly gives up once the "large" dimension exceeds about 180-240 (or 64 in the case where all operands are square). Also, libxsmm's `gemm` cannot handle a transposition on matrix A and similarly dispatches the fallback implementation for those cases. libxsmm also does not export CBLAS interfaces, and therefore only appears on the graphs for column-stored matrices.
 
 ### Kaby Lake results
 
@@ -156,6 +170,73 @@ size of interest so that we can better assist you.
 
 ---
 
+## Haswell
+
+### Haswell experiment details
+
+* Location: TACC (Lonestar5)
+* Processor model: Intel Xeon E5-2690 v3 (Haswell)
+* Core topology: two sockets, 12 cores per socket, 24 cores total
+* SMT status: enabled, but not utilized
+* Max clock rate: 3.5GHz (single-core), 3.1GHz (multicore)
+* Max vector register length: 256 bits (AVX2)
+* Max FMA vector IPC: 2
+* Peak performance:
+  * single-core: 56 GFLOPS (double-precision), 112 GFLOPS (single-precision)
+* Operating system: Cray Linux Environment 6 (Linux kernel 4.4.103)
+* Page size: 4096 bytes
+* Compiler: gcc 7.3.0
+* Driver source code directory: `test/sup`
+* Results gathered: 23-28 August 2019
+* Implementations tested:
+  * BLIS 4a0a6e8 (0.6.0-28)
+    * configured with `./configure --enable-cblas auto`
+    * sub-configuration exercised: `haswell`
+  * OpenBLAS 0.3.7
+    * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
+  * BLASFEO 01f6b7f
+    * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
+  * Eigen 3.3.90
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019)
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal):
+         ```
+         # These lines added after line 67.
+         check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+         if(COMPILER_SUPPORTS_MARCH_NATIVE)
+           set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+         endif()
+         ```
+    * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas`
+    * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install`
+    * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
+    * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
+  * MKL 2019 update 4
+    * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
+  * libxsmm 77a295c (1.6.5-6679)
+    * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally.
+* Affinity:
+  * N/A.
+* Frequency throttling (via `cpupower`):
+  * No changes made.
+* Comments:
+  * libxsmm is highly competitive for very small problems, but quickly gives up once the "large" dimension exceeds about 180-240 (or 64 in the case where all operands are square). Also, libxsmm's `gemm` cannot handle a transposition on matrix A and similarly dispatches the fallback implementation for those cases. libxsmm also does not export CBLAS interfaces, and therefore only appears on the graphs for column-stored matrices.
+
+### Haswell results
+
+#### pdf
+
+* [Haswell row-stored](graphs/sup/dgemm_rrr_has_nt1.pdf)
+* [Haswell column-stored](graphs/sup/dgemm_ccc_has_nt1.pdf)
+
+#### png (inline)
+
+* **Haswell row-stored**
+![row-stored](graphs/sup/dgemm_rrr_has_nt1.png)
+* **Haswell column-stored**
+![column-stored](graphs/sup/dgemm_ccc_has_nt1.png)
+
+---
+
 ## Epyc
 
 ### Epyc experiment details
@@ -171,24 +252,36 @@ size of interest so that we can better assist you.
 * Peak performance:
   * single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision)
 * Operating system: Ubuntu 18.04 (Linux kernel 4.15.0)
-* Compiler: gcc 7.3.0
-* Results gathered: 31 May 2019, 3 June 2019
+* Page size: 4096 bytes
+* Compiler: gcc 7.4.0
+* Driver source code directory: `test/sup`
+* Results gathered: 23-28 August 2019
 * Implementations tested:
-  * BLIS 6bf449c (0.5.2-42)
+  * BLIS 4a0a6e8 (0.6.0-28)
     * configured with `./configure --enable-cblas auto`
     * sub-configuration exercised: `zen`
-  * OpenBLAS 0.3.6
+  * OpenBLAS 0.3.7
     * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded)
-  * BLASFEO 75a3dd8
+  * BLASFEO 01f6b7f
     * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`.
   * Eigen 3.3.90
-    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (30 May 2019)
-    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal).
-    * configured and built BLAS library via `mkdir build; cd build; cmake ..; make blas`
+    * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019)
+    * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal):
+         ```
+         # These lines added after line 67.
+         check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
+         if(COMPILER_SUPPORTS_MARCH_NATIVE)
+           set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+         endif()
+         ```
+    * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas`
+    * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install`
     * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library.
     * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded)
   * MKL 2019 update 4
     * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded)
+  * libxsmm 77a295c (1.6.5-6679)
+    * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally.
 * Affinity:
   * N/A.
 * Frequency throttling (via `cpupower`):
@@ -197,8 +290,7 @@ size of interest so that we can better assist you.
   * Hardware limits: 1.2GHz - 2.0GHz
   * Adjusted minimum: 2.0GHz
 * Comments:
-  * As with Kaby Lake, BLIS's new small/skinny matrix implementation is competitive with (or exceeds the performance of) the next highest-performing solution, except for a few cases of where the _k_ dimension is very small.
-  * For the classic case of `dgemm_nn` on square matrices, BLIS is the fastest implementation for the problem size range of approximately 12 to 256. BLIS is also competitive in this general range for other transpose parameter combinations (`nt`, `tn`, and `tt`).
+  * libxsmm is highly competitive for very small problems, but quickly gives up once the "large" dimension exceeds about 180-240 (or 64 in the case where all operands are square). Also, libxsmm's `gemm` cannot handle a transposition on matrix A and similarly dispatches the fallback implementation for those cases. libxsmm also does not export CBLAS interfaces, and therefore only appears on the graphs for column-stored matrices.
 
 ### Epyc results
 
diff --git a/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf b/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf
index 0f6e07e9a0935c847435ce46d3e71bc36e091748..8bba344a7eeb65568cf6cf400a5534c1f8be07af 100644
Binary files a/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf and b/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf differ
diff --git a/docs/graphs/sup/dgemm_ccc_epyc_nt1.png b/docs/graphs/sup/dgemm_ccc_epyc_nt1.png
index 71eeb46c7e7fa8fd6b719a0a9e2211904217a778..bf409bf10580b38d2dea36033c8a81f46d7d4805 100644
Binary files a/docs/graphs/sup/dgemm_ccc_epyc_nt1.png and b/docs/graphs/sup/dgemm_ccc_epyc_nt1.png differ
diff --git a/docs/graphs/sup/dgemm_ccc_has_nt1.pdf b/docs/graphs/sup/dgemm_ccc_has_nt1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..2614c65a54b491c9ce2d1b0f6d3390e3e4be7f13
Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_has_nt1.pdf differ
diff --git a/docs/graphs/sup/dgemm_ccc_has_nt1.png b/docs/graphs/sup/dgemm_ccc_has_nt1.png
new file mode 100644
index 0000000000000000000000000000000000000000..34ea1eee47b4ec88030096754850b433cc30bfe6
Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_has_nt1.png differ
diff --git a/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf b/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf
index 9ff9de2023b519fe700d62ed661c4891d5c7ce62..43cdcc6872dea36bd8ba905b12b42192f12828fe 100644
Binary files a/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf and b/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf differ
diff --git a/docs/graphs/sup/dgemm_ccc_kbl_nt1.png b/docs/graphs/sup/dgemm_ccc_kbl_nt1.png
index 4f99f8f7fca1e6048f515c01ed5219e09ed57a01..fdf45868ae1fa43ed2e5eef678a8c11f1860b14d 100644
Binary files a/docs/graphs/sup/dgemm_ccc_kbl_nt1.png and b/docs/graphs/sup/dgemm_ccc_kbl_nt1.png differ
diff --git a/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf b/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf
index f010da9aa60c235e877e65b436114799670d5bc7..f09c9efc9fb8aa47b72cb5ee0871370843184bff 100644
Binary files a/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf and b/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf differ
diff --git a/docs/graphs/sup/dgemm_rrr_epyc_nt1.png b/docs/graphs/sup/dgemm_rrr_epyc_nt1.png
index 306bd40b0f7fbc3c7dcca8244346298efa43634e..8add499d80e5c80f95d66be7000130931d903f93 100644
Binary files a/docs/graphs/sup/dgemm_rrr_epyc_nt1.png and b/docs/graphs/sup/dgemm_rrr_epyc_nt1.png differ
diff --git a/docs/graphs/sup/dgemm_rrr_has_nt1.pdf b/docs/graphs/sup/dgemm_rrr_has_nt1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e1dc609a04d9f2ef93c72a8a51f127ebf03008e1
Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_has_nt1.pdf differ
diff --git a/docs/graphs/sup/dgemm_rrr_has_nt1.png b/docs/graphs/sup/dgemm_rrr_has_nt1.png
new file mode 100644
index 0000000000000000000000000000000000000000..c8b47b85ac7b728b283173b43284c7ea9463927c
Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_has_nt1.png differ
diff --git a/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf b/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf
index d104363d113df3a2ce24e47a1cd8a65251d78c78..10b674a22d3af6a5e130e8a3b8703130f431d2b2 100644
Binary files a/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf and b/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf differ
diff --git a/docs/graphs/sup/dgemm_rrr_kbl_nt1.png b/docs/graphs/sup/dgemm_rrr_kbl_nt1.png
index dbea1b96dc4ea91257555b129ae545fc10f1e059..310b2aad67a9d198a3c793834a01257a20da1fc6 100644
Binary files a/docs/graphs/sup/dgemm_rrr_kbl_nt1.png and b/docs/graphs/sup/dgemm_rrr_kbl_nt1.png differ
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index 1d876d50f1b45487607b299323ed0346ff31890a..4f073cb20ade98cbce44b96eeb036a031d1150bf 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -99,35 +99,84 @@ void bli_l3_thrinfo_print_gemm_paths
        thrinfo_t** threads
      )
 {
+	// In order to query the number of threads, we query the only thread we
+	// know exists: thread 0.
 	dim_t n_threads = bli_thread_num_threads( threads[0] );
-	dim_t gl_id;
-
-	thrinfo_t* jc_info  = threads[0];
-	thrinfo_t* pc_info  = bli_thrinfo_sub_node( jc_info );
-	thrinfo_t* pb_info  = bli_thrinfo_sub_node( pc_info );
-	thrinfo_t* ic_info  = bli_thrinfo_sub_node( pb_info );
-	thrinfo_t* pa_info  = bli_thrinfo_sub_node( ic_info );
-	thrinfo_t* jr_info  = bli_thrinfo_sub_node( pa_info );
-	thrinfo_t* ir_info  = bli_thrinfo_sub_node( jr_info );
-
-	dim_t jc_way = bli_thread_n_way( jc_info );
-	dim_t pc_way = bli_thread_n_way( pc_info );
-	dim_t pb_way = bli_thread_n_way( pb_info );
-	dim_t ic_way = bli_thread_n_way( ic_info );
-	dim_t pa_way = bli_thread_n_way( pa_info );
-	dim_t jr_way = bli_thread_n_way( jr_info );
-	dim_t ir_way = bli_thread_n_way( ir_info );
-
-	dim_t jc_nt = bli_thread_num_threads( jc_info );
-	dim_t pc_nt = bli_thread_num_threads( pc_info );
-	dim_t pb_nt = bli_thread_num_threads( pb_info );
-	dim_t ic_nt = bli_thread_num_threads( ic_info );
-	dim_t pa_nt = bli_thread_num_threads( pa_info );
-	dim_t jr_nt = bli_thread_num_threads( jr_info );
-	dim_t ir_nt = bli_thread_num_threads( ir_info );
+
+	// For the purposes of printing the "header" information that is common
+	// to the various instances of a thrinfo_t (ie: across all threads), we
+	// choose the last thread in case the problem is so small that there is
+	// only an "edge" case, which will always be assigned to the last thread
+	// (at least for higher levels of partitioning).
+	thrinfo_t* jc_info  = threads[n_threads-1];
+	thrinfo_t* pc_info  = NULL;
+	thrinfo_t* pb_info  = NULL;
+	thrinfo_t* ic_info  = NULL;
+	thrinfo_t* pa_info  = NULL;
+	thrinfo_t* jr_info  = NULL;
+	thrinfo_t* ir_info  = NULL;
+
+	// Initialize the n_ways and n_threads fields of each thrinfo_t "level"
+	// to -1. More than likely, these will all be overwritten with meaningful
+	// values, but in case some thrinfo_t trees are not fully built (see
+	// next commnet), these will be the placeholder values.
+	dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1,
+	                   pa_way = -1, jr_way = -1, ir_way = -1;
+
+	dim_t jc_nt = -1,  pc_nt = -1,  pb_nt = -1,  ic_nt = -1,
+	                   pa_nt = -1,  jr_nt = -1,  ir_nt = -1;
+
+	// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
+	// may not fully build their thrinfo_t structures--specifically when the
+	// dimension being parallelized is not large enough for each thread to have
+	// even one unit of work (where as unit is usually a single micropanel's
+	// width, MR or NR).
+
+	if ( !jc_info ) goto print_header;
+
+	jc_way  = bli_thread_n_way( jc_info );
+	jc_nt   = bli_thread_num_threads( jc_info );
+	pc_info = bli_thrinfo_sub_node( jc_info );
+
+	if ( !pc_info ) goto print_header;
+
+	pc_way  = bli_thread_n_way( pc_info );
+	pc_nt   = bli_thread_num_threads( pc_info );
+	pb_info = bli_thrinfo_sub_node( pc_info );
+
+	if ( !pb_info ) goto print_header;
+
+	pb_way  = bli_thread_n_way( pb_info );
+	pb_nt   = bli_thread_num_threads( pb_info );
+	ic_info = bli_thrinfo_sub_node( pb_info );
+
+	if ( !ic_info ) goto print_header;
+
+	ic_way  = bli_thread_n_way( ic_info );
+	ic_nt   = bli_thread_num_threads( ic_info );
+	pa_info = bli_thrinfo_sub_node( ic_info );
+
+	if ( !pa_info ) goto print_header;
+
+	pa_way  = bli_thread_n_way( pa_info );
+	pa_nt   = bli_thread_num_threads( pa_info );
+	jr_info = bli_thrinfo_sub_node( pa_info );
+
+	if ( !jr_info ) goto print_header;
+
+	jr_way  = bli_thread_n_way( jr_info );
+	jr_nt   = bli_thread_num_threads( jr_info );
+	ir_info = bli_thrinfo_sub_node( jr_info );
+
+	if ( !ir_info ) goto print_header;
+
+	ir_way  = bli_thread_n_way( ir_info );
+	ir_nt   = bli_thread_num_threads( ir_info );
+
+	print_header:
 
 	printf( "            jc   kc   pb   ic   pa   jr   ir\n" );
-	printf( "xx_nt:    %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
+	printf( "xx_nt:    %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
 	( unsigned long )jc_nt,
 	( unsigned long )pc_nt,
 	( unsigned long )pb_nt,
@@ -135,7 +184,7 @@ void bli_l3_thrinfo_print_gemm_paths
 	( unsigned long )pa_nt,
 	( unsigned long )jr_nt,
 	( unsigned long )ir_nt );
-	printf( "xx_way:   %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
+	printf( "xx_way:   %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
     ( unsigned long )jc_way,
 	( unsigned long )pc_way,
 	( unsigned long )pb_way,
@@ -145,116 +194,59 @@ void bli_l3_thrinfo_print_gemm_paths
 	( unsigned long )ir_way );
 	printf( "============================================\n" );
 
-	dim_t jc_comm_id;
-	dim_t pc_comm_id;
-	dim_t pb_comm_id;
-	dim_t ic_comm_id;
-	dim_t pa_comm_id;
-	dim_t jr_comm_id;
-	dim_t ir_comm_id;
-
-	dim_t jc_work_id;
-	dim_t pc_work_id;
-	dim_t pb_work_id;
-	dim_t ic_work_id;
-	dim_t pa_work_id;
-	dim_t jr_work_id;
-	dim_t ir_work_id;
-
-	for ( gl_id = 0; gl_id < n_threads; ++gl_id )
+	for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id )
 	{
 		jc_info = threads[gl_id];
 
-		// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
-		// may not fully build their thrinfo_t structures--specifically when the
-		// dimension being parallelized is not large enough for each thread to have
-		// even one unit of work (where as unit is usually a single micropanel's
-		// width, MR or NR).
-		if ( !jc_info )
-		{
-			jc_comm_id = pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
-			jc_work_id = pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
-		}
-		else
-		{
-			jc_comm_id = bli_thread_ocomm_id( jc_info );
-			jc_work_id = bli_thread_work_id( jc_info );
-			pc_info = bli_thrinfo_sub_node( jc_info );
+		dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1,
+		                       pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1;
 
-			if ( !pc_info )
-			{
-				pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
-				pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
-			}
-			else
-			{
-				pc_comm_id = bli_thread_ocomm_id( pc_info );
-				pc_work_id = bli_thread_work_id( pc_info );
-				pb_info = bli_thrinfo_sub_node( pc_info );
+		dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1,
+		                       pa_work_id = -1, jr_work_id = -1, ir_work_id = -1;
 
-				if ( !pb_info )
-				{
-					pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
-					pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
-				}
-				else
-				{
-					pb_comm_id = bli_thread_ocomm_id( pb_info );
-					pb_work_id = bli_thread_work_id( pb_info );
-					ic_info = bli_thrinfo_sub_node( pb_info );
+		if ( !jc_info ) goto print_thrinfo;
 
-					if ( !ic_info )
-					{
-						ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
-						ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
-					}
-					else
-					{
-						ic_comm_id = bli_thread_ocomm_id( ic_info );
-						ic_work_id = bli_thread_work_id( ic_info );
-						pa_info = bli_thrinfo_sub_node( ic_info );
+		jc_comm_id = bli_thread_ocomm_id( jc_info );
+		jc_work_id = bli_thread_work_id( jc_info );
+		pc_info    = bli_thrinfo_sub_node( jc_info );
 
-						if ( !pa_info )
-						{
-							pa_comm_id = jr_comm_id = ir_comm_id = -1;
-							pa_work_id = jr_work_id = ir_work_id = -1;
-						}
-						else
-						{
-							pa_comm_id = bli_thread_ocomm_id( pa_info );
-							pa_work_id = bli_thread_work_id( pa_info );
-							jr_info = bli_thrinfo_sub_node( pa_info );
+		if ( !pc_info ) goto print_thrinfo;
 
-							if ( !jr_info )
-							{
-								jr_comm_id = ir_comm_id = -1;
-								jr_work_id = ir_work_id = -1;
-							}
-							else
-							{
-								jr_comm_id = bli_thread_ocomm_id( jr_info );
-								jr_work_id = bli_thread_work_id( jr_info );
-								ir_info = bli_thrinfo_sub_node( jr_info );
+		pc_comm_id = bli_thread_ocomm_id( pc_info );
+		pc_work_id = bli_thread_work_id( pc_info );
+		pb_info    = bli_thrinfo_sub_node( pc_info );
 
-								if ( !ir_info )
-								{
-									ir_comm_id = -1;
-									ir_work_id = -1;
-								}
-								else
-								{
-									ir_comm_id = bli_thread_ocomm_id( ir_info );
-									ir_work_id = bli_thread_work_id( ir_info );
-								}
-							}
-						}
-					}
-				}
-			}
-		}
+		if ( !pb_info ) goto print_thrinfo;
+
+		pb_comm_id = bli_thread_ocomm_id( pb_info );
+		pb_work_id = bli_thread_work_id( pb_info );
+		ic_info    = bli_thrinfo_sub_node( pb_info );
+
+		if ( !ic_info ) goto print_thrinfo;
+
+		ic_comm_id = bli_thread_ocomm_id( ic_info );
+		ic_work_id = bli_thread_work_id( ic_info );
+		pa_info    = bli_thrinfo_sub_node( ic_info );
+
+		if ( !pa_info ) goto print_thrinfo;
+
+		pa_comm_id = bli_thread_ocomm_id( pa_info );
+		pa_work_id = bli_thread_work_id( pa_info );
+		jr_info    = bli_thrinfo_sub_node( pa_info );
+
+		if ( !jr_info ) goto print_thrinfo;
+
+		jr_comm_id = bli_thread_ocomm_id( jr_info );
+		jr_work_id = bli_thread_work_id( jr_info );
+		ir_info    = bli_thrinfo_sub_node( jr_info );
+
+		if ( !ir_info ) goto print_thrinfo;
+
+		ir_comm_id = bli_thread_ocomm_id( ir_info );
+		ir_work_id = bli_thread_work_id( ir_info );
+
+		print_thrinfo:
 
-		//printf( "            gl   jc   pb   kc   pa   ic   jr  \n" );
-		//printf( "            gl   jc   kc   pb   ic   pa   jr  \n" );
 		printf( "comm ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
 		( long )jc_comm_id,
 		( long )pc_comm_id,
@@ -285,44 +277,105 @@ void bli_l3_thrinfo_print_trsm_paths
        thrinfo_t** threads
      )
 {
+	// In order to query the number of threads, we query the only thread we
+	// know exists: thread 0.
 	dim_t n_threads = bli_thread_num_threads( threads[0] );
-	dim_t gl_id;
-
-	thrinfo_t* jc_info  = threads[0];
-	thrinfo_t* pc_info  = bli_thrinfo_sub_node( jc_info );
-	thrinfo_t* pb_info  = bli_thrinfo_sub_node( pc_info );
-	thrinfo_t* ic_info  = bli_thrinfo_sub_node( pb_info );
-
-	thrinfo_t* pa_info  = bli_thrinfo_sub_node( ic_info );
-	thrinfo_t* jr_info  = bli_thrinfo_sub_node( pa_info );
-	thrinfo_t* ir_info  = bli_thrinfo_sub_node( jr_info );
-	thrinfo_t* pa_info0 = bli_thrinfo_sub_prenode( ic_info );
-	thrinfo_t* jr_info0 = ( pa_info0 ? bli_thrinfo_sub_node( pa_info0 ) : NULL );
-	thrinfo_t* ir_info0 = ( jr_info0 ? bli_thrinfo_sub_node( jr_info0 ) : NULL );
-
-	dim_t jc_way  = bli_thread_n_way( jc_info );
-	dim_t pc_way  = bli_thread_n_way( pc_info );
-	dim_t pb_way  = bli_thread_n_way( pb_info );
-	dim_t ic_way  = bli_thread_n_way( ic_info );
-
-	dim_t pa_way  = bli_thread_n_way( pa_info );
-	dim_t jr_way  = bli_thread_n_way( jr_info );
-	dim_t ir_way  = bli_thread_n_way( ir_info );
-	dim_t pa_way0 = ( pa_info0 ? bli_thread_n_way( pa_info0 ) : -1 );
-	dim_t jr_way0 = ( jr_info0 ? bli_thread_n_way( jr_info0 ) : -1 );
-	dim_t ir_way0 = ( ir_info0 ? bli_thread_n_way( ir_info0 ) : -1 );
-
-	dim_t jc_nt  = bli_thread_num_threads( jc_info );
-	dim_t pc_nt  = bli_thread_num_threads( pc_info );
-	dim_t pb_nt  = bli_thread_num_threads( pb_info );
-	dim_t ic_nt  = bli_thread_num_threads( ic_info );
-
-	dim_t pa_nt  = bli_thread_num_threads( pa_info );
-	dim_t jr_nt  = bli_thread_num_threads( jr_info );
-	dim_t ir_nt  = bli_thread_num_threads( ir_info );
-	dim_t pa_nt0 = ( pa_info0 ? bli_thread_num_threads( pa_info0 ) : -1 );
-	dim_t jr_nt0 = ( jr_info0 ? bli_thread_num_threads( jr_info0 ) : -1 );
-	dim_t ir_nt0 = ( ir_info0 ? bli_thread_num_threads( ir_info0 ) : -1 );
+
+	// For the purposes of printing the "header" information that is common
+	// to the various instances of a thrinfo_t (ie: across all threads), we
+	// choose the last thread in case the problem is so small that there is
+	// only an "edge" case, which will always be assigned to the last thread
+	// (at least for higher levels of partitioning).
+	thrinfo_t* jc_info  = threads[n_threads-1];
+	thrinfo_t* pc_info  = NULL;
+	thrinfo_t* pb_info  = NULL;
+	thrinfo_t* ic_info  = NULL;
+	thrinfo_t* pa_info  = NULL; thrinfo_t* pa_info0 = NULL;
+	thrinfo_t* jr_info  = NULL; thrinfo_t* jr_info0 = NULL;
+	thrinfo_t* ir_info  = NULL; thrinfo_t* ir_info0 = NULL;
+
+	// Initialize the n_ways and n_threads fields of each thrinfo_t "level"
+	// to -1. More than likely, these will all be overwritten with meaningful
+	// values, but in case some thrinfo_t trees are not fully built (see
+	// next commnet), these will be the placeholder values.
+	dim_t jc_way = -1, pc_way  = -1, pb_way  = -1, ic_way  = -1,
+	                   pa_way  = -1, jr_way  = -1, ir_way  = -1,
+	                   pa_way0 = -1, jr_way0 = -1, ir_way0 = -1;
+
+	dim_t jc_nt = -1,  pc_nt   = -1, pb_nt   = -1, ic_nt   = -1,
+	                   pa_nt   = -1, jr_nt   = -1, ir_nt   = -1,
+	                   pa_nt0  = -1, jr_nt0  = -1, ir_nt0  = -1;
+
+	// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
+	// may not fully build their thrinfo_t structures--specifically when the
+	// dimension being parallelized is not large enough for each thread to have
+	// even one unit of work (where as unit is usually a single micropanel's
+	// width, MR or NR).
+
+	if ( !jc_info ) goto print_header;
+
+	jc_way   = bli_thread_n_way( jc_info );
+	jc_nt    = bli_thread_num_threads( jc_info );
+	pc_info  = bli_thrinfo_sub_node( jc_info );
+
+	if ( !pc_info ) goto print_header;
+
+	pc_way   = bli_thread_n_way( pc_info );
+	pc_nt    = bli_thread_num_threads( pc_info );
+	pb_info  = bli_thrinfo_sub_node( pc_info );
+
+	if ( !pb_info ) goto print_header;
+
+	pb_way   = bli_thread_n_way( pb_info );
+	pb_nt    = bli_thread_num_threads( pb_info );
+	ic_info  = bli_thrinfo_sub_node( pb_info );
+
+	if ( !ic_info ) goto print_header;
+
+	ic_way   = bli_thread_n_way( ic_info );
+	ic_nt    = bli_thread_num_threads( ic_info );
+	pa_info  = bli_thrinfo_sub_node( ic_info );
+	pa_info0 = bli_thrinfo_sub_prenode( ic_info );
+
+	// check_header_prenode:
+
+	if ( !pa_info0 ) goto check_header_node;
+
+	pa_way0  = bli_thread_n_way( pa_info0 );
+	pa_nt0   = bli_thread_num_threads( pa_info0 );
+	jr_info0 = bli_thrinfo_sub_node( pa_info0 );
+
+	if ( !jr_info0 ) goto check_header_node;
+
+	jr_way0  = bli_thread_n_way( jr_info0 );
+	jr_nt0   = bli_thread_num_threads( jr_info0 );
+	ir_info0 = bli_thrinfo_sub_node( jr_info0 );
+
+	if ( !ir_info0 ) goto check_header_node;
+
+	ir_way0  = bli_thread_n_way( ir_info0 );
+	ir_nt0   = bli_thread_num_threads( ir_info0 );
+
+	check_header_node:
+
+	if ( !pa_info ) goto print_header;
+
+	pa_way  = bli_thread_n_way( pa_info );
+	pa_nt   = bli_thread_num_threads( pa_info );
+	jr_info = bli_thrinfo_sub_node( pa_info );
+
+	if ( !jr_info ) goto print_header;
+
+	jr_way  = bli_thread_n_way( jr_info );
+	jr_nt   = bli_thread_num_threads( jr_info );
+	ir_info = bli_thrinfo_sub_node( jr_info );
+
+	if ( !ir_info ) goto print_header;
+
+	ir_way  = bli_thread_n_way( ir_info );
+	ir_nt   = bli_thread_num_threads( ir_info );
+
+	print_header:
 
 	printf( "            jc   kc   pb   ic     pa     jr     ir\n" );
 	printf( "xx_nt:    %4ld %4ld %4ld %4ld  %2ld|%2ld  %2ld|%2ld  %2ld|%2ld\n",
@@ -343,26 +396,105 @@ void bli_l3_thrinfo_print_trsm_paths
 	( long )ir_way0, ( long )ir_way );
 	printf( "==================================================\n" );
 
-	dim_t jc_comm_id;
-	dim_t pc_comm_id;
-	dim_t pb_comm_id;
-	dim_t ic_comm_id;
-	dim_t pa_comm_id0, pa_comm_id;
-	dim_t jr_comm_id0, jr_comm_id;
-	dim_t ir_comm_id0, ir_comm_id;
-
-	dim_t jc_work_id;
-	dim_t pc_work_id;
-	dim_t pb_work_id;
-	dim_t ic_work_id;
-	dim_t pa_work_id0, pa_work_id;
-	dim_t jr_work_id0, jr_work_id;
-	dim_t ir_work_id0, ir_work_id;
-
-	for ( gl_id = 0; gl_id < n_threads; ++gl_id )
+
+	for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id )
 	{
 		jc_info = threads[gl_id];
 
+#if 1
+		// NOTE: This cpp branch contains code that is safe to execute
+		// for small problems that are parallelized enough that one or
+		// more threads gets no work.
+
+		dim_t jc_comm_id = -1, pc_comm_id  = -1, pb_comm_id  = -1, ic_comm_id  = -1,
+		                       pa_comm_id  = -1, jr_comm_id  = -1, ir_comm_id  = -1,
+		                       pa_comm_id0 = -1, jr_comm_id0 = -1, ir_comm_id0 = -1;
+
+		dim_t jc_work_id = -1, pc_work_id  = -1, pb_work_id  = -1, ic_work_id  = -1,
+		                       pa_work_id  = -1, jr_work_id  = -1, ir_work_id  = -1,
+		                       pa_work_id0 = -1, jr_work_id0 = -1, ir_work_id0 = -1;
+
+		if ( !jc_info ) goto print_thrinfo;
+
+		jc_comm_id = bli_thread_ocomm_id( jc_info );
+		jc_work_id = bli_thread_work_id( jc_info );
+		pc_info    = bli_thrinfo_sub_node( jc_info );
+
+		if ( !pc_info ) goto print_thrinfo;
+
+		pc_comm_id = bli_thread_ocomm_id( pc_info );
+		pc_work_id = bli_thread_work_id( pc_info );
+		pb_info    = bli_thrinfo_sub_node( pc_info );
+
+		if ( !pb_info ) goto print_thrinfo;
+
+		pb_comm_id = bli_thread_ocomm_id( pb_info );
+		pb_work_id = bli_thread_work_id( pb_info );
+		ic_info    = bli_thrinfo_sub_node( pb_info );
+
+		if ( !ic_info ) goto print_thrinfo;
+
+		ic_comm_id = bli_thread_ocomm_id( ic_info );
+		ic_work_id = bli_thread_work_id( ic_info );
+		pa_info    = bli_thrinfo_sub_node( ic_info );
+		pa_info0   = bli_thrinfo_sub_prenode( ic_info );
+
+		// check_thrinfo_prenode:
+
+		if ( !pa_info0 ) goto check_thrinfo_node;
+
+		pa_comm_id0 = bli_thread_ocomm_id( pa_info0 );
+		pa_work_id0 = bli_thread_work_id( pa_info0 );
+		jr_info0    = bli_thrinfo_sub_node( pa_info0 );
+
+		if ( !jr_info0 ) goto check_thrinfo_node;
+
+		jr_comm_id0 = bli_thread_ocomm_id( jr_info0 );
+		jr_work_id0 = bli_thread_work_id( jr_info0 );
+		ir_info0    = bli_thrinfo_sub_node( jr_info0 );
+
+		if ( !ir_info0 ) goto check_thrinfo_node;
+
+		ir_comm_id0 = bli_thread_ocomm_id( ir_info0 );
+		ir_work_id0 = bli_thread_work_id( ir_info0 );
+
+		check_thrinfo_node:
+
+		if ( !pa_info ) goto print_thrinfo;
+
+		pa_comm_id = bli_thread_ocomm_id( pa_info );
+		pa_work_id = bli_thread_work_id( pa_info );
+		jr_info    = bli_thrinfo_sub_node( pa_info );
+
+		if ( !jr_info ) goto print_thrinfo;
+
+		jr_comm_id = bli_thread_ocomm_id( jr_info );
+		jr_work_id = bli_thread_work_id( jr_info );
+		ir_info    = bli_thrinfo_sub_node( jr_info );
+
+		if ( !ir_info ) goto print_thrinfo;
+
+		ir_comm_id = bli_thread_ocomm_id( ir_info );
+		ir_work_id = bli_thread_work_id( ir_info );
+
+		print_thrinfo:
+#else
+		dim_t jc_comm_id;
+		dim_t pc_comm_id;
+		dim_t pb_comm_id;
+		dim_t ic_comm_id;
+		dim_t pa_comm_id0, pa_comm_id;
+		dim_t jr_comm_id0, jr_comm_id;
+		dim_t ir_comm_id0, ir_comm_id;
+
+		dim_t jc_work_id;
+		dim_t pc_work_id;
+		dim_t pb_work_id;
+		dim_t ic_work_id;
+		dim_t pa_work_id0, pa_work_id;
+		dim_t jr_work_id0, jr_work_id;
+		dim_t ir_work_id0, ir_work_id;
+
 		// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
 		// may not fully build their thrinfo_t structures--specifically when the
 		// dimension being parallelized is not large enough for each thread to have
@@ -488,6 +620,7 @@ void bli_l3_thrinfo_print_trsm_paths
 				}
 			}
 		}
+#endif
 
 		printf( "comm ids: %4ld %4ld %4ld %4ld  %2ld|%2ld  %2ld|%2ld  %2ld|%2ld\n",
 		( long )jc_comm_id,
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 818534b1e95c6f70cd5f8f666cd6e925df71ac62..fae7b5f6ed951d757a22b57bd2bd76588bc980e5 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -648,6 +648,22 @@ static void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id,
 	bmults[ bs_id ] = mult_id;
 }
 
+static void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
+{
+	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
+	blksz_t* blksz  = &blkszs[ bs_id ];
+
+	bli_blksz_set_def( bs, dt, blksz );
+}
+
+static void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
+{
+	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
+	blksz_t* blksz  = &blkszs[ bs_id ];
+
+	bli_blksz_set_max( bs, dt, blksz );
+}
+
 static void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx )
 {
 	func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index fce31bcfa24dd5a77a1a245b1f439494dc9de7ec..c54592377c9bbba0a88421040a17aa53baa283b6 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -141,7 +141,7 @@ void bli_gks_init( void )
 		                                              bli_cntx_init_cortexa57_ind );
 #endif
 #ifdef BLIS_CONFIG_CORTEXA53
-		bli_gks_register_cntx( BLIS_ARCH_CORTEXA57,   bli_cntx_init_cortexa53,
+		bli_gks_register_cntx( BLIS_ARCH_CORTEXA53,   bli_cntx_init_cortexa53,
 		                                              bli_cntx_init_cortexa53_ref,
 		                                              bli_cntx_init_cortexa53_ind );
 #endif
diff --git a/ref_kernels/ind/bli_gemmtrsm4m1_ref.c b/ref_kernels/ind/bli_gemmtrsm4m1_ref.c
index 1b2205c8d711d95f9720221b337435ce7c785018..0988c457da04a4f63dcad8a1b5978373fdc536ec 100644
--- a/ref_kernels/ind/bli_gemmtrsm4m1_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm4m1_ref.c
@@ -84,6 +84,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	ctype_r* restrict one_r       = PASTEMAC(chr,1); \
 	ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
+\
+	/* A hack to avoid a 'restrict' warning triggered by passing in the
+	   same address (one_r) for both alpha and beta when calling the last
+	   of the four matrix products. We now use one_r for alpha and this
+	   new local variable, onel, for beta. (See issue #328.) */ \
+	ctype_r           onel; \
+	ctype_r* restrict onel_r      = &onel; \
+	PASTEMAC(chr,set1s)( onel ); \
 \
 	ctype_r           alpha_r     = PASTEMAC(ch,real)( *alpha ); \
 	ctype_r           alpha_i     = PASTEMAC(ch,imag)( *alpha ); \
@@ -187,7 +195,7 @@ PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: bx111p_i", k+m, n, \
 	  one_r, \
 	  a1x_i, \
 	  bx1_i, \
-	  one_r, \
+	  onel_r, \
 	  b11_r, rs_b, cs_b, \
 	  data, \
 	  cntx  \
diff --git a/test/1m4m/Makefile b/test/1m4m/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..74c0804cac56363e5d3669b0728295ff6a960c79
--- /dev/null
+++ b/test/1m4m/Makefile
@@ -0,0 +1,515 @@
+#!/bin/bash
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas at Austin
+#  Copyright (C) 2018, Advanced Micro Devices, Inc.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+#
+# Makefile
+#
+# Field G. Van Zee
+#
+# Makefile for standalone BLIS test drivers.
+#
+
+#
+# --- Makefile PHONY target definitions ----------------------------------------
+#
+
+.PHONY: all \
+        clean cleanx
+
+
+
+#
+# --- Determine makefile fragment location -------------------------------------
+#
+
+# Comments:
+# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
+# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
+#   the second case because CONFIG_NAME is not yet set.
+ifneq ($(strip $(BLIS_INSTALL_PATH)),)
+LIB_PATH   := $(BLIS_INSTALL_PATH)/lib
+INC_PATH   := $(BLIS_INSTALL_PATH)/include/blis
+SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
+else
+DIST_PATH  := ../..
+LIB_PATH    = ../../lib/$(CONFIG_NAME)
+INC_PATH    = ../../include/$(CONFIG_NAME)
+SHARE_PATH := ../..
+endif
+
+
+
+#
+# --- Include common makefile definitions --------------------------------------
+#
+
+# Include the common makefile fragment.
+-include $(SHARE_PATH)/common.mk
+
+
+
+#
+# --- BLAS implementations -----------------------------------------------------
+#
+
+# BLAS library path(s). This is where the BLAS libraries reside.
+HOME_LIB_PATH  := $(HOME)/flame/lib
+
+# OpenBLAS
+OPENBLAS_LIB   := $(HOME_LIB_PATH)/libopenblas.a
+OPENBLASP_LIB  := $(HOME_LIB_PATH)/libopenblasp.a
+
+# ATLAS
+#ATLAS_LIB      := $(HOME_LIB_PATH)/libf77blas.a \
+#                  $(HOME_LIB_PATH)/libatlas.a
+
+# Eigen
+EIGEN_INC      := $(HOME)/flame/eigen/include/eigen3
+EIGEN_LIB      := $(HOME_LIB_PATH)/libeigen_blas_static.a
+EIGENP_LIB     := $(EIGEN_LIB)
+
+# MKL
+MKL_LIB_PATH   := $(HOME)/intel/mkl/lib/intel64
+MKL_LIB        := -L$(MKL_LIB_PATH) \
+                  -lmkl_intel_lp64 \
+                  -lmkl_core \
+                  -lmkl_sequential \
+                  -lpthread -lm -ldl
+#MKLP_LIB       := -L$(MKL_LIB_PATH) \
+#                  -lmkl_intel_thread \
+#                  -lmkl_core \
+#                  -lmkl_intel_ilp64 \
+#                  -L$(ICC_LIB_PATH) \
+#                  -liomp5
+MKLP_LIB       := -L$(MKL_LIB_PATH) \
+                  -lmkl_intel_lp64 \
+                  -lmkl_core \
+                  -lmkl_gnu_thread \
+                  -lpthread -lm -ldl -fopenmp
+                  #-L$(ICC_LIB_PATH) \
+                  #-lgomp
+
+VENDOR_LIB     := $(MKL_LIB)
+VENDORP_LIB    := $(MKLP_LIB)
+
+
+#
+# --- Problem size definitions -------------------------------------------------
+#
+
+# Single core (single-threaded)
+PS_BEGIN := 48
+PS_MAX   := 2400
+PS_INC   := 48
+
+# Single-socket (multithreaded)
+P1_BEGIN := 96
+P1_MAX   := 4800
+P1_INC   := 96
+
+# Dual-socket (multithreaded)
+P2_BEGIN := 144
+P2_MAX   := 7200
+P2_INC   := 144
+
+
+#
+# --- General build definitions ------------------------------------------------
+#
+
+TEST_SRC_PATH  := .
+TEST_OBJ_PATH  := .
+
+# Gather all local object files.
+TEST_OBJS      := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
+                                    $(TEST_OBJ_PATH)/%.o, \
+                                    $(wildcard $(TEST_SRC_PATH)/*.c)))
+
+# Override the value of CINCFLAGS so that the value of CFLAGS returned by
+# get-user-cflags-for() is not cluttered up with include paths needed only
+# while building BLIS.
+CINCFLAGS      := -I$(INC_PATH)
+
+# Use the "framework" CFLAGS for the configuration family.
+CFLAGS         := $(call get-user-cflags-for,$(CONFIG_NAME))
+
+# Add local header paths to CFLAGS.
+CFLAGS         += -I$(TEST_SRC_PATH)
+
+# Locate the libblis library to which we will link.
+#LIBBLIS_LINK   := $(LIB_PATH)/$(LIBBLIS_L)
+
+# Define a set of CFLAGS for use with C++ and Eigen.
+CXXFLAGS       := $(subst -std=c99,-std=c++11,$(CFLAGS))
+CXXFLAGS       += -I$(EIGEN_INC)
+
+# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading.
+CXXFLAGS_ST    := -march=native $(subst -fopenmp,,$(CXXFLAGS))
+CXXFLAGS_MT    := -march=native $(CXXFLAGS)
+
+
+# Which library?
+BLI_DEF  := -DBLIS
+BLA_DEF  := -DBLAS
+EIG_DEF  := -DEIGEN
+
+# Complex implementation type
+D3MHW    := -DIND=BLIS_3MH
+D3M1     := -DIND=BLIS_3M1
+D4MHW    := -DIND=BLIS_4MH
+D4M1B    := -DIND=BLIS_4M1B
+D4M1A    := -DIND=BLIS_4M1A
+D1M      := -DIND=BLIS_1M
+DNAT     := -DIND=BLIS_NAT
+
+# Implementation string
+#STR_3MHW := -DSTR=\"3mhw\"
+#STR_3M1  := -DSTR=\"3m1\"
+#STR_4MHW := -DSTR=\"4mhw\"
+#STR_4M1B := -DSTR=\"4m1b\"
+STR_4M1A := -DSTR=\"4m1a_blis\"
+STR_1M   := -DSTR=\"1m_blis\"
+STR_NAT  := -DSTR=\"asm_blis\"
+STR_OBL  := -DSTR=\"openblas\"
+STR_EIG  := -DSTR=\"eigen\"
+STR_VEN  := -DSTR=\"vendor\"
+
+# Single or multithreaded string
+STR_ST   := -DTHR_STR=\"st\"
+STR_1S   := -DTHR_STR=\"1s\"
+STR_2S   := -DTHR_STR=\"2s\"
+
+# Problem size specification
+PDEF_ST  := -DP_BEGIN=$(PS_BEGIN)  -DP_INC=$(PS_INC)  -DP_MAX=$(PS_MAX)
+PDEF_1S  := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX)
+PDEF_2S  := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX)
+
+
+
+#
+# --- Targets/rules ------------------------------------------------------------
+#
+
+all:        all-st all-1s all-2s
+blis:       blis-st blis-1s blis-2s
+openblas:   openblas-st openblas-1s openblas-2s
+eigen:      eigen-st eigen-1s eigen-2s
+vendor:     vendor-st vendor-1s vendor-2s
+mkl:        vendor
+armpl:      vendor
+
+all-st:     blis-st openblas-st mkl-st
+all-1s:     blis-1s openblas-1s mkl-1s
+all-2s:     blis-2s openblas-2s mkl-2s
+
+blis-st:    blis-nat-st blis-1m-st blis-4m1a-st
+blis-1s:    blis-nat-1s blis-1m-1s blis-4m1a-1s
+blis-2s:    blis-nat-2s blis-1m-2s blis-4m1a-2s
+
+#blis-ind:   blis-ind-st blis-ind-mt
+blis-nat:   blis-nat-st  blis-nat-1s  blis-nat-2s
+blis-1m:    blis-1m-st   blis-1m-1s   blis-1m-2s
+blis-4m1a:  blis-4m1a-st blis-4m1a-1s blis-4m1a-2s
+
+# Define the datatypes, operations, and implementations.
+DTS    := s d c z
+OPS    := gemm
+BIMPLS := asm_blis 4m1a_blis 1m_blis openblas vendor
+EIMPLS := eigen
+
+# Define functions to construct object filenames from the datatypes and
+# operations given an implementation. We define one function for single-
+# threaded, single-socket, and dual-socket filenames.
+get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o))
+get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o))
+get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o))
+
+# Construct object and binary names for single-threaded, single-socket, and
+# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL).
+BLIS_1M_ST_OBJS := $(call get-st-objs,1m_blis)
+BLIS_1M_ST_BINS := $(patsubst %.o,%.x,$(BLIS_1M_ST_OBJS))
+BLIS_1M_1S_OBJS := $(call get-1s-objs,1m_blis)
+BLIS_1M_1S_BINS := $(patsubst %.o,%.x,$(BLIS_1M_1S_OBJS))
+BLIS_1M_2S_OBJS := $(call get-2s-objs,1m_blis)
+BLIS_1M_2S_BINS := $(patsubst %.o,%.x,$(BLIS_1M_2S_OBJS))
+
+BLIS_4M1A_ST_OBJS := $(call get-st-objs,4m1a_blis)
+BLIS_4M1A_ST_BINS := $(patsubst %.o,%.x,$(BLIS_4M1A_ST_OBJS))
+BLIS_4M1A_1S_OBJS := $(call get-1s-objs,4m1a_blis)
+BLIS_4M1A_1S_BINS := $(patsubst %.o,%.x,$(BLIS_4M1A_1S_OBJS))
+BLIS_4M1A_2S_OBJS := $(call get-2s-objs,4m1a_blis)
+BLIS_4M1A_2S_BINS := $(patsubst %.o,%.x,$(BLIS_4M1A_2S_OBJS))
+
+BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis)
+BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS))
+BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis)
+BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS))
+BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis)
+BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS))
+
+OPENBLAS_ST_OBJS := $(call get-st-objs,openblas)
+OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
+OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas)
+OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS))
+OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas)
+OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS))
+
+EIGEN_ST_OBJS    := $(call get-st-objs,eigen)
+EIGEN_ST_BINS    := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
+EIGEN_1S_OBJS    := $(call get-1s-objs,eigen)
+EIGEN_1S_BINS    := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS))
+EIGEN_2S_OBJS    := $(call get-2s-objs,eigen)
+EIGEN_2S_BINS    := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS))
+
+VENDOR_ST_OBJS   := $(call get-st-objs,vendor)
+VENDOR_ST_BINS   := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
+VENDOR_1S_OBJS   := $(call get-1s-objs,vendor)
+VENDOR_1S_BINS   := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS))
+VENDOR_2S_OBJS   := $(call get-2s-objs,vendor)
+VENDOR_2S_BINS   := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS))
+
+# Define some targets associated with the above object/binary files.
+blis-nat-st: $(BLIS_NAT_ST_BINS)
+blis-nat-1s: $(BLIS_NAT_1S_BINS)
+blis-nat-2s: $(BLIS_NAT_2S_BINS)
+
+blis-1m-st: $(BLIS_1M_ST_BINS)
+blis-1m-1s: $(BLIS_1M_1S_BINS)
+blis-1m-2s: $(BLIS_1M_2S_BINS)
+
+blis-4m1a-st: $(BLIS_4M1A_ST_BINS)
+blis-4m1a-1s: $(BLIS_4M1A_1S_BINS)
+blis-4m1a-2s: $(BLIS_4M1A_2S_BINS)
+
+openblas-st: $(OPENBLAS_ST_BINS)
+openblas-1s: $(OPENBLAS_1S_BINS)
+openblas-2s: $(OPENBLAS_2S_BINS)
+
+eigen-st: $(EIGEN_ST_BINS)
+eigen-1s: $(EIGEN_1S_BINS)
+eigen-2s: $(EIGEN_2S_BINS)
+
+vendor-st: $(VENDOR_ST_BINS)
+vendor-1s: $(VENDOR_1S_BINS)
+vendor-2s: $(VENDOR_2S_BINS)
+
+mkl-st: vendor-st
+mkl-1s: vendor-1s
+mkl-2s: vendor-2s
+
+armpl-st: vendor-st
+armpl-1s: vendor-1s
+armpl-2s: vendor-2s
+
+# Mark the object files as intermediate so that make will remove them
+# automatically after building the binaries on which they depend.
+.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS)
+.INTERMEDIATE: $(BLIS_1M_ST_OBJS)  $(BLIS_1M_1S_OBJS)  $(BLIS_1M_2S_OBJS)
+.INTERMEDIATE: $(BLIS_4M1A_ST_OBJS) $(BLIS_4M1A_1S_OBJS) $(BLIS_4M1A_2S_OBJS)
+.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS)
+.INTERMEDIATE: $(EIGEN_ST_OBJS)    $(EIGEN_1S_OBJS)    $(EIGEN_2S_OBJS)
+.INTERMEDIATE: $(VENDOR_ST_OBJS)   $(VENDOR_1S_OBJS)   $(VENDOR_2S_OBJS)
+
+
+# --Object file rules --
+
+#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
+#	$(CC) $(CFLAGS) -c $< -o $@
+
+# A function to return the datatype cpp macro def from the datatype
+# character.
+get-dt-cpp = $(strip \
+             $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT    -DIS_FLOAT,\
+             $(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE   -DIS_DOUBLE,\
+             $(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
+                                       -DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
+
+get-in-cpp = $(strip \
+             $(if $(findstring   1m_blis,$(1)),-DIND=BLIS_1M,\
+             $(if $(findstring 4m1a_blis,$(1)),-DIND=BLIS_4M1A,\
+                                               -DIND=BLIS_NAT)))
+
+# A function to return other cpp macros that help the test driver
+# identify the implementation.
+#get-bl-cpp = $(strip \
+#             $(if $(findstring     blis,$(1)),$(STR_NAT) $(BLI_DEF),\
+#             $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
+#             $(if $(findstring    eigen,$(1)),$(STR_EIG) $(EIG_DEF),\
+#                                              $(STR_VEN) $(BLA_DEF)))))
+
+get-bl-cpp = $(strip \
+             $(if $(findstring   1m_blis,$(1)),$(STR_1M) $(BLI_DEF),\
+             $(if $(findstring 4m1a_blis,$(1)),$(STR_4M1A) $(BLI_DEF),\
+             $(if $(findstring  asm_blis,$(1)),$(STR_NAT) $(BLI_DEF),\
+             $(if $(findstring  openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
+             $(if $(and $(findstring eigen,$(1)),\
+                        $(findstring  gemm,$(2))),\
+                                              $(STR_EIG) $(EIG_DEF),\
+             $(if       $(findstring eigen,$(1)),\
+                                              $(STR_EIG) $(BLA_DEF),\
+                                              $(STR_VEN) $(BLA_DEF))))))))
+
+
+# Rules for BLIS and BLAS libraries.
+define make-st-rule
+test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
+	$(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(call get-in-cpp,$(3)) $(STR_ST) -c $$< -o $$@
+endef
+
+define make-1s-rule
+test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
+	$(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(call get-in-cpp,$(3)) $(STR_1S) -c $$< -o $$@
+endef
+
+define make-2s-rule
+test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
+	$(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(call get-in-cpp,$(3)) $(STR_2S) -c $$< -o $$@
+endef
+
+$(foreach dt,$(DTS), \
+$(foreach op,$(OPS), \
+$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im))))))
+
+$(foreach dt,$(DTS), \
+$(foreach op,$(OPS), \
+$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im))))))
+
+$(foreach dt,$(DTS), \
+$(foreach op,$(OPS), \
+$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im))))))
+
+# Rules for Eigen.
+define make-eigst-rule
+test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
+	$(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
+endef
+
+define make-eig1s-rule
+test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
+	$(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
+endef
+
+define make-eig2s-rule
+test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
+	$(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
+endef
+
+$(foreach dt,$(DTS), \
+$(foreach op,$(OPS), \
+$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im))))))
+
+$(foreach dt,$(DTS), \
+$(foreach op,$(OPS), \
+$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im))))))
+
+$(foreach dt,$(DTS), \
+$(foreach op,$(OPS), \
+$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im))))))
+
+
+# -- Executable file rules --
+
+# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
+# on the link command line in case BLIS was configured with the BLAS
+# compatibility layer. This prevents BLIS from inadvertently getting called
+# for the BLAS routines we are trying to test with.
+
+test_%_$(PS_MAX)_1m_blis_st.x: test_%_$(PS_MAX)_1m_blis_st.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P1_MAX)_1m_blis_1s.x: test_%_$(P1_MAX)_1m_blis_1s.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P2_MAX)_1m_blis_2s.x: test_%_$(P2_MAX)_1m_blis_2s.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+
+test_%_$(PS_MAX)_4m1a_blis_st.x: test_%_$(PS_MAX)_4m1a_blis_st.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P1_MAX)_4m1a_blis_1s.x: test_%_$(P1_MAX)_4m1a_blis_1s.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P2_MAX)_4m1a_blis_2s.x: test_%_$(P2_MAX)_4m1a_blis_2s.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+
+test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<                    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+
+test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<   $(OPENBLAS_LIB)  $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<   $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK)
+	$(CC) $(strip $<   $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+
+test_%_$(PS_MAX)_eigen_st.x:    test_%_$(PS_MAX)_eigen_st.o    $(LIBBLIS_LINK)
+	$(CXX) $(strip $<  $(EIGEN_LIB)     $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P1_MAX)_eigen_1s.x:    test_%_$(P1_MAX)_eigen_1s.o    $(LIBBLIS_LINK)
+	$(CXX) $(strip $<  $(EIGENP_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P2_MAX)_eigen_2s.x:    test_%_$(P2_MAX)_eigen_2s.o    $(LIBBLIS_LINK)
+	$(CXX) $(strip $<  $(EIGENP_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+
+test_%_$(PS_MAX)_vendor_st.x:   test_%_$(PS_MAX)_vendor_st.o   $(LIBBLIS_LINK)
+	$(CC) $(strip $<   $(VENDOR_LIB)    $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P1_MAX)_vendor_1s.x:   test_%_$(P1_MAX)_vendor_1s.o   $(LIBBLIS_LINK)
+	$(CC) $(strip $<   $(VENDORP_LIB)   $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+test_%_$(P2_MAX)_vendor_2s.x:   test_%_$(P2_MAX)_vendor_2s.o   $(LIBBLIS_LINK)
+	$(CC) $(strip $<   $(VENDORP_LIB)   $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
+
+# -- Clean rules --
+
+clean: cleanx
+
+cleanx:
+	- $(RM_F) *.o *.x
+
diff --git a/test/1m4m/runme.sh b/test/1m4m/runme.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d79d539259cffa4e8d4ebaa97fca572a2b897034
--- /dev/null
+++ b/test/1m4m/runme.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+
+# File pefixes.
+exec_root="test"
+out_root="output"
+delay=0.1
+
+#sys="blis"
+#sys="stampede2"
+sys="lonestar5"
+#sys="ul252"
+#sys="ul264"
+
+# Bind threads to processors.
+#export OMP_PROC_BIND=true
+#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
+#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"
+
+if [ ${sys} = "blis" ]; then
+
+	export GOMP_CPU_AFFINITY="0 1 2 3"
+
+	threads="jc1ic1jr1_2400
+	         jc2ic3jr2_6000
+	         jc4ic3jr2_8000"
+
+elif [ ${sys} = "stampede2" ]; then
+
+	echo "Need to set GOMP_CPU_AFFINITY."
+	exit 1
+
+	threads="jc1ic1jr1_2400
+	         jc4ic6jr1_6000
+	         jc4ic12jr1_8000"
+
+elif [ ${sys} = "lonestar5" ]; then
+
+	export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23"
+
+	# A hack to use libiomp5 with gcc.
+	#export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64"
+
+	#threads="jc1ic1jr1_2400
+	#         jc2ic3jr2_4800
+	#         jc4ic3jr2_9600"
+	threads="jc1ic1jr1_2400
+	         jc4ic3jr2_7200"
+	threads="jc4ic3jr2_7200"
+
+elif [ ${sys} = "ul252" ]; then
+
+	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
+	export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51"
+
+	threads="jc1ic1jr1_2400
+	         jc2ic13jr1_6000
+	         jc4ic13jr1_8000"
+
+elif [ ${sys} = "ul264" ]; then
+
+	export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
+	export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63"
+
+	threads="jc1ic1jr1_2400
+	         jc1ic8jr4_6000
+	         jc2ic8jr4_8000"
+
+fi
+
+# Datatypes to test.
+test_dts="s d c z"
+
+# Operations to test.
+#test_ops="gemm hemm herk trmm trsm"
+test_ops="gemm"
+
+# Implementations to test.
+#impls="blis"
+#impls="other"
+#impls="eigen"
+impls="all"
+
+if [ "${impls}" = "blis" ]; then
+
+	test_impls="asm_blis"
+
+elif [ "${impls}" = "eigen" ]; then
+
+	test_impls="eigen"
+
+elif [ "${impls}" = "other" ]; then
+
+	test_impls="openblas vendor"
+
+elif [ "${impls}" = "eigen" ]; then
+
+	test_impls="eigen"
+
+else
+
+	test_impls="openblas vendor asm_blis 4m1a_blis 1m_blis"
+	#test_impls="openblas"
+	#test_impls="asm_blis 4m1a_blis 1m_blis"
+	#test_impls="asm_blis 1m_blis"
+fi
+
+# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
+# restore the value.
+GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
+
+
+# First perform real test cases.
+for th in ${threads}; do
+
+	# Start with one way of parallelism in each loop. We will now begin
+	# parsing the 'th' variable to update one or more of these threading
+	# parameters.
+	jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1
+
+	# Strip everything before and after the underscore so that what remains
+	# is the problem size and threading parameter string, respectively.
+	psize=${th##*_}; thinfo=${th%%_*}
+
+	# Identify each threading parameter and insert a space before it.
+	thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" )
+
+	nt=1
+
+	for loopnum in ${thsep}; do
+
+		# Given the current string, which identifies a loop and the
+		# number of ways of parallelism for that loop, strip out
+		# the ways and loop separately to identify each.
+		loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" )
+		num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" )
+
+		# Construct a string that we can evaluate to set the number
+		# of ways of parallelism for the current loop.
+		loop_nt_eq_num="${loop}_nt=${num}"
+
+		# Update the total number of threads.
+		nt=$(expr ${nt} \* ${num})
+
+		# Evaluate the string to assign the ways to the variable.
+		eval ${loop_nt_eq_num}
+
+	done
+
+	echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}"
+
+
+	for dt in ${test_dts}; do
+
+		for im in ${test_impls}; do
+
+			if [ "${dt}" = "s"       -o "${dt}" = "d"         ] && \
+			   [ "${im}" = "1m_blis" -o "${im}" = "4m1a_blis" ]; then
+				continue
+			fi
+
+			for op in ${test_ops}; do
+
+				# Eigen does not support multithreading for hemm, herk, trmm,
+				# or trsm. So if we're getting ready to execute an Eigen driver
+				# for one of these operations and nt > 1, we skip this test.
+				if [ "${im}"  = "eigen" ] && \
+				   [ "${op}" != "gemm"  ] && \
+				   [ "${nt}" != "1"     ]; then
+					continue;
+				fi
+
+				# Find the threading suffix by probing the executable.
+				binname=$(ls ${exec_root}_${dt}${op}_${psize}_${im}_*.x)
+				suf_ext=${binname##*_}
+				suf=${suf_ext%%.*}
+
+				#echo "found file: ${binname} with suffix ${suf}"
+
+				# Set the number of threads according to th.
+				if [ "${suf}" = "1s" ] || [ "${suf}" = "2s" ]; then
+
+					# Set the threading parameters based on the implementation
+					# that we are preparing to run.
+					if   [ "${im}" = "asm_blis" ]; then
+						unset  OMP_NUM_THREADS
+						export BLIS_JC_NT=${jc_nt}
+						export BLIS_PC_NT=${pc_nt}
+						export BLIS_IC_NT=${ic_nt}
+						export BLIS_JR_NT=${jr_nt}
+						export BLIS_IR_NT=${ir_nt}
+					elif [ "${im}" = "openblas" ]; then
+						unset  OMP_NUM_THREADS
+						export OPENBLAS_NUM_THREADS=${nt}
+					elif [ "${im}" = "eigen" ]; then
+						export OMP_NUM_THREADS=${nt}
+					elif [ "${im}" = "vendor" ]; then
+						unset  OMP_NUM_THREADS
+						export MKL_NUM_THREADS=${nt}
+					fi
+					export nt_use=${nt}
+
+					# Multithreaded OpenBLAS seems to have a problem running
+					# properly if GOMP_CPU_AFFINITY is set. So we temporarily
+					# unset it here if we are about to execute OpenBLAS, but
+					# otherwise restore it.
+					if [ ${im} = "openblas" ]; then
+						unset GOMP_CPU_AFFINITY
+					else
+						export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
+					fi
+				else
+
+					export BLIS_JC_NT=1
+					export BLIS_PC_NT=1
+					export BLIS_IC_NT=1
+					export BLIS_JR_NT=1
+					export BLIS_IR_NT=1
+					export OMP_NUM_THREADS=1
+					export OPENBLAS_NUM_THREADS=1
+					export MKL_NUM_THREADS=1
+					export nt_use=1
+				fi
+
+				# Construct the name of the test executable.
+				exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${suf}.x"
+
+				# Construct the name of the output file.
+				out_file="${out_root}_${suf}_${dt}${op}_${im}.m"
+
+				#echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
+				echo "Running ./${exec_name} > ${out_file}"
+
+				# Run executable.
+				./${exec_name} > ${out_file}
+
+				sleep ${delay}
+
+			done
+		done
+	done
+done
+
diff --git a/test/1m4m/test_gemm.c b/test/1m4m/test_gemm.c
new file mode 100644
index 0000000000000000000000000000000000000000..a58e6e58935773bd3f077ed185b3d14a90dbabb2
--- /dev/null
+++ b/test/1m4m/test_gemm.c
@@ -0,0 +1,425 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#ifdef EIGEN
+  #define BLIS_DISABLE_BLAS_DEFS
+  #include "blis.h"
+  #include <Eigen/Core>
+  #include <Eigen/src/misc/blas.h>
+  using namespace Eigen;
+#else
+  #include "blis.h"
+#endif
+
+#define COL_STORAGE
+//#define ROW_STORAGE
+
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+	obj_t    a, b, c;
+	obj_t    c_save;
+	obj_t    alpha, beta;
+	dim_t    m, n, k;
+	dim_t    p;
+	dim_t    p_begin, p_max, p_inc;
+	int      m_input, n_input, k_input;
+	ind_t    ind;
+	num_t    dt;
+	char     dt_ch;
+	int      r, n_repeats;
+	trans_t  transa;
+	trans_t  transb;
+	f77_char f77_transa;
+	f77_char f77_transb;
+
+	double   dtime;
+	double   dtime_save;
+	double   gflops;
+
+	//bli_init();
+
+	bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
+
+	n_repeats = 3;
+
+	dt      = DT;
+
+	ind     = IND;
+
+#if 1
+	p_begin = P_BEGIN;
+	p_max   = P_MAX;
+	p_inc   = P_INC;
+
+	m_input = -1;
+	n_input = -1;
+	k_input = -1;
+#else
+	p_begin = 40;
+	p_max   = 2000;
+	p_inc   = 40;
+
+	m_input = -1;
+	n_input = -1;
+	k_input = -1;
+#endif
+
+
+	// Supress compiler warnings about unused variable 'ind'.
+	( void )ind;
+
+#if 0
+
+	cntx_t* cntx;
+
+	ind_t ind_mod = ind;
+
+	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
+	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
+
+	// Initialize a context for the current induced method and datatype.
+	cntx = bli_gks_query_ind_cntx( ind_mod, dt );
+
+	// Set k to the kc blocksize for the current datatype.
+	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
+
+#elif 0
+
+	#ifdef BLIS
+	if      ( ind == BLIS_4M1A ) k_input = 128;
+	else if ( ind == BLIS_1M )   k_input = 128;
+	else                         k_input = 256;
+	#else
+	k_input = 192;
+	#endif
+
+#endif
+
+	// Choose the char corresponding to the requested datatype.
+	if      ( bli_is_float( dt ) )    dt_ch = 's';
+	else if ( bli_is_double( dt ) )   dt_ch = 'd';
+	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
+	else                              dt_ch = 'z';
+
+	transa = BLIS_NO_TRANSPOSE;
+	transb = BLIS_NO_TRANSPOSE;
+
+	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
+	bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
+
+	// Begin with initializing the last entry to zero so that
+	// matlab allocates space for the entire array once up-front.
+	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
+
+	printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
+	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
+	        ( unsigned long )0,
+	        ( unsigned long )0,
+	        ( unsigned long )0, 0.0 );
+
+
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
+	{
+
+		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
+		else               m =     ( dim_t )    m_input;
+		if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
+		else               n =     ( dim_t )    n_input;
+		if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
+		else               k =     ( dim_t )    k_input;
+
+		bli_obj_create( dt, 1, 1, 0, 0, &alpha );
+		bli_obj_create( dt, 1, 1, 0, 0, &beta );
+
+	#ifdef COL_STORAGE
+		bli_obj_create( dt, m, k, 0, 0, &a );
+		bli_obj_create( dt, k, n, 0, 0, &b );
+		bli_obj_create( dt, m, n, 0, 0, &c );
+		bli_obj_create( dt, m, n, 0, 0, &c_save );
+	#else
+		bli_obj_create( dt, m, k, k, 1, &a );
+		bli_obj_create( dt, k, n, n, 1, &b );
+		bli_obj_create( dt, m, n, n, 1, &c );
+		bli_obj_create( dt, m, n, n, 1, &c_save );
+	#endif
+
+		bli_randm( &a );
+		bli_randm( &b );
+		bli_randm( &c );
+
+		bli_obj_set_conjtrans( transa, &a );
+		bli_obj_set_conjtrans( transb, &b );
+
+		bli_setsc(  (1.0/1.0), 0.0, &alpha );
+		bli_setsc(  (1.0/1.0), 0.0, &beta );
+
+		bli_copym( &c, &c_save );
+	
+#ifdef BLIS
+		bli_ind_disable_all_dt( dt );
+		bli_ind_enable_dt( ind, dt );
+#endif
+
+#ifdef EIGEN
+		double alpha_r, alpha_i;
+
+		bli_getsc( &alpha, &alpha_r, &alpha_i );
+
+		void* ap = bli_obj_buffer_at_off( &a );
+		void* bp = bli_obj_buffer_at_off( &b );
+		void* cp = bli_obj_buffer_at_off( &c );
+
+	#ifdef COL_STORAGE
+		const int os_a = bli_obj_col_stride( &a );
+		const int os_b = bli_obj_col_stride( &b );
+		const int os_c = bli_obj_col_stride( &c );
+	#else
+		const int os_a = bli_obj_row_stride( &a );
+		const int os_b = bli_obj_row_stride( &b );
+		const int os_c = bli_obj_row_stride( &c );
+	#endif
+
+		Stride<Dynamic,1> stride_a( os_a, 1 );
+		Stride<Dynamic,1> stride_b( os_b, 1 );
+		Stride<Dynamic,1> stride_c( os_c, 1 );
+
+	#ifdef COL_STORAGE
+		#if defined(IS_FLOAT)
+		typedef Matrix<float,                Dynamic, Dynamic, ColMajor> MatrixXf_;
+		#elif defined (IS_DOUBLE)
+		typedef Matrix<double,               Dynamic, Dynamic, ColMajor> MatrixXd_;
+		#elif defined (IS_SCOMPLEX)
+		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, ColMajor> MatrixXcf_;
+		#elif defined (IS_DCOMPLEX)
+		typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
+		#endif
+	#else
+		#if defined(IS_FLOAT)
+		typedef Matrix<float,                Dynamic, Dynamic, RowMajor> MatrixXf_;
+		#elif defined (IS_DOUBLE)
+		typedef Matrix<double,               Dynamic, Dynamic, RowMajor> MatrixXd_;
+		#elif defined (IS_SCOMPLEX)
+		typedef Matrix<std::complex<float>,  Dynamic, Dynamic, RowMajor> MatrixXcf_;
+		#elif defined (IS_DCOMPLEX)
+		typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
+		#endif
+	#endif
+	#if defined(IS_FLOAT)
+		Map<MatrixXf_,  0, Stride<Dynamic,1> > A( ( float*  )ap, m, k, stride_a );
+		Map<MatrixXf_,  0, Stride<Dynamic,1> > B( ( float*  )bp, k, n, stride_b );
+		Map<MatrixXf_,  0, Stride<Dynamic,1> > C( ( float*  )cp, m, n, stride_c );
+	#elif defined (IS_DOUBLE)
+		Map<MatrixXd_,  0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
+		Map<MatrixXd_,  0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
+		Map<MatrixXd_,  0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
+	#elif defined (IS_SCOMPLEX)
+		Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>*  )ap, m, k, stride_a );
+		Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>*  )bp, k, n, stride_b );
+		Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>*  )cp, m, n, stride_c );
+	#elif defined (IS_DCOMPLEX)
+		Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
+		Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
+		Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
+	#endif
+#endif
+
+		dtime_save = DBL_MAX;
+
+		for ( r = 0; r < n_repeats; ++r )
+		{
+			bli_copym( &c_save, &c );
+
+			dtime = bli_clock();
+
+#ifdef PRINT
+			bli_printm( "a", &a, "%4.1f", "" );
+			bli_printm( "b", &b, "%4.1f", "" );
+			bli_printm( "c", &c, "%4.1f", "" );
+#endif
+
+#if defined(BLIS)
+
+			bli_gemm( &alpha,
+			          &a,
+			          &b,
+			          &beta,
+			          &c );
+
+#elif defined(EIGEN)
+
+			C.noalias() += alpha_r * A * B;
+
+#else // if defined(BLAS)
+
+			if ( bli_is_float( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				float*    alphap = ( float* )bli_obj_buffer( &alpha );
+				float*    ap     = ( float* )bli_obj_buffer( &a );
+				float*    bp     = ( float* )bli_obj_buffer( &b );
+				float*    betap  = ( float* )bli_obj_buffer( &beta );
+				float*    cp     = ( float* )bli_obj_buffer( &c );
+
+				sgemm_( &f77_transa,
+						&f77_transb,
+						&mm,
+						&nn,
+						&kk,
+						alphap,
+						ap, &lda,
+						bp, &ldb,
+						betap,
+						cp, &ldc );
+			}
+			else if ( bli_is_double( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				double*   alphap = ( double* )bli_obj_buffer( &alpha );
+				double*   ap     = ( double* )bli_obj_buffer( &a );
+				double*   bp     = ( double* )bli_obj_buffer( &b );
+				double*   betap  = ( double* )bli_obj_buffer( &beta );
+				double*   cp     = ( double* )bli_obj_buffer( &c );
+
+				dgemm_( &f77_transa,
+						&f77_transb,
+						&mm,
+						&nn,
+						&kk,
+						alphap,
+						ap, &lda,
+						bp, &ldb,
+						betap,
+						cp, &ldc );
+			}
+			else if ( bli_is_scomplex( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
+				scomplex* ap     = ( scomplex* )bli_obj_buffer( &a );
+				scomplex* bp     = ( scomplex* )bli_obj_buffer( &b );
+				scomplex* betap  = ( scomplex* )bli_obj_buffer( &beta );
+				scomplex* cp     = ( scomplex* )bli_obj_buffer( &c );
+
+				cgemm_( &f77_transa,
+						&f77_transb,
+						&mm,
+						&nn,
+						&kk,
+						alphap,
+						ap, &lda,
+						bp, &ldb,
+						betap,
+						cp, &ldc );
+			}
+			else if ( bli_is_dcomplex( dt ) )
+			{
+				f77_int   mm     = bli_obj_length( &c );
+				f77_int   kk     = bli_obj_width_after_trans( &a );
+				f77_int   nn     = bli_obj_width( &c );
+				f77_int   lda    = bli_obj_col_stride( &a );
+				f77_int   ldb    = bli_obj_col_stride( &b );
+				f77_int   ldc    = bli_obj_col_stride( &c );
+				dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
+				dcomplex* ap     = ( dcomplex* )bli_obj_buffer( &a );
+				dcomplex* bp     = ( dcomplex* )bli_obj_buffer( &b );
+				dcomplex* betap  = ( dcomplex* )bli_obj_buffer( &beta );
+				dcomplex* cp     = ( dcomplex* )bli_obj_buffer( &c );
+
+				zgemm_( &f77_transa,
+						&f77_transb,
+						&mm,
+						&nn,
+						&kk,
+						alphap,
+						ap, &lda,
+						bp, &ldb,
+						betap,
+						cp, &ldc );
+			}
+#endif
+
+#ifdef PRINT
+			bli_printm( "c after", &c, "%4.1f", "" );
+			exit(1);
+#endif
+
+			dtime_save = bli_clock_min_diff( dtime_save, dtime );
+		}
+
+		gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
+
+		if ( bli_is_complex( dt ) ) gflops *= 4.0;
+
+		printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
+		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
+		        ( unsigned long )m,
+		        ( unsigned long )k,
+		        ( unsigned long )n, gflops );
+		//fflush( stdout );
+
+		bli_obj_free( &alpha );
+		bli_obj_free( &beta );
+
+		bli_obj_free( &a );
+		bli_obj_free( &b );
+		bli_obj_free( &c );
+		bli_obj_free( &c_save );
+	}
+
+	//bli_finalize();
+
+	return 0;
+}
+
diff --git a/test/3/Makefile b/test/3/Makefile
index 972b4d93dfb87e325198b008cd4a147d8c9bf4d8..38d915721c712bc41c3f5f5e4f384e58573fe5df 100644
--- a/test/3/Makefile
+++ b/test/3/Makefile
@@ -135,14 +135,14 @@ PS_MAX   := 2400
 PS_INC   := 48
 
 # Single-socket (multithreaded)
-P1_BEGIN := 120
-P1_MAX   := 6000
-P1_INC   := 120
+P1_BEGIN := 96
+P1_MAX   := 4800
+P1_INC   := 96
 
 # Dual-socket (multithreaded)
-P2_BEGIN := 160
-P2_MAX   := 8000
-P2_INC   := 160
+P2_BEGIN := 144
+P2_MAX   := 7200
+P2_INC   := 144
 
 
 #
diff --git a/test/3/test_gemm.c b/test/3/test_gemm.c
index e7b6a94359814023ab4bec29f9994dcee9d82312..5ff4c0c0fa58da8f8dc63b92b1d7f6cb9f216f75 100644
--- a/test/3/test_gemm.c
+++ b/test/3/test_gemm.c
@@ -43,8 +43,8 @@
   #include "blis.h"
 #endif
 
-//#define COL_STORAGE
-#define ROW_STORAGE
+#define COL_STORAGE
+//#define ROW_STORAGE
 
 //#define PRINT
 
@@ -141,13 +141,14 @@ int main( int argc, char** argv )
 
 	printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
 	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
 
-	for ( p = p_begin; p <= p_max; p += p_inc )
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -396,7 +397,7 @@ int main( int argc, char** argv )
 
 		printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
 		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )k,
 		        ( unsigned long )n, gflops );
diff --git a/test/3/test_hemm.c b/test/3/test_hemm.c
index 73746ae4bdef382ab6e745a0b024078f294c4991..e69a1ec5742e21ae7b0ac14ac1cc90af6acb4697 100644
--- a/test/3/test_hemm.c
+++ b/test/3/test_hemm.c
@@ -119,12 +119,13 @@ int main( int argc, char** argv )
 
 	printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR );
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
 
-	for ( p = p_begin; p <= p_max; p += p_inc )
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -317,7 +318,7 @@ int main( int argc, char** argv )
 
 		printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR );
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
 
diff --git a/test/3/test_herk.c b/test/3/test_herk.c
index f51cf8c298d08fd8cb92c1ccc23f83ad749c454b..b963f944b3590878d6b1933fa8a68b8885bdc2f1 100644
--- a/test/3/test_herk.c
+++ b/test/3/test_herk.c
@@ -121,12 +121,13 @@ int main( int argc, char** argv )
 
 	printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
 
-	for ( p = p_begin; p <= p_max; p += p_inc )
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -297,7 +298,7 @@ int main( int argc, char** argv )
 
 		printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )k, gflops );
 
diff --git a/test/3/test_trmm.c b/test/3/test_trmm.c
index 1915b508ad3a1c66fbf8d0f3db33cb2e4c6b2811..2fa7fe52dd0592b2c762165f73878157801bc822 100644
--- a/test/3/test_trmm.c
+++ b/test/3/test_trmm.c
@@ -136,12 +136,13 @@ int main( int argc, char** argv )
 
 	printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
 
-	for ( p = p_begin; p <= p_max; p += p_inc )
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -311,7 +312,7 @@ int main( int argc, char** argv )
 
 		printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
 
diff --git a/test/3/test_trsm.c b/test/3/test_trsm.c
index b474a52a7e6c0d725e85544dd128676ff2b09217..2e5ff0a53314bab2ace9a2a0c18ddc4e26be01a4 100644
--- a/test/3/test_trsm.c
+++ b/test/3/test_trsm.c
@@ -136,12 +136,13 @@ int main( int argc, char** argv )
 
 	printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
 
-	for ( p = p_begin; p <= p_max; p += p_inc )
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -315,7 +316,7 @@ int main( int argc, char** argv )
 
 		printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
 
diff --git a/test/mixeddt/Makefile b/test/mixeddt/Makefile
index 87568825af99cdf7434866e49790e487644264e1..20e5378ffbed2f310de1a22861a67dff75f46893 100644
--- a/test/mixeddt/Makefile
+++ b/test/mixeddt/Makefile
@@ -140,11 +140,11 @@ STR_MT   := -DTHR_STR=\"mt\"
 
 # Problem size specification
 PDEF_ST  := -DP_BEGIN=40 \
-            -DP_END=2000 \
+            -DP_MAX=2000 \
             -DP_INC=40
 
 PDEF_MT  := -DP_BEGIN=160 \
-            -DP_END=8000 \
+            -DP_MAX=8000 \
             -DP_INC=160
 
 # Enumerate possible datatypes and computation precisions.
diff --git a/test/mixeddt/test_gemm.c b/test/mixeddt/test_gemm.c
index ea45a7c1410fef1e814ef2e83d8e1f9e785981d0..12437e41137d075f9e5526ba27482c856a8b01b0 100644
--- a/test/mixeddt/test_gemm.c
+++ b/test/mixeddt/test_gemm.c
@@ -77,7 +77,7 @@ int main( int argc, char** argv )
 	prec_t comp_prec = bli_dt_prec( dtx );
 
 	dim_t p_begin = P_BEGIN;
-	dim_t p_end   = P_END;
+	dim_t p_max   = P_MAX;
 	dim_t p_inc   = P_INC;
 
 	int m_input   = -1;
@@ -122,12 +122,12 @@ int main( int argc, char** argv )
 
 	// Begin with initializing the last entry to zero so that
 	// matlab allocates space for the entire array once up-front.
-	for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
 
 	//printf( "data_%s_%c%c%c%cgemm_%s",      THR_STR, dtc_ch, dta_ch, dtb_ch, dtx_ch, STR );
 	printf( "data_gemm_%s", STR );
 	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
@@ -143,7 +143,8 @@ int main( int argc, char** argv )
 	else if ( c_complex && a_complex && b_complex ) flopsmul = 8.0;
 
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
@@ -220,7 +221,7 @@ int main( int argc, char** argv )
 		//printf( "data_%s_%c%c%c%cgemm_%s",      THR_STR, dtc_ch, dta_ch, dtb_ch, dtx_ch, STR );
 		printf( "data_gemm_%s", STR );
 		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )k,
 		        ( unsigned long )n, gflops );
diff --git a/test/sup/Makefile b/test/sup/Makefile
index d2b3c7170a04bb515d801529648c8a0a80169c8b..2cd0627472843682f7883d38afcc3bcf67bd3f9a 100644
--- a/test/sup/Makefile
+++ b/test/sup/Makefile
@@ -96,6 +96,9 @@ endif
 HOME_LIB_PATH  := $(HOME)/flame/lib
 MKL_LIB_PATH   := $(HOME)/intel/mkl/lib/intel64
 
+# netlib BLAS
+NETLIB_LIB     := $(HOME_LIB_PATH)/libblas.a
+
 # OpenBLAS
 OPENBLAS_LIB   := $(HOME_LIB_PATH)/libopenblas.a
 OPENBLASP_LIB  := $(HOME_LIB_PATH)/libopenblasp.a
@@ -103,6 +106,10 @@ OPENBLASP_LIB  := $(HOME_LIB_PATH)/libopenblasp.a
 # BLASFEO
 BLASFEO_LIB    := $(HOME_LIB_PATH)/libblasfeo.a
 
+# libxsmm
+LIBXSMM_LIB    := $(HOME_LIB_PATH)/libxsmm.a -ldl \
+                  $(NETLIB_LIB) -lgfortran
+
 # ATLAS
 ATLAS_LIB      := $(HOME_LIB_PATH)/libf77blas.a \
                   $(HOME_LIB_PATH)/libatlas.a
@@ -210,15 +217,22 @@ TRANS := n_n \
          t_n \
          t_t
 
+# While BLIS supports all combinations of row and column storage for matrices
+# C, A, and B, the alternatives mostly only support CBLAS APIs, which inherently
+# support only "all row-storage" or "all column-storage". Thus, we disable the
+# building of those other drivers so that compilation/linking completes sooner.
+#STORS := r_r_r \
+#         r_r_c \
+#         r_c_r \
+#         r_c_c \
+#         c_r_r \
+#         c_r_c \
+#         c_c_r \
+#         c_c_c
 STORS := r_r_r \
-         r_r_c \
-         r_c_r \
-         r_c_c \
-         c_r_r \
-         c_r_c \
-         c_c_r \
          c_c_c
 
+
 SHAPES := l_l_s \
           l_s_l \
           s_l_l \
@@ -306,14 +320,18 @@ get-imp-defs = $(strip $(subst  blissup,-DSTR=\"$(1)\" -DBLIS -DSUP, \
                        $(subst    eigen,-DSTR=\"$(1)\" -DEIGEN, \
                        $(subst openblas,-DSTR=\"$(1)\" -DCBLAS, \
                        $(subst  blasfeo,-DSTR=\"$(1)\" -DCBLAS, \
-                       $(subst   vendor,-DSTR=\"$(1)\" -DCBLAS,$(1))))))))
+                       $(subst  libxsmm,-DSTR=\"$(1)\" -DBLAS -DXSMM, \
+                       $(subst   vendor,-DSTR=\"$(1)\" -DCBLAS,$(1)))))))))
 
 TRANS0  = $(call stripu,$(TRANS))
 STORS0  = $(call stripu,$(STORS))
 
 # Limit BLAS and Eigen to only using all row-stored, or all column-stored matrices.
+# Also, limit libxsmm to using all column-stored matrices since it does not offer
+# CBLAS interfaces.
 BSTORS0 = rrr ccc
 ESTORS0 = rrr ccc
+XSTORS0 = ccc
 
 
 #
@@ -339,6 +357,9 @@ OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
 BLASFEO_ST_OBJS  := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blasfeo)
 BLASFEO_ST_BINS  := $(patsubst %.o,%.x,$(BLASFEO_ST_OBJS))
 
+LIBXSMM_ST_OBJS  := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),libxsmm)
+LIBXSMM_ST_BINS  := $(patsubst %.o,%.x,$(LIBXSMM_ST_OBJS))
+
 VENDOR_ST_OBJS   := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor)
 VENDOR_ST_BINS   := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
 
@@ -351,6 +372,7 @@ VENDOR_ST_BINS   := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
                $(EIGEN_ST_OBJS) \
                $(OPENBLAS_ST_OBJS) \
                $(BLASFEO_ST_OBJS) \
+               $(LIBXSMM_ST_OBJS) \
                $(VENDOR_ST_OBJS)
 
 
@@ -365,9 +387,11 @@ blislpab:    blislpab-st
 eigen:       eigen-st
 openblas:    openblas-st
 blasfeo:     blasfeo-st
+libxsmm:     libxsmm-st
 vendor:      vendor-st
 
-st:          blissup-st blislpab-st eigen-st openblas-st blasfeo-st vendor-st
+st:          blissup-st blislpab-st \
+             eigen-st openblas-st blasfeo-st libxsmm-st vendor-st
 blis:        blissup-st blislpab-st
 
 blissup-st:  $(BLISSUP_ST_BINS)
@@ -375,13 +399,14 @@ blislpab-st: $(BLISLPAB_ST_BINS)
 eigen-st:    $(EIGEN_ST_BINS)
 openblas-st: $(OPENBLAS_ST_BINS)
 blasfeo-st:  $(BLASFEO_ST_BINS)
+libxsmm-st:  $(LIBXSMM_ST_BINS)
 vendor-st:   $(VENDOR_ST_BINS)
 
 
 # --Object file rules --
 
 # Define the implementations for which we will instantiate compilation rules.
-BIMPLS := blissup blislpab openblas blasfeo vendor
+BIMPLS := blissup blislpab openblas blasfeo libxsmm vendor
 EIMPLS := eigen
 
 #      1     2  3   4 567  8
@@ -447,6 +472,9 @@ test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK)
 test_%_blasfeo_st.x:  test_%_blasfeo_st.o  $(LIBBLIS_LINK)
 	$(CC) $(strip $<  $(BLASFEO_LIB)       $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
 
+test_%_libxsmm_st.x:  test_%_libxsmm_st.o  $(LIBBLIS_LINK)
+	$(CC) $(strip $<  $(LIBXSMM_LIB)       $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
+
 test_%_vendor_st.x:   test_%_vendor_st.o   $(LIBBLIS_LINK)
 	$(CC) $(strip $<  $(VENDOR_LIB)        $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
 
@@ -456,5 +484,5 @@ test_%_vendor_st.x:   test_%_vendor_st.o   $(LIBBLIS_LINK)
 clean: cleanx
 
 cleanx:
-	- $(RM_F) *.x
+	- $(RM_F) *.x *.o
 
diff --git a/test/sup/octave/plot_l3sup_perf.m b/test/sup/octave/plot_l3sup_perf.m
index b027bf7a43015d1118ca2484cc75a118885bf366..bf291087891c6ce9b923d5e9f41fdb1defebfc59 100644
--- a/test/sup/octave/plot_l3sup_perf.m
+++ b/test/sup/octave/plot_l3sup_perf.m
@@ -4,20 +4,21 @@ function r_val = plot_l3sup_perf( opname, ...
                                   data_eigen, ...
                                   data_open, ...
                                   data_bfeo, ...
+                                  data_xsmm, ...
                                   data_vend, vend_str, ...
                                   nth, ...
                                   rows, cols, ...
                                   cfreq, ...
                                   dfps, ...
                                   theid, impl )
-if ... %mod(theid-1,cols) == 2 || ...
-   ... %mod(theid-1,cols) == 3 || ...
-   ... %mod(theid-1,cols) == 4 || ...
-   0 == 1 ... %theid >= 19
-	show_plot = 0;
-else
+%if ... %mod(theid-1,cols) == 2 || ...
+%   ... %mod(theid-1,cols) == 3 || ...
+%   ... %mod(theid-1,cols) == 4 || ...
+%   0 == 1 ... %theid >= 19
+%	show_plot = 0;
+%else
 	show_plot = 1;
-end
+%end
 
 %legend_plot_id = 11;
 legend_plot_id = 1*cols + 1*5;
@@ -33,6 +34,7 @@ color_blislpab = 'k'; lines_blislpab = ':';  markr_blislpab = '';
 color_eigen    = 'm'; lines_eigen    = '-.'; markr_eigen    = 'o';
 color_open     = 'r'; lines_open     = '--'; markr_open     = 'o';
 color_bfeo     = 'c'; lines_bfeo     = '-';  markr_bfeo     = 'o';
+color_xsmm     = 'g'; lines_xsmm     = '-';  markr_xsmm     = 'o';
 color_vend     = 'b'; lines_vend     = '-.'; markr_vend     = '.';
 
 % Compute the peak performance in terms of the number of double flops
@@ -57,6 +59,7 @@ blislpab_legend = sprintf( 'BLIS conv' );
 eigen_legend    = sprintf( 'Eigen' );
 open_legend     = sprintf( 'OpenBLAS' );
 bfeo_legend     = sprintf( 'BLASFEO' );
+xsmm_legend     = sprintf( 'libxsmm' );
 %vend_legend     = sprintf( 'MKL' );
 %vend_legend     = sprintf( 'ARMPL' );
 vend_legend     = vend_str;
@@ -96,30 +99,53 @@ for psize_col = 1:3
 		break;
 	end
 end
-x_end = data_blissup( size( data_blissup, 1 ), psize_col );
-
 x_axis( :, 1 ) = data_blissup( :, psize_col );
 
+% Compute the number of data points we have in the x-axis. Note that
+% we only use half the data points for the m = n = k column of graphs.
+if mod(theid-1,cols) == 6
+	np = size( data_blissup, 1 ) / 2;
+else
+	np = size( data_blissup, 1 );
+end
+
+has_xsmm = 1;
+if data_xsmm( 1, flopscol ) == 0.0
+	has_xsmm = 0;
+end
+
+% Grab the last x-axis value.
+x_end = data_blissup( np, psize_col );
+
 %data_peak( 1, 1:2 ) = [     0 max_perf_core ];
 %data_peak( 2, 1:2 ) = [ x_end max_perf_core ];
 
 if show_plot == 1
-blissup_ln  = line( x_axis( :, 1 ), data_blissup( :, flopscol ) / nth, ...
+blissup_ln  = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ...
                     'Color',color_blissup, 'LineStyle',lines_blissup, ...
                     'LineWidth',linesize );
-blislpab_ln = line( x_axis( :, 1 ), data_blislpab( :, flopscol ) / nth, ...
+blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ...
                     'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
                     'LineWidth',linesize );
-eigen_ln    = line( x_axis( :, 1 ), data_eigen( :, flopscol ) / nth, ...
+eigen_ln    = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ...
                     'Color',color_eigen, 'LineStyle',lines_eigen, ...
                     'LineWidth',linesize );
-open_ln     = line( x_axis( :, 1 ), data_open( :, flopscol ) / nth, ...
+open_ln     = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
                     'Color',color_open, 'LineStyle',lines_open, ...
                     'LineWidth',linesize );
-bfeo_ln     = line( x_axis( :, 1 ), data_bfeo( :, flopscol ) / nth, ...
+bfeo_ln     = line( x_axis( 1:np, 1 ), data_bfeo( 1:np, flopscol ) / nth, ...
                     'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
                     'LineWidth',linesize );
-vend_ln     = line( x_axis( :, 1 ), data_vend( :, flopscol ) / nth, ...
+if has_xsmm == 1
+xsmm_ln     = line( x_axis( 1:np, 1 ), data_xsmm( 1:np, flopscol ) / nth, ...
+                    'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
+                    'LineWidth',linesize );
+else
+xsmm_ln     = line( nan, nan, ...
+                    'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
+                    'LineWidth',linesize );
+end
+vend_ln     = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
                     'Color',color_vend, 'LineStyle',lines_vend, ...
                     'LineWidth',linesize );
 else
@@ -139,6 +165,9 @@ open_ln     = line( nan, nan, ...
 bfeo_ln     = line( nan, nan, ...
                     'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
                     'LineWidth',linesize );
+xsmm_ln     = line( nan, nan, ...
+                    'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
+                    'LineWidth',linesize );
 vend_ln     = line( nan, nan, ...
                     'Color',color_vend, 'LineStyle',lines_vend, ...
                     'LineWidth',linesize );
@@ -169,40 +198,72 @@ elseif 500 <= x_end && x_end < 1000
 end
 
 if show_plot == 1 || theid == legend_plot_id
-if rows == 4 && cols == 7
 	if nth == 1 && theid == legend_plot_id
-		leg = legend( ...
-		[ ...
-		  blissup_ln ...
-		  blislpab_ln ...
-		  eigen_ln ...
-		  open_ln ...
-		  bfeo_ln ...
-		  vend_ln ...
-		], ...
-		blissup_legend, ...
-		blislpab_legend, ...
-		eigen_legend, ...
-		open_legend, ...
-		bfeo_legend, ...
-		vend_legend, ...
-		'Location', legend_loc );
+		if has_xsmm == 1
+			leg = legend( ...
+			[ ...
+			  blissup_ln ...
+			  blislpab_ln ...
+			  eigen_ln ...
+			  open_ln ...
+			  bfeo_ln ...
+			  xsmm_ln ...
+			  vend_ln ...
+			], ...
+			blissup_legend, ...
+			blislpab_legend, ...
+			eigen_legend, ...
+			open_legend, ...
+			bfeo_legend, ...
+			xsmm_legend, ...
+			vend_legend, ...
+			'Location', legend_loc );
+			set( leg,'Box','off' );
+			set( leg,'Color','none' );
+			set( leg,'Units','inches' );
+			if impl == 'octave'
+			set( leg,'FontSize',fontsize );
+			set( leg,'Position',[11.92 6.54 1.15 0.7 ] ); % (1,4tl)
+			else
+			set( leg,'FontSize',fontsize-3 );
+			set( leg,'Position',[18.20 10.20 1.15 0.7 ] ); % (1,4tl)
+			end
+		else
+			leg = legend( ...
+			[ ...
+			  blissup_ln ...
+			  blislpab_ln ...
+			  eigen_ln ...
+			  open_ln ...
+			  bfeo_ln ...
+			  vend_ln ...
+			], ...
+			blissup_legend, ...
+			blislpab_legend, ...
+			eigen_legend, ...
+			open_legend, ...
+			bfeo_legend, ...
+			vend_legend, ...
+			'Location', legend_loc );
+			set( leg,'Box','off' );
+			set( leg,'Color','none' );
+			set( leg,'Units','inches' );
+			if impl == 'octave'
+			set( leg,'FontSize',fontsize );
+			set( leg,'Position',[11.92 6.54 1.15 0.7 ] ); % (1,4tl)
+			else
+			set( leg,'FontSize',fontsize-1 );
+			set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
+			end
+		end
 		set( leg,'Box','off' );
 		set( leg,'Color','none' );
 		set( leg,'Units','inches' );
 		%                    xpos ypos
 		%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
-		if impl == 'octave'
-		set( leg,'FontSize',fontsize );
-		set( leg,'Position',[11.92 6.54 1.15 0.7 ] ); % (1,4tl)
-		else
-		set( leg,'FontSize',fontsize-1 );
-		set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
-		end
 	elseif nth > 1 && theid == legend_plot_id
 	end
 end
-end
 
 set( ax1,'FontSize',fontsize );
 set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1.
diff --git a/test/sup/octave/plot_panel_trxsh.m b/test/sup/octave/plot_panel_trxsh.m
index e5d282bc8c8b01592b03b1ba4aa4822150d8bae3..ebc216e3b9ae97101c0b295fae21284f3f7bf4d3 100644
--- a/test/sup/octave/plot_panel_trxsh.m
+++ b/test/sup/octave/plot_panel_trxsh.m
@@ -23,6 +23,7 @@ filetemp_blislpab = '%s/output_%s_%s_blislpab.m';
 filetemp_eigen    = '%s/output_%s_%s_eigen.m';
 filetemp_open     = '%s/output_%s_%s_openblas.m';
 filetemp_bfeo     = '%s/output_%s_%s_blasfeo.m';
+filetemp_xsmm     = '%s/output_%s_%s_libxsmm.m';
 filetemp_vend     = '%s/output_%s_%s_vendor.m';
 
 % Create a variable name "template" for the variables contained in the
@@ -83,15 +84,10 @@ for opi = 1:n_opsupnames
 	% Load the data files.
 	%str = sprintf( '  Loading %s', file_blissup ); disp(str);
 	run( file_blissup )
-	%str = sprintf( '  Loading %s', file_blislpab ); disp(str);
 	run( file_blislpab )
-	%str = sprintf( '  Loading %s', file_eigen ); disp(str);
 	run( file_eigen )
-	%str = sprintf( '  Loading %s', file_open ); disp(str);
 	run( file_open )
-	%str = sprintf( '  Loading %s', file_open ); disp(str);
 	run( file_bfeo )
-	%str = sprintf( '  Loading %s', file_vend ); disp(str);
 	run( file_vend )
 
 	% Construct variable names for the variables in the data files.
@@ -111,11 +107,25 @@ for opi = 1:n_opsupnames
 	data_bfeo = eval( var_bfeo );         % e.g. data_st_dgemm_blasfeo( :, : );
 	data_vend = eval( var_vend );         % e.g. data_st_dgemm_vendor( :, : );
 
+	if stor_str == 'ccc'
+		% Only read xsmm data for the column storage case, since that's the
+		% only format that libxsmm supports.
+		file_xsmm = sprintf( filetemp_xsmm,     dirpath, thr_str, opsupname );
+		run( file_xsmm )
+		var_xsmm  = sprintf( vartemp, thr_str, opname, 'libxsmm' );
+		data_xsmm = eval( var_xsmm );     % e.g. data_st_dgemm_libxsmm( :, : );
+	else
+		% Set the data variable to zeros using the same dimensions as the other
+		% variables.
+		data_xsmm = zeros( size( data_blissup, 1 ), ...
+		                   size( data_blissup, 2 ) );
+	end
 	%str = sprintf( '  Reading %s', var_blissup ); disp(str);
 	%str = sprintf( '  Reading %s', var_blislpab ); disp(str);
 	%str = sprintf( '  Reading %s', var_eigen ); disp(str);
 	%str = sprintf( '  Reading %s', var_open ); disp(str);
 	%str = sprintf( '  Reading %s', var_bfeo ); disp(str);
+	%str = sprintf( '  Reading %s', var_xsmm ); disp(str);
 	%str = sprintf( '  Reading %s', var_vend ); disp(str);
 
 	% Plot one result in an m x n grid of plots, via the subplot()
@@ -127,6 +137,7 @@ for opi = 1:n_opsupnames
 	                 data_eigen, ...
 	                 data_open, ...
 	                 data_bfeo, ...
+	                 data_xsmm, ...
 	                 data_vend, vend_str, ...
 	                 nth, ...
 	                 4, 7, ...
@@ -140,6 +151,7 @@ for opi = 1:n_opsupnames
 	clear data_eigen;
 	clear data_open;
 	clear data_bfeo;
+	clear data_xsmm;
 	clear data_vend;
 
 	end
diff --git a/test/sup/octave/runme.m b/test/sup/octave/runme.m
index 5fd894c2b21e0ac9d9f60629b9987e774ef5e219..a9e053c3ecf3282dd418df89928b1db96406dcea 100644
--- a/test/sup/octave/runme.m
+++ b/test/sup/octave/runme.m
@@ -1,8 +1,12 @@
 
+% haswell
+plot_panel_trxsh(3.25,16,1,'st','d','ccc',[ 6 8 4 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all;
+plot_panel_trxsh(3.25,16,1,'st','d','rrr',[ 6 8 4 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all;
+
 % kabylake
-plot_panel_trxsh(3.6,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20190531/4_800_4_mt201_last400','kbl','MKL','matlab'); close; clear all;
-plot_panel_trxsh(3.6,16,1,'st','d','ccc',[ 6 8 4 ],'../results/kabylake/20190531/4_800_4_mt201_last400','kbl','MKL','matlab'); close; clear all;
+plot_panel_trxsh(3.80,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20190823/4_800_4_mt201','kbl','MKL','matlab'); close; clear all;
+plot_panel_trxsh(3.80,16,1,'st','d','ccc',[ 6 8 4 ],'../results/kabylake/20190823/4_800_4_mt201','kbl','MKL','matlab'); close; clear all;
 
 % epyc
-plot_panel_trxsh(3.0,8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20190531/4_800_4_mt256_last400','epyc','MKL','matlab'); close; clear all;
-plot_panel_trxsh(3.0,8,1,'st','d','ccc',[ 6 8 4 ],'../results/epyc/20190531/4_800_4_mt256_last400','epyc','MKL','matlab'); close; clear all;
+plot_panel_trxsh(3.00, 8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
+plot_panel_trxsh(3.00, 8,1,'st','d','ccc',[ 6 8 4 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
diff --git a/test/sup/runme.sh b/test/sup/runme.sh
index 9646e3ccc55c6e950ab4e23b2f4ab90b4ae12667..48dacfa3a6790b378a994ce5cace39fe03761485 100755
--- a/test/sup/runme.sh
+++ b/test/sup/runme.sh
@@ -37,12 +37,13 @@ sns="8"
 sks="4"
 
 # Implementations to test.
-impls="vendor blissup blislpab openblas eigen"
-#impls="vendor openblas eigen"
-#impls="blislpab blissup"
-#mpls="openblas eigen vendor"
-#mpls="eigen"
+impls="vendor blissup blislpab openblas eigen libxsmm blasfeo"
+#impls="vendor"
 #impls="blissup"
+#impls="blislpab"
+#impls="openblas"
+#impls="eigen"
+#impls="libxsmm"
 #impls="blasfeo"
 
 # Example: test_dgemm_nn_rrc_m6npkp_blissup_st.x
@@ -75,6 +76,13 @@ for th in ${threads}; do
 											continue;
 										fi
 
+										# Further limit execution of libxsmm to
+										# ccc storage cases.
+										if [ "${im:0:7}" = "libxsmm" ] && \
+										   [ "${st}" != "ccc" ]; then
+											continue;
+										fi
+
 										# Extract the shape chars for m, n, k.
 										chm=${sh:0:1}
 										chn=${sh:1:1}
diff --git a/test/sup/test_gemm.c b/test/sup/test_gemm.c
index 311e8552afed3aa72dde7b404ca1045ca8ffccd7..7f611b554de77b39949dd05277e7031ebc337168 100644
--- a/test/sup/test_gemm.c
+++ b/test/sup/test_gemm.c
@@ -152,13 +152,14 @@ int main( int argc, char** argv )
 	printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch,
 	                                  transal, transbl, STR );
 	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
 
-	for ( p = p_begin; p <= p_max; p += p_inc )
+	//for ( p = p_begin; p <= p_max; p += p_inc )
+	for ( p = p_max; p_begin <= p; p -= p_inc )
 	{
 		obj_t  a, b, c;
 		obj_t  c_save;
@@ -195,7 +196,7 @@ int main( int argc, char** argv )
 		bli_obj_set_conjtrans( transa, &a );
 		bli_obj_set_conjtrans( transb, &b );
 
-		bli_setsc(  (2.0/1.0), 0.0, &alpha );
+		bli_setsc(  (1.0/1.0), 0.0, &alpha );
 		bli_setsc(  (1.0/1.0), 0.0, &beta );
 
 		bli_copym( &c, &c_save );
@@ -317,7 +318,11 @@ int main( int argc, char** argv )
 				float*    betap  = ( float* )bli_obj_buffer( &beta );
 				float*    cp     = ( float* )bli_obj_buffer( &c );
 
+				#ifdef XSMM
+				libxsmm_sgemm( &f77_transa,
+				#else
 				sgemm_( &f77_transa,
+				#endif
 				        &f77_transb,
 				        &mm,
 				        &nn,
@@ -342,7 +347,11 @@ int main( int argc, char** argv )
 				double*   betap  = ( double* )bli_obj_buffer( &beta );
 				double*   cp     = ( double* )bli_obj_buffer( &c );
 
+				#ifdef XSMM
+				libxsmm_dgemm( &f77_transa,
+				#else
 				dgemm_( &f77_transa,
+				#endif
 				        &f77_transb,
 				        &mm,
 				        &nn,
@@ -367,7 +376,11 @@ int main( int argc, char** argv )
 				scomplex* betap  = ( scomplex* )bli_obj_buffer( &beta );
 				scomplex* cp     = ( scomplex* )bli_obj_buffer( &c );
 
+				#ifdef XSMM
+				libxsmm_cgemm( &f77_transa,
+				#else
 				cgemm_( &f77_transa,
+				#endif
 				        &f77_transb,
 				        &mm,
 				        &nn,
@@ -392,7 +405,11 @@ int main( int argc, char** argv )
 				dcomplex* betap  = ( dcomplex* )bli_obj_buffer( &beta );
 				dcomplex* cp     = ( dcomplex* )bli_obj_buffer( &c );
 
+				#ifdef XSMM
+				libxsmm_zgemm( &f77_transa,
+				#else
 				zgemm_( &f77_transa,
+				#endif
 				        &f77_transb,
 				        &mm,
 				        &nn,
@@ -545,7 +562,7 @@ int main( int argc, char** argv )
 		printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch,
 		                                  transal, transbl, STR );
 		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n,
 		        ( unsigned long )k, gflops );
diff --git a/test/test_axpyv.c b/test/test_axpyv.c
index 268e3ea0de56015e1d1ca03f8b6698d0f931d4bf..54a4f61340339c69bc8e1c1fd00f54c0e8dfc9da 100644
--- a/test/test_axpyv.c
+++ b/test/test_axpyv.c
@@ -96,10 +96,11 @@ int main( int argc, char** argv )
 	printf( "data_axpyv_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 
 		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
@@ -188,7 +189,7 @@ int main( int argc, char** argv )
 		printf( "data_axpyv_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-				( unsigned long )(p - p_begin + 1)/p_inc + 1,
+				( unsigned long )(p - p_begin)/p_inc + 1,
 				( unsigned long )n, gflops );
 
 		bli_obj_free( &alpha );
diff --git a/test/test_dotv.c b/test/test_dotv.c
index ea0f7e4c580c8b768a80f32297fb1eb44e8ae787..d5bebea5a60679a465cc27fcc957f3be95020867 100644
--- a/test/test_dotv.c
+++ b/test/test_dotv.c
@@ -93,10 +93,11 @@ int main( int argc, char** argv )
 	printf( "data_dotv_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 
 		if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
@@ -172,7 +173,7 @@ int main( int argc, char** argv )
 		printf( "data_dotv_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )n, gflops );
 
 		bli_obj_free( &x );
diff --git a/test/test_gemm.c b/test/test_gemm.c
index 5d6b6aa9af17e3bb8e28a18227af2bcf1bca162a..042e42c8bd2f0445781c2f41f1520d3265ccc5cd 100644
--- a/test/test_gemm.c
+++ b/test/test_gemm.c
@@ -105,12 +105,13 @@ int main( int argc, char** argv )
 	printf( "data_gemm_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
 		else               m =     ( dim_t )    m_input;
@@ -287,7 +288,7 @@ int main( int argc, char** argv )
 		printf( "data_gemm_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )k,
 		        ( unsigned long )n, gflops );
diff --git a/test/test_gemv.c b/test/test_gemv.c
index 7d15c3249a67182ab1f4a714e163af724bbf1fa9..5e72d8655726ae365a7617660c10f1247ed52a3b 100644
--- a/test/test_gemv.c
+++ b/test/test_gemv.c
@@ -88,11 +88,12 @@ int main( int argc, char** argv )
 	printf( "data_gemv_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
@@ -183,7 +184,7 @@ int main( int argc, char** argv )
 		printf( "data_gemv_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
 
diff --git a/test/test_ger.c b/test/test_ger.c
index e3497703e60066f0d11d39d2fd92c715f30fd63d..4e584fb9d5bc43e8aa4077c7ed3400826daf7228 100644
--- a/test/test_ger.c
+++ b/test/test_ger.c
@@ -88,11 +88,12 @@ int main( int argc, char** argv )
 	printf( "data_ger_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
@@ -176,7 +177,7 @@ int main( int argc, char** argv )
 		printf( "data_ger_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
 
diff --git a/test/test_hemm.c b/test/test_hemm.c
index 40068c5f95dbedcb705e2105277e066950f3cf63..4f20aaca98c3fa46fbfdfd6d06e703e7d1a05c1d 100644
--- a/test/test_hemm.c
+++ b/test/test_hemm.c
@@ -106,11 +106,12 @@ int main( int argc, char** argv )
 	printf( "data_hemm_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
 		else               m =     ( dim_t )    m_input;
@@ -298,7 +299,7 @@ int main( int argc, char** argv )
 		printf( "data_hemm_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
 
diff --git a/test/test_hemv.c b/test/test_hemv.c
index 0250d31b8d15be0192cd5a90c34b7ce8a834e868..48227927d3fd2dfe0d6ad8d88aded40fc09496c6 100644
--- a/test/test_hemv.c
+++ b/test/test_hemv.c
@@ -93,10 +93,11 @@ int main( int argc, char** argv )
 	printf( "data_hemv_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
@@ -190,7 +191,7 @@ int main( int argc, char** argv )
 		printf( "data_hemv_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m, gflops );
 
 		bli_obj_free( &alpha );
diff --git a/test/test_her.c b/test/test_her.c
index 026b91261b3708f02a1150c054f80b58a2f9ad3c..606eb5a21d0b35e8acc1c903b07bb5d10af03d2c 100644
--- a/test/test_her.c
+++ b/test/test_her.c
@@ -94,10 +94,11 @@ int main( int argc, char** argv )
 	printf( "data_her_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
@@ -184,7 +185,7 @@ int main( int argc, char** argv )
 		printf( "data_her_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m, gflops );
 
 		bli_obj_free( &alpha );
diff --git a/test/test_her2.c b/test/test_her2.c
index 7428dde4ec4e0589771fb07726b310469b09b049..5814eee1076eee6633b1bc89b5099dfa93e8729a 100644
--- a/test/test_her2.c
+++ b/test/test_her2.c
@@ -93,10 +93,11 @@ int main( int argc, char** argv )
 	printf( "data_her2_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
@@ -186,7 +187,7 @@ int main( int argc, char** argv )
 		printf( "data_her2_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m, gflops );
 
 		bli_obj_free( &alpha );
diff --git a/test/test_her2k.c b/test/test_her2k.c
index a73e849554d5abf1c2dd6157590cfdbc094808f9..489b453f74e706e5d84281a92e75d5b5f45079e3 100644
--- a/test/test_her2k.c
+++ b/test/test_her2k.c
@@ -105,11 +105,12 @@ int main( int argc, char** argv )
 	printf( "data_her2k_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
 		else               m =     ( dim_t )    m_input;
@@ -287,7 +288,7 @@ int main( int argc, char** argv )
 		printf( "data_her2k_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )k, gflops );
 
diff --git a/test/test_herk.c b/test/test_herk.c
index db8f826c9f16b7e7d91e085df1b1012b70c70ceb..8f2adfa3450b9dcb407908af95761166e073b239 100644
--- a/test/test_herk.c
+++ b/test/test_herk.c
@@ -105,11 +105,12 @@ int main( int argc, char** argv )
 	printf( "data_herk_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
 		else               m =     ( dim_t )    m_input;
@@ -265,7 +266,7 @@ int main( int argc, char** argv )
 		printf( "data_herk_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )k, gflops );
 
diff --git a/test/test_trmm.c b/test/test_trmm.c
index 214ea32beb50adc2aca3b05dd392efff2362c505..ae867e4620e5abe46d9fc2c21c8095eff2f033cb 100644
--- a/test/test_trmm.c
+++ b/test/test_trmm.c
@@ -116,11 +116,12 @@ int main( int argc, char** argv )
 	printf( "data_trmm_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
 		else               m =     ( dim_t )    m_input;
@@ -282,7 +283,7 @@ int main( int argc, char** argv )
 		printf( "data_trmm_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
 
diff --git a/test/test_trmv.c b/test/test_trmv.c
index bd737de9f24dd53cdaec4f7bc31d247349199236..1fa33f3a89dcd0aebdf3b3be8b9a3d77e61f60ac 100644
--- a/test/test_trmv.c
+++ b/test/test_trmv.c
@@ -90,10 +90,11 @@ int main( int argc, char** argv )
 	printf( "data_trmv_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
@@ -176,7 +177,7 @@ int main( int argc, char** argv )
 		printf( "data_trmv_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m, gflops );
 
 		bli_obj_free( &alpha );
diff --git a/test/test_trsm.c b/test/test_trsm.c
index e5796bad34d77e37b12b51d51bbc4f680e79984b..5be9c965a799e4e90473f70b138af2b56512cc85 100644
--- a/test/test_trsm.c
+++ b/test/test_trsm.c
@@ -116,11 +116,12 @@ int main( int argc, char** argv )
 	printf( "data_trsm_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
 		else               m =     ( dim_t )    m_input;
@@ -285,7 +286,7 @@ int main( int argc, char** argv )
 		printf( "data_trsm_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m,
 		        ( unsigned long )n, gflops );
 
diff --git a/test/test_trsv.c b/test/test_trsv.c
index 048fe3950d6050a4a8710a02d5d476ec6c16e416..10586a81fa9278bda6fd8ac9e1133e05a6d0d2bd 100644
--- a/test/test_trsv.c
+++ b/test/test_trsv.c
@@ -90,10 +90,11 @@ int main( int argc, char** argv )
 	printf( "data_trv_%s", BLAS );
 #endif
 	printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+	        ( unsigned long )(p - p_begin)/p_inc + 1,
 	        ( unsigned long )0, 0.0 );
 
-	for ( p = p_begin; p <= p_end; p += p_inc )
+	//for ( p = p_begin; p <= p_end; p += p_inc )
+	for ( p = p_end; p_begin <= p; p -= p_inc )
 	{
 
 		if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
@@ -183,7 +184,7 @@ int main( int argc, char** argv )
 		printf( "data_trsv_%s", BLAS );
 #endif
 		printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
-		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
+		        ( unsigned long )(p - p_begin)/p_inc + 1,
 		        ( unsigned long )m, gflops );
 
 		bli_obj_free( &alpha );
diff --git a/testsuite/src/test_addm.c b/testsuite/src/test_addm.c
index 545f9387bd8ff342a8674daa71d51fa448a86423..821c8b55e3d3affedd440834eab786fc8d546312 100644
--- a/testsuite/src/test_addm.c
+++ b/testsuite/src/test_addm.c
@@ -275,7 +275,7 @@ void libblis_test_addm_check
 	//
 	// is functioning correctly if
 	//
-	//   normfv(y) - sqrt( absqsc( beta + conjx(alpha) ) * m * n )
+	//   normfm(y) - sqrt( absqsc( beta + conjx(alpha) ) * m * n )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_axpbyv.c b/testsuite/src/test_axpbyv.c
index 24ed4a5ce117fc06de5abf1cf747cbd9ef066ee1..8def7b32d6aaa7fbc4507f839ed4b1c1c5631634 100644
--- a/testsuite/src/test_axpbyv.c
+++ b/testsuite/src/test_axpbyv.c
@@ -296,7 +296,7 @@ void libblis_test_axpbyv_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - ( beta * y_orig + alpha * conjx(x) ) )
+	//   normfv( y - ( beta * y_orig + alpha * conjx(x) ) )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c
index a834aa6a369dc4d86a10a62e47f919a9eda3747f..c23443ab072406a99b18bf4f89d9a1fc170e9626 100644
--- a/testsuite/src/test_axpy2v.c
+++ b/testsuite/src/test_axpy2v.c
@@ -314,7 +314,7 @@ void libblis_test_axpy2v_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( z - v )
+	//   normfv( z - v )
 	//
 	// is negligible, where v contains z as computed by two calls to axpyv.
 	//
diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c
index 3bd18ca3ef32c6eab08ad065920fba372f03e84f..155e442b0d1e4d7d151de3d03a82625c4c58758e 100644
--- a/testsuite/src/test_axpyf.c
+++ b/testsuite/src/test_axpyf.c
@@ -319,7 +319,7 @@ void libblis_test_axpyf_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - v )
+	//   normfv( y - v )
 	//
 	// is negligible, where v contains y as computed by repeated calls to
 	// axpyv.
diff --git a/testsuite/src/test_axpym.c b/testsuite/src/test_axpym.c
index c79866104eda4877b7945a6c46725888a4494223..0138d822ad456fcfeccd2f3c864ef555709940e1 100644
--- a/testsuite/src/test_axpym.c
+++ b/testsuite/src/test_axpym.c
@@ -289,7 +289,7 @@ void libblis_test_axpym_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - ( y_orig + alpha * conjx(x) ) )
+	//   normfm( y - ( y_orig + alpha * conjx(x) ) )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_axpyv.c b/testsuite/src/test_axpyv.c
index ff0326fb8c442994cca0785ea93d78ff3560fcbd..89b505f4c1955b074675d22f73467a5419606985 100644
--- a/testsuite/src/test_axpyv.c
+++ b/testsuite/src/test_axpyv.c
@@ -286,7 +286,7 @@ void libblis_test_axpyv_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - ( y_orig + alpha * conjx(x) ) )
+	//   normfv( y - ( y_orig + alpha * conjx(x) ) )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c
index cf5563ec95bd2b78919985f2183a498a3f404e77..b9b8d99c33a32c00fe7e08f19198ffefdedb1341 100644
--- a/testsuite/src/test_dotaxpyv.c
+++ b/testsuite/src/test_dotaxpyv.c
@@ -345,7 +345,7 @@ void libblis_test_dotaxpyv_check
 	//
 	// and
 	//
-	//   normf( z - z_temp )
+	//   normfv( z - z_temp )
 	//
 	// are negligible, where rho_temp and z_temp contain rho and z as
 	// computed by dotv and axpyv, respectively.
diff --git a/testsuite/src/test_dotv.c b/testsuite/src/test_dotv.c
index ff9cd2b59c26d436a3e6c239a0a58e4101417161..0b0404af3c3c1e6bb18f4e1a2e35e3ab34bb5eda 100644
--- a/testsuite/src/test_dotv.c
+++ b/testsuite/src/test_dotv.c
@@ -278,7 +278,7 @@ void libblis_test_dotv_check
 	//
 	// is functioning correctly if
 	//
-	//   sqrtsc( rho.real ) - normf( x )
+	//   sqrtsc( rho.real ) - normfv( x )
 	//
 	// and
 	//
diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c
index e85edff171e5950b802346ff609779b6ab4b32d3..80638d11059fa41fd045ca0371216f20eba4a2ee 100644
--- a/testsuite/src/test_dotxaxpyf.c
+++ b/testsuite/src/test_dotxaxpyf.c
@@ -366,11 +366,11 @@ void libblis_test_dotxaxpyf_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - v )
+	//   normfv( y - v )
 	//
 	// and
 	//
-	//   normf( z - q )
+	//   normfv( z - q )
 	//
 	// are negligible, where v and q contain y and z as computed by repeated
 	// calls to dotxv and axpyv, respectively.
diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c
index d73fd0609e73729228ec298f62a73b6fdf75564e..cac443ac6b3bc6c7ebbf4d16f5d0d437c3d4aaaf 100644
--- a/testsuite/src/test_dotxf.c
+++ b/testsuite/src/test_dotxf.c
@@ -324,7 +324,7 @@ void libblis_test_dotxf_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - v )
+	//   normfv( y - v )
 	//
 	// is negligible, where v contains y as computed by repeated calls to
 	// dotxv.
diff --git a/testsuite/src/test_dotxv.c b/testsuite/src/test_dotxv.c
index 76a47a08dccaa9815df96eae1aa3177b9111c5bc..64ab90e02f509e859f9d33baea73c4e502b90a19 100644
--- a/testsuite/src/test_dotxv.c
+++ b/testsuite/src/test_dotxv.c
@@ -304,7 +304,7 @@ void libblis_test_dotxv_check
 	//
 	// is functioning correctly if
 	//
-	//   sqrtsc( rho.real ) - sqrtsc( alpha ) * normf( x )
+	//   sqrtsc( rho.real ) - sqrtsc( alpha ) * normfv( x )
 	//
 	// and
 	//
diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c
index 80cc010a1942440fb26e36090bb7dcaf0444017a..6dae4301ead38ead4630cc6533a5cce95c8d6763 100644
--- a/testsuite/src/test_gemm.c
+++ b/testsuite/src/test_gemm.c
@@ -625,7 +625,7 @@ void libblis_test_gemm_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index 616532491d7ab2884177efde92ab82009b55e26e..2017c70dcaf9cacb82f117be2185ce54dbb60c06 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -390,7 +390,7 @@ void libblis_test_gemm_ukr_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index 6d2f028d235c3cc18de4563e5b938fd6a8c2a7ed..20ceac1c612fc47b75da9cb21633efe141788434 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -465,7 +465,7 @@ void libblis_test_gemmtrsm_ukr_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c
index aa10764b0ad80d9f94917f900ae7c6eeb581ca13..022fd2b561b471a5a13725bbedf002e5e59572be 100644
--- a/testsuite/src/test_gemv.c
+++ b/testsuite/src/test_gemv.c
@@ -324,7 +324,7 @@ void libblis_test_gemv_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - z )
+	//   normfv( y - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c
index c611c4661459ac0ba5d21dd0d046f45a796612df..672077ec1945f5e89a2157ba3055958b55cee0fc 100644
--- a/testsuite/src/test_ger.c
+++ b/testsuite/src/test_ger.c
@@ -303,7 +303,7 @@ void libblis_test_ger_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - w )
+	//   normfv( v - w )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c
index 15a684c3d49d593b13370304d7fbfeca011e2f63..12afa369882bef0c0237aaa2fb315441cb799ff4 100644
--- a/testsuite/src/test_hemm.c
+++ b/testsuite/src/test_hemm.c
@@ -338,7 +338,7 @@ void libblis_test_hemm_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c
index 17204102e3a90d5268d0a83902758ab2f100215d..0976e0bc856b5643b7ed7ad4b91e07b4944a2d2d 100644
--- a/testsuite/src/test_hemv.c
+++ b/testsuite/src/test_hemv.c
@@ -322,7 +322,7 @@ void libblis_test_hemv_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - v )
+	//   normfv( y - v )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_her.c b/testsuite/src/test_her.c
index c5ca4b14d364f27178d0f33426ed4747cbc5a8ea..c545e00a774b7019f0be57c35a6ea8c0e0eeedf6 100644
--- a/testsuite/src/test_her.c
+++ b/testsuite/src/test_her.c
@@ -301,7 +301,7 @@ void libblis_test_her_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - w )
+	//   normfv( v - w )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_her2.c b/testsuite/src/test_her2.c
index 896497b4eaa22df161f123acadd1ac0fd33d4e86..d6a8686ade0e2b669b04ee1b958db64b0ae9a289 100644
--- a/testsuite/src/test_her2.c
+++ b/testsuite/src/test_her2.c
@@ -311,7 +311,7 @@ void libblis_test_her2_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - w )
+	//   normfv( v - w )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c
index 2b692b021cdd778feb10cd7724d6ad8c81afd508..9aed2d968cad19fbf684685bfcedc7a6f1359af1 100644
--- a/testsuite/src/test_her2k.c
+++ b/testsuite/src/test_her2k.c
@@ -336,7 +336,7 @@ void libblis_test_her2k_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c
index 5b9e1c353fd3e0651858ddad51910a2a9f95645e..eda56d2aa34b2afd2f30990e373870875698ec70 100644
--- a/testsuite/src/test_herk.c
+++ b/testsuite/src/test_herk.c
@@ -323,7 +323,7 @@ void libblis_test_herk_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_normfm.c b/testsuite/src/test_normfm.c
index e8882ed54e8997e3194731153419131c51ea2920..7d80f7c172be5202cf9f25788ee5f524fa6ee011 100644
--- a/testsuite/src/test_normfm.c
+++ b/testsuite/src/test_normfm.c
@@ -259,7 +259,7 @@ void libblis_test_normfm_check
 	//
 	// Under these conditions, we assume that the implementation for
 	//
-	//   norm := normf( x )
+	//   norm := normfm( x )
 	//
 	// is functioning correctly if
 	//
diff --git a/testsuite/src/test_normfv.c b/testsuite/src/test_normfv.c
index 1622a2e8971dfd30c1e44352694a23c6fb8dd8d6..83210f16883c7d0442cc292c45edc2741660e1e6 100644
--- a/testsuite/src/test_normfv.c
+++ b/testsuite/src/test_normfv.c
@@ -256,7 +256,7 @@ void libblis_test_normfv_check
 	//
 	// Under these conditions, we assume that the implementation for
 	//
-	//   norm := normf( x )
+	//   norm := normfv( x )
 	//
 	// is functioning correctly if
 	//
diff --git a/testsuite/src/test_scal2m.c b/testsuite/src/test_scal2m.c
index d6f29f996cf3dd2a397ef3721095659b47060a62..7ed1ec49b4cfcd2e72b396e728022862ddfb61c0 100644
--- a/testsuite/src/test_scal2m.c
+++ b/testsuite/src/test_scal2m.c
@@ -288,7 +288,7 @@ void libblis_test_scal2m_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - alpha * conjx(x) )
+	//   normfm( y - alpha * conjx(x) )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_scal2v.c b/testsuite/src/test_scal2v.c
index 7a28479dbdc5eaf7edc9ccce42a472c69358d4a0..b5b2a3d6533e0ddf4012e5039d6ebdca97fd9391 100644
--- a/testsuite/src/test_scal2v.c
+++ b/testsuite/src/test_scal2v.c
@@ -285,7 +285,7 @@ void libblis_test_scal2v_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - alpha * conjx(x) )
+	//   normfv( y - alpha * conjx(x) )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c
index 3e9d5069f6ae0d9ffb01ba715181bf9ab2de1b5f..284e23ab667f2069212c1029ade5351ed2d7f58d 100644
--- a/testsuite/src/test_scalm.c
+++ b/testsuite/src/test_scalm.c
@@ -280,7 +280,7 @@ void libblis_test_scalm_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y + -conjbeta(beta) * y_orig )
+	//   normfm( y + -conjbeta(beta) * y_orig )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c
index ef3b980caeddc8ac973a48840f2209124c7d35e4..61b3f5fbe07da790b4cfedf1f5787b3e523040b0 100644
--- a/testsuite/src/test_scalv.c
+++ b/testsuite/src/test_scalv.c
@@ -276,7 +276,7 @@ void libblis_test_scalv_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y + -conjbeta(beta) * y_orig )
+	//   normfv( y + -conjbeta(beta) * y_orig )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_subm.c b/testsuite/src/test_subm.c
index d28eb280089718a91f7dd02533979b2b729cb9fb..8c6a83831b0fcccf216357c724c8b2a19443f742 100644
--- a/testsuite/src/test_subm.c
+++ b/testsuite/src/test_subm.c
@@ -275,7 +275,7 @@ void libblis_test_subm_check
 	//
 	// is functioning correctly if
 	//
-	//   normfv(y) - sqrt( absqsc( beta - conjx(alpha) ) * m * n )
+	//   normfm(y) - sqrt( absqsc( beta - conjx(alpha) ) * m * n )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c
index 690594a39e053b556891c682462ccf20ab67c39b..e36147251af2ebda967672492b26b5b187607601 100644
--- a/testsuite/src/test_symm.c
+++ b/testsuite/src/test_symm.c
@@ -338,7 +338,7 @@ void libblis_test_symm_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_symv.c b/testsuite/src/test_symv.c
index c654685dfdfd080e1d27331ddb85ef15e173c946..a1f9141429e3605759ab0442fb5d27e4dfb37316 100644
--- a/testsuite/src/test_symv.c
+++ b/testsuite/src/test_symv.c
@@ -322,7 +322,7 @@ void libblis_test_symv_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - v )
+	//   normfv( y - v )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_syr.c b/testsuite/src/test_syr.c
index efdc67b842722265f4d4ffb9ab43d8009679eaed..f328d061b4ed63d695a45b991733cb69a37ec74f 100644
--- a/testsuite/src/test_syr.c
+++ b/testsuite/src/test_syr.c
@@ -301,7 +301,7 @@ void libblis_test_syr_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - w )
+	//   normfv( v - w )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_syr2.c b/testsuite/src/test_syr2.c
index e87cd13e5259512d049dac8335ed4034d84e49f4..e79bfeca6e35bfd983395487db5938ea5d555b79 100644
--- a/testsuite/src/test_syr2.c
+++ b/testsuite/src/test_syr2.c
@@ -313,7 +313,7 @@ void libblis_test_syr2_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - w )
+	//   normfv( v - w )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c
index 0283135b46f9b54b6fa43d37b9c1c35488aaea33..e1346692dd716c0d60abe26652e733d992b62eff 100644
--- a/testsuite/src/test_syr2k.c
+++ b/testsuite/src/test_syr2k.c
@@ -335,7 +335,7 @@ void libblis_test_syr2k_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c
index 86c5864e33fc6e6e3336800b0d8c56aec9303dba..d6ca4b3bdc10a9f0af6419a9ae2b685894fc0fad 100644
--- a/testsuite/src/test_syrk.c
+++ b/testsuite/src/test_syrk.c
@@ -324,7 +324,7 @@ void libblis_test_syrk_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c
index 1341d2b71258ca3733eeeb9243f96ddaed497824..be6bb941e92694639c4fdeb52fbd5f8ea0b5418e 100644
--- a/testsuite/src/test_trmm.c
+++ b/testsuite/src/test_trmm.c
@@ -320,7 +320,7 @@ void libblis_test_trmm_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c
index 5b9392f5f26e8e024753c53ca6fd351748814cec..ba9431a0b4e43af8d89906e22cc2326c1467c2ab 100644
--- a/testsuite/src/test_trmm3.c
+++ b/testsuite/src/test_trmm3.c
@@ -339,7 +339,7 @@ void libblis_test_trmm3_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c
index cd1b130cf1690496cd510a08fc86eaf003c9d962..b4b2f386d1a81f3076e5479a9a3ef02aa2810fb8 100644
--- a/testsuite/src/test_trmv.c
+++ b/testsuite/src/test_trmv.c
@@ -304,7 +304,7 @@ void libblis_test_trmv_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - x )
+	//   normfv( y - x )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_trsm.c b/testsuite/src/test_trsm.c
index 23f17b0876e182014efc1121b733670ab79b2167..fa50bf790e7d08f4e6f8df3a19910f1b9b21ca71 100644
--- a/testsuite/src/test_trsm.c
+++ b/testsuite/src/test_trsm.c
@@ -327,7 +327,7 @@ void libblis_test_trsm_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index 5476e1daf24fb4169d84c894246e52aad19e96ae..7d3df41c9ab23622dac962a23777da68a8d9de62 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -401,7 +401,7 @@ void libblis_test_trsm_ukr_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( v - z )
+	//   normfv( v - z )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c
index cb3138c9201c36d9a2ddd668a8d49aa8b1770c1e..b05f7ab975084eb775630e233e62a336aa06e5ec 100644
--- a/testsuite/src/test_trsv.c
+++ b/testsuite/src/test_trsv.c
@@ -305,7 +305,7 @@ void libblis_test_trsv_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - x_orig )
+	//   normfv( y - x_orig )
 	//
 	// is negligible, where
 	//
diff --git a/testsuite/src/test_xpbym.c b/testsuite/src/test_xpbym.c
index b7acc654ef7b787374a6afccf7e3eb3f3278475a..2340b4e11f3c2fa8cbe23095f721b44ce0d99e6f 100644
--- a/testsuite/src/test_xpbym.c
+++ b/testsuite/src/test_xpbym.c
@@ -288,7 +288,7 @@ void libblis_test_xpbym_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - ( beta * y_orig + conjx(x) ) )
+	//   normfm( y - ( beta * y_orig + conjx(x) ) )
 	//
 	// is negligible.
 	//
diff --git a/testsuite/src/test_xpbyv.c b/testsuite/src/test_xpbyv.c
index fa0abdb828e56a21f4c2391aed8290ec14c7dabb..75ad98f6fe2183dfed79b90d2c62395295ae19dd 100644
--- a/testsuite/src/test_xpbyv.c
+++ b/testsuite/src/test_xpbyv.c
@@ -283,7 +283,7 @@ void libblis_test_xpbyv_check
 	//
 	// is functioning correctly if
 	//
-	//   normf( y - ( beta * y_orig + conjx(x) ) )
+	//   normfv( y - ( beta * y_orig + conjx(x) ) )
 	//
 	// is negligible.
 	//
diff --git a/travis/do_sde.sh b/travis/do_sde.sh
index 6ec9febe5d27a4c6660ca36f548acd93ef64dd11..9bf601034def281b5fd3d54e7901b903cd751d05 100755
--- a/travis/do_sde.sh
+++ b/travis/do_sde.sh
@@ -7,9 +7,12 @@ SDE_VERSION=sde-external-8.16.0-2018-01-30-lin
 SDE_TARBALL=$SDE_VERSION.tar.bz2
 SDE=$SDE_VERSION/sde64
 
-set +x
-curl -s -X POST https://content.dropboxapi.com/2/files/download -H "Authorization: Bearer $DROPBOX_TOKEN" -H "Dropbox-API-Arg: {\"path\": \"/$SDE_TARBALL\"}" > $SDE_TARBALL
-set -x
+curl --verbose --form accept_license=1 --form form_id=intel_licensed_dls_step_1 \
+     --output /dev/null --cookie-jar jar.txt \
+     --location https://software.intel.com/protected-download/267266/144917
+curl --verbose --cookie jar.txt --output $SDE_TARBALL \
+     https://software.intel.com/system/files/managed/2a/1a/$SDE_TARBALL
+
 tar xvf $SDE_TARBALL
 
 make -j2 testsuite-bin