From aa2912bc36273e1020d04e97dec3c77f81e179b6 Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain.dolbeau@sipearl.com>
Date: Sun, 3 May 2020 15:51:49 +0200
Subject: [PATCH] avx512 version of SELL format, based on the SVE version

---
 openmp-avx512/basic/Box.hpp                   |   22 +
 openmp-avx512/basic/BoxIterator.hpp           |  143 +
 openmp-avx512/basic/BoxPartition.cpp          |  477 +
 openmp-avx512/basic/BoxPartition.hpp          |   76 +
 openmp-avx512/basic/CSRMatrix.hpp             |  139 +
 openmp-avx512/basic/ComputeNodeType.hpp       |   29 +
 openmp-avx512/basic/DotOp.hpp                 |   35 +
 openmp-avx512/basic/ELLMatrix.hpp             |  144 +
 openmp-avx512/basic/FEComputeElem.hpp         |   29 +
 openmp-avx512/basic/FusedMatvecDotOp.hpp      |   59 +
 openmp-avx512/basic/GetNodesCoords.hpp        |   24 +
 openmp-avx512/basic/Hex8_box_utils.hpp        |  174 +
 openmp-avx512/basic/Lock.hpp                  |  103 +
 openmp-avx512/basic/LockingMatrix.hpp         |   74 +
 openmp-avx512/basic/LockingVector.hpp         |   77 +
 openmp-avx512/basic/MatrixCopyOp.hpp          |   33 +
 openmp-avx512/basic/MatrixInitOp.hpp          |  183 +
 openmp-avx512/basic/MatvecOp.hpp              |   99 +
 openmp-avx512/basic/MemInitOp.hpp             |   14 +
 openmp-avx512/basic/NoOpMemoryModel.hpp       |   27 +
 openmp-avx512/basic/SerialComputeNode.hpp     |   25 +
 .../basic/SparseMatrix_functions.hpp          |  621 ++
 openmp-avx512/basic/SumInLinSys.hpp           |   33 +
 openmp-avx512/basic/TBBNode.cpp               |    8 +
 openmp-avx512/basic/TBBNode.hpp               |   76 +
 openmp-avx512/basic/TPINode.hpp               |  113 +
 openmp-avx512/basic/TypeTraits.hpp            |  137 +
 openmp-avx512/basic/Vector.hpp                |   83 +
 openmp-avx512/basic/Vector_functions.hpp      |  249 +
 openmp-avx512/basic/WaxpbyOp.hpp              |   43 +
 openmp-avx512/basic/analytic_soln.hpp         |  117 +
 openmp-avx512/basic/assemble_FE_data.hpp      |   85 +
 openmp-avx512/basic/box_utils.hpp             |  199 +
 openmp-avx512/basic/cg_solve.hpp              |  273 +
 openmp-avx512/basic/compute_matrix_stats.hpp  |  114 +
 openmp-avx512/basic/driver.hpp                |  403 +
 openmp-avx512/basic/exchange_externals.hpp    |  270 +
 .../basic/generate_matrix_structure.hpp       |  150 +
 openmp-avx512/basic/get_common_files          |   11 +
 .../basic/gold_files/1x1x2_A.mtx.1.0          |  113 +
 .../basic/gold_files/1x1x2_A.mtx.2.0          |   33 +
 .../basic/gold_files/1x1x2_A.mtx.2.1          |   80 +
 .../basic/gold_files/1x1x2_b.vec.1.0          |   13 +
 .../basic/gold_files/1x1x2_b.vec.2.0          |    5 +
 .../basic/gold_files/1x1x2_b.vec.2.1          |    8 +
 .../basic/gold_files/1x1x2_x.vec.1.0          |   13 +
 .../basic/gold_files/1x1x2_x.vec.2.0          |    5 +
 .../basic/gold_files/1x1x2_x.vec.2.1          |    8 +
 openmp-avx512/basic/imbalance.hpp             |  271 +
 openmp-avx512/basic/main.cpp                  |  247 +
 openmp-avx512/basic/make_local_matrix.hpp     |  440 +
 openmp-avx512/basic/make_targets              |   52 +
 openmp-avx512/basic/makefile                  |   35 +
 openmp-avx512/basic/makefile.cuda.gnu.serial  |   36 +
 .../basic/makefile.cuda.tbb.gnu.serial        |   37 +
 openmp-avx512/basic/makefile.debug            |   35 +
 openmp-avx512/basic/makefile.gnu.purify       |   25 +
 openmp-avx512/basic/makefile.gnu.quantify     |   24 +
 openmp-avx512/basic/makefile.gnu.serial       |   21 +
 openmp-avx512/basic/makefile.intel.serial     |   19 +
 openmp-avx512/basic/makefile.redstorm         |   21 +
 openmp-avx512/basic/makefile.tbb              |   28 +
 openmp-avx512/basic/makefile.tbb.gnu.serial   |   28 +
 openmp-avx512/basic/makefile.tpi              |   28 +
 openmp-avx512/basic/makefile.tpi.gnu.serial   |   28 +
 openmp-avx512/basic/optional/README           |    3 +
 .../basic/optional/ThreadPool/CMakeLists.txt  |   53 +
 .../basic/optional/ThreadPool/Makefile.am     |  199 +
 .../ThreadPool/Makefile.export.threadpool.in  |    9 +
 .../basic/optional/ThreadPool/Makefile.in     |  777 ++
 .../optional/ThreadPool/ThreadPool_config.h   |    3 +
 .../basic/optional/ThreadPool/aclocal.m4      |  932 ++
 .../basic/optional/ThreadPool/bootstrap       |    9 +
 .../ThreadPool/cmake/Dependencies.cmake       |   11 +
 .../ThreadPool/cmake/ThreadPool_config.h.in   |    2 +
 .../optional/ThreadPool/config/acx_pthread.m4 |  224 +
 .../optional/ThreadPool/config/config.guess   | 1500 ++++
 .../optional/ThreadPool/config/config.sub     | 1616 ++++
 .../basic/optional/ThreadPool/config/depcomp  |  584 ++
 .../ThreadPool/config/generate-makeoptions.pl |   86 +
 .../optional/ThreadPool/config/install-sh     |  507 ++
 .../basic/optional/ThreadPool/config/missing  |  367 +
 .../config/replace-install-prefix.pl          |   89 +
 .../ThreadPool/config/string-replace.pl       |   43 +
 .../ThreadPool/config/strip_dup_incl_paths.pl |   44 +
 .../ThreadPool/config/strip_dup_libs.pl       |   69 +
 .../ThreadPool/config/tac_arg_check_mpi.m4    |   68 +
 .../ThreadPool/config/tac_arg_config_mpi.m4   |  188 +
 .../config/tac_arg_enable_export-makefiles.m4 |   76 +
 .../config/tac_arg_enable_feature.m4          |   40 +
 .../tac_arg_enable_feature_sub_check.m4       |   54 +
 .../ThreadPool/config/tac_arg_with_ar.m4      |   39 +
 .../ThreadPool/config/tac_arg_with_flags.m4   |   31 +
 .../ThreadPool/config/tac_arg_with_incdirs.m4 |   24 +
 .../ThreadPool/config/tac_arg_with_libdirs.m4 |   24 +
 .../ThreadPool/config/tac_arg_with_libs.m4    |   30 +
 .../ThreadPool/config/tac_arg_with_perl.m4    |   34 +
 .../ThreadPool/config/token-replace.pl        |   43 +
 .../basic/optional/ThreadPool/configure       | 7804 +++++++++++++++++
 .../basic/optional/ThreadPool/configure.ac    |  240 +
 .../optional/ThreadPool/src/CMakeLists.txt    |   70 +
 .../basic/optional/ThreadPool/src/Makefile.am |  140 +
 .../basic/optional/ThreadPool/src/Makefile.in |  680 ++
 .../basic/optional/ThreadPool/src/TPI.c       | 1016 +++
 .../basic/optional/ThreadPool/src/TPI.h       |  253 +
 .../basic/optional/ThreadPool/src/TPI.hpp     |  135 +
 .../optional/ThreadPool/src/TPI_Walltime.c    |   44 +
 .../ThreadPool/src/ThreadPool_config.h.in     |   71 +
 .../optional/ThreadPool/test/CMakeLists.txt   |   86 +
 .../optional/ThreadPool/test/Makefile.am      |   55 +
 .../optional/ThreadPool/test/Makefile.in      |  730 ++
 .../basic/optional/ThreadPool/test/build_gnu  |   79 +
 .../optional/ThreadPool/test/build_intel      |   82 +
 .../basic/optional/ThreadPool/test/build_pgi  |   39 +
 .../ThreadPool/test/hhpccg/BoxPartitionIB.c   |  562 ++
 .../ThreadPool/test/hhpccg/BoxPartitionIB.h   |   88 +
 .../ThreadPool/test/hhpccg/CGSolver.c         |  311 +
 .../ThreadPool/test/hhpccg/CGSolver.h         |   40 +
 .../ThreadPool/test/hhpccg/CMakeLists.txt     |   83 +
 .../ThreadPool/test/hhpccg/dcrs_matrix.c      |  314 +
 .../ThreadPool/test/hhpccg/dcrs_matrix.h      |   41 +
 .../optional/ThreadPool/test/hhpccg/main.c    |  422 +
 .../ThreadPool/test/hhpccg/tpi_vector.c       |  277 +
 .../ThreadPool/test/hhpccg/tpi_vector.h       |   30 +
 .../ThreadPool/test/hpccg/BoxPartition.c      |  487 +
 .../ThreadPool/test/hpccg/BoxPartition.h      |   64 +
 .../optional/ThreadPool/test/hpccg/CGSolver.c |  248 +
 .../optional/ThreadPool/test/hpccg/CGSolver.h |   32 +
 .../ThreadPool/test/hpccg/CMakeLists.txt      |   83 +
 .../optional/ThreadPool/test/hpccg/main.c     |  340 +
 .../ThreadPool/test/hpccg/tpi_vector.c        |  273 +
 .../ThreadPool/test/hpccg/tpi_vector.h        |   31 +
 .../optional/ThreadPool/test/test_c_dnax.c    |  414 +
 .../optional/ThreadPool/test/test_mpi_sum.c   |  764 ++
 .../optional/ThreadPool/test/test_pthreads.c  |  279 +
 .../optional/ThreadPool/test/test_tpi.cpp     |  123 +
 .../optional/ThreadPool/test/test_tpi_unit.c  |  505 ++
 .../basic/optional/copy_from_trilinos         |   25 +
 .../basic/optional/cuda/CudaCall.hpp          |   21 +
 .../basic/optional/cuda/CudaMemoryModel.hpp   |  152 +
 .../basic/optional/cuda/CudaNode.cpp          |   96 +
 .../basic/optional/cuda/CudaNode.cuh          |   66 +
 .../basic/optional/cuda/CudaNode.hpp          |   57 +
 .../basic/optional/cuda/CudaNodeImpl.hpp      |   15 +
 openmp-avx512/basic/optional/cuda/Matrix.cu   |   22 +
 openmp-avx512/basic/optional/cuda/Vector.cu   |   19 +
 .../optional/cuda/cutil_inline_runtime.h      |   63 +
 openmp-avx512/basic/optional/make_targets     |   54 +
 openmp-avx512/basic/perform_element_loop.hpp  |  110 +
 .../basic/perform_element_loop_TBB_pipe.hpp   |  382 +
 .../perform_element_loop_TBB_pllfor1.hpp      |  126 +
 .../perform_element_loop_TBB_pllfor2.hpp      |  162 +
 openmp-avx512/basic/run_one_test              |   60 +
 openmp-avx512/basic/run_tests                 |   22 +
 openmp-avx512/basic/sharedmem.cuh             |  153 +
 .../basic/simple_mesh_description.hpp         |  239 +
 openmp-avx512/basic/time_kernels.hpp          |  140 +
 openmp-avx512/basic/utest.cpp                 |   68 +
 openmp-avx512/basic/utest_case.hpp            |   55 +
 openmp-avx512/basic/utest_cases.hpp           | 1232 +++
 openmp-avx512/basic/verify_solution.hpp       |  170 +
 openmp-avx512/fem/ElemData.hpp                |   64 +
 openmp-avx512/fem/Hex8.hpp                    |  417 +
 openmp-avx512/fem/Hex8_ElemData.hpp           |   86 +
 openmp-avx512/fem/Hex8_enums.hpp              |   52 +
 openmp-avx512/fem/analytic_soln.hpp           |  116 +
 openmp-avx512/fem/gauss_pts.hpp               |   67 +
 openmp-avx512/fem/matrix_algebra_3x3.hpp      |  166 +
 openmp-avx512/fem/verify_solution.hpp         |  179 +
 openmp-avx512/src/CSRMatrix.hpp               |  146 +
 openmp-avx512/src/ELLMatrix.hpp               |  139 +
 openmp-avx512/src/GetNodesCoords.hpp          |   51 +
 openmp-avx512/src/Hex8_box_utils.hpp          |  173 +
 openmp-avx512/src/Makefile                    |   42 +
 openmp-avx512/src/Makefile.cray.xc30          |   35 +
 openmp-avx512/src/Makefile.gnu.openmp         |   32 +
 openmp-avx512/src/Makefile.intel.openmp       |   34 +
 openmp-avx512/src/MatrixCopyOp.hpp            |   60 +
 openmp-avx512/src/MatrixInitOp.hpp            |  320 +
 openmp-avx512/src/README.md                   |    6 +
 openmp-avx512/src/SELLMatrix.hpp              |  160 +
 openmp-avx512/src/SparseMatrix_functions.hpp  |  837 ++
 openmp-avx512/src/Vector.hpp                  |   68 +
 openmp-avx512/src/Vector_functions.hpp        |  308 +
 openmp-avx512/src/YAML_Doc.cpp                |  102 +
 openmp-avx512/src/YAML_Doc.hpp                |  115 +
 openmp-avx512/src/YAML_Element.cpp            |  148 +
 openmp-avx512/src/YAML_Element.hpp            |   79 +
 openmp-avx512/src/assemble_FE_data.hpp        |   78 +
 openmp-avx512/src/cg_solve.hpp                |  215 +
 openmp-avx512/src/driver.hpp                  |  410 +
 openmp-avx512/src/exchange_externals.hpp      |  270 +
 openmp-avx512/src/generate_info_header        |   88 +
 .../src/generate_matrix_structure.hpp         |  165 +
 openmp-avx512/src/get_common_files            |   15 +
 openmp-avx512/src/main.cpp                    |  276 +
 openmp-avx512/src/make_local_matrix.hpp       |  447 +
 openmp-avx512/src/make_targets                |   45 +
 openmp-avx512/src/perform_element_loop.hpp    |   95 +
 openmp-avx512/src/simple_mesh_description.hpp |  248 +
 openmp-avx512/src/time_kernels.hpp            |  139 +
 openmp-avx512/utils/Box.hpp                   |   55 +
 openmp-avx512/utils/BoxIterator.hpp           |  142 +
 openmp-avx512/utils/BoxPartition.cpp          |  503 ++
 openmp-avx512/utils/BoxPartition.hpp          |  103 +
 openmp-avx512/utils/Parameters.hpp            |   64 +
 openmp-avx512/utils/TypeTraits.hpp            |  136 +
 openmp-avx512/utils/box_utils.hpp             |  320 +
 openmp-avx512/utils/compute_matrix_stats.hpp  |  116 +
 openmp-avx512/utils/imbalance.hpp             |  298 +
 openmp-avx512/utils/miniFE_no_info.hpp        |   39 +
 openmp-avx512/utils/miniFE_version.h          |   35 +
 openmp-avx512/utils/mytimer.cpp               |  132 +
 openmp-avx512/utils/mytimer.hpp               |   52 +
 openmp-avx512/utils/outstream.hpp             |   45 +
 openmp-avx512/utils/param_utils.cpp           |   58 +
 openmp-avx512/utils/param_utils.hpp           |  160 +
 openmp-avx512/utils/utils.cpp                 |  136 +
 openmp-avx512/utils/utils.hpp                 |  204 +
 219 files changed, 44337 insertions(+)
 create mode 100644 openmp-avx512/basic/Box.hpp
 create mode 100644 openmp-avx512/basic/BoxIterator.hpp
 create mode 100644 openmp-avx512/basic/BoxPartition.cpp
 create mode 100644 openmp-avx512/basic/BoxPartition.hpp
 create mode 100644 openmp-avx512/basic/CSRMatrix.hpp
 create mode 100644 openmp-avx512/basic/ComputeNodeType.hpp
 create mode 100644 openmp-avx512/basic/DotOp.hpp
 create mode 100644 openmp-avx512/basic/ELLMatrix.hpp
 create mode 100644 openmp-avx512/basic/FEComputeElem.hpp
 create mode 100644 openmp-avx512/basic/FusedMatvecDotOp.hpp
 create mode 100644 openmp-avx512/basic/GetNodesCoords.hpp
 create mode 100644 openmp-avx512/basic/Hex8_box_utils.hpp
 create mode 100644 openmp-avx512/basic/Lock.hpp
 create mode 100644 openmp-avx512/basic/LockingMatrix.hpp
 create mode 100644 openmp-avx512/basic/LockingVector.hpp
 create mode 100644 openmp-avx512/basic/MatrixCopyOp.hpp
 create mode 100644 openmp-avx512/basic/MatrixInitOp.hpp
 create mode 100644 openmp-avx512/basic/MatvecOp.hpp
 create mode 100644 openmp-avx512/basic/MemInitOp.hpp
 create mode 100644 openmp-avx512/basic/NoOpMemoryModel.hpp
 create mode 100644 openmp-avx512/basic/SerialComputeNode.hpp
 create mode 100644 openmp-avx512/basic/SparseMatrix_functions.hpp
 create mode 100644 openmp-avx512/basic/SumInLinSys.hpp
 create mode 100644 openmp-avx512/basic/TBBNode.cpp
 create mode 100644 openmp-avx512/basic/TBBNode.hpp
 create mode 100644 openmp-avx512/basic/TPINode.hpp
 create mode 100644 openmp-avx512/basic/TypeTraits.hpp
 create mode 100644 openmp-avx512/basic/Vector.hpp
 create mode 100644 openmp-avx512/basic/Vector_functions.hpp
 create mode 100644 openmp-avx512/basic/WaxpbyOp.hpp
 create mode 100644 openmp-avx512/basic/analytic_soln.hpp
 create mode 100644 openmp-avx512/basic/assemble_FE_data.hpp
 create mode 100644 openmp-avx512/basic/box_utils.hpp
 create mode 100644 openmp-avx512/basic/cg_solve.hpp
 create mode 100644 openmp-avx512/basic/compute_matrix_stats.hpp
 create mode 100644 openmp-avx512/basic/driver.hpp
 create mode 100644 openmp-avx512/basic/exchange_externals.hpp
 create mode 100644 openmp-avx512/basic/generate_matrix_structure.hpp
 create mode 100755 openmp-avx512/basic/get_common_files
 create mode 100644 openmp-avx512/basic/gold_files/1x1x2_A.mtx.1.0
 create mode 100644 openmp-avx512/basic/gold_files/1x1x2_A.mtx.2.0
 create mode 100644 openmp-avx512/basic/gold_files/1x1x2_A.mtx.2.1
 create mode 100644 openmp-avx512/basic/gold_files/1x1x2_b.vec.1.0
 create mode 100644 openmp-avx512/basic/gold_files/1x1x2_b.vec.2.0
 create mode 100644 openmp-avx512/basic/gold_files/1x1x2_b.vec.2.1
 create mode 100644 openmp-avx512/basic/gold_files/1x1x2_x.vec.1.0
 create mode 100644 openmp-avx512/basic/gold_files/1x1x2_x.vec.2.0
 create mode 100644 openmp-avx512/basic/gold_files/1x1x2_x.vec.2.1
 create mode 100644 openmp-avx512/basic/imbalance.hpp
 create mode 100644 openmp-avx512/basic/main.cpp
 create mode 100644 openmp-avx512/basic/make_local_matrix.hpp
 create mode 100644 openmp-avx512/basic/make_targets
 create mode 100644 openmp-avx512/basic/makefile
 create mode 100644 openmp-avx512/basic/makefile.cuda.gnu.serial
 create mode 100644 openmp-avx512/basic/makefile.cuda.tbb.gnu.serial
 create mode 100644 openmp-avx512/basic/makefile.debug
 create mode 100644 openmp-avx512/basic/makefile.gnu.purify
 create mode 100644 openmp-avx512/basic/makefile.gnu.quantify
 create mode 100644 openmp-avx512/basic/makefile.gnu.serial
 create mode 100644 openmp-avx512/basic/makefile.intel.serial
 create mode 100644 openmp-avx512/basic/makefile.redstorm
 create mode 100644 openmp-avx512/basic/makefile.tbb
 create mode 100644 openmp-avx512/basic/makefile.tbb.gnu.serial
 create mode 100644 openmp-avx512/basic/makefile.tpi
 create mode 100644 openmp-avx512/basic/makefile.tpi.gnu.serial
 create mode 100644 openmp-avx512/basic/optional/README
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/CMakeLists.txt
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/Makefile.am
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/Makefile.export.threadpool.in
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/Makefile.in
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/ThreadPool_config.h
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/aclocal.m4
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/bootstrap
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/cmake/Dependencies.cmake
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/cmake/ThreadPool_config.h.in
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/acx_pthread.m4
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/config.guess
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/config.sub
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/depcomp
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/generate-makeoptions.pl
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/install-sh
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/missing
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/replace-install-prefix.pl
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/string-replace.pl
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/strip_dup_incl_paths.pl
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/strip_dup_libs.pl
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_check_mpi.m4
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_config_mpi.m4
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_export-makefiles.m4
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_feature.m4
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_feature_sub_check.m4
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_ar.m4
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_flags.m4
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_incdirs.m4
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_libdirs.m4
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_libs.m4
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_perl.m4
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/config/token-replace.pl
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/configure
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/configure.ac
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/src/CMakeLists.txt
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/src/Makefile.am
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/src/Makefile.in
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/src/TPI.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/src/TPI.h
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/src/TPI.hpp
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/src/TPI_Walltime.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/src/ThreadPool_config.h.in
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/CMakeLists.txt
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/Makefile.am
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/Makefile.in
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/test/build_gnu
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/test/build_intel
 create mode 100755 openmp-avx512/basic/optional/ThreadPool/test/build_pgi
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.h
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CGSolver.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CGSolver.h
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CMakeLists.txt
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.h
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/main.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/tpi_vector.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hhpccg/tpi_vector.h
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hpccg/BoxPartition.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hpccg/BoxPartition.h
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hpccg/CGSolver.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hpccg/CGSolver.h
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hpccg/CMakeLists.txt
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hpccg/main.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hpccg/tpi_vector.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/hpccg/tpi_vector.h
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/test_c_dnax.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/test_mpi_sum.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/test_pthreads.c
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/test_tpi.cpp
 create mode 100644 openmp-avx512/basic/optional/ThreadPool/test/test_tpi_unit.c
 create mode 100755 openmp-avx512/basic/optional/copy_from_trilinos
 create mode 100644 openmp-avx512/basic/optional/cuda/CudaCall.hpp
 create mode 100644 openmp-avx512/basic/optional/cuda/CudaMemoryModel.hpp
 create mode 100644 openmp-avx512/basic/optional/cuda/CudaNode.cpp
 create mode 100644 openmp-avx512/basic/optional/cuda/CudaNode.cuh
 create mode 100644 openmp-avx512/basic/optional/cuda/CudaNode.hpp
 create mode 100644 openmp-avx512/basic/optional/cuda/CudaNodeImpl.hpp
 create mode 100644 openmp-avx512/basic/optional/cuda/Matrix.cu
 create mode 100644 openmp-avx512/basic/optional/cuda/Vector.cu
 create mode 100644 openmp-avx512/basic/optional/cuda/cutil_inline_runtime.h
 create mode 100644 openmp-avx512/basic/optional/make_targets
 create mode 100644 openmp-avx512/basic/perform_element_loop.hpp
 create mode 100644 openmp-avx512/basic/perform_element_loop_TBB_pipe.hpp
 create mode 100644 openmp-avx512/basic/perform_element_loop_TBB_pllfor1.hpp
 create mode 100644 openmp-avx512/basic/perform_element_loop_TBB_pllfor2.hpp
 create mode 100755 openmp-avx512/basic/run_one_test
 create mode 100755 openmp-avx512/basic/run_tests
 create mode 100644 openmp-avx512/basic/sharedmem.cuh
 create mode 100644 openmp-avx512/basic/simple_mesh_description.hpp
 create mode 100644 openmp-avx512/basic/time_kernels.hpp
 create mode 100644 openmp-avx512/basic/utest.cpp
 create mode 100644 openmp-avx512/basic/utest_case.hpp
 create mode 100644 openmp-avx512/basic/utest_cases.hpp
 create mode 100644 openmp-avx512/basic/verify_solution.hpp
 create mode 100644 openmp-avx512/fem/ElemData.hpp
 create mode 100644 openmp-avx512/fem/Hex8.hpp
 create mode 100644 openmp-avx512/fem/Hex8_ElemData.hpp
 create mode 100644 openmp-avx512/fem/Hex8_enums.hpp
 create mode 100644 openmp-avx512/fem/analytic_soln.hpp
 create mode 100644 openmp-avx512/fem/gauss_pts.hpp
 create mode 100644 openmp-avx512/fem/matrix_algebra_3x3.hpp
 create mode 100644 openmp-avx512/fem/verify_solution.hpp
 create mode 100644 openmp-avx512/src/CSRMatrix.hpp
 create mode 100644 openmp-avx512/src/ELLMatrix.hpp
 create mode 100644 openmp-avx512/src/GetNodesCoords.hpp
 create mode 100644 openmp-avx512/src/Hex8_box_utils.hpp
 create mode 100644 openmp-avx512/src/Makefile
 create mode 100644 openmp-avx512/src/Makefile.cray.xc30
 create mode 100644 openmp-avx512/src/Makefile.gnu.openmp
 create mode 100644 openmp-avx512/src/Makefile.intel.openmp
 create mode 100644 openmp-avx512/src/MatrixCopyOp.hpp
 create mode 100644 openmp-avx512/src/MatrixInitOp.hpp
 create mode 100644 openmp-avx512/src/README.md
 create mode 100644 openmp-avx512/src/SELLMatrix.hpp
 create mode 100644 openmp-avx512/src/SparseMatrix_functions.hpp
 create mode 100644 openmp-avx512/src/Vector.hpp
 create mode 100644 openmp-avx512/src/Vector_functions.hpp
 create mode 100644 openmp-avx512/src/YAML_Doc.cpp
 create mode 100644 openmp-avx512/src/YAML_Doc.hpp
 create mode 100644 openmp-avx512/src/YAML_Element.cpp
 create mode 100644 openmp-avx512/src/YAML_Element.hpp
 create mode 100644 openmp-avx512/src/assemble_FE_data.hpp
 create mode 100644 openmp-avx512/src/cg_solve.hpp
 create mode 100644 openmp-avx512/src/driver.hpp
 create mode 100644 openmp-avx512/src/exchange_externals.hpp
 create mode 100755 openmp-avx512/src/generate_info_header
 create mode 100644 openmp-avx512/src/generate_matrix_structure.hpp
 create mode 100755 openmp-avx512/src/get_common_files
 create mode 100644 openmp-avx512/src/main.cpp
 create mode 100644 openmp-avx512/src/make_local_matrix.hpp
 create mode 100644 openmp-avx512/src/make_targets
 create mode 100644 openmp-avx512/src/perform_element_loop.hpp
 create mode 100644 openmp-avx512/src/simple_mesh_description.hpp
 create mode 100644 openmp-avx512/src/time_kernels.hpp
 create mode 100644 openmp-avx512/utils/Box.hpp
 create mode 100644 openmp-avx512/utils/BoxIterator.hpp
 create mode 100644 openmp-avx512/utils/BoxPartition.cpp
 create mode 100644 openmp-avx512/utils/BoxPartition.hpp
 create mode 100644 openmp-avx512/utils/Parameters.hpp
 create mode 100644 openmp-avx512/utils/TypeTraits.hpp
 create mode 100644 openmp-avx512/utils/box_utils.hpp
 create mode 100644 openmp-avx512/utils/compute_matrix_stats.hpp
 create mode 100644 openmp-avx512/utils/imbalance.hpp
 create mode 100644 openmp-avx512/utils/miniFE_no_info.hpp
 create mode 100644 openmp-avx512/utils/miniFE_version.h
 create mode 100644 openmp-avx512/utils/mytimer.cpp
 create mode 100644 openmp-avx512/utils/mytimer.hpp
 create mode 100644 openmp-avx512/utils/outstream.hpp
 create mode 100644 openmp-avx512/utils/param_utils.cpp
 create mode 100644 openmp-avx512/utils/param_utils.hpp
 create mode 100644 openmp-avx512/utils/utils.cpp
 create mode 100644 openmp-avx512/utils/utils.hpp

diff --git a/openmp-avx512/basic/Box.hpp b/openmp-avx512/basic/Box.hpp
new file mode 100644
index 0000000..62046e4
--- /dev/null
+++ b/openmp-avx512/basic/Box.hpp
@@ -0,0 +1,22 @@
+#ifndef _Box_hpp_
+#define _Box_hpp_
+
+/**
+  * a 'Box' is 3 pairs of ints, where each pair specifies a lower
+  * and upper bound for one of the 3 spatial dimensions.
+  *
+  * This struct stores the 3 pairs as a simple array of 6 ints,
+  * but defines the bracket operator so that it can be referenced
+  * using 2-dimensional array notation like this:
+  * int xmin = box[0][0]; int xmax = box[0][1];
+  * int ymin = box[1][0]; int ymax = box[1][1];
+  * int zmin = box[2][0]; int zmax = box[2][1];
+ */
+struct Box {
+  int ranges[6];
+  int* operator[](int xyz) { return &ranges[xyz*2]; }
+  const int* operator[](int xyz) const { return &ranges[xyz*2]; }
+};
+
+#endif
+
diff --git a/openmp-avx512/basic/BoxIterator.hpp b/openmp-avx512/basic/BoxIterator.hpp
new file mode 100644
index 0000000..f644119
--- /dev/null
+++ b/openmp-avx512/basic/BoxIterator.hpp
@@ -0,0 +1,143 @@
+#ifndef _BoxTraverser_hpp_
+#define _BoxTraverser_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+namespace miniFE {
+
+/** Class for traversing a 3-dimensional 'box' of indices.
+
+  //One way to traverse a 'box[3][2]' is to use a triply-nested for-loop:
+  for(int z=box[2][0]; z<box[2][1]; ++z) {
+    for(int y=box[1][0]; y<box[1][1]; ++y) {
+      for(int x=box[0][0]; x<box[0][1]; ++x) {
+        ...
+      }
+    }
+  }
+
+  //Another way is to use this BoxIterator class, like so:
+  //BoxIterator iter = BoxIterator::begin(box);
+  //BoxIterator end = BoxIterator::end(box);
+  for(; iter != end; ++iter) {
+    int x = iter.x;
+    int y = iter.y;
+    int z = iter.z;
+    ...
+  }
+*/
+class BoxIterator {
+public:
+  ~BoxIterator(){}
+
+  static BoxIterator begin(const Box& box)
+  {
+    return BoxIterator(box);
+  }
+
+  static BoxIterator end(const Box& box)
+  {
+    return BoxIterator(box, true/*at_end==true*/);
+  }
+
+  BoxIterator& operator=(const BoxIterator& src)
+  {
+    box_[0][0] = src.box_[0][0]; box_[0][1] = src.box_[0][1];
+    box_[1][0] = src.box_[1][0]; box_[1][1] = src.box_[1][1];
+    box_[2][0] = src.box_[2][0]; box_[2][1] = src.box_[2][1];
+    x = src.x;
+    y = src.y;
+    z = src.z;
+    return *this;
+  }
+
+  BoxIterator& operator++()
+  {
+    ++x;
+    if (x >= box_[0][1]) {
+      x = box_[0][0];
+      ++y;
+      if (y >= box_[1][1]) {
+        y = box_[1][0];
+        ++z;
+        if (z >= box_[2][1]) {
+          z = box_[2][1];
+          y = box_[1][1];
+          x = box_[0][1];
+        }
+      }
+    }
+    return *this;
+  }
+
+  BoxIterator operator++(int)
+  {
+    BoxIterator temp = *this;
+    ++(*this);
+    return temp;
+  }
+
+  bool operator==(const BoxIterator& rhs) const
+  {
+    return x == rhs.x && y == rhs.y && z == rhs.z;
+  }
+
+  bool operator!=(const BoxIterator& rhs) const
+  {
+    return !(this->operator==(rhs));
+  }
+
+  int x;
+  int y;
+  int z;
+
+private:
+  BoxIterator(const Box& box, bool at_end = false)
+   : x(box[0][0]),
+     y(box[1][0]),
+     z(box[2][0]),
+     box_()
+  {
+    box_[0][0] = box[0][0]; box_[0][1] = box[0][1];
+    box_[1][0] = box[1][0]; box_[1][1] = box[1][1];
+    box_[2][0] = box[2][0]; box_[2][1] = box[2][1];
+    if (at_end) {
+      x = box[0][1];
+      y = box[1][1];
+      z = box[2][1];
+    }
+  }
+
+  Box box_;
+};//class BoxTraverser
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/BoxPartition.cpp b/openmp-avx512/basic/BoxPartition.cpp
new file mode 100644
index 0000000..2a4e5a7
--- /dev/null
+++ b/openmp-avx512/basic/BoxPartition.cpp
@@ -0,0 +1,477 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <Box.hpp>
+#include <BoxPartition.hpp>
+
+/*--------------------------------------------------------------------*/
+
+static int box_map_local_entry( const Box& box ,
+                                const int ghost ,
+                                int local_x ,
+                                int local_y ,
+                                int local_z )
+{
+  const int nx = 2 * ghost + box[0][1] - box[0][0] ;
+  const int ny = 2 * ghost + box[1][1] - box[1][0] ;
+  const int nz = 2 * ghost + box[2][1] - box[2][0] ;
+  int result = -1 ;
+
+  local_x += ghost ;
+  local_y += ghost ;
+  local_z += ghost ;
+
+  if ( 0 <= local_x && local_x < nx &&
+       0 <= local_y && local_y < ny &&
+       0 <= local_z && local_z < nz ) {
+
+    result = local_z * ny * nx + local_y * nx + local_x ;
+  }
+  return result ;
+}
+
+int box_map_local( const Box& box_local,
+                   const int ghost ,
+                   const int box_local_map[] ,
+                   const int local_x ,
+                   const int local_y ,
+                   const int local_z )
+{
+  int result = box_map_local_entry(box_local,ghost,local_x,local_y,local_z);
+
+  if ( 0 <= result ) {
+    result = box_local_map[ result ];
+  }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+/* Recursively split a box into into (up-ip) sub-boxes */
+
+void box_partition( int ip , int up , int axis ,
+                    const Box& box,
+                    Box* p_box )
+{
+  const int np = up - ip ;
+  if ( 1 == np ) {
+    p_box[ip][0][0] = box[0][0] ; p_box[ip][0][1] = box[0][1] ;
+    p_box[ip][1][0] = box[1][0] ; p_box[ip][1][1] = box[1][1] ;
+    p_box[ip][2][0] = box[2][0] ; p_box[ip][2][1] = box[2][1] ;
+  }
+  else {
+    const int n = box[ axis ][1] - box[ axis ][0] ;
+    const int np_low = np / 2 ;  /* Rounded down */
+    const int np_upp = np - np_low ;
+
+    const int n_upp = (int) (((double) n) * ( ((double)np_upp) / ((double)np)));
+    const int n_low = n - n_upp ;
+    const int next_axis = ( axis + 2 ) % 3 ;
+
+    if ( np_low ) { /* P = [ip,ip+np_low) */
+      Box dbox ;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      dbox[ axis ][1] = dbox[ axis ][0] + n_low ;
+
+      box_partition( ip, ip + np_low, next_axis, dbox, p_box );
+    }
+
+    if ( np_upp ) { /* P = [ip+np_low,ip+np_low+np_upp) */
+      Box dbox;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      ip += np_low ;
+      dbox[ axis ][0] += n_low ;
+      dbox[ axis ][1]  = dbox[ axis ][0] + n_upp ;
+
+      box_partition( ip, ip + np_upp, next_axis, dbox, p_box );
+    }
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+static int box_disjoint( const Box& a , const Box& b)
+{
+  return a[0][1] <= b[0][0] || b[0][1] <= a[0][0] ||
+         a[1][1] <= b[1][0] || b[1][1] <= a[1][0] ||
+         a[2][1] <= b[2][0] || b[2][1] <= a[2][0] ;
+}
+
+static void resize_int( int ** a , int * allocLen , int newLen )
+{
+  int k = 32;
+  while ( k < newLen ) { k <<= 1 ; }
+  if ( NULL == *a )
+    { *a = (int*)malloc( sizeof(int)*(*allocLen = k) ); }
+  else if ( *allocLen < k ) 
+    { *a = (int*)realloc(*a , sizeof(int)*(*allocLen = k)); }
+}
+
+static void box_partition_maps( 
+  const int np ,
+  const int my_p ,
+  const Box* pbox,
+  const int ghost ,
+  int ** map_local_id ,
+  int ** map_recv_pc ,
+  int ** map_send_pc ,
+  int ** map_send_id )
+{
+  const Box& my_box = pbox[my_p] ;
+
+  const int my_ix = my_box[0][0] ;
+  const int my_iy = my_box[1][0] ;
+  const int my_iz = my_box[2][0] ;
+  const int my_nx = my_box[0][1] - my_box[0][0] ;
+  const int my_ny = my_box[1][1] - my_box[1][0] ;
+  const int my_nz = my_box[2][1] - my_box[2][0] ;
+
+  const int my_use_nx = 2 * ghost + my_nx ;
+  const int my_use_ny = 2 * ghost + my_ny ;
+  const int my_use_nz = 2 * ghost + my_nz ;
+
+  const int id_length = my_use_nx * my_use_ny * my_use_nz ;
+
+  int * local_id  = (int *) malloc( id_length * sizeof(int) );
+  int * recv_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
+  int * send_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
+
+  int * send_id  = NULL ;
+  int   send_id_size = 0 ;
+
+  int iLocal , iSend ;
+  int i ;
+
+  Box my_use_box;
+
+  my_use_box[0][0] = my_box[0][0] - ghost ;
+  my_use_box[0][1] = my_box[0][1] + ghost ;
+  my_use_box[1][0] = my_box[1][0] - ghost ;
+  my_use_box[1][1] = my_box[1][1] + ghost ;
+  my_use_box[2][0] = my_box[2][0] - ghost ;
+  my_use_box[2][1] = my_box[2][1] + ghost ;
+
+  for ( i = 0 ; i < id_length ; ++i ) { local_id[i] = -1 ; }
+
+  iSend = 0 ;
+  iLocal = 0 ;
+
+  /* The vector space is partitioned by processors */
+
+  for ( i = 0 ; i < np ; ++i ) {
+    const int ip = ( i + my_p ) % np ;
+    recv_pc[i] = iLocal ;
+    send_pc[i] = iSend ;
+
+    if ( ! box_disjoint( my_use_box , pbox[ip] ) ) {
+      const int p_ix = pbox[ip][0][0] ;
+      const int p_iy = pbox[ip][1][0] ;
+      const int p_iz = pbox[ip][2][0] ;
+      const int p_ex = pbox[ip][0][1] ;
+      const int p_ey = pbox[ip][1][1] ;
+      const int p_ez = pbox[ip][2][1] ;
+
+      int local_x , local_y , local_z ;
+
+      /* Run the span of global cells that my processor uses */
+
+      for ( local_z = -ghost ; local_z < my_nz + ghost ; ++local_z ) {
+      for ( local_y = -ghost ; local_y < my_ny + ghost ; ++local_y ) {
+      for ( local_x = -ghost ; local_x < my_nx + ghost ; ++local_x ) {
+
+        const int global_z = local_z + my_iz ;
+        const int global_y = local_y + my_iy ;
+        const int global_x = local_x + my_ix ;
+
+        const int entry = 
+          box_map_local_entry(my_box,ghost,local_x,local_y,local_z);
+
+        if ( entry < 0 ) { abort(); }
+
+        if ( p_iz <= global_z && global_z < p_ez &&
+             p_iy <= global_y && global_y < p_ey &&
+             p_ix <= global_x && global_x < p_ex ) {
+
+          /* This ordinal is owned by processor 'ip' */
+
+          local_id[ entry ] = iLocal++ ;
+
+#if defined(DEBUG_PRINT)
+if ( my_p != ip ) {
+  fprintf(stdout,"  (%d,%d,%d) : P%d recv at local %d from P%d\n",
+                  global_x,global_y,global_z,my_p,local_id[entry],ip);
+  fflush(stdout);
+}
+#endif
+        }
+
+        /* If in my ownership and used by the other processor */
+        if ( my_p != ip &&
+             /* In my ownership: */
+             ( 0 <= local_z && local_z < my_nz &&
+               0 <= local_y && local_y < my_ny &&
+               0 <= local_x && local_x < my_nx ) &&
+             /* In other processors usage: */
+             ( p_iz - ghost <= global_z && global_z < p_ez + ghost &&
+               p_iy - ghost <= global_y && global_y < p_ey + ghost &&
+               p_ix - ghost <= global_x && global_x < p_ex + ghost ) ) {
+
+          resize_int( & send_id , & send_id_size , (iSend + 1) );
+          send_id[ iSend ] = local_id[ entry ] ;
+          ++iSend ;
+
+#if defined(DEBUG_PRINT)
+{
+  fprintf(stdout,"  (%d,%d,%d) : P%d send at local %d to P%d\n",
+                  global_x,global_y,global_z,my_p,local_id[entry],ip);
+  fflush(stdout);
+}
+#endif
+        }
+      }
+    }
+    }
+    }
+  }
+  recv_pc[np] = iLocal ;
+  send_pc[np] = iSend ;
+
+  *map_local_id  = local_id ;
+  *map_recv_pc   = recv_pc ;
+  *map_send_pc   = send_pc ;
+  *map_send_id   = send_id ;
+}
+
+void box_partition_rcb( const int np , 
+                        const int my_p ,
+                        const Box& root_box,
+                        const int ghost ,
+                        Box** pbox,
+                        int ** map_local_id ,
+                        int ** map_recv_pc ,
+                        int ** map_send_pc ,
+                        int ** map_send_id )
+{
+  *pbox = new Box[ np ];
+
+  box_partition( 0 , np , 2 , root_box , *pbox );
+
+  box_partition_maps( np , my_p , *pbox , ghost ,
+                      map_local_id , map_recv_pc , 
+                      map_send_pc , map_send_id );
+}
+
+/*--------------------------------------------------------------------*/
+
+#ifdef UNIT_TEST
+
+static int box_contain( const Box& a , const Box& b )
+{
+  return a[0][0] <= b[0][0] && b[0][1] <= a[0][1] &&
+         a[1][0] <= b[1][0] && b[1][1] <= a[1][1] &&
+         a[2][0] <= b[2][0] && b[2][1] <= a[2][1] ;
+}
+
+static void box_print( FILE * fp , const Box& a )
+{
+  fprintf(fp,"{ [ %d , %d ) , [ %d , %d ) , [ %d , %d ) }",
+                a[0][0] , a[0][1] ,  
+                a[1][0] , a[1][1] ,  
+                a[2][0] , a[2][1] );
+}
+
+static void test_box( const Box& box , const int np )
+{
+  const int ncell_box = box[0][1] * box[1][1] * box[2][1] ;
+  int ncell_total = 0 ;
+  int ncell_min = ncell_box ;
+  int ncell_max = 0 ;
+  std::vector<Box> pbox(np);
+  int i , j ;
+
+  box_partition( 0 , np , 2 , box , &pbox[0] );
+
+  for ( i = 0 ; i < np ; ++i ) {
+    const int ncell = ( pbox[i][0][1] - pbox[i][0][0] ) *
+                      ( pbox[i][1][1] - pbox[i][1][0] ) *
+                      ( pbox[i][2][1] - pbox[i][2][0] );
+
+    if ( ! box_contain( box , pbox[i] ) ) {
+      fprintf(stdout,"  OUT OF BOUNDS pbox[%d/%d] = ",i,np);
+      box_print(stdout,pbox[i]);
+      fprintf(stdout,"\n");
+      abort();
+    }
+
+    for ( j = i + 1 ; j < np ; ++j ) {
+      if ( ! box_disjoint( pbox[i] , pbox[j] ) ) {
+        fprintf(stdout,"  NOT DISJOINT pbox[%d/%d] = ",i,np);
+        box_print(stdout, pbox[i]);
+        fprintf(stdout,"\n");
+        fprintf(stdout,"               pbox[%d/%d] = ",j,np);
+        box_print(stdout, pbox[j]);
+        fprintf(stdout,"\n");
+        abort();
+      }
+    }
+    ncell_total += ncell ;
+
+    if ( ncell_max < ncell ) { ncell_max = ncell ; }
+    if ( ncell < ncell_min ) { ncell_min = ncell ; }
+  }
+
+  if ( ncell_total != ncell_box ) {
+    fprintf(stdout,"  WRONG CELL COUNT NP = %d\n",np);
+    abort();
+  }
+  fprintf(stdout,"NP = %d, total = %d, avg = %d, min = %d, max = %d\n",
+          np,ncell_box,ncell_box/np,ncell_min,ncell_max);
+}
+
+/*--------------------------------------------------------------------*/
+
+static void test_maps( const Box& root_box , const int np )
+{
+  const int ghost = 1 ;
+  const int nx_global = root_box[0][1] - root_box[0][0] ;
+  const int ny_global = root_box[1][1] - root_box[1][0] ;
+  int ieq , i , j ;
+  std::vector<Box> pbox(np);
+  int **local_values ;
+  int **map_local_id ;
+  int **map_recv_pc ;
+  int **map_send_pc ;
+  int **map_send_id ;
+
+  box_partition( 0 , np , 2 , root_box , &pbox[0] );
+
+  local_values = (int **) malloc( sizeof(int*) * np );
+  map_local_id = (int **) malloc( sizeof(int*) * np );
+  map_recv_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_id  = (int **) malloc( sizeof(int*) * np );
+
+  /* Set each local value to the global equation number */
+
+  for ( ieq = i = 0 ; i < np ; ++i ) {
+    const Box& mybox = pbox[i] ;
+    const int nx = mybox[0][1] - mybox[0][0] ;
+    const int ny = mybox[1][1] - mybox[1][0] ;
+    const int nz = mybox[2][1] - mybox[2][0] ;
+    int ix , iy , iz ;
+
+    /* Generate the partition maps for this rank */
+    box_partition_maps( np , i , &pbox[0] , ghost ,
+                        & map_local_id[i] , & map_recv_pc[i] , 
+                        & map_send_pc[i] , & map_send_id[i] );
+
+    local_values[i] = (int *) malloc( sizeof(int) * map_recv_pc[i][np] );
+
+    for ( iz = -ghost ; iz < nz + ghost ; ++iz ) {
+    for ( iy = -ghost ; iy < ny + ghost ; ++iy ) {
+    for ( ix = -ghost ; ix < nx + ghost ; ++ix ) {
+      const int ieq = box_map_local(mybox,ghost,map_local_id[i],ix,iy,iz);
+
+      if ( 0 <= ieq ) {
+        const int ix_global = ix + mybox[0][0] ;
+        const int iy_global = iy + mybox[1][0] ;
+        const int iz_global = iz + mybox[2][0] ;
+
+        if ( root_box[0][0] <= ix_global && ix_global < root_box[0][1] &&
+             root_box[1][0] <= iy_global && iy_global < root_box[1][1] &&
+             root_box[2][0] <= iz_global && iz_global < root_box[2][1] ) {
+
+          local_values[i][ ieq ] = ix_global +
+                                   iy_global * nx_global +
+                                   iz_global * nx_global * ny_global ;
+        }
+        else {
+          local_values[i][ ieq ] = -1 ;
+        }
+      }
+    }
+    }
+    }
+  }
+
+  /* Pair-wise compare the local values */
+  /* i  == receiving processor rank */
+  /* ip == sending   processor rank */
+  /* j  == receiving processor data entry for message from 'ip' */
+  /* jp == sending   processor data entry for message to   'i' */
+
+  for ( i = 0 ; i < np ; ++i ) {
+    for ( j = 1 ; j < np ; ++j ) {
+      const int ip = ( i + j ) % np ;
+      const int jp = ( i + np - ip ) % np ;
+      const int nrecv = map_recv_pc[i] [j+1]  - map_recv_pc[i] [j] ;
+      const int nsend = map_send_pc[ip][jp+1] - map_send_pc[ip][jp] ;
+      int k ;
+      if ( nrecv != nsend ) {
+        fprintf(stderr,"P%d recv %d from P%d\n",i,nrecv,ip);
+        fprintf(stderr,"P%d send %d to   P%d\n",ip,nsend,i);
+        abort();
+      }
+      for ( k = 0 ; k < nrecv ; ++k ) {
+        const int irecv = map_recv_pc[i][j] + k ;
+        const int isend = map_send_pc[ip][jp] + k ;
+        const int val_irecv = local_values[i][irecv] ;
+        const int val_isend = local_values[ip][ map_send_id[ip][isend] ] ;
+        if ( val_irecv != val_isend ) {
+          fprintf(stderr,"P%d recv[%d] = %d , from P%d\n",i,k,val_irecv,ip);
+          fprintf(stderr,"P%d send[%d] = %d , to   P%d\n",ip,k,val_isend,i);
+          abort();
+        }
+      }
+    }
+  }
+
+  for ( i = 0 ; i < np ; ++i ) {
+    free( map_local_id[i] );
+    free( map_recv_pc[i] );
+    free( map_send_pc[i] );
+    free( map_send_id[i] );
+    free( local_values[i] );
+  }
+  free( map_send_id );
+  free( map_send_pc );
+  free( map_recv_pc );
+  free( map_local_id );
+  free( local_values );
+}
+
+/*--------------------------------------------------------------------*/
+
+int main( int argc , char * argv[] )
+{
+  int np_max = 256 ;
+  Box box = { 0 , 64 , 0 , 64 , 0 , 64 };
+  int np = 0 ;
+
+  switch( argc ) {
+  case 3:
+    sscanf(argv[1],"%d",&np);
+    sscanf(argv[2],"%dx%dx%d",& box[0][1] , & box[1][1] , & box[2][1] );
+    if ( 0 < np ) { test_box( box , np ); }
+    if ( 0 < np ) { test_maps( box , np ); }
+    break ;
+  default:
+    for ( np = 1 ; np <= np_max ; ++np ) {
+      test_box( box , np );
+      test_maps( box , np );
+    }
+    break ;
+  }
+  return 0 ;
+}
+
+#endif
+
+
diff --git a/openmp-avx512/basic/BoxPartition.hpp b/openmp-avx512/basic/BoxPartition.hpp
new file mode 100644
index 0000000..4359a16
--- /dev/null
+++ b/openmp-avx512/basic/BoxPartition.hpp
@@ -0,0 +1,76 @@
+#ifndef _BoxPartition_hpp_
+#define _BoxPartition_hpp_
+
+#include <Box.hpp>
+
+/** \brief Recursively split a box into (up-ip) sub-boxes
+ */
+void box_partition( int ip , int up , int axis ,
+                    const Box& box ,
+                    Box* p_box );
+
+/** \brief  Partition a { [ix,jx) X [iy,jy) X [iz,jz) } box.
+ *
+ *  Use recursive coordinate bisection to partition a box 
+ *  into np disjoint sub-boxes.  Allocate (via malloc) and
+ *  populate the sub-boxes, mapping the local (x,y,z) to
+ *  a local ordinal, and mappings for the send-recv messages
+ *  to update the ghost cells.
+ *
+ *  usage:
+ *
+ *  my_nx = pbox[my_p][0][1] - pbox[my_p][0][0] ;
+ *  my_ny = pbox[my_p][1][1] - pbox[my_p][1][0] ;
+ *  my_nz = pbox[my_p][2][1] - pbox[my_p][2][0] ;
+ *
+ *  for ( x = -ghost ; x < my_nx + ghost ; ++x ) {
+ *  for ( y = -ghost ; y < my_ny + ghost ; ++y ) {
+ *  for ( z = -ghost ; z < my_nz + ghost ; ++z ) {
+ *    const int x_global = x + pbox[my_p][0][0] ;
+ *    const int y_global = y + pbox[my_p][1][0] ;
+ *    const int z_global = z + pbox[my_p][2][0] ;
+ *
+ *    const int local_ordinal =
+ *      box_map_local( pbox[my_p], ghost, map_local_id, x, y, z );
+ *
+ *    if ( 0 <= local_ordinal ) {
+ *    }
+ *  }
+ *  
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int recv_processor = ( my_p + i ) % np ;
+ *    const int recv_ordinal_begin = map_recv_pc[i];
+ *    const int recv_ordinal_end   = map_recv_pc[i+1];
+ *  }
+ *
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int send_processor = ( my_p + i ) % np ;
+ *    const int send_map_begin = map_send_pc[i];
+ *    const int send_map_end   = map_send_pc[i+1];
+ *    for ( j = send_map_begin ; j < send_map_end ; ++j ) {
+ *      send_ordinal = map_send_id[j] ;
+ *    }
+ *  }
+ */
+void box_partition_rcb( 
+  const int np            /**< [in]  Number of partitions */ ,
+  const int my_p          /**< [in]  My partition rank    */ ,
+  const Box& root_box     /**< [in]  3D Box to partition  */ ,
+  const int ghost         /**< [in]  Ghost cell boundary  */ ,
+  Box* pbox               /**< [out] Partition's 3D boxes */ ,
+  int ** map_local_id     /**< [out] Map local cells */ ,
+  int ** map_recv_pc      /**< [out] Receive spans per processor */ ,
+  int ** map_send_pc      /**< [out] Send prefix counts per processor */ ,
+  int ** map_send_id      /**< [out] Send message ordinals */ );
+
+/* \brief  Map a local (x,y,z) to a local ordinal.
+ */
+int box_map_local( const Box& box_local ,
+                   const int ghost ,
+                   const int map_local_id[] ,
+                   const int local_x ,
+                   const int local_y ,
+                   const int local_z );
+
+#endif
+
diff --git a/openmp-avx512/basic/CSRMatrix.hpp b/openmp-avx512/basic/CSRMatrix.hpp
new file mode 100644
index 0000000..9cfeaee
--- /dev/null
+++ b/openmp-avx512/basic/CSRMatrix.hpp
@@ -0,0 +1,139 @@
+#ifndef _CSRMatrix_hpp_
+#define _CSRMatrix_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <algorithm>
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal,
+         typename ComputeNode>
+struct
+CSRMatrix {
+  CSRMatrix(ComputeNode& comp_node)
+   : has_local_indices(false),
+     rows(), row_offsets(), row_offsets_external(),
+     packed_cols(), packed_coefs(),
+     num_cols(0),
+     compute_node(comp_node)
+#ifdef HAVE_MPI
+     ,external_index(), external_local_index(), elements_to_send(),
+      neighbors(), recv_length(), send_length(), send_buffer(), request()
+#endif
+  {
+  }
+
+  ~CSRMatrix()
+  {}
+
+  typedef Scalar        ScalarType;
+  typedef LocalOrdinal  LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+  typedef ComputeNode   ComputeNodeType;
+
+  bool                       has_local_indices;
+  std::vector<GlobalOrdinal> rows;
+  std::vector<LocalOrdinal>  row_offsets;
+  std::vector<LocalOrdinal>  row_offsets_external;
+  std::vector<GlobalOrdinal> packed_cols;
+  std::vector<Scalar>        packed_coefs;
+  LocalOrdinal               num_cols;
+  ComputeNode&               compute_node;
+
+#ifdef HAVE_MPI
+  std::vector<GlobalOrdinal> external_index;
+  std::vector<GlobalOrdinal>  external_local_index;
+  std::vector<GlobalOrdinal> elements_to_send;
+  std::vector<int>           neighbors;
+  std::vector<LocalOrdinal>  recv_length;
+  std::vector<LocalOrdinal>  send_length;
+  std::vector<Scalar>        send_buffer;
+  std::vector<MPI_Request>   request;
+#endif
+
+  size_t num_nonzeros() const
+  {
+    return row_offsets[row_offsets.size()-1];
+  }
+
+  void reserve_space(unsigned nrows, unsigned ncols_per_row)
+  {
+    rows.resize(nrows);
+    row_offsets.resize(nrows+1);
+    packed_cols.reserve(nrows * ncols_per_row);
+    packed_coefs.reserve(nrows * ncols_per_row);
+  }
+
+  void get_row_pointers(GlobalOrdinalType row, size_t& row_length,
+                        GlobalOrdinalType*& cols,
+                        ScalarType*& coefs)
+  {
+    ptrdiff_t local_row = -1;
+    //first see if we can get the local-row index using fast direct lookup:
+    if (rows.size() >= 1) {
+      ptrdiff_t idx = row - rows[0];
+      if (idx < rows.size() && rows[idx] == row) {
+        local_row = idx;
+      }
+    }
+ 
+    //if we didn't get the local-row index using direct lookup, try a
+    //more expensive binary-search:
+    if (local_row == -1) {
+      typename std::vector<GlobalOrdinal>::iterator row_iter =
+          std::lower_bound(rows.begin(), rows.end(), row);
+  
+      //if we still haven't found row, it's not local so jump out:
+      if (row_iter == rows.end() || *row_iter != row) {
+        row_length = 0;
+        return;
+      }
+  
+      local_row = row_iter - rows.begin();
+    }
+
+    LocalOrdinalType offset = row_offsets[local_row];
+    row_length = row_offsets[local_row+1] - offset;
+    cols = &packed_cols[offset];
+    coefs = &packed_coefs[offset];
+  }
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/ComputeNodeType.hpp b/openmp-avx512/basic/ComputeNodeType.hpp
new file mode 100644
index 0000000..e59f3eb
--- /dev/null
+++ b/openmp-avx512/basic/ComputeNodeType.hpp
@@ -0,0 +1,29 @@
+#ifndef _ComputeNodeType_hpp_
+#define _ComputeNodeType_hpp_
+
+#if defined(MINIFE_HAVE_TBB)
+
+#include <tbb/task_scheduler_init.h>
+#include <TBBNode.hpp>
+typedef TBBNode ComputeNodeType;
+
+#elif defined(MINIFE_HAVE_TPI)
+
+#include <TPI.h>
+#include <TPINode.hpp>
+typedef TPINode ComputeNodeType;
+
+#elif defined(MINIFE_HAVE_CUDA)
+
+#include <CudaNode.hpp>
+typedef CUDANode ComputeNodeType;
+
+#else
+
+#include <SerialComputeNode.hpp>
+typedef SerialComputeNode ComputeNodeType;
+
+#endif
+
+#endif
+
diff --git a/openmp-avx512/basic/DotOp.hpp b/openmp-avx512/basic/DotOp.hpp
new file mode 100644
index 0000000..6471949
--- /dev/null
+++ b/openmp-avx512/basic/DotOp.hpp
@@ -0,0 +1,35 @@
+#ifndef DOTOP_HPP_
+#define DOTOP_HPP_
+
+template <class Scalar>
+struct DotOp {
+  typedef Scalar ReductionType;
+
+  const Scalar* x;
+  const Scalar* y;
+
+  size_t n;
+
+  ReductionType result;
+
+  inline DotOp() {
+    result = identity();
+  }
+
+  static inline KERNEL_PREFIX ReductionType identity()
+  {
+    return 0.0;
+  }
+
+  inline KERNEL_PREFIX ReductionType reduce(ReductionType u, ReductionType v) const
+  {
+    return u+v;
+  }
+
+  inline KERNEL_PREFIX Scalar generate(int i) const
+  {
+    return x[i]*y[i];
+  }
+};
+
+#endif
diff --git a/openmp-avx512/basic/ELLMatrix.hpp b/openmp-avx512/basic/ELLMatrix.hpp
new file mode 100644
index 0000000..97b662f
--- /dev/null
+++ b/openmp-avx512/basic/ELLMatrix.hpp
@@ -0,0 +1,144 @@
+#ifndef _ELLMatrix_hpp_
+#define _ELLMatrix_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <algorithm>
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal,
+         typename ComputeNode>
+struct
+ELLMatrix {
+  ELLMatrix(ComputeNode& comp_node)
+   : has_local_indices(false),
+     rows(),
+     cols(), coefs(),
+     num_cols(0),
+     num_cols_per_row(0),
+     compute_node(comp_node)
+#ifdef HAVE_MPI
+     ,external_index(), external_local_index(), elements_to_send(),
+      neighbors(), recv_length(), send_length(), send_buffer(), request()
+#endif
+  {
+  }
+
+  ~ELLMatrix()
+  {}
+
+  typedef Scalar        ScalarType;
+  typedef LocalOrdinal  LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+  typedef ComputeNode   ComputeNodeType;
+
+  bool                       has_local_indices;
+  std::vector<GlobalOrdinal> rows;
+  std::vector<GlobalOrdinal> cols;
+  std::vector<Scalar>        coefs;
+  LocalOrdinal               num_cols;
+  LocalOrdinal               num_cols_per_row;
+  ComputeNode&               compute_node;
+
+#ifdef HAVE_MPI
+  std::vector<GlobalOrdinal> external_index;
+  std::vector<GlobalOrdinal>  external_local_index;
+  std::vector<GlobalOrdinal> elements_to_send;
+  std::vector<int>           neighbors;
+  std::vector<LocalOrdinal>  recv_length;
+  std::vector<LocalOrdinal>  send_length;
+  std::vector<Scalar>        send_buffer;
+  std::vector<MPI_Request>   request;
+#endif
+
+  size_t num_nonzeros() const
+  {
+    return rows.size()*num_cols_per_row;
+  }
+
+  void reserve_space(unsigned nrows, unsigned ncols_per_row)
+  {
+    rows.resize(nrows);
+    cols.resize(nrows * ncols_per_row);
+    coefs.resize(nrows * ncols_per_row);
+    num_cols_per_row = ncols_per_row;
+  }
+
+  void get_row_pointers(GlobalOrdinalType row, size_t& row_length,
+                        GlobalOrdinalType*& cols_ptr,
+                        ScalarType*& coefs_ptr)
+  {
+    ptrdiff_t local_row = -1;
+    //first see if we can get the local-row index using fast direct lookup:
+    if (rows.size() >= 1) {
+      ptrdiff_t idx = row - rows[0];
+      if (idx < rows.size() && rows[idx] == row) {
+        local_row = idx;
+      }
+    }
+ 
+    //if we didn't get the local-row index using direct lookup, try a
+    //more expensive binary-search:
+    if (local_row == -1) {
+      typename std::vector<GlobalOrdinal>::iterator row_iter =
+          std::lower_bound(rows.begin(), rows.end(), row);
+  
+      //if we still haven't found row, it's not local so jump out:
+      if (row_iter == rows.end() || *row_iter != row) {
+        row_length = 0;
+        return;
+      }
+  
+      local_row = row_iter - rows.begin();
+    }
+
+    cols_ptr = &cols[local_row*num_cols_per_row];
+    coefs_ptr = &coefs[local_row*num_cols_per_row];
+    
+    int idx = num_cols_per_row-1;
+    while(idx>=0) {
+      if (cols_ptr[idx] != 0) break;
+      --idx;
+    }
+    row_length = idx+1;
+  }
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/FEComputeElem.hpp b/openmp-avx512/basic/FEComputeElem.hpp
new file mode 100644
index 0000000..03aa8a2
--- /dev/null
+++ b/openmp-avx512/basic/FEComputeElem.hpp
@@ -0,0 +1,29 @@
+#ifndef FECOMPUTEELEM_HPP_
+#define FECOMPUTEELEM_HPP_
+
+#include <Hex8.hpp>
+
+#ifndef KERNEL_PREFIX 
+#define KERNEL_PREFIX
+#endif
+
+template<typename GlobalOrdinal,typename Scalar>
+struct FEComputeElem {
+  Scalar* elem_node_coords;
+  Scalar* elem_diffusion_matrix;
+  Scalar* elem_source_vector;
+
+inline KERNEL_PREFIX void operator()(int i)
+{
+  unsigned nnodes = miniFE::Hex8::numNodesPerElem;
+  unsigned dim = miniFE::Hex8::spatialDim;
+  Scalar* coords = elem_node_coords+i*nnodes*dim;
+  Scalar* diffusionMat = elem_diffusion_matrix+i*nnodes*nnodes;
+  Scalar* sourceVec = elem_source_vector+i*nnodes;
+
+  miniFE::Hex8::diffusionMatrix(coords, diffusionMat);
+  miniFE::Hex8::sourceVector(coords, sourceVec);
+}
+};
+
+#endif
diff --git a/openmp-avx512/basic/FusedMatvecDotOp.hpp b/openmp-avx512/basic/FusedMatvecDotOp.hpp
new file mode 100644
index 0000000..e4b59e4
--- /dev/null
+++ b/openmp-avx512/basic/FusedMatvecDotOp.hpp
@@ -0,0 +1,59 @@
+#ifndef FUSEDMATVECDOTOP_HPP_
+#define FUSEDMATVECDOTOP_HPP_
+
+#ifndef KERNEL_PREFIX
+#define KERNEL_PREFIX
+#endif
+
+template <typename MatrixType,
+          typename VectorType>
+struct FusedMatvecDotOp {
+
+  typedef typename VectorType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename VectorType::LocalOrdinalType LocalOrdinalType;
+  typedef typename VectorType::ScalarType ScalarType;
+  typedef ScalarType ReductionType;
+
+  size_t n;
+
+  const LocalOrdinalType*  Arowoffsets;
+  const GlobalOrdinalType* Acols;
+  const ScalarType*        Acoefs;
+
+  const ScalarType* x;
+        ScalarType* y;
+  ScalarType beta;
+
+  ReductionType result;
+
+  inline FusedMatvecDotOp() {
+    result = identity();
+  }
+
+  static inline KERNEL_PREFIX ReductionType identity()
+  {
+    return 0.0;
+  }
+
+  inline KERNEL_PREFIX ReductionType reduce(ReductionType u, ReductionType v) const
+  {
+    return u+v;
+  }
+
+  inline KERNEL_PREFIX ScalarType generate(int row)
+  {
+    //we count on the caller (ComputeNode) to pass in 'row'
+    //in range 0..n-1
+  
+    ScalarType sum = beta*y[row];
+
+    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
+      sum += Acoefs[i]*x[Acols[i]];
+    }
+
+    y[row] = sum;
+    return x[row]*sum;
+  }
+};
+
+#endif
diff --git a/openmp-avx512/basic/GetNodesCoords.hpp b/openmp-avx512/basic/GetNodesCoords.hpp
new file mode 100644
index 0000000..01ed26a
--- /dev/null
+++ b/openmp-avx512/basic/GetNodesCoords.hpp
@@ -0,0 +1,24 @@
+#ifndef _GETNODESCOORDS_HPP_
+#define _GETNODESCOORDS_HPP_
+
+#include <Hex8_enums.hpp>
+#include <simple_mesh_description.hpp>
+
+template<typename GlobalOrdinal,typename Scalar>
+struct GetNodesCoords {
+  const miniFE::simple_mesh_description<GlobalOrdinal>* mesh;
+  GlobalOrdinal* elemIDs;
+  GlobalOrdinal* node_ordinals;
+  Scalar* elem_node_coords;
+
+inline void operator()(int i)
+{
+  unsigned nnodes = miniFE::Hex8::numNodesPerElem;
+  GlobalOrdinal elemID = elemIDs[i];
+  GlobalOrdinal* node_ords = node_ordinals+i*nnodes;
+  Scalar* node_coords = elem_node_coords+i*nnodes*miniFE::Hex8::spatialDim;
+  get_elem_nodes_and_coords(*mesh, elemID, node_ords, node_coords);
+}
+};
+
+#endif
diff --git a/openmp-avx512/basic/Hex8_box_utils.hpp b/openmp-avx512/basic/Hex8_box_utils.hpp
new file mode 100644
index 0000000..c1662ec
--- /dev/null
+++ b/openmp-avx512/basic/Hex8_box_utils.hpp
@@ -0,0 +1,174 @@
+#ifndef _Hex8_box_utils_hpp_
+#define _Hex8_box_utils_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <stdexcept>
+
+#include <box_utils.hpp>
+#include <ElemData.hpp>
+#include <simple_mesh_description.hpp>
+#include <Hex8.hpp>
+
+namespace miniFE {
+
+
+template<typename GlobalOrdinal>
+void get_hex8_node_ids(int nx, int ny,
+                       GlobalOrdinal node0,
+                       GlobalOrdinal* elem_node_ids)
+{
+//Given box dimensions nx and ny, and a starting node
+//(local-node-0 for a hex8), compute the other nodes
+//of the hex8 using the exodus ordering convention.
+  elem_node_ids[0] = node0;
+  elem_node_ids[1] = node0 + 1;
+  elem_node_ids[2] = node0 + nx + 1;
+  elem_node_ids[3] = node0 + nx;
+  elem_node_ids[4] = node0 +     nx*ny;
+  elem_node_ids[5] = node0 + 1 + nx*ny;
+  elem_node_ids[6] = node0 + nx + nx*ny + 1;
+  elem_node_ids[7] = node0 + nx + nx*ny;
+}
+
+template<typename Scalar>
+void get_hex8_node_coords_3d(Scalar x, Scalar y, Scalar z,
+                             Scalar hx, Scalar hy, Scalar hz,
+                             Scalar* elem_node_coords)
+{
+  //Input: x,y,z are the coordinates of local-node 0 for a Hex8.
+  //'hx', 'hy', 'hz' are the lengths of the sides of the element
+  //in each direction.
+
+  elem_node_coords[0] = x;
+  elem_node_coords[1] = y;
+  elem_node_coords[2] = z;
+
+  elem_node_coords[3] = x + hx;
+  elem_node_coords[4] = y;
+  elem_node_coords[5] = z;
+
+  elem_node_coords[6] = x + hx;
+  elem_node_coords[7] = y + hy;
+  elem_node_coords[8] = z;
+
+  elem_node_coords[9]  = x;
+  elem_node_coords[10] = y + hy;
+  elem_node_coords[11] = z;
+
+  elem_node_coords[12] = x;
+  elem_node_coords[13] = y;
+  elem_node_coords[14] = z + hz;
+
+  elem_node_coords[15] = x + hx;
+  elem_node_coords[16] = y;
+  elem_node_coords[17] = z + hz;
+
+  elem_node_coords[18] = x + hx;
+  elem_node_coords[19] = y + hy;
+  elem_node_coords[20] = z + hz;
+
+  elem_node_coords[21] = x;
+  elem_node_coords[22] = y + hy;
+  elem_node_coords[23] = z + hz;
+}
+
+template<typename GlobalOrdinal, typename Scalar>
+void
+get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
+                          GlobalOrdinal elemID,
+                          GlobalOrdinal* node_ords, Scalar* node_coords)
+{
+  int global_nodes_x = mesh.global_box[0][1]+1;
+  int global_nodes_y = mesh.global_box[1][1]+1;
+  int global_nodes_z = mesh.global_box[2][1]+1;
+ 
+  if (elemID < 0) {
+    //I don't think this can happen, but check for the sake of paranoia...
+    throw std::runtime_error("get_elem_nodes_and_coords ERROR, negative elemID");
+  }
+
+  int elem_int_x, elem_int_y, elem_int_z;
+  get_int_coords(elemID, global_nodes_x-1, global_nodes_y-1, global_nodes_z-1,
+             elem_int_x, elem_int_y, elem_int_z);
+  GlobalOrdinal nodeID = get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z, elem_int_x, elem_int_y, elem_int_z);
+
+#ifdef MINIFE_DEBUG
+  std::cout<<"\nelemID: "<<elemID<<", nodeID: "<<nodeID<<std::endl;
+#endif
+  get_hex8_node_ids(global_nodes_x, global_nodes_y, nodeID, node_ords);
+
+  //Map node-IDs to rows because each processor may have a non-contiguous block of
+  //node-ids, but needs a contiguous block of row-numbers:
+#ifdef MINIFE_DEBUG
+  std::cout<<"elem "<<elemID<<" nodes: ";
+#endif
+  for(int i=0; i<Hex8::numNodesPerElem; ++i) {
+#ifdef MINIFE_DEBUG
+    std::cout<<node_ords[i]<<" ";
+#endif
+    node_ords[i] = mesh.map_id_to_row(node_ords[i]);
+  }
+#ifdef MINIFE_DEBUG
+  std::cout << std::endl;
+#endif
+
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+ 
+  Scalar ix,iy,iz;
+  get_coords<GlobalOrdinal,Scalar>(nodeID, global_nodes_x,global_nodes_y,global_nodes_z,
+                            ix,iy,iz);
+  Scalar hx = 1.0/global_elems_x;
+  Scalar hy = 1.0/global_elems_y;
+  Scalar hz = 1.0/global_elems_z;
+  get_hex8_node_coords_3d(ix, iy, iz, hx, hy, hz, node_coords);
+#ifdef MINIFE_DEBUG
+  int offset = 0;
+  for(int i=0; i<Hex8::numNodesPerElem; ++i) {
+    std::cout << "("<<node_coords[offset++]<<","<<node_coords[offset++]<<","<<node_coords[offset++]<<")";
+  }
+  std::cout << std::endl;
+#endif
+}
+
+template<typename GlobalOrdinal, typename Scalar>
+void
+get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
+                          GlobalOrdinal elemID,
+                          ElemData<GlobalOrdinal,Scalar>& elem_data)
+{
+  get_elem_nodes_and_coords(mesh, elemID, elem_data.elem_node_ids, elem_data.elem_node_coords);
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/Lock.hpp b/openmp-avx512/basic/Lock.hpp
new file mode 100644
index 0000000..16be86f
--- /dev/null
+++ b/openmp-avx512/basic/Lock.hpp
@@ -0,0 +1,103 @@
+#ifndef _Lock_hpp_
+#define _Lock_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#ifdef MINIFE_HAVE_TBB
+
+#include <iostream>
+#include <tbb/atomic.h>
+
+namespace miniFE {
+
+static tbb::atomic<size_t> miniFE_num_matrix_conflicts;
+static tbb::atomic<size_t> miniFE_num_vector_conflicts;
+
+//We have two lock classes, LockM and LockV. The only reason for
+//this is so that they can separately track the number of conflicts
+//for matrix accesses versus vector accesses (by incrementing the
+//above counters).
+//The LockingMatrix class uses LockM, LockingVector uses LockV.
+
+template<typename T>
+class LockM {
+public:
+   // Constructors/destructors
+   LockM(tbb::atomic<T>& row)
+       : locked_row_(row)
+   {
+     if (++locked_row_ != 1) {
+       unsigned counter = 0;
+       while(locked_row_ != 1) {
+         ++counter;
+       }
+       ++miniFE_num_matrix_conflicts;
+     }
+   }
+   ~LockM()
+   { --locked_row_; }
+
+private:
+   tbb::atomic<T>& locked_row_;
+   LockM(const LockM&);
+   LockM& operator=(const LockM&);
+};
+
+template<typename T>
+class LockV {
+public:
+   // Constructors/destructors
+   LockV(tbb::atomic<T>& row)
+       : locked_row_(row)
+   {
+     if (++locked_row_ != 1) {
+       unsigned counter = 0;
+       while(locked_row_ != 1) {
+         ++counter;
+       }
+       ++miniFE_num_vector_conflicts;
+     }
+   }
+   ~LockV()
+   { --locked_row_; }
+
+private:
+   tbb::atomic<T>& locked_row_;
+   LockV(const LockV&);
+   LockV& operator=(const LockV&);
+};
+
+}//namespace miniFE
+
+#else
+#error "ERROR, this file shouldn't be compiled if MINIFE_HAVE_TBB isn't defined."
+#endif
+
+#endif
+
diff --git a/openmp-avx512/basic/LockingMatrix.hpp b/openmp-avx512/basic/LockingMatrix.hpp
new file mode 100644
index 0000000..c278274
--- /dev/null
+++ b/openmp-avx512/basic/LockingMatrix.hpp
@@ -0,0 +1,74 @@
+#ifndef _LockingMatrix_hpp_
+#define _LockingMatrix_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+
+#include <Lock.hpp>
+
+namespace miniFE {
+
+template<typename MatrixType>
+class LockingMatrix {
+public:
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  LockingMatrix(MatrixType& A) : A_(A), myFirstRow_(0), myLastRow_(0), numMyRows_(0), row_locks_()
+  {
+    if (A_.rows.size() > 0) {
+      myFirstRow_ = A_.rows[0];
+      myLastRow_ = A_.rows[A_.rows.size()-1];
+    }
+    numMyRows_ = myLastRow_-myFirstRow_+1;
+    row_locks_.resize(numMyRows_);
+  }
+
+  void sum_in(GlobalOrdinal row, size_t row_len, const GlobalOrdinal* col_indices, const Scalar* values)
+  {
+    int local_row = row - myFirstRow_;
+    if (local_row >= 0 && local_row < numMyRows_) {
+      LockM<int> lock(row_locks_[local_row]);
+      sum_into_row(row, row_len, col_indices, values, A_);
+    }
+  }
+
+private:
+  MatrixType& A_;
+  GlobalOrdinal myFirstRow_;
+  GlobalOrdinal myLastRow_;
+  size_t numMyRows_;
+  std::vector<tbb::atomic<int> > row_locks_;
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/LockingVector.hpp b/openmp-avx512/basic/LockingVector.hpp
new file mode 100644
index 0000000..60f7598
--- /dev/null
+++ b/openmp-avx512/basic/LockingVector.hpp
@@ -0,0 +1,77 @@
+#ifndef _LockingVector_hpp_
+#define _LockingVector_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+
+#include <Lock.hpp>
+
+namespace miniFE {
+
+template<typename VectorType>
+class LockingVector {
+public:
+  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename VectorType::ScalarType Scalar;
+
+  LockingVector(VectorType& x) : x_(x), myFirstRow_(0), myLastRow_(0), numMyRows_(0), row_locks_()
+  {
+    if (x_.local_size > 0) {
+      myFirstRow_ = x_.startIndex;
+      myLastRow_ = myFirstRow_ + x_.local_size - 1;
+    }
+    numMyRows_ = myLastRow_-myFirstRow_+1;
+    row_locks_.resize(numMyRows_);
+  }
+
+  void sum_in(size_t num_indices, const GlobalOrdinal* indices, const Scalar* values)
+  {
+    for(int i=0; i<num_indices; ++i) {
+      GlobalOrdinal row = indices[i];
+      int local_row = row - myFirstRow_;
+      if (local_row >= 0 && local_row < numMyRows_) {
+        LockV<int> lock(row_locks_[local_row]);
+        sum_into_vector(1, &row, &values[i], x_);
+      }
+    }
+  }
+
+private:
+  VectorType& x_;
+  GlobalOrdinal myFirstRow_;
+  GlobalOrdinal myLastRow_;
+  size_t numMyRows_;
+  std::vector<tbb::atomic<int> > row_locks_;
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/MatrixCopyOp.hpp b/openmp-avx512/basic/MatrixCopyOp.hpp
new file mode 100644
index 0000000..f6c300a
--- /dev/null
+++ b/openmp-avx512/basic/MatrixCopyOp.hpp
@@ -0,0 +1,33 @@
+#ifndef _MatrixCopyOp_hpp_
+#define _MatrixCopyOp_hpp_
+
+template<typename MatrixType>
+struct MatrixCopyOp {
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+  typedef typename MatrixType::ScalarType ScalarType;
+
+  const GlobalOrdinalType* src_rows;
+  const LocalOrdinalType*  src_rowoffsets;
+  const GlobalOrdinalType* src_cols;
+  const ScalarType*        src_coefs;
+
+  GlobalOrdinalType* dest_rows;
+  LocalOrdinalType*  dest_rowoffsets;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+
+  inline void operator()(int i)
+  {
+    dest_rows[i] = src_rows[i];
+    dest_rowoffsets[i] = src_rowoffsets[i];
+    for(int j=src_rowoffsets[i]; j<src_rowoffsets[i+1]; ++j) {
+      dest_cols[j] = src_cols[j];
+      dest_coefs[j] = src_coefs[j];
+    }
+  }
+};
+
+#endif
+
diff --git a/openmp-avx512/basic/MatrixInitOp.hpp b/openmp-avx512/basic/MatrixInitOp.hpp
new file mode 100644
index 0000000..0ab9048
--- /dev/null
+++ b/openmp-avx512/basic/MatrixInitOp.hpp
@@ -0,0 +1,183 @@
+#ifndef _MatrixInitOp_hpp_
+#define _MatrixInitOp_hpp_
+
+#include <simple_mesh_description.hpp>
+#include <box_utils.hpp>
+#include <ComputeNodeType.hpp>
+
+#include <CSRMatrix.hpp>
+#include <ELLMatrix.hpp>
+
+#include <algorithm>
+
+template<typename GlobalOrdinal>
+void sort_if_needed(GlobalOrdinal* list,
+                    GlobalOrdinal list_len)
+{
+  bool need_to_sort = false;
+  for(GlobalOrdinal i=list_len-1; i>=1; --i) {
+    if (list[i] < list[i-1]) {
+      need_to_sort = true;
+      break;
+    }
+  }
+
+  if (need_to_sort) {
+    std::sort(list,list+list_len);
+  }
+}
+
+template<typename MatrixType>
+struct MatrixInitOp {
+};
+
+template<>
+struct MatrixInitOp<miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,ComputeNodeType> > {
+  MatrixInitOp(const std::vector<MINIFE_GLOBAL_ORDINAL>& rows_vec,
+               const std::vector<MINIFE_LOCAL_ORDINAL>& row_offsets_vec,
+               const std::vector<int>& row_coords_vec,
+               int global_nx, int global_ny, int global_nz,
+               MINIFE_GLOBAL_ORDINAL global_n_rows,
+               const miniFE::simple_mesh_description<MINIFE_GLOBAL_ORDINAL>& input_mesh,
+               miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,ComputeNodeType>& matrix)
+   : rows(&rows_vec[0]),
+     row_offsets(&row_offsets_vec[0]),
+     row_coords(&row_coords_vec[0]),
+     global_nodes_x(global_nx),
+     global_nodes_y(global_ny),
+     global_nodes_z(global_nz),
+     global_nrows(global_n_rows),
+     mesh(&input_mesh),
+     dest_rows(&matrix.rows[0]),
+     dest_rowoffsets(&matrix.row_offsets[0]),
+     dest_cols(&matrix.packed_cols[0]),
+     dest_coefs(&matrix.packed_coefs[0]),
+     n(matrix.rows.size())
+  {
+    matrix.packed_cols.resize(row_offsets_vec[n]);
+    matrix.packed_coefs.resize(row_offsets_vec[n]);
+    dest_rowoffsets[n] = row_offsets_vec[n];
+  }
+
+  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
+  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
+  typedef MINIFE_SCALAR ScalarType;
+
+  const GlobalOrdinalType* rows;
+  const LocalOrdinalType*  row_offsets;
+  const int*               row_coords;
+
+  int global_nodes_x;
+  int global_nodes_y;
+  int global_nodes_z;
+
+  GlobalOrdinalType global_nrows;
+
+  GlobalOrdinalType* dest_rows;
+  LocalOrdinalType*  dest_rowoffsets;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+
+  const miniFE::simple_mesh_description<GlobalOrdinalType>* mesh;
+
+  inline void operator()(int i)
+  {
+    dest_rows[i] = rows[i];
+    int offset = row_offsets[i];
+    dest_rowoffsets[i] = offset;
+    int ix = row_coords[i*3];
+    int iy = row_coords[i*3+1];
+    int iz = row_coords[i*3+2];
+    GlobalOrdinalType nnz = 0;
+    for(int sz=-1; sz<=1; ++sz)
+      for(int sy=-1; sy<=1; ++sy)
+        for(int sx=-1; sx<=1; ++sx) {
+          GlobalOrdinalType col_id =
+              miniFE::get_id<GlobalOrdinalType>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                   ix+sx, iy+sy, iz+sz);
+          if (col_id >= 0 && col_id < global_nrows) {
+            GlobalOrdinalType col = mesh->map_id_to_row(col_id);
+            dest_cols[offset+nnz] = col;
+            dest_coefs[offset+nnz] = 0;
+            ++nnz;
+          }
+        }
+
+    sort_if_needed(&dest_cols[offset], nnz);
+  }
+};
+
+template<>
+struct MatrixInitOp<miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,ComputeNodeType> > {
+  MatrixInitOp(const std::vector<MINIFE_GLOBAL_ORDINAL>& rows_vec,
+               const std::vector<MINIFE_LOCAL_ORDINAL>& /*row_offsets_vec*/,
+               const std::vector<int>& row_coords_vec,
+               int global_nx, int global_ny, int global_nz,
+               MINIFE_GLOBAL_ORDINAL global_n_rows,
+               const miniFE::simple_mesh_description<MINIFE_GLOBAL_ORDINAL>& input_mesh,
+               miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,ComputeNodeType>& matrix)
+   : rows(&rows_vec[0]),
+     row_coords(&row_coords_vec[0]),
+     global_nodes_x(global_nx),
+     global_nodes_y(global_ny),
+     global_nodes_z(global_nz),
+     global_nrows(global_n_rows),
+     mesh(&input_mesh),
+     dest_rows(&matrix.rows[0]),
+     dest_cols(&matrix.cols[0]),
+     dest_coefs(&matrix.coefs[0]),
+     n(matrix.rows.size()),
+     ncols_per_row(matrix.num_cols_per_row)
+  {
+  }
+
+  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
+  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
+  typedef MINIFE_SCALAR ScalarType;
+
+  const GlobalOrdinalType* rows;
+  const int*               row_coords;
+
+  int global_nodes_x;
+  int global_nodes_y;
+  int global_nodes_z;
+
+  GlobalOrdinalType global_nrows;
+
+  GlobalOrdinalType* dest_rows;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+  int ncols_per_row;
+
+  const miniFE::simple_mesh_description<GlobalOrdinalType>* mesh;
+
+  inline void operator()(int i)
+  {
+    dest_rows[i] = rows[i];
+    int offset = i*ncols_per_row;
+    int ix = row_coords[i*3];
+    int iy = row_coords[i*3+1];
+    int iz = row_coords[i*3+2];
+    GlobalOrdinalType nnz = 0;
+    for(int sz=-1; sz<=1; ++sz)
+      for(int sy=-1; sy<=1; ++sy)
+        for(int sx=-1; sx<=1; ++sx) {
+          GlobalOrdinalType col_id =
+              miniFE::get_id<GlobalOrdinalType>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                   ix+sx, iy+sy, iz+sz);
+          if (col_id >= 0 && col_id < global_nrows) {
+            GlobalOrdinalType col = mesh->map_id_to_row(col_id);
+            dest_cols[offset+nnz] = col;
+            dest_coefs[offset+nnz] = 0;
+            ++nnz;
+          }
+        }
+
+    sort_if_needed(&dest_cols[offset], nnz);
+  }
+};
+
+#endif
+
diff --git a/openmp-avx512/basic/MatvecOp.hpp b/openmp-avx512/basic/MatvecOp.hpp
new file mode 100644
index 0000000..9c5c8e4
--- /dev/null
+++ b/openmp-avx512/basic/MatvecOp.hpp
@@ -0,0 +1,99 @@
+#ifndef _MatvecOp_hpp_
+#define _MatvecOp_hpp_
+
+#ifndef KERNEL_PREFIX
+#define KERNEL_PREFIX
+#endif
+
+#include <CSRMatrix.hpp>
+#include <ELLMatrix.hpp>
+#include <ComputeNodeType.hpp>
+
+template<typename MatrixType>
+struct MatvecOp {
+};
+
+template<>
+struct MatvecOp<miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL, ComputeNodeType> > {
+  MatvecOp(miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL, ComputeNodeType>& A)
+  : n(A.rows.size()),
+    Arowoffsets(&A.row_offsets[0]),
+    Acols(&A.packed_cols[0]),
+    Acoefs(&A.packed_coefs[0])
+  {
+  }
+
+  size_t n;
+
+  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
+  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
+  typedef MINIFE_SCALAR ScalarType;
+
+  const LocalOrdinalType*  Arowoffsets;
+  const GlobalOrdinalType* Acols;
+  const ScalarType*        Acoefs;
+
+  const ScalarType* x;
+        ScalarType* y;
+  ScalarType beta;
+
+  inline KERNEL_PREFIX void operator()(int row)
+  {
+    //we count on the caller (ComputeNode) to pass in 'row'
+    //in range 0..n-1
+  
+    ScalarType sum = beta*y[row];
+
+    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
+      sum += Acoefs[i]*x[Acols[i]];
+    }
+
+    y[row] = sum;
+  }
+
+};//struct MatvecOp
+
+template<>
+struct MatvecOp<miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL, ComputeNodeType> > {
+  MatvecOp(miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL, ComputeNodeType>& A)
+  : n(A.rows.size()),
+    Acols(&A.cols[0]),
+    Acoefs(&A.coefs[0]),
+    ncols_per_row(A.num_cols_per_row)
+  {
+  }
+
+  size_t n;
+
+  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
+  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
+  typedef MINIFE_SCALAR ScalarType;
+
+  const GlobalOrdinalType* Acols;
+  const ScalarType*        Acoefs;
+  int ncols_per_row;
+
+  const ScalarType* x;
+        ScalarType* y;
+  ScalarType beta;
+
+  inline KERNEL_PREFIX void operator()(int row)
+  {
+    //we count on the caller (ComputeNode) to pass in 'row'
+    //in range 0..n-1
+  
+    ScalarType sum = beta*y[row];
+
+    for(LocalOrdinalType i=0; i<ncols_per_row; ++i) {
+      GlobalOrdinalType col = Acols[row*ncols_per_row + i];
+      ScalarType coef      = Acoefs[row*ncols_per_row + i];
+      if (coef != 0) sum += coef*x[col];
+    }
+
+    y[row] = sum;
+  }
+
+};//struct MatvecOp
+
+#endif
+
diff --git a/openmp-avx512/basic/MemInitOp.hpp b/openmp-avx512/basic/MemInitOp.hpp
new file mode 100644
index 0000000..f7bd579
--- /dev/null
+++ b/openmp-avx512/basic/MemInitOp.hpp
@@ -0,0 +1,14 @@
+#ifndef MEMINITOP_HPP_
+#define MEMINITOP_HPP_
+
+template <class Scalar>
+struct MemInitOp {
+  Scalar* ptr;
+  size_t n;
+  inline void operator()(size_t i)
+  {
+    ptr[i] = 0;
+  }
+};
+
+#endif
diff --git a/openmp-avx512/basic/NoOpMemoryModel.hpp b/openmp-avx512/basic/NoOpMemoryModel.hpp
new file mode 100644
index 0000000..92d1eb1
--- /dev/null
+++ b/openmp-avx512/basic/NoOpMemoryModel.hpp
@@ -0,0 +1,27 @@
+#ifndef _NoOpMemoryModel_hpp_
+#define _NoOpMemoryModel_hpp_
+
+class NoOpMemoryModel {
+  public:
+    NoOpMemoryModel(){}
+    virtual ~NoOpMemoryModel(){}
+
+    template<class T>
+    T* get_buffer(const T* host_ptr, size_t buf_size)
+    { return const_cast<T*>(host_ptr); }
+
+    template<class T>
+    void destroy_buffer(T*& device_ptr)
+    { }
+
+    template<class T>
+    void copy_to_buffer(const T* host_ptr, size_t buf_size, T* device_ptr)
+    { }
+
+    template<class T>
+    void copy_from_buffer(T* host_ptr, size_t buf_size, const T* device_ptr)
+    { }
+};
+
+#endif
+
diff --git a/openmp-avx512/basic/SerialComputeNode.hpp b/openmp-avx512/basic/SerialComputeNode.hpp
new file mode 100644
index 0000000..1f45ed8
--- /dev/null
+++ b/openmp-avx512/basic/SerialComputeNode.hpp
@@ -0,0 +1,25 @@
+#ifndef SERIALCOMPUTENODE_HPP_
+#define SERIALCOMPUTENODE_HPP_
+
+#include <NoOpMemoryModel.hpp>
+
+class SerialComputeNode : public NoOpMemoryModel {
+  public:
+    template <class WDP>
+    void parallel_for(unsigned int length, WDP wd) {
+      for(int i=0; i<length; ++i) {
+        wd(i);
+      }
+    }
+
+    template <class WDP>
+    void parallel_reduce(unsigned int length, WDP &wd) {
+      wd.result = wd.identity();
+      for(int i=0; i<length; ++i) {
+        wd.result = wd.reduce(wd.result, wd.generate(i));
+      }
+    }
+
+};
+
+#endif
diff --git a/openmp-avx512/basic/SparseMatrix_functions.hpp b/openmp-avx512/basic/SparseMatrix_functions.hpp
new file mode 100644
index 0000000..f4f6e3e
--- /dev/null
+++ b/openmp-avx512/basic/SparseMatrix_functions.hpp
@@ -0,0 +1,621 @@
+#ifndef _SparseMatrix_functions_hpp_
+#define _SparseMatrix_functions_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <sstream>
+#include <fstream>
+
+#include <Vector.hpp>
+#include <Vector_functions.hpp>
+#include <ElemData.hpp>
+#include <FusedMatvecDotOp.hpp>
+#include <MatvecOp.hpp>
+#include <MatrixInitOp.hpp>
+#include <MatrixCopyOp.hpp>
+#include <exchange_externals.hpp>
+#include <mytimer.hpp>
+
+#ifdef MINIFE_HAVE_TBB
+#include <LockingMatrix.hpp>
+#endif
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType>
+void init_matrix(MatrixType& M,
+                 const std::vector<typename MatrixType::GlobalOrdinalType>& rows,
+                 const std::vector<typename MatrixType::LocalOrdinalType>& row_offsets,
+                 const std::vector<int>& row_coords,
+                 int global_nodes_x,
+                 int global_nodes_y,
+                 int global_nodes_z,
+                 typename MatrixType::GlobalOrdinalType global_nrows,
+                 const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh)
+{
+  MatrixInitOp<MatrixType> mat_init(rows, row_offsets, row_coords,
+                                 global_nodes_x, global_nodes_y, global_nodes_z,
+                                 global_nrows, mesh, M);
+
+#ifdef MINIFE_HAVE_CUDA
+//if on cuda, don't do this with parallel_for...
+  for(size_t i=0; i<mat_init.n; ++i) {
+    mat_init(i);
+  }
+#else
+  M.compute_node.parallel_for(mat_init.n, mat_init);
+#endif
+}
+
+template<typename T,
+         typename U>
+void sort_with_companions(ptrdiff_t len, T* array, U* companions)
+{
+  ptrdiff_t i, j, index;
+  U companion;
+
+  for (i=1; i < len; i++) {
+    index = array[i];
+    companion = companions[i];
+    j = i;
+    while ((j > 0) && (array[j-1] > index))
+    {
+      array[j] = array[j-1];
+      companions[j] = companions[j-1];
+      j = j - 1;
+    }
+    array[j] = index;
+    companions[j] = companion;
+  }
+}
+
+template<typename MatrixType>
+void write_matrix(const std::string& filename, 
+                  MatrixType& mat)
+{
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::ScalarType ScalarType;
+
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  std::ostringstream osstr;
+  osstr << filename << "." << numprocs << "." << myproc;
+  std::string full_name = osstr.str();
+  std::ofstream ofs(full_name.c_str());
+
+  size_t nrows = mat.rows.size();
+  size_t nnz = mat.num_nonzeros();
+
+  for(int p=0; p<numprocs; ++p) {
+    if (p == myproc) {
+      if (p == 0) {
+        ofs << nrows << " " << nnz << std::endl;
+      }
+      for(size_t i=0; i<nrows; ++i) {
+        size_t row_len = 0;
+        GlobalOrdinalType* cols = NULL;
+        ScalarType* coefs = NULL;
+        mat.get_row_pointers(mat.rows[i], row_len, cols, coefs);
+
+        for(size_t j=0; j<row_len; ++j) {
+          ofs << mat.rows[i] << " " << cols[j] << " " << coefs[j] << std::endl;
+        }
+      }
+    }
+#ifdef HAVE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  }
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+void
+sum_into_row(int row_len,
+             GlobalOrdinal* row_indices,
+             Scalar* row_coefs,
+             int num_inputs,
+             const GlobalOrdinal* input_indices,
+             const Scalar* input_coefs)
+{
+  for(size_t i=0; i<num_inputs; ++i) {
+    GlobalOrdinal* loc = std::lower_bound(row_indices, row_indices+row_len,
+                                          input_indices[i]);
+    if (loc-row_indices < row_len && *loc == input_indices[i]) {
+//if(flag && *loc==6)
+//std::cout<<"  ("<<*loc<<":"<<row_coefs[loc-row_indices]<<" += "<<input_coefs[i]<<")"<<std::endl;
+      row_coefs[loc-row_indices] += input_coefs[i];
+    }
+  }
+}
+
+template<typename MatrixType>
+void
+sum_into_row(typename MatrixType::GlobalOrdinalType row,
+             size_t num_indices,
+             const typename MatrixType::GlobalOrdinalType* col_inds,
+             const typename MatrixType::ScalarType* coefs,
+             MatrixType& mat)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  size_t row_len = 0;
+  GlobalOrdinal* mat_row_cols = NULL;
+  Scalar* mat_row_coefs = NULL;
+
+  mat.get_row_pointers(row, row_len, mat_row_cols, mat_row_coefs);
+  if (row_len == 0) return;
+
+  sum_into_row(row_len, mat_row_cols, mat_row_coefs, num_indices, col_inds, coefs);
+}
+
+template<typename MatrixType>
+void
+sum_in_symm_elem_matrix(size_t num,
+                   const typename MatrixType::GlobalOrdinalType* indices,
+                   const typename MatrixType::ScalarType* coefs,
+                   MatrixType& mat)
+{
+  typedef typename MatrixType::ScalarType Scalar;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+//indices is length num (which should be nodes-per-elem)
+//coefs is the upper triangle of the element diffusion matrix
+//which should be length num*(num+1)/2
+//std::cout<<std::endl;
+
+  int row_offset = 0;
+  bool flag = false;
+  for(size_t i=0; i<num; ++i) {
+    GlobalOrdinal row = indices[i];
+ 
+    const Scalar* row_coefs = &coefs[row_offset];
+    const GlobalOrdinal* row_col_inds = &indices[i];
+    size_t row_len = num - i;
+    row_offset += row_len;
+
+    size_t mat_row_len = 0;
+    GlobalOrdinal* mat_row_cols = NULL;
+    Scalar* mat_row_coefs = NULL;
+  
+    mat.get_row_pointers(row, mat_row_len, mat_row_cols, mat_row_coefs);
+    if (mat_row_len == 0) continue;
+
+    sum_into_row(mat_row_len, mat_row_cols, mat_row_coefs,
+                 row_len, row_col_inds, row_coefs);
+
+    int offset = i;
+    for(size_t j=0; j<i; ++j) {
+      Scalar coef = coefs[offset];
+//std::cout<<"i: "<<i<<", j: "<<j<<", offset: "<<offset<<std::endl;
+      sum_into_row(mat_row_len, mat_row_cols, mat_row_coefs,
+                   1, &indices[j], &coef);
+      offset += num - (j+1);
+    }
+  }
+}
+
+template<typename MatrixType>
+void
+sum_in_elem_matrix(size_t num,
+                   const typename MatrixType::GlobalOrdinalType* indices,
+                   const typename MatrixType::ScalarType* coefs,
+                   MatrixType& mat)
+{
+  size_t offset = 0;
+
+  for(size_t i=0; i<num; ++i) {
+    sum_into_row(indices[i], num,
+                 &indices[0], &coefs[offset], mat);
+    offset += num;
+  }
+}
+
+template<typename GlobalOrdinal, typename Scalar,
+         typename MatrixType, typename VectorType>
+void
+sum_into_global_linear_system(ElemData<GlobalOrdinal,Scalar>& elem_data,
+                              MatrixType& A, VectorType& b)
+{
+  sum_in_symm_elem_matrix(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                     elem_data.elem_diffusion_matrix, A);
+  sum_into_vector(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                  elem_data.elem_source_vector, b);
+}
+
+#ifdef MINIFE_HAVE_TBB
+template<typename MatrixType>
+void
+sum_in_elem_matrix(size_t num,
+                   const typename MatrixType::GlobalOrdinalType* indices,
+                   const typename MatrixType::ScalarType* coefs,
+                   LockingMatrix<MatrixType>& mat)
+{
+  size_t offset = 0;
+
+  for(size_t i=0; i<num; ++i) {
+    mat.sum_in(indices[i], num, &indices[0], &coefs[offset]);
+    offset += num;
+  }
+}
+
+template<typename GlobalOrdinal, typename Scalar,
+         typename MatrixType, typename VectorType>
+void
+sum_into_global_linear_system(ElemData<GlobalOrdinal,Scalar>& elem_data,
+                              LockingMatrix<MatrixType>& A, LockingVector<VectorType>& b)
+{
+  sum_in_elem_matrix(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                     elem_data.elem_diffusion_matrix, A);
+  sum_into_vector(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                  elem_data.elem_source_vector, b);
+}
+#endif
+
+template<typename MatrixType>
+void
+add_to_diagonal(typename MatrixType::ScalarType value, MatrixType& mat)
+{
+  for(size_t i=0; i<mat.rows.size(); ++i) {
+    sum_into_row(mat.rows[i], 1, &mat.rows[i], &value, mat);
+  }
+}
+
+template<typename MatrixType>
+double
+parallel_memory_overhead_MB(const MatrixType& A)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  double mem_MB = 0;
+
+#ifdef HAVE_MPI
+  double invMB = 1.0/(1024*1024);
+  mem_MB = invMB*A.external_index.size()*sizeof(GlobalOrdinal);
+  mem_MB += invMB*A.external_local_index.size()*sizeof(GlobalOrdinal);
+  mem_MB += invMB*A.elements_to_send.size()*sizeof(GlobalOrdinal);
+  mem_MB += invMB*A.neighbors.size()*sizeof(int);
+  mem_MB += invMB*A.recv_length.size()*sizeof(LocalOrdinal);
+  mem_MB += invMB*A.send_length.size()*sizeof(LocalOrdinal);
+
+  double tmp = mem_MB;
+  MPI_Allreduce(&tmp, &mem_MB, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  return mem_MB;
+}
+
+template<typename MatrixType>
+void rearrange_matrix_local_external(MatrixType& A)
+{
+  //This function will rearrange A so that local entries are contiguous at the front
+  //of A's memory, and external entries are contiguous at the back of A's memory.
+  //
+  //A.row_offsets will describe where the local entries occur, and
+  //A.row_offsets_external will describe where the external entries occur.
+
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  size_t nrows = A.rows.size();
+  std::vector<LocalOrdinal> tmp_row_offsets(nrows*2);
+  std::vector<LocalOrdinal> tmp_row_offsets_external(nrows*2);
+
+  LocalOrdinal num_local_nz = 0;
+  LocalOrdinal num_extern_nz = 0;
+
+  //First sort within each row of A, so that local entries come
+  //before external entries within each row.
+  //tmp_row_offsets describe the locations of the local entries, and
+  //tmp_row_offsets_external describe the locations of the external entries.
+  //
+  for(size_t i=0; i<nrows; ++i) {
+    GlobalOrdinal* row_begin = &A.packed_cols[A.row_offsets[i]];
+    GlobalOrdinal* row_end = &A.packed_cols[A.row_offsets[i+1]];
+
+    Scalar* coef_row_begin = &A.packed_coefs[A.row_offsets[i]];
+
+    tmp_row_offsets[i*2] = A.row_offsets[i];
+    tmp_row_offsets[i*2+1] = A.row_offsets[i+1];
+    tmp_row_offsets_external[i*2] = A.row_offsets[i+1];
+    tmp_row_offsets_external[i*2+1] = A.row_offsets[i+1];
+
+    ptrdiff_t row_len = row_end - row_begin;
+
+    sort_with_companions(row_len, row_begin, coef_row_begin);
+
+    GlobalOrdinal* row_iter = std::lower_bound(row_begin, row_end, nrows);
+
+    LocalOrdinal offset = A.row_offsets[i] + row_iter-row_begin;
+    tmp_row_offsets[i*2+1] = offset;
+    tmp_row_offsets_external[i*2] = offset;
+
+    num_local_nz += tmp_row_offsets[i*2+1]-tmp_row_offsets[i*2];
+    num_extern_nz += tmp_row_offsets_external[i*2+1]-tmp_row_offsets_external[i*2];
+  }
+
+  //Next, copy the external entries into separate arrays.
+
+  std::vector<GlobalOrdinal> ext_cols(num_extern_nz);
+  std::vector<Scalar> ext_coefs(num_extern_nz);
+  std::vector<LocalOrdinal> ext_offsets(nrows+1);
+  LocalOrdinal offset = 0;
+  for(size_t i=0; i<nrows; ++i) {
+    ext_offsets[i] = offset;
+    for(LocalOrdinal j=tmp_row_offsets_external[i*2];
+                     j<tmp_row_offsets_external[i*2+1]; ++j) {
+      ext_cols[offset] = A.packed_cols[j];
+      ext_coefs[offset++] = A.packed_coefs[j];
+    }
+  }
+  ext_offsets[nrows] = offset;
+
+  //Now slide all local entries down to the beginning of A's packed arrays
+
+  A.row_offsets.resize(nrows+1);
+  offset = 0;
+  for(size_t i=0; i<nrows; ++i) {
+    A.row_offsets[i] = offset;
+    for(LocalOrdinal j=tmp_row_offsets[i*2]; j<tmp_row_offsets[i*2+1]; ++j) {
+      A.packed_cols[offset] = A.packed_cols[j];
+      A.packed_coefs[offset++] = A.packed_coefs[j];
+    }
+  }
+  A.row_offsets[nrows] = offset;
+
+  //Finally, copy the external entries back into A.packed_cols and
+  //A.packed_coefs, starting at the end of the local entries.
+
+  for(LocalOrdinal i=offset; i<offset+ext_cols.size(); ++i) {
+    A.packed_cols[i] = ext_cols[i-offset];
+    A.packed_coefs[i] = ext_coefs[i-offset];
+  }
+
+  A.row_offsets_external.resize(nrows+1);
+  for(size_t i=0; i<=nrows; ++i) A.row_offsets_external[i] = ext_offsets[i] + offset;
+}
+
+//------------------------------------------------------------------------
+template<typename MatrixType>
+void
+zero_row_and_put_1_on_diagonal(MatrixType& A, typename MatrixType::GlobalOrdinalType row)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  size_t row_len = 0;
+  GlobalOrdinal* cols = NULL;
+  Scalar* coefs = NULL;
+  A.get_row_pointers(row, row_len, cols, coefs);
+  
+  for(size_t i=0; i<row_len; ++i) {
+    if (cols[i] == row) coefs[i] = 1;
+    else coefs[i] = 0;
+  }
+}
+
+//------------------------------------------------------------------------
+template<typename MatrixType,
+         typename VectorType>
+void
+impose_dirichlet(typename MatrixType::ScalarType prescribed_value,
+                    MatrixType& A,
+                    VectorType& b,
+                    int global_nx,
+                    int global_ny,
+                    int global_nz,
+                    const std::set<typename MatrixType::GlobalOrdinalType>& bc_rows)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  GlobalOrdinal first_local_row = A.rows.size()>0 ? A.rows[0] : 0;
+  GlobalOrdinal last_local_row  = A.rows.size()>0 ? A.rows[A.rows.size()-1] : -1;
+
+  typename std::set<GlobalOrdinal>::const_iterator
+    bc_iter = bc_rows.begin(), bc_end = bc_rows.end();
+  for(; bc_iter!=bc_end; ++bc_iter) {
+    GlobalOrdinal row = *bc_iter;
+    if (row >= first_local_row && row <= last_local_row) {
+      size_t local_row = row - first_local_row;
+      b.coefs[local_row] = prescribed_value;
+      zero_row_and_put_1_on_diagonal(A, row);
+    }
+  }
+
+  for(size_t i=0; i<A.rows.size(); ++i) {
+    GlobalOrdinal row = A.rows[i];
+
+    if (bc_rows.find(row) != bc_rows.end()) continue;
+
+    size_t row_length = 0;
+    GlobalOrdinal* cols = NULL;
+    Scalar* coefs = NULL;
+    A.get_row_pointers(row, row_length, cols, coefs);
+
+    Scalar sum = 0;
+    for(size_t j=0; j<row_length; ++j) {
+      if (bc_rows.find(cols[j]) != bc_rows.end()) {
+        sum += coefs[j];
+        coefs[j] = 0;
+      }
+    }
+
+    b.coefs[i] -= sum*prescribed_value;
+  }
+}
+
+static timer_type exchtime = 0;
+
+//------------------------------------------------------------------------
+//Compute matrix vector product y = A*x and return dot(x,y), where:
+//
+// A - input matrix
+// x - input vector
+// y - result vector
+//
+template<typename MatrixType,
+         typename VectorType>
+typename TypeTraits<typename VectorType::ScalarType>::magnitude_type
+matvec_and_dot(MatrixType& A,
+               VectorType& x,
+               VectorType& y)
+{
+  timer_type t0 = mytimer();
+  exchange_externals(A, x);
+  exchtime += mytimer()-t0;
+
+  typedef typename TypeTraits<typename VectorType::ScalarType>::magnitude_type magnitude;
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+  typedef typename MatrixType::ComputeNodeType ComputeNodeType;
+
+  ComputeNodeType& comp_node = A.compute_node;
+
+  FusedMatvecDotOp<MatrixType,VectorType> mvdotop;
+
+  mvdotop.n = A.rows.size();
+  mvdotop.Arowoffsets = comp_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
+  mvdotop.Acols       = comp_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
+  mvdotop.Acoefs      = comp_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
+  mvdotop.x = comp_node.get_buffer(&x.coefs[0], x.coefs.size());
+  mvdotop.y = comp_node.get_buffer(&y.coefs[0], y.coefs.size());
+  mvdotop.beta = 0;
+
+  comp_node.parallel_reduce(mvdotop.n, mvdotop);
+
+#ifdef HAVE_MPI
+  magnitude local_dot = mvdotop.result, global_dot = 0;
+  MPI_Datatype mpi_dtype = TypeTraits<magnitude>::mpi_type();  
+  MPI_Allreduce(&local_dot, &global_dot, 1, mpi_dtype, MPI_SUM, MPI_COMM_WORLD);
+  return global_dot;
+#else
+  return mvdotop.result;
+#endif
+}
+
+//------------------------------------------------------------------------
+//Compute matrix vector product y = A*x where:
+//
+// A - input matrix
+// x - input vector
+// y - result vector
+//
+template<typename MatrixType,
+         typename VectorType>
+struct matvec_std {
+void operator()(MatrixType& A,
+            VectorType& x,
+            VectorType& y)
+{
+  exchange_externals(A, x);
+
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+  typedef typename MatrixType::ComputeNodeType ComputeNodeType;
+
+  ComputeNodeType& comp_node = A.compute_node;
+
+  MatvecOp<MatrixType> mvop(A);
+
+  mvop.x = comp_node.get_buffer(&x.coefs[0], x.coefs.size());
+  mvop.y = comp_node.get_buffer(&y.coefs[0], y.coefs.size());
+  mvop.beta = 0;
+
+  comp_node.parallel_for(mvop.n, mvop);
+}
+};
+
+template<typename MatrixType,
+         typename VectorType>
+void matvec(MatrixType& A, VectorType& x, VectorType& y)
+{
+  matvec_std<MatrixType,VectorType> mv;
+  mv(A, x, y);
+}
+
+template<typename MatrixType,
+         typename VectorType>
+struct matvec_overlap {
+void operator()(MatrixType& A,
+                    VectorType& x,
+                    VectorType& y)
+{
+#ifdef HAVE_MPI
+  begin_exchange_externals(A, x);
+#endif
+
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+  typedef typename MatrixType::ComputeNodeType ComputeNodeType;
+
+  ComputeNodeType& comp_node = A.compute_node;
+
+  MatvecOp<MatrixType> mvop(A);
+
+  mvop.x = comp_node.get_buffer(&x.coefs[0], x.coefs.size());
+  mvop.y = comp_node.get_buffer(&y.coefs[0], y.coefs.size());
+  mvop.beta = 0;
+
+  comp_node.parallel_for(mvop.n, mvop);
+
+#ifdef HAVE_MPI
+  finish_exchange_externals(A.neighbors.size());
+
+  mvop.Arowoffsets = comp_node.get_buffer(&A.row_offsets_external[0], A.row_offsets_external.size());
+  mvop.beta = 1;
+
+  comp_node.parallel_for(A.rows.size(), mvop);
+#endif
+}
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/SumInLinSys.hpp b/openmp-avx512/basic/SumInLinSys.hpp
new file mode 100644
index 0000000..d5f6471
--- /dev/null
+++ b/openmp-avx512/basic/SumInLinSys.hpp
@@ -0,0 +1,33 @@
+#ifndef _SUMINLINSYS_HPP_
+#define _SUMINLINSYS_HPP_
+
+#include <Hex8_enums.hpp>
+#include <LockingMatrix.hpp>
+#include <LockingVector.hpp>
+
+template<typename GlobalOrdinal,typename Scalar,
+         typename MatrixType, typename VectorType>
+struct SumInLinSys {
+  GlobalOrdinal* node_ordinals;
+  Scalar* elem_diffusion_matrix;
+  Scalar* elem_source_vector;
+  miniFE::LockingMatrix<MatrixType>* A;
+  miniFE::LockingVector<VectorType>* b;
+
+inline void operator()(int i)
+{
+  size_t nnodes = miniFE::Hex8::numNodesPerElem;
+  GlobalOrdinal* node_ords = node_ordinals+i*nnodes;
+  Scalar* diffusionMat = elem_diffusion_matrix+i*nnodes*nnodes;
+  Scalar* sourceVec = elem_source_vector+i*nnodes;
+  for(size_t ii=0; ii<nnodes; ++ii) {
+    GlobalOrdinal row = node_ords[ii];
+    A->sum_in(row, nnodes, node_ords,
+              &(diffusionMat[ii*nnodes]));
+    b->sum_in(1, &row, &(sourceVec[ii]));
+  }
+}
+
+};
+
+#endif
diff --git a/openmp-avx512/basic/TBBNode.cpp b/openmp-avx512/basic/TBBNode.cpp
new file mode 100644
index 0000000..20078fd
--- /dev/null
+++ b/openmp-avx512/basic/TBBNode.cpp
@@ -0,0 +1,8 @@
+#ifdef MINIFE_HAVE_TBB
+
+#include "TBBNode.hpp"
+
+tbb::task_scheduler_init TBBNode::tsi_(tbb::task_scheduler_init::deferred);
+
+#endif
+
diff --git a/openmp-avx512/basic/TBBNode.hpp b/openmp-avx512/basic/TBBNode.hpp
new file mode 100644
index 0000000..6b1fe89
--- /dev/null
+++ b/openmp-avx512/basic/TBBNode.hpp
@@ -0,0 +1,76 @@
+#ifndef TBBNODE_HPP_
+#define TBBNODE_HPP_
+
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/task_scheduler_init.h>
+#include <stdlib.h>
+
+#include <NoOpMemoryModel.hpp>
+
+#include <iostream> // debug
+
+template <class WDPin>
+struct BlockedRangeWDP {
+  mutable WDPin wd;
+  BlockedRangeWDP(WDPin &in) : wd(in) {}
+  inline void operator()(tbb::blocked_range<int> &rng) const
+  {
+    for(int i=rng.begin(); i<rng.end(); ++i) {
+      wd(i);
+    }
+  }
+};
+
+template <class WDPin>
+struct BlockedRangeWDPReducer {
+  WDPin wd;
+  BlockedRangeWDPReducer(WDPin &in) : wd(in) {}
+  BlockedRangeWDPReducer(BlockedRangeWDPReducer &in, tbb::split) : wd(in.wd)
+  {
+    wd.result = wd.identity();
+  }
+  void operator()(tbb::blocked_range<int> &rng)
+  { 
+    for(int i=rng.begin(); i<rng.end(); ++i) {
+      wd.result = wd.reduce(wd.result, wd.generate(i));
+    }
+  }
+  inline void join( const BlockedRangeWDPReducer<WDPin> &other ) {
+    wd.result = wd.reduce( wd.result, other.wd.result );
+  }
+};
+
+class TBBNode : public NoOpMemoryModel {
+  public:
+
+    TBBNode(int numThreads=0) {
+      if (numThreads >= 1) {
+        tsi_.initialize(numThreads);
+      }
+      else {
+        tsi_.initialize(tbb::task_scheduler_init::automatic);
+      }
+    }
+
+    ~TBBNode() {}
+
+    template <class WDP>
+    void parallel_for(int length, WDP wd) {
+      BlockedRangeWDP<WDP> tbb_wd(wd);
+      tbb::parallel_for(tbb::blocked_range<int>(0,length), tbb_wd, tbb::auto_partitioner()); 
+    }
+
+    template <class WDP>
+    void parallel_reduce(int length, WDP &wd) {
+      BlockedRangeWDPReducer<WDP> tbb_wd(wd);
+      tbb::parallel_reduce(tbb::blocked_range<int>(0,length), tbb_wd, tbb::auto_partitioner());
+      wd.result = tbb_wd.wd.result;  // have to put result from final tbb_wd into orginal wd
+    }
+
+  private:
+    static tbb::task_scheduler_init tsi_;
+};
+
+#endif
diff --git a/openmp-avx512/basic/TPINode.hpp b/openmp-avx512/basic/TPINode.hpp
new file mode 100644
index 0000000..66ec84f
--- /dev/null
+++ b/openmp-avx512/basic/TPINode.hpp
@@ -0,0 +1,113 @@
+#ifndef TPINODE_HPP_
+#define TPINODE_HPP_
+
+#include <TPI.h>
+
+#include <NoOpMemoryModel.hpp>
+
+#include <iostream> // debug
+
+inline
+void tpi_work_span(TPI_Work* work, int n,
+                   size_t& ibeg, size_t& iend)
+{
+  const int chunk = ( n + work->count - 1 ) / work->count ;
+
+  iend = chunk * ( work->rank + 1 );
+  ibeg = chunk * ( work->rank );
+
+  if ( n < iend ) { iend = n; }
+}
+
+template<class WDP>
+void tpi_execute(TPI_Work * work)
+{
+  const WDP* const_wdp = static_cast<const WDP*>(work->info);
+  WDP* wdp = const_cast<WDP*>(const_wdp);
+  size_t n = wdp->n;
+  size_t ibeg = 0, iend = n;
+  tpi_work_span(work, n, ibeg, iend);
+  for(size_t i=ibeg; i<iend; ++i) {
+    (*wdp)(i);
+  }
+}
+
+template<class WDP>
+void tpi_reduction_work(TPI_Work * work)
+{
+  const WDP* wdp = static_cast<const WDP*>(work->info);
+  size_t n = wdp->n;
+  size_t ibeg = 0, iend = n;
+  tpi_work_span(work, n, ibeg, iend);
+
+  typedef typename WDP::ReductionType ReductionType;
+  ReductionType tmpres = wdp->result, tmpi;
+
+  for(size_t i=ibeg; i<iend; ++i) {
+    tmpi = wdp->generate(i);
+    tmpres = wdp->reduce(tmpres, tmpi);
+  }
+  *(static_cast<ReductionType*>(work->reduce)) = tmpres;
+}
+
+template<class WDP>
+void tpi_reduction_join(TPI_Work * work, const void* src)
+{
+  typedef typename WDP::ReductionType ReductionType;
+
+  const WDP* wdp = static_cast<const WDP*>(work->info);
+
+  ReductionType& work_reduce = *(static_cast<ReductionType*>(work->reduce));
+
+  work_reduce = wdp->reduce(work_reduce, *(static_cast<const ReductionType*>(src)) );
+}
+
+template<class WDP>
+void tpi_reduction_init(TPI_Work * work)
+{
+  typedef typename WDP::ReductionType ReductionType;
+
+  const WDP* wdp = static_cast<const WDP*>(work->info);
+
+  *(static_cast<ReductionType*>(work->reduce)) = wdp->identity();
+}
+
+class TPINode : public NoOpMemoryModel {
+  public:
+
+    TPINode(int numThreads=0)
+     : numThreads_(numThreads)
+    {
+      if (numThreads >= 1) {
+        TPI_Init(numThreads);
+      }
+    }
+
+    ~TPINode()
+    {
+      if (numThreads_ >= 1) {
+        TPI_Finalize();
+      }
+    }
+
+    template <class WDP>
+    void parallel_for(int length, WDP & wd ) {
+      TPI_Run_threads(tpi_execute<WDP>, &wd, 0 );
+    }
+
+    template <class WDP>
+    void parallel_reduce(int length, WDP & wd ) {
+      typedef typename WDP::ReductionType ReductionType;
+      ReductionType result = 0;
+      TPI_Run_threads_reduce(tpi_reduction_work<WDP>, &wd,
+                             tpi_reduction_join<WDP>,
+                             tpi_reduction_init<WDP>, sizeof(result), &result);
+      wd.result = result;
+    }
+
+  private:
+    int numThreads_;
+};
+
+#endif
+
diff --git a/openmp-avx512/basic/TypeTraits.hpp b/openmp-avx512/basic/TypeTraits.hpp
new file mode 100644
index 0000000..3ac472c
--- /dev/null
+++ b/openmp-avx512/basic/TypeTraits.hpp
@@ -0,0 +1,137 @@
+#ifndef _TypeTraits_hpp_
+#define _TypeTraits_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <complex>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename T> struct TypeTraits {};
+
+template<>
+struct TypeTraits<float> {
+  typedef float magnitude_type;
+
+  static const char* name() {return "float";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_FLOAT;}
+#endif
+};
+
+template<>
+struct TypeTraits<double> {
+  typedef double magnitude_type;
+
+  static const char* name() {return "double";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_DOUBLE;}
+#endif
+};
+
+template<>
+struct TypeTraits<int> {
+  typedef int magnitude_type;
+
+  static const char* name() {return "int";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_INT;}
+#endif
+};
+
+template<>
+struct TypeTraits<long int> {
+  typedef long int magnitude_type;
+
+  static const char* name() {return "long int";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_LONG;}
+#endif
+};
+
+#ifndef MINIFE_NO_LONG_LONG
+
+template<>
+struct TypeTraits<long long> {
+  typedef long long magnitude_type;
+
+  static const char* name() {return "long long";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_LONG_LONG;}
+#endif
+};
+
+#endif
+
+template<>
+struct TypeTraits<unsigned> {
+  typedef unsigned magnitude_type;
+
+  static const char* name() {return "unsigned";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_UNSIGNED;}
+#endif
+};
+
+template<>
+struct TypeTraits<std::complex<float> > {
+  typedef float magnitude_type;
+
+  static const char* name() {return "std::complex<float>";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_COMPLEX;}
+#endif
+};
+
+template<>
+struct TypeTraits<std::complex<double> > {
+  typedef double magnitude_type;
+
+  static const char* name() {return "std::complex<double>";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_DOUBLE_COMPLEX;}
+#endif
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/Vector.hpp b/openmp-avx512/basic/Vector.hpp
new file mode 100644
index 0000000..4290ae4
--- /dev/null
+++ b/openmp-avx512/basic/Vector.hpp
@@ -0,0 +1,83 @@
+#ifndef _Vector_hpp_
+#define _Vector_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+
+#include <MemInitOp.hpp>
+
+namespace miniFE {
+
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal,
+         typename ComputeNode>
+struct Vector {
+  typedef ComputeNode ComputeNodeType;
+  typedef Scalar ScalarType;
+  typedef LocalOrdinal LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+
+  Vector(GlobalOrdinal startIdx, LocalOrdinal local_sz, ComputeNode& cn)
+   : startIndex(startIdx),
+     local_size(local_sz),
+     coefs(local_size),
+     compute_node(cn)
+  {
+    MemInitOp<Scalar> mem_init;
+    mem_init.ptr = &coefs[0];
+    mem_init.n = local_size;
+#ifdef MINIFE_HAVE_CUDA
+//we don't want to run this mem-init kernel on cuda, we want
+//to just run it locally on the host.
+    for(size_t i=0; i<mem_init.n; ++i) {
+      mem_init(i);
+    }
+#else
+    cn.parallel_for(local_size, mem_init);
+#endif
+  }
+
+  ~Vector()
+  {
+  }
+
+  GlobalOrdinal startIndex;
+  LocalOrdinal local_size;
+  std::vector<Scalar> coefs;
+  ComputeNode& compute_node;
+};
+
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/Vector_functions.hpp b/openmp-avx512/basic/Vector_functions.hpp
new file mode 100644
index 0000000..f82866e
--- /dev/null
+++ b/openmp-avx512/basic/Vector_functions.hpp
@@ -0,0 +1,249 @@
+#ifndef _Vector_functions_hpp_
+#define _Vector_functions_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+#include <sstream>
+#include <fstream>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#ifdef MINIFE_HAVE_TBB
+#include <LockingVector.hpp>
+#endif
+
+#include <TypeTraits.hpp>
+#include <Vector.hpp>
+#include <WaxpbyOp.hpp>
+#include <DotOp.hpp>
+
+
+namespace miniFE {
+
+
+template<typename VectorType>
+void write_vector(const std::string& filename,
+                  const VectorType& vec)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  std::ostringstream osstr;
+  osstr << filename << "." << numprocs << "." << myproc;
+  std::string full_name = osstr.str();
+  std::ofstream ofs(full_name.c_str());
+
+  typedef typename VectorType::ScalarType ScalarType;
+
+  const std::vector<ScalarType>& coefs = vec.coefs;
+  for(int p=0; p<numprocs; ++p) {
+    if (p == myproc) {
+      if (p == 0) {
+        ofs << vec.local_size << std::endl;
+      }
+  
+      typename VectorType::GlobalOrdinalType first = vec.startIndex;
+      for(size_t i=0; i<vec.local_size; ++i) {
+        ofs << first+i << " " << coefs[i] << std::endl;
+      }
+    }
+#ifdef HAVE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  }
+}
+
+template<typename VectorType>
+void sum_into_vector(size_t num_indices,
+                     const typename VectorType::GlobalOrdinalType* indices,
+                     const typename VectorType::ScalarType* coefs,
+                     VectorType& vec)
+{
+  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename VectorType::ScalarType Scalar;
+
+  GlobalOrdinal first = vec.startIndex;
+  GlobalOrdinal last = first + vec.local_size - 1;
+
+  std::vector<Scalar>& vec_coefs = vec.coefs;
+
+  for(size_t i=0; i<num_indices; ++i) {
+    if (indices[i] < first || indices[i] > last) continue;
+    size_t idx = indices[i] - first;
+    vec_coefs[idx] += coefs[i];
+  }
+}
+
+#ifdef MINIFE_HAVE_TBB
+template<typename VectorType>
+void sum_into_vector(size_t num_indices,
+                     const typename VectorType::GlobalOrdinalType* indices,
+                     const typename VectorType::ScalarType* coefs,
+                     LockingVector<VectorType>& vec)
+{
+  vec.sum_in(num_indices, indices, coefs);
+}
+#endif
+
+//------------------------------------------------------------
+//Compute the update of a vector with the sum of two scaled vectors where:
+//
+// w = alpha*x + beta*y
+//
+// x,y - input vectors
+//
+// alpha,beta - scalars applied to x and y respectively
+//
+// w - output vector
+//
+template<typename VectorType>
+void
+  waxpby(typename VectorType::ScalarType alpha, const VectorType& x,
+         typename VectorType::ScalarType beta, const VectorType& y,
+         VectorType& w)
+{
+  typedef typename VectorType::ScalarType ScalarType;
+  typedef typename VectorType::ComputeNodeType ComputeNodeType;
+
+  ComputeNodeType& compute_node = x.compute_node;
+
+  WaxpbyOp<ScalarType> waxpbyop;
+
+  waxpbyop.w = compute_node.get_buffer(&w.coefs[0], w.coefs.size());
+  waxpbyop.x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  waxpbyop.y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
+  waxpbyop.alpha = alpha;
+  waxpbyop.beta  = beta;
+  waxpbyop.n = x.local_size;
+
+#ifdef MINIFE_DEBUG
+  if (y.local_size < x.local_size || w.local_size < x.local_size) {
+    std::cerr << "miniFE::waxpby ERROR, y and w must be at least as long as x." << std::endl;
+    return;
+  }
+#endif
+
+  compute_node.parallel_for(waxpbyop.n, waxpbyop);
+}
+
+//Like waxpby above, except operates on two sets of arguments.
+//In other words, performs two waxpby operations in one loop.
+template<typename VectorType>
+void
+  fused_waxpby(typename VectorType::ScalarType alpha, const VectorType& x,
+         typename VectorType::ScalarType beta, const VectorType& y,
+         VectorType& w,
+         typename VectorType::ScalarType alpha2, const VectorType& x2,
+         typename VectorType::ScalarType beta2, const VectorType& y2,
+         VectorType& w2)
+{
+  typedef typename VectorType::ScalarType ScalarType;
+  typedef typename VectorType::ComputeNodeType ComputeNodeType;
+
+  ComputeNodeType& compute_node = x.compute_node;
+
+  FusedWaxpbyOp<ScalarType> waxpbyop;
+
+  waxpbyop.w = compute_node.get_buffer(&w.coefs[0], w.coefs.size());
+  waxpbyop.x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  waxpbyop.y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
+  waxpbyop.alpha = alpha;
+  waxpbyop.beta  = beta;
+  waxpbyop.w2 = compute_node.get_buffer(&w2.coefs[0], w2.coefs.size());
+  waxpbyop.x2 = compute_node.get_buffer(&x2.coefs[0], x2.coefs.size());
+  waxpbyop.y2 = compute_node.get_buffer(&y2.coefs[0], y2.coefs.size());
+  waxpbyop.alpha2 = alpha2;
+  waxpbyop.beta2  = beta2;
+  waxpbyop.n = x.local_size;
+
+#ifdef MINIFE_DEBUG
+  if (y.local_size < x.local_size || w.local_size < x.local_size) {
+    std::cerr << "miniFE::waxpby ERROR, y and w must be at least as long as x." << std::endl;
+    return;
+  }
+#endif
+
+  compute_node.parallel_for(waxpbyop.n, waxpbyop);
+}
+
+//-----------------------------------------------------------
+//Compute the dot product of two vectors where:
+//
+// x,y - input vectors
+//
+// result - return-value
+//
+template<typename Vector>
+typename TypeTraits<typename Vector::ScalarType>::magnitude_type
+  dot(const Vector& x,
+      const Vector& y)
+{
+  size_t n = x.local_size;
+
+#ifdef MINIFE_DEBUG
+  if (y.local_size < n) {
+    std::cerr << "miniFE::dot ERROR, y must be at least as long as x."<<std::endl;
+    n = y.local_size;
+  }
+#endif
+
+  typedef typename Vector::ScalarType Scalar;
+  typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
+
+  typedef typename Vector::ComputeNodeType ComputeNodeType;
+
+  ComputeNodeType& compute_node = x.compute_node;
+
+  DotOp<Scalar> dotop;
+  dotop.x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  dotop.y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
+  dotop.n = x.local_size;
+
+  compute_node.parallel_reduce(n, dotop);
+
+#ifdef HAVE_MPI
+  magnitude local_dot = dotop.result, global_dot = 0;
+  MPI_Datatype mpi_dtype = TypeTraits<magnitude>::mpi_type();  
+  MPI_Allreduce(&local_dot, &global_dot, 1, mpi_dtype, MPI_SUM, MPI_COMM_WORLD);
+  return global_dot;
+#else
+  return dotop.result;
+#endif
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/WaxpbyOp.hpp b/openmp-avx512/basic/WaxpbyOp.hpp
new file mode 100644
index 0000000..6eaaa6e
--- /dev/null
+++ b/openmp-avx512/basic/WaxpbyOp.hpp
@@ -0,0 +1,43 @@
+#ifndef WAXPBYOP_HPP_
+#define WAXPBYOP_HPP_
+
+#ifndef KERNEL_PREFIX 
+#define KERNEL_PREFIX
+#endif
+
+template <class Scalar>
+struct WaxpbyOp {
+      Scalar* w;
+  const Scalar* x;
+  const Scalar* y;
+  Scalar alpha, beta;
+  size_t n;
+  KERNEL_PREFIX void operator()(size_t i) const
+  {
+    //here we count on the caller (ComputeNode) to pass in 'i'
+    //that is in the range 0..n-1
+    w[i] = alpha*x[i] + beta*y[i];
+  }
+};
+
+template <class Scalar>
+struct FusedWaxpbyOp {
+      Scalar* w;
+  const Scalar* x;
+  const Scalar* y;
+  Scalar alpha, beta;
+      Scalar* w2;
+  const Scalar* x2;
+  const Scalar* y2;
+  Scalar alpha2, beta2;
+  size_t n;
+  KERNEL_PREFIX void operator()(size_t i) const
+  {
+    //here we count on the caller (ComputeNode) to pass in 'i'
+    //that is in the range 0..n-1
+    w[i] = alpha*x[i] + beta*y[i];
+    w2[i] = alpha2*x2[i] + beta2*y2[i];
+  }
+};
+
+#endif
diff --git a/openmp-avx512/basic/analytic_soln.hpp b/openmp-avx512/basic/analytic_soln.hpp
new file mode 100644
index 0000000..8dcdfad
--- /dev/null
+++ b/openmp-avx512/basic/analytic_soln.hpp
@@ -0,0 +1,117 @@
+#ifndef _analytic_soln_hpp_
+#define _analytic_soln_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+
+#ifndef MINIFE_SCALAR
+#define MINIFE_SCALAR double;
+#endif
+
+namespace miniFE {
+
+typedef MINIFE_SCALAR Scalar;
+
+// The 'soln' function below computes the analytic solution for
+// steady state temperature in a brick-shaped domain (formally called
+// a rectangular parallelepiped). The inputs to the function are
+// the x,y,z coordinates of the point at which temperature is to be
+// computed, and the number of terms p,q in the series expansion.
+//
+// The equations used for the temperature solution are equations 9 and 10
+// in section 6.2 of Carslaw & Jaeger, "Conduction of Heat in Solids".
+//
+// The paralellepiped being used is defined by this domain:
+// 0 <= x <= 1.0
+// 0 <= y <= 1.0
+// 0 <= z <= 1.0
+//
+// With boundary conditions prescribing the temperature to be 1.0 on
+// the x==1.0 face, and 0.0 on all other faces.
+//
+// Thus, in the equations from Carslaw & Jaeger, the following constants
+// are used:
+//
+// a == b == c == 1.0  (the extents of the domain)
+// v1 == 0.0           (temperature at x == 0.0)
+// v2 == 1.0           (temperature at x == 1.0)
+//
+
+const Scalar PI = 3.141592653589793238462;
+const Scalar PI_SQR = PI*PI;
+const Scalar term0 = 16.0/(PI_SQR);
+
+inline Scalar fcn_l(int p, int q)
+{
+  return std::sqrt((2*p+1)*(2*p+1)*PI_SQR + (2*q+1)*(2*q+1)*PI_SQR);
+}
+
+inline Scalar fcn(int n, Scalar u)
+{
+  return (2*n+1)*PI*u;
+}
+
+inline Scalar soln(Scalar x, Scalar y, Scalar z, int max_p, int max_q)
+{
+  Scalar sum = 0;
+  for(int p=0; p<=max_p; ++p) {
+    const Scalar p21y = fcn(p, y);
+    const Scalar sin_py = std::sin(p21y)/(2*p+1);
+    for(int q=0; q<=max_q; ++q) {
+      const Scalar q21z = fcn(q, z);
+      const Scalar sin_qz = std::sin(q21z)/(2*q+1);
+
+      const Scalar l = fcn_l(p, q);
+
+      const Scalar sinh1 = std::sinh(l*x);
+      const Scalar sinh2 = std::sinh(l);
+
+      const Scalar tmp = (sinh1*sin_py)*(sin_qz/sinh2);
+
+      //if the scalar l gets too big, sinh(l) becomes inf.
+      //if that happens, tmp is a NaN.
+      //crude check for NaN:
+      //if tmp != tmp, tmp is NaN
+      if (tmp == tmp) {
+        sum += tmp;
+      }
+      else {
+        //if we got a NaN, break out of this inner loop and go to
+        //the next iteration of the outer loop.
+        break;
+      }
+    }
+  }
+  return term0*sum;
+}
+
+}//namespace miniFE
+
+#endif /* _analytic_soln_hpp_ */
diff --git a/openmp-avx512/basic/assemble_FE_data.hpp b/openmp-avx512/basic/assemble_FE_data.hpp
new file mode 100644
index 0000000..f34b14a
--- /dev/null
+++ b/openmp-avx512/basic/assemble_FE_data.hpp
@@ -0,0 +1,85 @@
+#ifndef _assemble_FE_data_hpp_
+#define _assemble_FE_data_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <box_utils.hpp>
+#include <simple_mesh_description.hpp>
+
+#ifdef MINIFE_HAVE_TBB
+//#include <perform_element_loop_TBB_pipe.hpp>
+#include <perform_element_loop_TBB_pllfor1.hpp>
+//#include <perform_element_loop_TBB_pllfor2.hpp>
+#else
+#include <perform_element_loop.hpp>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType,
+         typename VectorType>
+void
+assemble_FE_data(const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh,
+                 MatrixType& A,
+                 VectorType& b,
+                 Parameters& params)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+
+  Box local_elem_box;
+  copy_box(mesh.local_box, local_elem_box);
+
+  if (get_num_ids<GlobalOrdinal>(local_elem_box) < 1) {
+    return;
+  }
+
+  //
+  //We want the element-loop to loop over our (processor-local) domain plus a
+  //ghost layer, so we can assemble the complete linear-system without doing
+  //any communication.
+  //
+  int ghost = 1;
+  if (local_elem_box[0][0] > 0) local_elem_box[0][0] -= ghost;
+  if (local_elem_box[1][0] > 0) local_elem_box[1][0] -= ghost;
+  if (local_elem_box[2][0] > 0) local_elem_box[2][0] -= ghost;
+  if (local_elem_box[0][1] < global_elems_x) local_elem_box[0][1] += ghost;
+  if (local_elem_box[1][1] < global_elems_y) local_elem_box[1][1] += ghost;
+  if (local_elem_box[2][1] < global_elems_z) local_elem_box[2][1] += ghost;
+
+  perform_element_loop(mesh, local_elem_box, A, b, params);
+}
+                      
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/box_utils.hpp b/openmp-avx512/basic/box_utils.hpp
new file mode 100644
index 0000000..ee10975
--- /dev/null
+++ b/openmp-avx512/basic/box_utils.hpp
@@ -0,0 +1,199 @@
+#ifndef _box_utils_hpp_
+#define _box_utils_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+#include <map>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <TypeTraits.hpp>
+#include <Box.hpp>
+
+namespace miniFE {
+
+inline void copy_box(const Box& from_box, Box& to_box)
+{
+  for(int i=0; i<3; ++i) {
+    to_box[i][0] = from_box[i][0];
+    to_box[i][1] = from_box[i][1];
+  }
+}
+
+template<typename GlobalOrdinal>
+void get_int_coords(GlobalOrdinal ID, int nx, int ny, int nz,
+                int& x, int& y, int& z)
+{
+  z = ID/(nx*ny);
+  y = (ID%(nx*ny))/nx;
+  x = ID%nx;
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+void get_coords(GlobalOrdinal ID, int nx, int ny, int nz,
+                Scalar& x, Scalar& y, Scalar& z)
+{
+  const int xdiv = nx>1 ? nx-1 : 1;
+  const int ydiv = ny>1 ? ny-1 : 1;
+  const int zdiv = nz>1 ? nz-1 : 1;
+
+//This code assumes that ID is 0-based.
+//
+//compute coordinates that lie on (or in) the unit cube.
+//that's why we're dividing by nz,ny,nx:
+  z = (1.0*(ID/(nx*ny)))/zdiv;
+  y = 1.0*((ID%(nx*ny))/nx)/ydiv;
+  x = 1.0*(ID%nx)/xdiv;
+}
+
+template<typename GlobalOrdinal>
+GlobalOrdinal get_num_ids(const Box& box)
+{
+  int nx = box[0][1] - box[0][0];
+  int ny = box[1][1] - box[1][0];
+  int nz = box[2][1] - box[2][0];
+  GlobalOrdinal tmp = nx*ny;
+  tmp *= nz;
+  return tmp;
+}
+
+template<typename GlobalOrdinal>
+GlobalOrdinal get_id(int nx, int ny, int nz,
+                     int x, int y, int z)
+{
+  if (x<0 || y<0 || z<0) return -1;
+  if (x>=nx || y>=ny || z>=nz) return -1;
+
+  //form x + nx*y + nx*ny*z:
+
+  GlobalOrdinal tmp = nx*ny;
+  tmp *= z;
+  tmp = x + nx * y + tmp;
+  return tmp;
+}
+
+template<typename GlobalOrdinal>
+void get_ids(int nx, int ny, int nz,
+             const Box& box,
+             GlobalOrdinal* ids)
+{
+  unsigned offset = 0;
+  for(int z=box[2][0]; z<box[2][1]; ++z) {
+    for(int y=box[1][0]; y<box[1][1]; ++y) {
+      for(int x=box[0][0]; x<box[0][1]; ++x) {
+        ids[offset++] = get_id<GlobalOrdinal>(nx, ny, nz, x, y, z);
+      }
+    }
+  }
+}
+
+template<typename GlobalOrdinal>
+void create_map_id_to_row(int global_nx, int global_ny, int global_nz,
+                     const Box& box,
+                     std::map<GlobalOrdinal,GlobalOrdinal>& id_to_row)
+{
+  GlobalOrdinal num_my_ids = get_num_ids<GlobalOrdinal>(box);
+  GlobalOrdinal my_first_row = 0;
+
+#ifdef HAVE_MPI
+  int numprocs = 1, myproc = 0;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+  typename std::vector<GlobalOrdinal> tmp_buffer(numprocs, 0);
+  tmp_buffer[myproc] = num_my_ids;
+  typename std::vector<GlobalOrdinal> global_offsets(numprocs);
+  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
+  MPI_Allreduce(&tmp_buffer[0], &global_offsets[0], numprocs, mpi_dtype,
+                MPI_SUM, MPI_COMM_WORLD);
+  GlobalOrdinal offset = 0;
+  for(int i=0; i<numprocs; ++i) {
+    GlobalOrdinal tmp = global_offsets[i];
+    global_offsets[i] = offset;
+    offset += tmp;
+  }
+
+  my_first_row = global_offsets[myproc];
+#endif
+
+  typename std::vector<GlobalOrdinal> all_my_ids(num_my_ids);
+  get_ids(global_nx, global_ny, global_nz, box, &all_my_ids[0]);
+
+  typename std::vector<GlobalOrdinal> ids;
+  typename std::vector<GlobalOrdinal> rows;
+
+  if (all_my_ids.size() > 0) {
+    ids.push_back(all_my_ids[0]);
+    rows.push_back(my_first_row);
+  }
+
+  for(size_t i=1; i<all_my_ids.size(); ++i) {
+    if (all_my_ids[i] != all_my_ids[i-1]+1) {
+      ids.push_back(all_my_ids[i]);
+      rows.push_back(my_first_row+i);
+    }
+  }
+
+#ifdef HAVE_MPI
+  int len = ids.size();
+  std::vector<int> lengths(numprocs);
+  MPI_Allgather(&len, 1, MPI_INT, &lengths[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+  std::vector<int> displs(lengths);
+  int displ = 0;
+  for(int i=0; i<numprocs; ++i) {
+    int tmp = lengths[i];
+    displs[i] = displ;
+    displ += tmp;
+  }
+
+  typename std::vector<GlobalOrdinal> global_ids(displ);
+  typename std::vector<GlobalOrdinal> global_rows(displ);
+
+  MPI_Allgatherv(&ids[0], len, mpi_dtype, &global_ids[0],
+                 &lengths[0], &displs[0], mpi_dtype, MPI_COMM_WORLD);
+  MPI_Allgatherv(&rows[0], len, mpi_dtype, &global_rows[0],
+                 &lengths[0], &displs[0], mpi_dtype, MPI_COMM_WORLD);
+
+  ids = global_ids;
+  rows = global_rows;
+#endif
+
+  for(size_t i=0; i<ids.size(); ++i) {
+    id_to_row.insert(std::make_pair(ids[i], rows[i]));
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/cg_solve.hpp b/openmp-avx512/basic/cg_solve.hpp
new file mode 100644
index 0000000..ccbd9b7
--- /dev/null
+++ b/openmp-avx512/basic/cg_solve.hpp
@@ -0,0 +1,273 @@
+#ifndef _cg_solve_hpp_
+#define _cg_solve_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+#include <limits>
+
+#include <Vector_functions.hpp>
+#include <mytimer.hpp>
+
+#include <outstream.hpp>
+
+namespace miniFE {
+
+template<typename Scalar>
+void print_vec(const std::vector<Scalar>& vec, const std::string& name)
+{
+  for(size_t i=0; i<vec.size(); ++i) {
+    std::cout << name << "["<<i<<"]: " << vec[i] << std::endl;
+  }
+}
+
+template<typename VectorType>
+bool breakdown(typename VectorType::ScalarType inner,
+               const VectorType& v,
+               const VectorType& w)
+{
+  typedef typename VectorType::ScalarType Scalar;
+  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
+
+//This is code that was copied from Aztec, and originally written
+//by my hero, Ray Tuminaro.
+//
+//Assuming that inner = <v,w> (inner product of v and w),
+//v and w are considered orthogonal if
+//  |inner| < 100 * ||v||_2 * ||w||_2 * epsilon
+
+  magnitude vnorm = std::sqrt(dot(v,v));
+  magnitude wnorm = std::sqrt(dot(w,w));
+  return std::abs(inner) <= 100*vnorm*wnorm*std::numeric_limits<magnitude>::epsilon();
+}
+
+template<typename OperatorType,
+         typename VectorType,
+         typename Matvec>
+void
+cg_solve(OperatorType& A,
+         const VectorType& b,
+         VectorType& x,
+         Matvec matvec,
+         typename OperatorType::LocalOrdinalType max_iter,
+         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
+         typename OperatorType::LocalOrdinalType& num_iters,
+         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
+         timer_type* my_cg_times)
+{
+  typedef typename OperatorType::ScalarType ScalarType;
+  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
+  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;
+
+  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
+  timer_type total_time = mytimer();
+
+  int myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (!A.has_local_indices) {
+    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
+       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
+       << std::endl;
+    return;
+  }
+
+  size_t nrows = A.rows.size();
+  LocalOrdinalType ncols = A.num_cols;
+
+  VectorType r(b.startIndex, nrows, b.compute_node);
+  VectorType p(0, ncols, b.compute_node);
+  VectorType Ap(b.startIndex, nrows, b.compute_node);
+
+  normr = 0;
+  magnitude_type rtrans = 0;
+  magnitude_type oldrtrans = 0;
+
+  LocalOrdinalType print_freq = max_iter/10;
+  if (print_freq>50) print_freq = 50;
+  if (print_freq<1)  print_freq = 1;
+
+  ScalarType one = 1.0;
+  ScalarType zero = 0.0;
+
+  typedef typename VectorType::ComputeNodeType ComputeNodeType;
+  ComputeNodeType& compute_node = x.compute_node;
+
+  //The following lines that create and initialize buffers are no-ops in many
+  //cases, but perform actual allocations and copies if an off-cpu device such
+  //as a GPU is being used by compute_node.
+
+  //Do any required allocations for buffers that will be needed during CG:
+  ScalarType* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  ScalarType* d_p = compute_node.get_buffer(&p.coefs[0], p.coefs.size());
+  ScalarType* d_b = compute_node.get_buffer(&b.coefs[0], b.coefs.size());
+  ScalarType* d_Ap = compute_node.get_buffer(&Ap.coefs[0], Ap.coefs.size());
+  ScalarType* d_r  = compute_node.get_buffer(&r.coefs[0], r.coefs.size());
+#ifdef MINIFE_CSR_MATRIX
+  LocalOrdinalType* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
+  GlobalOrdinalType* d_Acols   = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
+  ScalarType* d_Acoefs  = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
+#endif
+#ifdef MINIFE_ELL_MATRIX
+  GlobalOrdinalType* d_Acols   = compute_node.get_buffer(&A.cols[0], A.cols.size());
+  ScalarType* d_Acoefs  = compute_node.get_buffer(&A.coefs[0], A.coefs.size());
+#endif
+
+  //Copy data to buffers that need to be initialized from input data:
+  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
+  compute_node.copy_to_buffer(&b.coefs[0], b.coefs.size(), d_b);
+#ifdef MINIFE_CSR_MATRIX
+  compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
+  compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
+  compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);
+#endif
+#ifdef MINIFE_ELL_MATRIX
+  compute_node.copy_to_buffer(&A.cols[0], A.cols.size(), d_Acols);
+  compute_node.copy_to_buffer(&A.coefs[0], A.coefs.size(), d_Acoefs);
+#endif
+
+  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);
+
+  compute_node.copy_from_buffer(&p.coefs[0], p.coefs.size(), d_p);
+//  print_vec(p.coefs, "p");
+
+  TICK();
+  matvec(A, p, Ap);
+  TOCK(tMATVEC);
+
+  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);
+
+//  if (b.coefs.size() == r.coefs.size()) std::cout << "b.size == r.size" << std::endl;
+//  else std::cout << "b.size != r.size" << std::endl;
+//  if (b.coefs == r.coefs) std::cout << "b == r" << std::endl;
+//  else std::cout << "b != r" << std::endl;
+//  compute_node.copy_from_buffer(&r.coefs[0], r.coefs.size(), d_r);
+//  print_vec(b.coefs, "b");
+//  print_vec(r.coefs, "r");
+
+  TICK(); rtrans = dot(r, r); TOCK(tDOT);
+
+//std::cout << "rtrans="<<rtrans<<std::endl;
+
+  normr = std::sqrt(rtrans);
+
+  if (myproc == 0) {
+    std::cout << "Initial Residual = "<< normr << std::endl;
+  }
+
+  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();
+
+#ifdef MINIFE_DEBUG
+  std::ostream& os = outstream();
+  os << "brkdown_tol = " << brkdown_tol << std::endl;
+#endif
+
+
+  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
+    if (k == 1) {
+      TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
+    }
+    else {
+      oldrtrans = rtrans;
+      TICK(); rtrans = dot(r, r); TOCK(tDOT);
+      magnitude_type beta = rtrans/oldrtrans;
+      TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY);
+    }
+
+    normr = std::sqrt(rtrans);
+
+    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
+      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
+    }
+
+    magnitude_type alpha = 0;
+    magnitude_type p_ap_dot = 0;
+
+#ifdef MINIFE_FUSED
+    TICK();
+    p_ap_dot = matvec_and_dot(A, p, Ap);
+    TOCK(tMATVECDOT);
+#else
+    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);
+
+    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);
+#endif
+
+#ifdef MINIFE_DEBUG
+    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
+    os.flush();
+#endif
+    if (p_ap_dot < brkdown_tol) {
+      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
+        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
+#ifdef MINIFE_DEBUG
+        os << "ERROR, numerical breakdown!"<<std::endl;
+#endif
+        //update the timers before jumping out.
+        my_cg_times[WAXPY] = tWAXPY;
+        my_cg_times[DOT] = tDOT;
+        my_cg_times[MATVEC] = tMATVEC;
+        my_cg_times[TOTAL] = mytimer() - total_time;
+        return;
+      }
+      else brkdown_tol = 0.1 * p_ap_dot;
+    }
+    alpha = rtrans/p_ap_dot;
+#ifdef MINIFE_DEBUG
+    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
+#endif
+
+#ifdef MINIFE_FUSED
+    TICK();
+    fused_waxpby(one, x, alpha, p, x, one, r, -alpha, Ap, r);
+    TOCK(tWAXPY);
+#else
+    TICK(); waxpby(one, x, alpha, p, x);
+            waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY);
+#endif
+
+    num_iters = k;
+  }
+
+  compute_node.copy_from_buffer(&x.coefs[0], x.coefs.size(), d_x);
+
+  my_cg_times[WAXPY] = tWAXPY;
+  my_cg_times[DOT] = tDOT;
+  my_cg_times[MATVEC] = tMATVEC;
+  my_cg_times[MATVECDOT] = tMATVECDOT;
+  my_cg_times[TOTAL] = mytimer() - total_time;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/compute_matrix_stats.hpp b/openmp-avx512/basic/compute_matrix_stats.hpp
new file mode 100644
index 0000000..f035eec
--- /dev/null
+++ b/openmp-avx512/basic/compute_matrix_stats.hpp
@@ -0,0 +1,114 @@
+#ifndef _compute_matrix_stats_hpp_
+#define _compute_matrix_stats_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+
+#include <outstream.hpp>
+#include <utils.hpp>
+#include <YAML_Doc.hpp>
+
+namespace miniFE {
+
+template<typename MatrixType>
+size_t
+compute_matrix_stats(const MatrixType& A, int myproc, int numprocs, YAML_Doc& ydoc)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
+  int min_proc = 0, max_proc = 0;
+
+  GlobalOrdinal local_nrows = A.rows.size();
+
+  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
+                     max_nrows, max_proc);
+
+  //Gather stats on global, min/max matrix num-nonzeros:
+
+  double local_nnz = A.num_nonzeros();
+  double dglobal_nnz = 0, dmin_nnz = 0, dmax_nnz = 0;
+
+  get_global_min_max(local_nnz, dglobal_nnz, dmin_nnz, min_proc,
+                     dmax_nnz, max_proc);
+
+  double avg_nrows = global_nrows;
+  avg_nrows /= numprocs;
+  double avg_nnz = dglobal_nnz;
+  avg_nnz /= numprocs;
+
+  double mem_overhead_MB = parallel_memory_overhead_MB(A);
+
+  size_t global_nnz = static_cast<size_t>(std::ceil(dglobal_nnz));
+  size_t min_nnz = static_cast<size_t>(std::ceil(dmin_nnz));
+  size_t max_nnz = static_cast<size_t>(std::ceil(dmax_nnz));
+
+  if (myproc == 0) {
+    ydoc.add("Matrix attributes","");
+    ydoc.get("Matrix attributes")->add("Global Nrows",global_nrows);
+    ydoc.get("Matrix attributes")->add("Global NNZ",global_nnz);
+
+    //compute how much memory the matrix occupies:
+    //num-bytes = sizeof(GlobalOrdinal)*global_nrows   for A.rows
+    //          + sizeof(LocalOrdinal)*global_nrows    for A.rows_offsets
+    //          + sizeof(GlobalOrdinal)*global_nnz     for A.packed_cols
+    //          + sizeof(Scalar)*global_nnz            for A.packed_coefs
+
+    double invGB = 1.0/(1024*1024*1024);
+    double memGB = invGB*global_nrows*sizeof(GlobalOrdinal);
+    memGB += invGB*global_nrows*sizeof(LocalOrdinal);
+    memGB += invGB*global_nnz*sizeof(GlobalOrdinal);
+    memGB += invGB*global_nnz*sizeof(Scalar);
+    ydoc.get("Matrix attributes")->add("Global Memory (GB)",memGB);
+
+    ydoc.get("Matrix attributes")->add("Pll Memory Overhead (MB)",mem_overhead_MB);
+
+    ydoc.get("Matrix attributes")->add("Rows per proc MIN",min_nrows);
+    ydoc.get("Matrix attributes")->add("Rows per proc MAX",max_nrows);
+    ydoc.get("Matrix attributes")->add("Rows per proc AVG",avg_nrows);
+    ydoc.get("Matrix attributes")->add("NNZ per proc MIN",min_nnz);
+    ydoc.get("Matrix attributes")->add("NNZ per proc MAX",max_nnz);
+    ydoc.get("Matrix attributes")->add("NNZ per proc AVG",avg_nnz);
+  }
+
+  return global_nnz;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/driver.hpp b/openmp-avx512/basic/driver.hpp
new file mode 100644
index 0000000..d3966eb
--- /dev/null
+++ b/openmp-avx512/basic/driver.hpp
@@ -0,0 +1,403 @@
+#ifndef _driver_hpp_
+#define _driver_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+
+#include <box_utils.hpp>
+#include <Vector.hpp>
+
+#ifdef MINIFE_CSR_MATRIX
+#include <CSRMatrix.hpp>
+#elif defined(MINIFE_ELL_MATRIX)
+#include <ELLMatrix.hpp>
+#else
+#include <CSRMatrix.hpp>
+#endif
+
+#include <simple_mesh_description.hpp>
+
+#include <SparseMatrix_functions.hpp>
+
+#include <generate_matrix_structure.hpp>
+#include <assemble_FE_data.hpp>
+
+#include <verify_solution.hpp>
+
+#include <compute_matrix_stats.hpp>
+#include <make_local_matrix.hpp>
+#include <imbalance.hpp>
+#include <cg_solve.hpp>
+#if MINIFE_KERNELS != 0
+#include <time_kernels.hpp>
+#endif
+#include <outstream.hpp>
+#include <utils.hpp>
+#include <mytimer.hpp>
+#include <YAML_Doc.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#define RUN_TIMED_FUNCTION(msg, fn, time_inc, time_total) \
+{                                   \
+  if (myproc==0) {                  \
+    std::cout.width(30);            \
+    std::cout << msg;               \
+    std::cout.flush();              \
+  }                                 \
+  timer_type rtf_t0 = mytimer();    \
+  fn;                               \
+  time_inc = mytimer() - rtf_t0;    \
+  time_total += time_inc;           \
+  if (myproc==0) {                  \
+    std::cout << time_inc << "s, total time: " << time_total << std::endl; \
+  }                                 \
+}
+
+//This program assembles finite-element matrices into a global matrix and
+//vector, then solves the linear-system using Conjugate Gradients.
+//Each finite-element is a hexahedron with 8 vertex-nodes.
+//
+//Notes:
+//- In finite-element terms, the box dimensions are in elements, not nodes.
+//  In other words, a 2x2x2 box describes 8 elements, each of which has 8 nodes,
+//  so it is a 3x3x3 node domain (27 nodes).
+//  The assembled linear system will have 1 equation for each finite element node.
+//
+//- The coordinate origin is at the corner of the global box where x=0,
+//  y=0, z=0, and the box extends along the positive x-axis, positive y-axis,
+//  and the positive z-axis.
+//
+//- Some aspects of matrix-structure generation and finite-element assembly
+//  are convenient to do using global node identifiers.
+//  A global identifier for each node is obtained from coordinates plus
+//  global box dimensions. See the function 'get_id' in box_utils.hpp.
+//
+//- Each node corresponds to a row in the matrix. The RCB partitioning method
+//  we use to split the global box among processors results in some
+//  processors owning non-contiguous blocks of global node identifiers.
+//  Since it is convenient for matrices and vectors to store contiguously-
+//  numbered blocks of rows, we map global node identifiers to a separate
+//  space of row numbers such that each processor's nodes correspond to a
+//  contiguous block of row numbers.
+//
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal,
+         typename ComputeNodeType>
+void
+driver(const Box& global_box, Box& my_box, ComputeNodeType& compute_node,
+       Parameters& params, YAML_Doc& ydoc)
+{
+  int global_nx = global_box[0][1];
+  int global_ny = global_box[1][1];
+  int global_nz = global_box[2][1];
+
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (params.load_imbalance > 0) {
+    add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc);
+  }
+
+  float largest_imbalance = 0, std_dev = 0;
+  compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance,
+                                   std_dev, ydoc, true);
+
+
+  //Create a representation of the mesh:
+  //Note that 'simple_mesh_description' is a virtual or conceptual
+  //mesh that doesn't actually store mesh data.
+
+  if (myproc==0) {
+    std::cout.width(30);
+    std::cout << "creating/filling mesh...";
+    std::cout.flush();
+  }
+
+  timer_type t_start = mytimer();
+  timer_type t0 = mytimer();
+
+  simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box);
+
+  timer_type mesh_fill = mytimer() - t0;
+  timer_type t_total = mytimer() - t_start;
+
+  if (myproc==0) {
+    std::cout << mesh_fill << "s, total time: " << t_total << std::endl;
+  }
+
+  //next we will generate the matrix structure.
+
+  //Declare matrix object:
+
+#ifdef MINIFE_CSR_MATRIX
+  typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> MatrixType;
+#elif defined(MINIFE_ELL_MATRIX)
+  typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> MatrixType;
+#else
+  typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> MatrixType;
+#endif
+
+  MatrixType A(compute_node);
+
+  timer_type gen_structure;
+  RUN_TIMED_FUNCTION("generating matrix structure...",
+                     generate_matrix_structure(mesh, A),
+                     gen_structure, t_total);
+
+  GlobalOrdinal local_nrows = A.rows.size();
+  GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1;
+
+  Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> b(my_first_row, local_nrows,compute_node);
+  Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(my_first_row, local_nrows,compute_node);
+
+  //Assemble finite-element sub-matrices and sub-vectors into the global
+  //linear system:
+
+  timer_type fe_assembly;
+  RUN_TIMED_FUNCTION("assembling FE data...",
+                     assemble_FE_data(mesh, A, b, params),
+                     fe_assembly, t_total);
+
+  if (myproc == 0) {
+    ydoc.add("Matrix structure generation","");
+    ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure);
+    ydoc.add("FE assembly","");
+    ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly);
+  }
+
+#ifdef MINIFE_DEBUG
+  write_matrix("A_prebc.mtx", A);
+  write_vector("b_prebc.vec", b);
+#endif
+
+  //Now apply dirichlet boundary-conditions
+  //(Apply the 0-valued surfaces first, then the 1-valued surface last.)
+
+  timer_type dirbc_time;
+  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
+            impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total);
+  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
+            impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total);
+
+#ifdef MINIFE_DEBUG
+  write_matrix("A.mtx", A);
+  write_vector("b.vec", b);
+#endif
+
+  //Transform global indices to local, set up communication information:
+
+  timer_type make_local_time;
+  RUN_TIMED_FUNCTION("making matrix indices local...",
+                     make_local_matrix(A),
+                     make_local_time, t_total);
+
+#ifdef MINIFE_DEBUG
+  write_matrix("A_local.mtx", A);
+  write_vector("b_local.vec", b);
+#endif
+
+  size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc);
+
+  //Prepare to perform conjugate gradient solve:
+
+  LocalOrdinal max_iters = 50;
+  LocalOrdinal num_iters = 0;
+  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
+  magnitude rnorm = 0;
+  magnitude tol = std::numeric_limits<magnitude>::epsilon();
+
+  timer_type cg_times[NUM_TIMERS];
+
+  typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> VectorType;
+
+  t_total = mytimer() - t_start;
+
+  bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1;
+
+#if MINIFE_KERNELS != 0
+  if (myproc==0) {
+    std::cout.width(30);
+    std::cout << "Starting kernel timing loops ..." << std::endl;
+  }
+
+  max_iters = 500;
+  x.coefs[0] = 0.9;
+  if (matvec_with_comm_overlap) {
+    time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
+  }
+  else {
+    time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
+  }
+  num_iters = max_iters;
+  std::string title("Kernel timings");
+#else
+  if (myproc==0) {
+    std::cout << "Starting CG solver ... " << std::endl;
+  }
+
+  if (matvec_with_comm_overlap) {
+#ifdef MINIFE_CSR_MATRIX
+    rearrange_matrix_local_external(A);
+    cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol,
+           num_iters, rnorm, cg_times);
+#else
+    std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl;
+#endif
+  }
+  else {
+    cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol,
+           num_iters, rnorm, cg_times);
+    if (myproc == 0) {
+      std::cout << "Final Resid Norm: " << rnorm << std::endl;
+    }
+
+#ifdef MINIFE_DEBUG
+    if (myproc == 0) {
+      std::cout << "verifying solution..." << std::endl;
+    }
+    verify_solution(mesh, x);
+#endif
+  }
+
+#ifdef MINIFE_DEBUG
+  write_vector("x.vec", x);
+#endif
+  std::string title("CG solve");
+#endif
+
+  if (myproc == 0) {
+    ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name());
+    ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name());
+    ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name());
+    ydoc.add(title,"");
+    ydoc.get(title)->add("Iterations",num_iters);
+    ydoc.get(title)->add("Final Resid Norm",rnorm);
+
+    GlobalOrdinal global_nrows = global_nx;
+    global_nrows *= global_ny*global_nz;
+
+    //flops-per-mv, flops-per-dot, flops-per-waxpy:
+    double mv_flops = global_nnz*2.0;
+    double dot_flops = global_nrows*2.0;
+    double waxpy_flops = global_nrows*3.0;
+
+#if MINIFE_KERNELS == 0
+//if MINIFE_KERNELS == 0 then we did a CG solve, and in that case
+//there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys.
+    mv_flops *= (num_iters+1);
+    dot_flops *= (2*num_iters);
+    waxpy_flops *= (3*num_iters+2);
+#else
+//if MINIFE_KERNELS then we did one of each operation per iteration.
+    mv_flops *= num_iters;
+    dot_flops *= num_iters;
+    waxpy_flops *= num_iters;
+#endif
+
+    double total_flops = mv_flops + dot_flops + waxpy_flops;
+
+    double mv_mflops = -1;
+    if (cg_times[MATVEC] > 1.e-4)
+      mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]);
+
+    double dot_mflops = -1;
+    if (cg_times[DOT] > 1.e-4)
+      dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]);
+
+    double waxpy_mflops = -1;
+    if (cg_times[WAXPY] > 1.e-4)
+      waxpy_mflops = 1.e-6 *  (waxpy_flops/cg_times[WAXPY]);
+
+    double total_mflops = -1;
+    if (cg_times[TOTAL] > 1.e-4)
+      total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]);
+
+    ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]);
+    ydoc.get(title)->add("WAXPY Flops",waxpy_flops);
+    if (waxpy_mflops >= 0)
+      ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops);
+    else
+      ydoc.get(title)->add("WAXPY Mflops","inf");
+
+    ydoc.get(title)->add("DOT Time",cg_times[DOT]);
+    ydoc.get(title)->add("DOT Flops",dot_flops);
+    if (dot_mflops >= 0)
+      ydoc.get(title)->add("DOT Mflops",dot_mflops);
+    else
+      ydoc.get(title)->add("DOT Mflops","inf");
+
+    ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]);
+    ydoc.get(title)->add("MATVEC Flops",mv_flops);
+    if (mv_mflops >= 0)
+      ydoc.get(title)->add("MATVEC Mflops",mv_mflops);
+    else
+      ydoc.get(title)->add("MATVEC Mflops","inf");
+
+#ifdef MINIFE_FUSED
+    ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]);
+    ydoc.get(title)->add("MATVECDOT Flops",mv_flops);
+    if (mv_mflops >= 0)
+      ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops);
+    else
+      ydoc.get(title)->add("MATVECDOT Mflops","inf");
+#endif
+
+#if MINIFE_KERNELS == 0
+    ydoc.get(title)->add("Total","");
+    ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]);
+    ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops);
+    if (total_mflops >= 0)
+      ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops);
+    else
+      ydoc.get(title)->get("Total")->add("Total CG Mflops","inf");
+    ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters);
+#endif
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/exchange_externals.hpp b/openmp-avx512/basic/exchange_externals.hpp
new file mode 100644
index 0000000..167ba1b
--- /dev/null
+++ b/openmp-avx512/basic/exchange_externals.hpp
@@ -0,0 +1,270 @@
+#ifndef _exchange_externals_hpp_
+#define _exchange_externals_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <cstdlib>
+#include <iostream>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <outstream.hpp>
+
+#include <TypeTraits.hpp>
+
+namespace miniFE {
+
+template<typename MatrixType,
+         typename VectorType>
+void
+exchange_externals(MatrixType& A,
+                   VectorType& x)
+{
+#ifdef HAVE_MPI
+#ifdef MINIFE_DEBUG
+  std::ostream& os = outstream();
+  os << "entering exchange_externals\n";
+#endif
+
+  int numprocs = 1;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+
+  if (numprocs < 2) return;
+
+  typedef typename MatrixType::ScalarType Scalar;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+  // Extract Matrix pieces
+
+  int local_nrow = A.rows.size();
+  int num_neighbors = A.neighbors.size();
+  const std::vector<LocalOrdinal>& recv_length = A.recv_length;
+  const std::vector<LocalOrdinal>& send_length = A.send_length;
+  const std::vector<int>& neighbors = A.neighbors;
+  const std::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
+
+  std::vector<Scalar>& send_buffer = A.send_buffer;
+
+  //
+  // first post receives, these are immediate receives
+  // Do not wait for result to come, will do that at the
+  // wait call below.
+  //
+
+  int MPI_MY_TAG = 99;
+
+  std::vector<MPI_Request>& request = A.request;
+
+  //
+  // Externals are at end of locals
+  //
+
+  std::vector<Scalar>& x_coefs = x.coefs;
+  Scalar* x_external = &(x_coefs[local_nrow]);
+
+  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
+
+  // Post receives first
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_recv = recv_length[i];
+    MPI_Irecv(x_external, n_recv, mpi_dtype, neighbors[i], MPI_MY_TAG,
+              MPI_COMM_WORLD, &request[i]);
+    x_external += n_recv;
+  }
+
+#ifdef MINIFE_DEBUG
+  os << "launched recvs\n";
+#endif
+
+  //
+  // Fill up send buffer
+  //
+
+  size_t total_to_be_sent = elements_to_send.size();
+#ifdef MINIFE_DEBUG
+  os << "total_to_be_sent: " << total_to_be_sent << std::endl;
+#endif
+
+  for(size_t i=0; i<total_to_be_sent; ++i) {
+#ifdef MINIFE_DEBUG
+    //expensive index range-check:
+    if (elements_to_send[i] < 0 || elements_to_send[i] > x.coefs.size()) {
+      os << "error, out-of-range. x.coefs.size()=="<<x.coefs.size()<<", elements_to_send[i]=="<<elements_to_send[i]<<std::endl;
+    }
+#endif
+    send_buffer[i] = x.coefs[elements_to_send[i]];
+  }
+
+  //
+  // Send to each neighbor
+  //
+
+  Scalar* s_buffer = &send_buffer[0];
+
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_send = send_length[i];
+    MPI_Send(s_buffer, n_send, mpi_dtype, neighbors[i], MPI_MY_TAG,
+             MPI_COMM_WORLD);
+    s_buffer += n_send;
+  }
+
+#ifdef MINIFE_DEBUG
+  os << "send to " << num_neighbors << std::endl;
+#endif
+
+  //
+  // Complete the reads issued above
+  //
+
+  MPI_Status status;
+  for(int i=0; i<num_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+  }
+
+#ifdef MINIFE_DEBUG
+  os << "leaving exchange_externals"<<std::endl;
+#endif
+
+//endif HAVE_MPI
+#endif
+}
+
+#ifdef HAVE_MPI
+static std::vector<MPI_Request> exch_ext_requests;
+#endif
+
+template<typename MatrixType,
+         typename VectorType>
+void
+begin_exchange_externals(MatrixType& A,
+                         VectorType& x)
+{
+#ifdef HAVE_MPI
+
+  int numprocs = 1, myproc = 0;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+  if (numprocs < 2) return;
+
+  typedef typename MatrixType::ScalarType Scalar;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+  // Extract Matrix pieces
+
+  int local_nrow = A.rows.size();
+  int num_neighbors = A.neighbors.size();
+  const std::vector<LocalOrdinal>& recv_length = A.recv_length;
+  const std::vector<LocalOrdinal>& send_length = A.send_length;
+  const std::vector<int>& neighbors = A.neighbors;
+  const std::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
+
+  std::vector<Scalar> send_buffer(elements_to_send.size(), 0);
+
+  //
+  // first post receives, these are immediate receives
+  // Do not wait for result to come, will do that at the
+  // wait call below.
+  //
+
+  int MPI_MY_TAG = 99;
+
+  exch_ext_requests.resize(num_neighbors);
+
+  //
+  // Externals are at end of locals
+  //
+
+  std::vector<Scalar>& x_coefs = x.coefs;
+  Scalar* x_external = &(x_coefs[local_nrow]);
+
+  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
+
+  // Post receives first
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_recv = recv_length[i];
+    MPI_Irecv(x_external, n_recv, mpi_dtype, neighbors[i], MPI_MY_TAG,
+              MPI_COMM_WORLD, &exch_ext_requests[i]);
+    x_external += n_recv;
+  }
+
+  //
+  // Fill up send buffer
+  //
+
+  size_t total_to_be_sent = elements_to_send.size();
+  for(size_t i=0; i<total_to_be_sent; ++i) send_buffer[i] = x.coefs[elements_to_send[i]];
+
+  //
+  // Send to each neighbor
+  //
+
+  Scalar* s_buffer = &send_buffer[0];
+
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_send = send_length[i];
+    MPI_Send(s_buffer, n_send, mpi_dtype, neighbors[i], MPI_MY_TAG,
+             MPI_COMM_WORLD);
+    s_buffer += n_send;
+  }
+#endif
+}
+
+inline
+void
+finish_exchange_externals(int num_neighbors)
+{
+#ifdef HAVE_MPI
+  //
+  // Complete the reads issued above
+  //
+
+  MPI_Status status;
+  for(int i=0; i<num_neighbors; ++i) {
+    if (MPI_Wait(&exch_ext_requests[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+  }
+
+//endif HAVE_MPI
+#endif
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/generate_matrix_structure.hpp b/openmp-avx512/basic/generate_matrix_structure.hpp
new file mode 100644
index 0000000..2413d62
--- /dev/null
+++ b/openmp-avx512/basic/generate_matrix_structure.hpp
@@ -0,0 +1,150 @@
+#ifndef _generate_matrix_structure_hpp_
+#define _generate_matrix_structure_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <sstream>
+#include <stdexcept>
+#include <map>
+#include <algorithm>
+
+#include <simple_mesh_description.hpp>
+#include <SparseMatrix_functions.hpp>
+#include <box_utils.hpp>
+#include <utils.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType>
+int
+generate_matrix_structure(const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh,
+                          MatrixType& A)
+{
+  int myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  int threw_exc = 0;
+  try {
+
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+
+  int global_nodes_x = mesh.global_box[0][1]+1;
+  int global_nodes_y = mesh.global_box[1][1]+1;
+  int global_nodes_z = mesh.global_box[2][1]+1;
+  Box box;
+  copy_box(mesh.local_box, box);
+
+  //num-owned-nodes in each dimension is num-elems+1
+  //only if num-elems > 0 in that dimension *and*
+  //we are at the high end of the global range in that dimension:
+  if (box[0][1] > box[0][0] && box[0][1] == mesh.global_box[0][1]) ++box[0][1];
+  if (box[1][1] > box[1][0] && box[1][1] == mesh.global_box[1][1]) ++box[1][1];
+  if (box[2][1] > box[2][0] && box[2][1] == mesh.global_box[2][1]) ++box[2][1];
+
+  GlobalOrdinal global_nrows = global_nodes_x;
+  global_nrows *= global_nodes_y*global_nodes_z;
+
+  GlobalOrdinal nrows = get_num_ids<GlobalOrdinal>(box);
+  try {
+    A.reserve_space(nrows, 27);
+  }
+  catch(std::exception& exc) {
+    std::ostringstream osstr;
+    osstr << "One of A.rows.resize, A.row_offsets.resize, A.packed_cols.reserve or A.packed_coefs.reserve: nrows=" <<nrows<<": ";
+    osstr << exc.what();
+    std::string str1 = osstr.str();
+    throw std::runtime_error(str1);
+  }
+
+  std::vector<GlobalOrdinal> rows(nrows);
+  std::vector<LocalOrdinal> row_offsets(nrows+1);
+  std::vector<int> row_coords(nrows*3);
+
+  unsigned roffset = 0;
+  GlobalOrdinal nnz = 0;
+
+  for(int iz=box[2][0]; iz<box[2][1]; ++iz) {
+   for(int iy=box[1][0]; iy<box[1][1]; ++iy) {
+    for(int ix=box[0][0]; ix<box[0][1]; ++ix) {
+      GlobalOrdinal row_id =
+          get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                ix, iy, iz);
+      rows[roffset] = mesh.map_id_to_row(row_id);
+      row_coords[roffset*3] = ix;
+      row_coords[roffset*3+1] = iy;
+      row_coords[roffset*3+2] = iz;
+      row_offsets[roffset++] = nnz;
+
+      GlobalOrdinal row_begin_offset = nnz;
+      for(int sz=-1; sz<=1; ++sz) {
+       for(int sy=-1; sy<=1; ++sy) {
+        for(int sx=-1; sx<=1; ++sx) {
+          GlobalOrdinal col_id =
+              get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                   ix+sx, iy+sy, iz+sz);
+          if (col_id >= 0 && col_id < global_nrows) {
+            ++nnz;
+          }
+        }
+       }
+      }
+    }
+   }
+  }
+  row_offsets[roffset] = nnz;
+  init_matrix(A, rows, row_offsets, row_coords,
+              global_nodes_x, global_nodes_y, global_nodes_z, global_nrows, mesh);
+  }
+  catch(...) {
+    std::cout << "proc " << myproc << " threw an exception in generate_matrix_structure, probably due to running out of memory." << std::endl;
+    threw_exc = 1;
+  }
+#ifdef HAVE_MPI
+  int global_throw = 0;
+  MPI_Allreduce(&threw_exc, &global_throw, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  threw_exc = global_throw;
+#endif
+  if (threw_exc) {
+    return 1;
+  }
+
+  return 0;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/get_common_files b/openmp-avx512/basic/get_common_files
new file mode 100755
index 0000000..dec46a7
--- /dev/null
+++ b/openmp-avx512/basic/get_common_files
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+dir=../../common
+
+cp ${dir}/YAML_Doc.cpp .
+cp ${dir}/YAML_Doc.hpp .
+cp ${dir}/YAML_Element.cpp .
+cp ${dir}/YAML_Element.hpp .
+
+cp ${dir}/generate_info_header .
+
diff --git a/openmp-avx512/basic/gold_files/1x1x2_A.mtx.1.0 b/openmp-avx512/basic/gold_files/1x1x2_A.mtx.1.0
new file mode 100644
index 0000000..d337780
--- /dev/null
+++ b/openmp-avx512/basic/gold_files/1x1x2_A.mtx.1.0
@@ -0,0 +1,113 @@
+12 112
+0 0 1
+0 1 0
+0 2 0
+0 3 0
+0 4 0
+0 5 0
+0 6 0
+0 7 0
+1 0 0
+1 1 1.16667
+1 2 0
+1 3 2.18961e-10
+1 4 0
+1 5 2.18961e-10
+1 6 0
+1 7 0.333333
+2 0 0
+2 1 0
+2 2 1
+2 3 0
+2 4 0
+2 5 0
+2 6 0
+2 7 0
+3 0 0
+3 1 2.18961e-10
+3 2 0
+3 3 2.5
+3 4 0
+3 5 -1
+3 6 0
+3 7 2.18961e-10
+4 0 0
+4 1 0
+4 2 0
+4 3 0
+4 4 1
+4 5 0
+4 6 0
+4 7 0
+4 8 0
+4 9 0
+4 10 0
+4 11 0
+5 0 0
+5 1 2.18961e-10
+5 2 0
+5 3 -1
+5 4 0
+5 5 2.5
+5 6 0
+5 7 2.18961e-10
+5 8 0
+5 9 0
+5 10 0
+5 11 0
+6 0 0
+6 1 0
+6 2 0
+6 3 0
+6 4 0
+6 5 0
+6 6 1
+6 7 0
+6 8 0
+6 9 0
+6 10 0
+6 11 0
+7 0 0
+7 1 0.333333
+7 2 0
+7 3 2.18961e-10
+7 4 0
+7 5 2.18961e-10
+7 6 0
+7 7 1.16667
+7 8 0
+7 9 0
+7 10 0
+7 11 0
+8 4 0
+8 5 0
+8 6 0
+8 7 0
+8 8 1
+8 9 0
+8 10 0
+8 11 0
+9 4 0
+9 5 0
+9 6 0
+9 7 0
+9 8 0
+9 9 0.5
+9 10 0
+9 11 0
+10 4 0
+10 5 0
+10 6 0
+10 7 0
+10 8 0
+10 9 0
+10 10 1
+10 11 0
+11 4 0
+11 5 0
+11 6 0
+11 7 0
+11 8 0
+11 9 0
+11 10 0
+11 11 0.5
diff --git a/openmp-avx512/basic/gold_files/1x1x2_A.mtx.2.0 b/openmp-avx512/basic/gold_files/1x1x2_A.mtx.2.0
new file mode 100644
index 0000000..363c0bd
--- /dev/null
+++ b/openmp-avx512/basic/gold_files/1x1x2_A.mtx.2.0
@@ -0,0 +1,33 @@
+4 32
+0 0 1
+0 1 0
+0 2 0
+0 3 0
+0 4 0
+0 5 0
+0 6 0
+0 7 0
+1 0 0
+1 1 1.16667
+1 2 0
+1 3 2.18961e-10
+1 4 0
+1 5 2.18961e-10
+1 6 0
+1 7 0.333333
+2 0 0
+2 1 0
+2 2 1
+2 3 0
+2 4 0
+2 5 0
+2 6 0
+2 7 0
+3 0 0
+3 1 2.18961e-10
+3 2 0
+3 3 2.5
+3 4 0
+3 5 -1
+3 6 0
+3 7 2.18961e-10
diff --git a/openmp-avx512/basic/gold_files/1x1x2_A.mtx.2.1 b/openmp-avx512/basic/gold_files/1x1x2_A.mtx.2.1
new file mode 100644
index 0000000..3b435ca
--- /dev/null
+++ b/openmp-avx512/basic/gold_files/1x1x2_A.mtx.2.1
@@ -0,0 +1,80 @@
+4 0 0
+4 1 0
+4 2 0
+4 3 0
+4 4 1
+4 5 0
+4 6 0
+4 7 0
+4 8 0
+4 9 0
+4 10 0
+4 11 0
+5 0 0
+5 1 2.18961e-10
+5 2 0
+5 3 -1
+5 4 0
+5 5 2.5
+5 6 0
+5 7 2.18961e-10
+5 8 0
+5 9 0
+5 10 0
+5 11 0
+6 0 0
+6 1 0
+6 2 0
+6 3 0
+6 4 0
+6 5 0
+6 6 1
+6 7 0
+6 8 0
+6 9 0
+6 10 0
+6 11 0
+7 0 0
+7 1 0.333333
+7 2 0
+7 3 2.18961e-10
+7 4 0
+7 5 2.18961e-10
+7 6 0
+7 7 1.16667
+7 8 0
+7 9 0
+7 10 0
+7 11 0
+8 4 0
+8 5 0
+8 6 0
+8 7 0
+8 8 1
+8 9 0
+8 10 0
+8 11 0
+9 4 0
+9 5 0
+9 6 0
+9 7 0
+9 8 0
+9 9 0.5
+9 10 0
+9 11 0
+10 4 0
+10 5 0
+10 6 0
+10 7 0
+10 8 0
+10 9 0
+10 10 1
+10 11 0
+11 4 0
+11 5 0
+11 6 0
+11 7 0
+11 8 0
+11 9 0
+11 10 0
+11 11 0.5
diff --git a/openmp-avx512/basic/gold_files/1x1x2_b.vec.1.0 b/openmp-avx512/basic/gold_files/1x1x2_b.vec.1.0
new file mode 100644
index 0000000..b0b890a
--- /dev/null
+++ b/openmp-avx512/basic/gold_files/1x1x2_b.vec.1.0
@@ -0,0 +1,13 @@
+12
+0 1
+1 1.25
+2 1
+3 1.41667
+4 1
+5 1.25
+6 1
+7 1.25
+8 1
+9 0
+10 1
+11 0
diff --git a/openmp-avx512/basic/gold_files/1x1x2_b.vec.2.0 b/openmp-avx512/basic/gold_files/1x1x2_b.vec.2.0
new file mode 100644
index 0000000..15d91a4
--- /dev/null
+++ b/openmp-avx512/basic/gold_files/1x1x2_b.vec.2.0
@@ -0,0 +1,5 @@
+4
+0 1
+1 1.25
+2 1
+3 1.41667
diff --git a/openmp-avx512/basic/gold_files/1x1x2_b.vec.2.1 b/openmp-avx512/basic/gold_files/1x1x2_b.vec.2.1
new file mode 100644
index 0000000..a8349a9
--- /dev/null
+++ b/openmp-avx512/basic/gold_files/1x1x2_b.vec.2.1
@@ -0,0 +1,8 @@
+4 1
+5 1.25
+6 1
+7 1.25
+8 1
+9 0
+10 1
+11 0
diff --git a/openmp-avx512/basic/gold_files/1x1x2_x.vec.1.0 b/openmp-avx512/basic/gold_files/1x1x2_x.vec.1.0
new file mode 100644
index 0000000..78dcba9
--- /dev/null
+++ b/openmp-avx512/basic/gold_files/1x1x2_x.vec.1.0
@@ -0,0 +1,13 @@
+12
+0 1
+1 0.833333
+2 1
+3 0.912698
+4 1
+5 0.865079
+6 1
+7 0.833333
+8 1
+9 0
+10 1
+11 0
diff --git a/openmp-avx512/basic/gold_files/1x1x2_x.vec.2.0 b/openmp-avx512/basic/gold_files/1x1x2_x.vec.2.0
new file mode 100644
index 0000000..024797b
--- /dev/null
+++ b/openmp-avx512/basic/gold_files/1x1x2_x.vec.2.0
@@ -0,0 +1,5 @@
+4
+0 1
+1 0.833333
+2 1
+3 0.912698
diff --git a/openmp-avx512/basic/gold_files/1x1x2_x.vec.2.1 b/openmp-avx512/basic/gold_files/1x1x2_x.vec.2.1
new file mode 100644
index 0000000..f774883
--- /dev/null
+++ b/openmp-avx512/basic/gold_files/1x1x2_x.vec.2.1
@@ -0,0 +1,8 @@
+4 1
+5 0.865079
+6 1
+7 0.833333
+8 1
+9 0
+10 1
+11 0
diff --git a/openmp-avx512/basic/imbalance.hpp b/openmp-avx512/basic/imbalance.hpp
new file mode 100644
index 0000000..f801efc
--- /dev/null
+++ b/openmp-avx512/basic/imbalance.hpp
@@ -0,0 +1,271 @@
+#ifndef _imbalance_hpp_
+#define _imbalance_hpp_
+
+#include <cmath>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <box_utils.hpp>
+#include <utils.hpp>
+#include <YAML_Doc.hpp>
+
+namespace miniFE {
+
+const int X = 0;
+const int Y = 1;
+const int Z = 2;
+const int NONE = 3;
+
+const int LOWER = 0;
+const int UPPER = 1;
+
+template<typename GlobalOrdinal>
+void
+compute_imbalance(const Box& global_box,
+                  const Box& local_box,
+                  float& largest_imbalance,
+                  float& std_dev,
+                  YAML_Doc& doc,
+                  bool record_in_doc)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box);
+  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
+  int min_proc = myproc, max_proc = myproc;
+  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
+                     max_nrows, max_proc);
+
+  float avg_nrows = global_nrows;
+  avg_nrows /= numprocs;
+
+  //largest_imbalance will be the difference between the min (or max)
+  //rows-per-processor and avg_nrows, represented as a percentage:
+  largest_imbalance = percentage_difference<float>(min_nrows, avg_nrows);
+
+  float tmp = percentage_difference<float>(max_nrows, avg_nrows);
+  if (tmp > largest_imbalance) largest_imbalance = tmp;
+
+  std_dev = compute_std_dev_as_percentage<float>(local_nrows, avg_nrows);
+
+  if (myproc == 0 && record_in_doc) {
+    doc.add("Rows-per-proc Load Imbalance","");
+    doc.get("Rows-per-proc Load Imbalance")->add("Largest (from avg, %)",largest_imbalance);
+    doc.get("Rows-per-proc Load Imbalance")->add("Std Dev (%)",std_dev);
+  }
+}
+
+std::pair<int,int>
+decide_how_to_grow(const Box& global_box, const Box& local_box)
+{
+  std::pair<int,int> result(NONE,UPPER);
+
+  if (local_box[Z][UPPER] < global_box[Z][UPPER]) {
+    result.first = Z;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Z][LOWER] > global_box[Z][LOWER]) {
+    result.first = Z;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[Y][UPPER] < global_box[Y][UPPER]) {
+    result.first = Y;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Y][LOWER] > global_box[Y][LOWER]) {
+    result.first = Y;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[X][UPPER] < global_box[X][UPPER]) {
+    result.first = X;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[X][LOWER] > global_box[X][LOWER]) {
+    result.first = X;
+    result.second = LOWER;
+    return result;
+  }
+  return result;
+}
+
+std::pair<int,int>
+decide_how_to_shrink(const Box& global_box, const Box& local_box)
+{
+  std::pair<int,int> result(NONE,UPPER);
+
+  if (local_box[Z][UPPER] < global_box[Z][UPPER] && local_box[Z][UPPER]-local_box[Z][LOWER] > 2) {
+    result.first = Z;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Z][LOWER] > global_box[Z][LOWER] && local_box[Z][UPPER]-local_box[Z][LOWER] > 2) {
+    result.first = Z;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[Y][UPPER] < global_box[Y][UPPER] && local_box[Y][UPPER]-local_box[Y][LOWER] > 2) {
+    result.first = Y;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Y][LOWER] > global_box[Y][LOWER] && local_box[Y][UPPER]-local_box[Y][LOWER] > 2) {
+    result.first = Y;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[X][UPPER] < global_box[X][UPPER] && local_box[X][UPPER]-local_box[X][LOWER] > 2) {
+    result.first = X;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[X][LOWER] > global_box[X][LOWER] && local_box[X][UPPER]-local_box[X][LOWER] > 2) {
+    result.first = X;
+    result.second = LOWER;
+    return result;
+  }
+  return result;
+}
+
+template<typename GlobalOrdinal>
+void
+add_imbalance(const Box& global_box,
+              Box& local_box,
+              float imbalance,
+              YAML_Doc& doc)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs == 1) {
+    return;
+  }
+
+  float cur_imbalance = 0, cur_std_dev = 0;
+  compute_imbalance<GlobalOrdinal>(global_box, local_box,
+                                  cur_imbalance, cur_std_dev, doc, false);
+
+  while (cur_imbalance < imbalance) {
+    GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box);
+    GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
+    int min_proc = myproc, max_proc = myproc;
+    get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
+                       max_nrows, max_proc);
+
+    std::pair<int,int> grow(NONE,UPPER);
+    int grow_axis_val = -1;
+    std::pair<int,int> shrink(NONE,UPPER);
+    int shrink_axis_val = -1;
+
+    if (myproc == max_proc) {
+      grow = decide_how_to_grow(global_box, local_box);
+      if (grow.first != NONE) {
+        grow_axis_val = local_box[grow.first][grow.second];
+      }
+    }
+    if (myproc == min_proc) {
+      shrink = decide_how_to_shrink(global_box, local_box);
+      if (shrink.first != NONE) {
+        shrink_axis_val = local_box[shrink.first][shrink.second];
+      }
+    }
+
+    int grow_info[8] = {grow.first, grow.second,
+                        local_box[X][0], local_box[X][1],
+                        local_box[Y][0], local_box[Y][1],
+                        local_box[Z][0], local_box[Z][1]};
+
+    int shrink_info[8] = {shrink.first, shrink.second,
+                        local_box[X][0], local_box[X][1],
+                        local_box[Y][0], local_box[Y][1],
+                        local_box[Z][0], local_box[Z][1]};
+#ifdef HAVE_MPI
+    MPI_Bcast(&grow_info[0], 8, MPI_INT, max_proc, MPI_COMM_WORLD);
+    MPI_Bcast(&shrink_info[0], 8, MPI_INT, min_proc, MPI_COMM_WORLD);
+#endif
+
+    int grow_axis = grow_info[0];
+    int grow_end = grow_info[1];
+    int shrink_axis = shrink_info[0];
+    int shrink_end = shrink_info[1];
+    int grow_incr = 1;
+    if (grow_end == LOWER) grow_incr = -1;
+    int shrink_incr = -1;
+    if (shrink_end == LOWER) shrink_incr = 1;
+    if (grow_axis != NONE) grow_axis_val = grow_info[2+grow_axis*2+grow_end];
+    if (shrink_axis != NONE) shrink_axis_val = shrink_info[2+shrink_axis*2+shrink_end];
+
+    if (grow_axis == NONE && shrink_axis == NONE) break;
+
+    bool grow_status = grow_axis==NONE ? false : true;
+    if (grow_axis != NONE) {
+      if ((grow_incr ==  1 && local_box[grow_axis][0] == grow_axis_val) ||
+          (grow_incr == -1 && local_box[grow_axis][1] == grow_axis_val)) {
+        if (local_box[grow_axis][1] - local_box[grow_axis][0] < 2) {
+          grow_status = false;
+        }
+      }
+    }
+
+    bool shrink_status = shrink_axis==NONE ? false : true;
+    if (shrink_axis != NONE) {
+      if ((shrink_incr ==  1 && local_box[shrink_axis][0] == shrink_axis_val) ||
+          (shrink_incr == -1 && local_box[shrink_axis][1] == shrink_axis_val)) {
+        if (local_box[shrink_axis][1] - local_box[shrink_axis][0] < 2) {
+          shrink_status = false;
+        }
+      }
+    }
+
+#ifdef HAVE_MPI
+    int statusints[2] = { grow_status ? 0 : 1, shrink_status ? 0 : 1 };
+    int globalstatus[2] = { 0, 0 };
+    MPI_Allreduce(&statusints, &globalstatus, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    grow_status = globalstatus[0]>0 ? false : true;
+    shrink_status = globalstatus[1]>0 ? false : true;
+#endif
+
+    if (grow_status == false && shrink_status == false) break;
+
+    if (grow_status && grow_axis != NONE) {
+      if (local_box[grow_axis][0] == grow_axis_val) {
+        local_box[grow_axis][0] += grow_incr;
+      }
+
+      if (local_box[grow_axis][1] == grow_axis_val) {
+        local_box[grow_axis][1] += grow_incr;
+      }
+    }
+
+    if (shrink_status && shrink_axis != NONE) {
+      if (local_box[shrink_axis][0] == shrink_axis_val) {
+        local_box[shrink_axis][0] += shrink_incr;
+      }
+
+      if (local_box[shrink_axis][1] == shrink_axis_val) {
+        local_box[shrink_axis][1] += shrink_incr;
+      }
+    }
+
+    compute_imbalance<GlobalOrdinal>(global_box, local_box,
+                                    cur_imbalance, cur_std_dev, doc, false);
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/main.cpp b/openmp-avx512/basic/main.cpp
new file mode 100644
index 0000000..ed3753f
--- /dev/null
+++ b/openmp-avx512/basic/main.cpp
@@ -0,0 +1,247 @@
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+#include <iostream>
+#include <ctime>
+#include <cstdlib>
+#include <vector>
+
+#include <miniFE_version.h>
+
+#include <outstream.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+//--------------------------------------------------------------------
+#include <ComputeNodeType.hpp>
+//--------------------------------------------------------------------
+
+#include <Box.hpp>
+#include <BoxPartition.hpp>
+#include <box_utils.hpp>
+#include <Parameters.hpp>
+#include <utils.hpp>
+#include <driver.hpp>
+#include <YAML_Doc.hpp>
+
+#if MINIFE_INFO != 0
+#include <miniFE_info.hpp>
+#else
+#include <miniFE_no_info.hpp>
+#endif
+
+//The following macros should be specified as compile-macros in the
+//makefile. They are defaulted here just in case...
+#ifndef MINIFE_SCALAR
+#define MINIFE_SCALAR double
+#endif
+#ifndef MINIFE_LOCAL_ORDINAL
+#define MINIFE_LOCAL_ORDINAL int
+#endif
+#ifndef MINIFE_GLOBAL_ORDINAL
+#define MINIFE_GLOBAL_ORDINAL int
+#endif
+
+// ************************************************************************
+
+void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params);
+void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads);
+void add_timestring_to_yaml(YAML_Doc& doc);
+
+inline void print_box(int myproc, const char* name, const Box& box,
+                      const char* name2, const Box& box2)
+{
+  std::cout << "proc " << myproc << " "<<name
+      <<" ("<<box[0][0]<<","<<box[0][1]<<") "
+      <<" ("<<box[1][0]<<","<<box[1][1]<<") "
+      <<" ("<<box[2][0]<<","<<box[2][1]<<") "
+      <<name2
+      <<" ("<<box2[0][0]<<","<<box2[0][1]<<") "
+      <<" ("<<box2[1][0]<<","<<box2[1][1]<<") "
+      <<" ("<<box2[2][0]<<","<<box2[2][1]<<") "<<std::endl;
+}
+
+//
+//We will create a 'box' of size nx X ny X nz, partition it among processors,
+//then call miniFE::driver which will use the partitioned box as the domain
+//from which to assemble finite-element matrices into a global matrix and
+//vector, then solve the linear-system using Conjugate Gradients.
+//
+
+int main(int argc, char** argv) {
+  miniFE::Parameters params;
+  miniFE::get_parameters(argc, argv, params);
+
+  int numprocs = 1, myproc = 0;
+  miniFE::initialize_mpi(argc, argv, numprocs, myproc);
+
+  miniFE::timer_type start_time = miniFE::mytimer();
+
+#ifdef MINIFE_DEBUG
+  outstream(numprocs, myproc);
+#endif
+
+  //make sure each processor has the same parameters:
+  miniFE::broadcast_parameters(params);
+
+
+  Box global_box = { 0, params.nx, 0, params.ny, 0, params.nz };
+  std::vector<Box> local_boxes(numprocs);
+
+  box_partition(0, numprocs, 2, global_box, &local_boxes[0]);
+
+  Box& my_box = local_boxes[myproc];
+
+//print_box(myproc, "global-box", global_box, "local-box", my_box);
+
+  std::ostringstream osstr;
+  osstr << "miniFE." << params.nx << "x" << params.ny << "x" << params.nz;
+#ifdef HAVE_MPI
+  osstr << ".P"<<numprocs;
+#endif
+#if defined(MINIFE_HAVE_TPI) || defined(MINIFE_HAVE_TBB)
+  osstr << "xT"<<params.numthreads;
+#endif
+  osstr << ".";
+  if (params.name != "") osstr << params.name << ".";
+
+  YAML_Doc doc("miniFE", MINIFE_VERSION, ".", osstr.str());
+  if (myproc == 0) {
+    add_params_to_yaml(doc, params);
+    add_configuration_to_yaml(doc, numprocs, params.numthreads);
+    add_timestring_to_yaml(doc);
+  }
+
+#if defined(MINIFE_HAVE_TBB)
+  TBBNode compute_node(params.numthreads);
+#ifdef MINIFE_HAVE_CUDA
+  CUDANode::singleton(0,8,512);
+#endif
+#elif defined(MINIFE_HAVE_TPI)
+  TPINode compute_node(params.numthreads);
+#elif defined(MINIFE_HAVE_CUDA)
+  CUDANode compute_node(0,2,64);
+#else
+  SerialComputeNode compute_node;
+#endif
+
+  //Most of the program is performed in the 'driver' function, which is
+  //templated on < Scalar, LocalOrdinal, GlobalOrdinal, NodeType >.
+  //To run miniFE with float instead of double, or 'long long' instead of int,
+  //etc., change these template-parameters by changing the macro definitions in
+  //the makefile or on the make command-line.
+
+  miniFE::driver< MINIFE_SCALAR, MINIFE_LOCAL_ORDINAL, MINIFE_GLOBAL_ORDINAL,
+                  ComputeNodeType>(global_box, my_box, compute_node, params, doc);
+
+  miniFE::timer_type total_time = miniFE::mytimer() - start_time;
+
+  if (myproc == 0) {
+    doc.add("Total Program Time",total_time);
+    std::cout << doc.generateYAML() << std::endl;
+  }
+
+  miniFE::finalize_mpi();
+
+  return 0;
+}
+
+void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params)
+{
+  doc.add("Global Run Parameters","");
+  doc.get("Global Run Parameters")->add("dimensions","");
+  doc.get("Global Run Parameters")->get("dimensions")->add("nx",params.nx);
+  doc.get("Global Run Parameters")->get("dimensions")->add("ny",params.ny);
+  doc.get("Global Run Parameters")->get("dimensions")->add("nz",params.nz);
+  doc.get("Global Run Parameters")->add("load_imbalance", params.load_imbalance);
+  if (params.mv_overlap_comm_comp == 1) {
+    std::string val("1 (yes)");
+    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
+  }
+  else {
+    std::string val("0 (no)");
+    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
+  }
+}
+
+void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads)
+{
+  doc.get("Global Run Parameters")->add("number of processors", numprocs);
+  std::string threading("none");
+
+#ifdef MINIFE_HAVE_TPI
+  threading = "TPI";
+#endif
+#ifdef MINIFE_HAVE_TBB
+  threading = "TBB";
+#endif
+#ifdef MINIFE_HAVE_CUDA
+  threading = "CUDA";
+#endif
+  if (threading != "none") {
+    doc.get("Global Run Parameters")->add("(per proc) numthreads",numthreads);
+  }
+
+  doc.add("Platform","");
+  doc.get("Platform")->add("hostname",MINIFE_HOSTNAME);
+  doc.get("Platform")->add("kernel name",MINIFE_KERNEL_NAME);
+  doc.get("Platform")->add("kernel release",MINIFE_KERNEL_RELEASE);
+  doc.get("Platform")->add("processor",MINIFE_PROCESSOR);
+
+  doc.add("Build","");
+  doc.get("Build")->add("CXX",MINIFE_CXX);
+  doc.get("Build")->add("compiler version",MINIFE_CXX_VERSION);
+  doc.get("Build")->add("CXXFLAGS",MINIFE_CXXFLAGS);
+  std::string using_mpi("no");
+#ifdef HAVE_MPI
+  using_mpi = "yes";
+#endif
+  doc.get("Build")->add("using MPI",using_mpi);
+  doc.get("Build")->add("Threading",threading.c_str());
+}
+
+void add_timestring_to_yaml(YAML_Doc& doc)
+{
+  std::time_t rawtime;
+  struct tm * timeinfo;
+  std::time(&rawtime);
+  timeinfo = std::localtime(&rawtime);
+  std::ostringstream osstr;
+  osstr.fill('0');
+  osstr << timeinfo->tm_year+1900 << "-";
+  osstr.width(2); osstr << timeinfo->tm_mon+1 << "-";
+  osstr.width(2); osstr << timeinfo->tm_mday << ", ";
+  osstr.width(2); osstr << timeinfo->tm_hour << "-";
+  osstr.width(2); osstr << timeinfo->tm_min << "-";
+  osstr.width(2); osstr << timeinfo->tm_sec;
+  std::string timestring = osstr.str();
+  doc.add("Run Date/Time",timestring);
+}
+
diff --git a/openmp-avx512/basic/make_local_matrix.hpp b/openmp-avx512/basic/make_local_matrix.hpp
new file mode 100644
index 0000000..99c2cf7
--- /dev/null
+++ b/openmp-avx512/basic/make_local_matrix.hpp
@@ -0,0 +1,440 @@
+#ifndef _make_local_matrix_hpp_
+#define _make_local_matrix_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <map>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType>
+void
+make_local_matrix(MatrixType& A)
+{
+#ifdef HAVE_MPI
+  int numprocs = 1, myproc = 0;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+  if (numprocs < 2) {
+    A.num_cols = A.rows.size();
+    A.has_local_indices = true;
+    return;
+  }
+
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  std::map<GlobalOrdinal,GlobalOrdinal> externals;
+  LocalOrdinal num_external = 0;
+
+  //Extract Matrix pieces
+
+  size_t local_nrow = A.rows.size();
+  GlobalOrdinal start_row = local_nrow>0 ? A.rows[0] : -1;
+  GlobalOrdinal stop_row  = local_nrow>0 ? A.rows[local_nrow-1] : -1;
+
+  // We need to convert the index values for the rows on this processor
+  // to a local index space. We need to:
+  // - Determine if each index reaches to a local value or external value
+  // - If local, subtract start_row from index value to get local index
+  // - If external, find out if it is already accounted for.
+  //   - If so, then do nothing,
+  //   - otherwise
+  //     - add it to the list of external indices,
+  //     - find out which processor owns the value.
+  //     - Set up communication for sparse MV operation
+
+  ///////////////////////////////////////////
+  // Scan the indices and transform to local
+  ///////////////////////////////////////////
+
+  std::vector<GlobalOrdinal>& external_index = A.external_index;
+
+  for(size_t i=0; i<A.rows.size(); ++i) {
+    GlobalOrdinal* Acols = NULL;
+    Scalar* Acoefs = NULL;
+    size_t row_len = 0;
+    A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);
+
+    for(size_t j=0; j<row_len; ++j) {
+      GlobalOrdinal cur_ind = Acols[j];
+      if (start_row <= cur_ind && cur_ind <= stop_row) {
+        Acols[j] -= start_row;
+      }
+      else { // Must find out if we have already set up this point
+        if (externals.find(cur_ind) == externals.end()) {
+          externals[cur_ind] = num_external++;
+          external_index.push_back(cur_ind);
+        }
+        // Mark index as external by adding 1 and negating it
+        Acols[j] = -(Acols[j] + 1);
+      }
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // Go through list of externals to find out which processors must be accessed.
+  ////////////////////////////////////////////////////////////////////////
+
+  std::vector<GlobalOrdinal> tmp_buffer(numprocs, 0); // Temp buffer space needed below
+
+  // Build list of global index offset
+
+  std::vector<GlobalOrdinal> global_index_offsets(numprocs, 0);
+
+  tmp_buffer[myproc] = start_row; // This is my start row
+
+  // This call sends the start_row of each ith processor to the ith
+  // entry of global_index_offsets on all processors.
+  // Thus, each processor knows the range of indices owned by all
+  // other processors.
+  // Note: There might be a better algorithm for doing this, but this
+  //       will work...
+
+  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
+  MPI_Allreduce(&tmp_buffer[0], &global_index_offsets[0], numprocs, mpi_dtype,
+                MPI_SUM, MPI_COMM_WORLD);
+
+  // Go through list of externals and find the processor that owns each
+  std::vector<int> external_processor(num_external);
+
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    GlobalOrdinal cur_ind = external_index[i];
+    for(int j=numprocs-1; j>=0; --j) {
+      if (global_index_offsets[j] <= cur_ind && global_index_offsets[j] >= 0) {
+        external_processor[i] = j;
+        break;
+      }
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  // Sift through the external elements. For each newly encountered external
+  // point assign it the next index in the sequence. Then look for other
+  // external elements who are updated by the same node and assign them the next
+  // set of index numbers in the sequence (ie. elements updated by the same node
+  // have consecutive indices).
+  /////////////////////////////////////////////////////////////////////////
+
+  size_t count = local_nrow;
+  std::vector<GlobalOrdinal>& external_local_index = A.external_local_index;
+  external_local_index.assign(num_external, -1);
+
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    if (external_local_index[i] == -1) {
+      external_local_index[i] = count++;
+
+      for(LocalOrdinal j=i+1; j<num_external; ++j) {
+        if (external_processor[j] == external_processor[i])
+          external_local_index[j] = count++;
+      }
+    }
+  }
+
+  for(size_t i=0; i<local_nrow; ++i) {
+    GlobalOrdinal* Acols = NULL;
+    Scalar* Acoefs = NULL;
+    size_t row_len = 0;
+    A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);
+
+    for(size_t j=0; j<row_len; ++j) {
+      if (Acols[j] < 0) { // Change index values of externals
+        GlobalOrdinal cur_ind = -Acols[j] - 1;
+        Acols[j] = external_local_index[externals[cur_ind]];
+      }
+    }
+  }
+
+  std::vector<int> new_external_processor(num_external, 0);
+
+  for(int i=0; i<num_external; ++i) {
+    new_external_processor[external_local_index[i]-local_nrow] =
+      external_processor[i];
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ///
+  // Count the number of neighbors from which we receive information to update
+  // our external elements. Additionally, fill the array tmp_neighbors in the
+  // following way:
+  //      tmp_neighbors[i] = 0   ==>  No external elements are updated by
+  //                              processor i.
+  //      tmp_neighbors[i] = x   ==>  (x-1)/numprocs elements are updated from
+  //                              processor i.
+  ///
+  ////////////////////////////////////////////////////////////////////////
+
+  std::vector<GlobalOrdinal> tmp_neighbors(numprocs, 0);
+
+  int num_recv_neighbors = 0;
+  int length             = 1;
+
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    if (tmp_neighbors[new_external_processor[i]] == 0) {
+      ++num_recv_neighbors;
+      tmp_neighbors[new_external_processor[i]] = 1;
+    }
+    tmp_neighbors[new_external_processor[i]] += numprocs;
+  }
+
+  /// sum over all processor all the tmp_neighbors arrays ///
+
+  MPI_Allreduce(&tmp_neighbors[0], &tmp_buffer[0], numprocs, mpi_dtype,
+                MPI_SUM, MPI_COMM_WORLD);
+
+  // decode the combined 'tmp_neighbors' (stored in tmp_buffer)
+  // array from all the processors
+
+  GlobalOrdinal num_send_neighbors = tmp_buffer[myproc] % numprocs;
+
+  /// decode 'tmp_buffer[myproc] to deduce total number of elements
+  //  we must send
+
+  GlobalOrdinal total_to_be_sent = (tmp_buffer[myproc] - num_send_neighbors) / numprocs;
+
+  ///////////////////////////////////////////////////////////////////////
+  ///
+  // Make a list of the neighbors that will send information to update our
+  // external elements (in the order that we will receive this information).
+  ///
+  ///////////////////////////////////////////////////////////////////////
+
+  std::vector<int> recv_list;
+  recv_list.push_back(new_external_processor[0]);
+  for(LocalOrdinal i=1; i<num_external; ++i) {
+    if (new_external_processor[i-1] != new_external_processor[i]) {
+      recv_list.push_back(new_external_processor[i]);
+    }
+  }
+
+  //
+  // Send a 0 length message to each of our recv neighbors
+  //
+
+  std::vector<int> send_list(num_send_neighbors, 0);
+
+  //
+  // first post receives, these are immediate receives
+  // Do not wait for result to come, will do that at the
+  // wait call below.
+  //
+  int MPI_MY_TAG = 99;
+
+  std::vector<MPI_Request> request(num_send_neighbors);
+  for(int i=0; i<num_send_neighbors; ++i) {
+    MPI_Irecv(&tmp_buffer[i], 1, mpi_dtype, MPI_ANY_SOURCE, MPI_MY_TAG,
+              MPI_COMM_WORLD, &request[i]);
+  }
+
+  // send messages
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    MPI_Send(&tmp_buffer[i], 1, mpi_dtype, recv_list[i], MPI_MY_TAG,
+             MPI_COMM_WORLD);
+  }
+
+  ///
+  // Receive message from each send neighbor to construct 'send_list'.
+  ///
+
+  MPI_Status status;
+  for(int i=0; i<num_send_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+    send_list[i] = status.MPI_SOURCE;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  ///
+  // Compare the two lists. In most cases they should be the same.
+  // However, if they are not then add new entries to the recv list
+  // that are in the send list (but not already in the recv list).
+  ///
+  //////////////////////////////////////////////////////////////////////
+
+  for(int j=0; j<num_send_neighbors; ++j) {
+    int found = 0;
+    for(int i=0; i<num_recv_neighbors; ++i) {
+      if (recv_list[i] == send_list[j]) found = 1;
+    }
+
+    if (found == 0) {
+      recv_list.push_back(send_list[j]);
+      ++num_recv_neighbors;
+    }
+  }
+
+  num_send_neighbors = num_recv_neighbors;
+  request.resize(num_send_neighbors);
+
+  A.elements_to_send.assign(total_to_be_sent, 0);
+  A.send_buffer.assign(total_to_be_sent, 0);
+
+  //
+  // Create 'new_external' which explicitly put the external elements in the
+  // order given by 'external_local_index'
+  //
+
+  std::vector<GlobalOrdinal> new_external(num_external);
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    new_external[external_local_index[i] - local_nrow] = external_index[i];
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  //
+  // Send each processor the global index list of the external elements in the
+  // order that I will want to receive them when updating my external elements.
+  //
+  /////////////////////////////////////////////////////////////////////////
+
+  std::vector<int> lengths(num_recv_neighbors);
+
+  ++MPI_MY_TAG;
+
+  // First post receives
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    int partner = recv_list[i];
+    MPI_Irecv(&lengths[i], 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD,
+              &request[i]);
+  }
+
+  std::vector<int>& neighbors = A.neighbors;
+  std::vector<int>& recv_length = A.recv_length;
+  std::vector<int>& send_length = A.send_length;
+
+  neighbors.resize(num_recv_neighbors, 0);
+  A.request.resize(num_recv_neighbors);
+  recv_length.resize(num_recv_neighbors, 0);
+  send_length.resize(num_recv_neighbors, 0);
+
+  LocalOrdinal j = 0;
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    int start = j;
+    int newlength = 0;
+
+    //go through list of external elements until updating
+    //processor changes
+
+    while((j < num_external) &&
+          (new_external_processor[j] == recv_list[i])) {
+      ++newlength;
+      ++j;
+      if (j == num_external) break;
+    }
+
+    recv_length[i] = newlength;
+    neighbors[i] = recv_list[i];
+
+    length = j - start;
+    MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  // Complete the receives of the number of externals
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+    send_length[i] = lengths[i];
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // Build "elements_to_send" list. These are the x elements I own
+  // that need to be sent to other processors.
+  ////////////////////////////////////////////////////////////////////////
+
+  ++MPI_MY_TAG;
+
+  j = 0;
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    MPI_Irecv(&A.elements_to_send[j], send_length[i], mpi_dtype, neighbors[i],
+              MPI_MY_TAG, MPI_COMM_WORLD, &request[i]);
+    j += send_length[i];
+  }
+
+  j = 0;
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    LocalOrdinal start = j;
+    LocalOrdinal newlength = 0;
+
+    // Go through list of external elements
+    // until updating processor changes. This is redundant, but
+    // saves us from recording this information.
+
+    while((j < num_external) &&
+          (new_external_processor[j] == recv_list[i])) {
+      ++newlength;
+      ++j;
+      if (j == num_external) break;
+    }
+    MPI_Send(&new_external[start], j-start, mpi_dtype, recv_list[i],
+             MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  // receive from each neighbor the global index list of external elements
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+  }
+
+  /// replace global indices by local indices ///
+
+  for(GlobalOrdinal i=0; i<total_to_be_sent; ++i) {
+    A.elements_to_send[i] -= start_row;
+  }
+
+  //////////////////
+  // Finish up !!
+  //////////////////
+
+  A.num_cols = local_nrow + num_external;
+
+#else
+  A.num_cols = A.rows.size();
+#endif
+
+  A.has_local_indices = true;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/make_targets b/openmp-avx512/basic/make_targets
new file mode 100644
index 0000000..4f9d92e
--- /dev/null
+++ b/openmp-avx512/basic/make_targets
@@ -0,0 +1,52 @@
+
+OBJS = \
+	BoxPartition.o \
+	YAML_Doc.o \
+	YAML_Element.o \
+	TBBNode.o
+
+UTIL_OBJS = \
+	param_utils.o \
+	utils.o \
+	mytimer.o
+
+MAIN_OBJ = \
+	main.o
+
+UTEST_OBJS = \
+	utest.o
+
+MINIFE_INFO = 1
+MINIFE_KERNELS = 0
+
+vpath %.cpp ../utils
+
+all:common_files generate_info miniFE.x
+
+utest.x:common_files $(OBJS) $(OPTIONAL_OBJS) $(UTIL_OBJS) $(UTEST_OBJS) *.hpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $(UTEST_OBJS) $(OBJS) $(UTIL_OBJS) -o utest.x $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
+
+miniFE.x:common_files $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) *.hpp generate_info
+	$(INSTRUMENT) $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) -o miniFE.x $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
+
+common_files:
+	./get_common_files
+
+generate_info:
+	./generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
+
+verify:all
+	./run_tests
+
+%.o:%.cpp *.hpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
+
+%.o:%.c *.h
+	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
+
+clean:
+	rm -f *.o *.a *.x *.linkinfo miniFE_info.hpp
+
+realclean: clean
+	rm -f gmon.out gprof.* *~ *.yaml *.TVD.* *.mtx* *.vec* run_utest_* minife_debug*
+
diff --git a/openmp-avx512/basic/makefile b/openmp-avx512/basic/makefile
new file mode 100644
index 0000000..47c1bd5
--- /dev/null
+++ b/openmp-avx512/basic/makefile
@@ -0,0 +1,35 @@
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+        -DMINIFE_SCALAR=double   \
+        -DMINIFE_LOCAL_ORDINAL=int      \
+        -DMINIFE_GLOBAL_ORDINAL=int
+
+MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
+# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3
+CXXFLAGS = -O3
+
+# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
+# from each proc containing various information.
+# This macro will also enable a somewhat expensive range-check on indices in
+# the exchange_externals function.
+
+# CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) -DMINIFE_DEBUG -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK
+
+LDFLAGS=
+LIBS=
+
+# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
+# such as the one on my cygwin machine.
+
+CXX=mpicxx
+CC=mpicc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.cuda.gnu.serial b/openmp-avx512/basic/makefile.cuda.gnu.serial
new file mode 100644
index 0000000..9ba0ce1
--- /dev/null
+++ b/openmp-avx512/basic/makefile.cuda.gnu.serial
@@ -0,0 +1,36 @@
+#-----------------------------------------------------------------------
+#  DEFINES, INCLUDES, OBJECTS, and LIBRARIES
+#  for the CUDA option.
+
+CUDA_DIR=/usr/local/cuda/3.0/cuda
+
+DEVICE_EMULATION=--device-emulation
+DEVICE_EMULATION=
+CUDAFLAGS = -arch=sm_13 -O3 $(DEVICE_EMULATION)
+
+OPTIONAL_DEFS = -DMINIFE_HAVE_CUDA
+OPTIONAL_INCS = -I$(CUDA_DIR)/include -I./optional/cuda
+OPTIONAL_OBJS = CudaNode.o CudaVector.o CudaMatrix.o
+OPTIONAL_LIBS = -L$(CUDA_DIR)/lib64 -lcublas -lcudart
+
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =	\
+	-DMINIFE_SCALAR=float	\
+	-DMINIFE_LOCAL_ORDINAL=int	\
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3
+CXXFLAGS = -O3
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
+LDFLAGS =
+LIBS =
+
+CXX=g++
+CC=gcc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.cuda.tbb.gnu.serial b/openmp-avx512/basic/makefile.cuda.tbb.gnu.serial
new file mode 100644
index 0000000..f375512
--- /dev/null
+++ b/openmp-avx512/basic/makefile.cuda.tbb.gnu.serial
@@ -0,0 +1,37 @@
+#-----------------------------------------------------------------------
+#  DEFINES, INCLUDES, OBJECTS, and LIBRARIES
+#  for the CUDA option *AND* for the TBB option.
+
+TBB_DIR=/sierra/Sntools/extras/compilers/intel/Compiler/11.1/064/tbb
+CUDA_DIR=/usr/local/cuda/3.0/cuda
+
+DEVICE_EMULATION=--device-emulation
+DEVICE_EMULATION=
+CUDAFLAGS = -arch=sm_13 -O3 $(DEVICE_EMULATION)
+
+OPTIONAL_DEFS = -DMINIFE_HAVE_TBB -DMINIFE_HAVE_CUDA
+OPTIONAL_INCS = -I$(CUDA_DIR)/include -I./optional/cuda -I$(TBB_DIR)/include
+OPTIONAL_OBJS = CudaNode.o CudaVector.o CudaMatrix.o
+OPTIONAL_LIBS = -L$(TBB_DIR)/intel64/cc4.1.0_libc2.4_kernel2.6.16.21/lib -ltbb -ltbbmalloc -L$(CUDA_DIR)/lib64 -lcublas -lcudart
+
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =	\
+	-DMINIFE_SCALAR=float	\
+	-DMINIFE_LOCAL_ORDINAL=int	\
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3
+CXXFLAGS = -O3
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
+LDFLAGS =
+LIBS =
+
+CXX=g++
+CC=gcc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.debug b/openmp-avx512/basic/makefile.debug
new file mode 100644
index 0000000..c6d4efb
--- /dev/null
+++ b/openmp-avx512/basic/makefile.debug
@@ -0,0 +1,35 @@
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+        -DMINIFE_SCALAR=double   \
+        -DMINIFE_LOCAL_ORDINAL=int      \
+        -DMINIFE_GLOBAL_ORDINAL=int
+
+MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
+# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -g
+CXXFLAGS = -g
+
+# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
+# from each proc containing various information.
+# This macro will also enable a somewhat expensive range-check on indices in
+# the exchange_externals function.
+
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DMINIFE_DEBUG -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK
+# CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK
+
+LDFLAGS=
+LIBS=
+
+# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
+# such as the one on my cygwin machine.
+
+CXX=mpicxx
+CC=mpicc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.gnu.purify b/openmp-avx512/basic/makefile.gnu.purify
new file mode 100644
index 0000000..e667ed4
--- /dev/null
+++ b/openmp-avx512/basic/makefile.gnu.purify
@@ -0,0 +1,25 @@
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+	-DMINIFE_SCALAR=double   \
+	-DMINIFE_LOCAL_ORDINAL=int      \
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+
+CFLAGS = -g
+CXXFLAGS = -g
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES)
+
+CXX=g++
+CC=gcc
+
+# Change 'quantify' to 'purify' to do memory checking instead of performance profiling, or
+# comment this out to do no instrumentation:
+
+INSTRUMENT=/usr/local/rational/rational7/releases/PurifyPlus.7.0.0.0-012/i386_linux2/bin/purify -always-use-cache-dir -cache-dir=/var/scratch2/william/purify-cache
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.gnu.quantify b/openmp-avx512/basic/makefile.gnu.quantify
new file mode 100644
index 0000000..3637812
--- /dev/null
+++ b/openmp-avx512/basic/makefile.gnu.quantify
@@ -0,0 +1,24 @@
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+	-DMINIFE_SCALAR=double   \
+	-DMINIFE_LOCAL_ORDINAL=int      \
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -g
+CXXFLAGS = -g
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES)
+
+CXX=g++
+CC=gcc
+
+# Change 'quantify' to 'purify' to do memory checking instead of performance profiling, or
+# comment this out to do no instrumentation:
+
+INSTRUMENT=/usr/local/rational/rational7/releases/PurifyPlus.7.0.0.0-012/i386_linux2/bin/quantify -always-use-cache-dir -cache-dir=/var/scratch2/william/quantify-cache
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.gnu.serial b/openmp-avx512/basic/makefile.gnu.serial
new file mode 100644
index 0000000..e40efac
--- /dev/null
+++ b/openmp-avx512/basic/makefile.gnu.serial
@@ -0,0 +1,21 @@
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+	-DMINIFE_SCALAR=double   \
+	-DMINIFE_LOCAL_ORDINAL=int      \
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -g
+CXXFLAGS = -g
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES)
+LDFLAGS =
+LIBS=-lm
+
+CXX=g++
+CC=gcc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.intel.serial b/openmp-avx512/basic/makefile.intel.serial
new file mode 100644
index 0000000..ca5d30f
--- /dev/null
+++ b/openmp-avx512/basic/makefile.intel.serial
@@ -0,0 +1,19 @@
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+	-DMINIFE_SCALAR=double   \
+	-DMINIFE_LOCAL_ORDINAL=int      \
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -g
+CXXFLAGS = -g
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES)
+
+CXX=icpc
+CC=icc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.redstorm b/openmp-avx512/basic/makefile.redstorm
new file mode 100644
index 0000000..526aca1
--- /dev/null
+++ b/openmp-avx512/basic/makefile.redstorm
@@ -0,0 +1,21 @@
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+	-DMINIFE_SCALAR=double   \
+	-DMINIFE_LOCAL_ORDINAL=int      \
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3
+CXXFLAGS = -O3
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK  -DREDSTORM
+
+# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
+# such as the one on my cygwin machine.
+
+CXX=CC
+CC=cc
+
+include make_targets
+include ./optional/make_targets
diff --git a/openmp-avx512/basic/makefile.tbb b/openmp-avx512/basic/makefile.tbb
new file mode 100644
index 0000000..c0d5dfe
--- /dev/null
+++ b/openmp-avx512/basic/makefile.tbb
@@ -0,0 +1,28 @@
+TBB_BASE_DIR=/home/william/packages/tbb21_20080605oss
+
+OPTIONAL_DEFS=-DMINIFE_HAVE_TBB
+OPTIONAL_INCS=-I$(TBB_BASE_DIR)/include
+OPTIONAL_LIBS=-L$(TBB_BASE_DIR)/em64t/cc3.4.3_libc2.3.4_kernel2.6.9/lib	\
+	-ltbb -ltbbmalloc -lpthread
+
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+	-DMINIFE_SCALAR=double   \
+	-DMINIFE_LOCAL_ORDINAL=int      \
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3
+CXXFLAGS = -O3
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS) -DHAVE_MPI
+LDFLAGS =
+LIBS =
+
+CXX=mpicxx
+CC=mpicc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.tbb.gnu.serial b/openmp-avx512/basic/makefile.tbb.gnu.serial
new file mode 100644
index 0000000..c5a9722
--- /dev/null
+++ b/openmp-avx512/basic/makefile.tbb.gnu.serial
@@ -0,0 +1,28 @@
+TBB_BASE_DIR=/home/william/packages/tbb21_20080605oss
+
+OPTIONAL_DEFS=-DMINIFE_HAVE_TBB
+OPTIONAL_INCS=-I$(TBB_BASE_DIR)/include
+OPTIONAL_LIBS=-L$(TBB_BASE_DIR)/em64t/cc3.4.3_libc2.3.4_kernel2.6.9/lib	\
+	-ltbb -ltbbmalloc -lpthread
+
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+	-DMINIFE_SCALAR=float   \
+	-DMINIFE_LOCAL_ORDINAL=int      \
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3
+CXXFLAGS = -O3
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
+LDFLAGS =
+LIBS =
+
+CXX=g++
+CC=gcc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.tpi b/openmp-avx512/basic/makefile.tpi
new file mode 100644
index 0000000..b3ba4a1
--- /dev/null
+++ b/openmp-avx512/basic/makefile.tpi
@@ -0,0 +1,28 @@
+#-----------------------------------------------------------------------
+
+OPTIONAL_DEFS = -DMINIFE_HAVE_TPI -DHAVE_PTHREAD
+OPTIONAL_INCS = -I./optional/ThreadPool -I./optional/ThreadPool/src
+OPTIONAL_OBJS = TPI.o
+OPTIONAL_LIBS = -lpthread
+
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+	-DMINIFE_SCALAR=double   \
+	-DMINIFE_LOCAL_ORDINAL=int      \
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3
+CXXFLAGS = -O3
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) -DHAVE_MPI $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
+LDFLAGS =
+LIBS =
+
+CXX=mpicxx
+CC=mpicc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/makefile.tpi.gnu.serial b/openmp-avx512/basic/makefile.tpi.gnu.serial
new file mode 100644
index 0000000..a5788dc
--- /dev/null
+++ b/openmp-avx512/basic/makefile.tpi.gnu.serial
@@ -0,0 +1,28 @@
+#-----------------------------------------------------------------------
+
+OPTIONAL_DEFS = -DMINIFE_HAVE_TPI -DHAVE_PTHREAD
+OPTIONAL_INCS = -I./optional/ThreadPool
+OPTIONAL_OBJS = TPI.o
+OPTIONAL_LIBS = -lpthread
+
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+	-DMINIFE_SCALAR=double   \
+	-DMINIFE_LOCAL_ORDINAL=int      \
+	-DMINIFE_GLOBAL_ORDINAL=int
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3
+CXXFLAGS = -O3
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
+LDFLAGS =
+LIBS =
+
+CXX=g++
+CC=gcc
+
+include make_targets
+include ./optional/make_targets
+
diff --git a/openmp-avx512/basic/optional/README b/openmp-avx512/basic/optional/README
new file mode 100644
index 0000000..e5975dc
--- /dev/null
+++ b/openmp-avx512/basic/optional/README
@@ -0,0 +1,3 @@
+
+ThreadPool/  is extracted from  Trilinos/packages/ThreadPool/src/
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/CMakeLists.txt b/openmp-avx512/basic/optional/ThreadPool/CMakeLists.txt
new file mode 100644
index 0000000..e5f7729
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/CMakeLists.txt
@@ -0,0 +1,53 @@
+
+INCLUDE(PackageMacros)
+INCLUDE(AddOptionAndDefine)
+
+#
+# A) Define the package
+#
+
+PACKAGE(ThreadPool)
+
+#
+# B) Set up package-specific options
+#
+
+#Pthread is a required dependency so this conditional should always be true
+#hence the assert after it. 
+SET(HAVE_PTHREAD FALSE)
+IF(TPL_ENABLE_Pthread)
+  SET(HAVE_PTHREAD TRUE)
+ENDIF()
+IF (${PROJECT_NAME}_VERBOSE_CONFIGURE)
+  PRINT_VAR(HAVE_PTHREAD)
+ENDIF()
+ASSERT_DEFINED(HAVE_PTHREAD)
+
+IF(${PACKAGE_NAME}_ENABLE_MPI)
+  SET(HAVE_MPI TRUE)
+ENDIF()
+
+#
+# C) Add the libraries, tests, and examples
+#
+
+ADD_SUBDIRECTORY(src)
+
+IF(HAVE_PTHREAD)
+  IF (${PROJECT_NAME}_VERBOSE_CONFIGURE)
+    MESSAGE(STATUS "ADDING THREADPOOL TESTS")
+  ENDIF()
+  PACKAGE_ADD_TEST_DIRECTORIES(test test/hpccg test/hhpccg)
+ENDIF()
+
+#
+# Exclude files for source package.
+#
+
+PACKAGE_ARCH_EXCLUDE_AUTOTOOLS_FILES()
+
+#
+# D) Do standard postprocessing
+#
+
+PACKAGE_POSTPROCESS()
diff --git a/openmp-avx512/basic/optional/ThreadPool/Makefile.am b/openmp-avx512/basic/optional/ThreadPool/Makefile.am
new file mode 100644
index 0000000..eac6f19
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/Makefile.am
@@ -0,0 +1,199 @@
+# @HEADER
+# ************************************************************************
+# 
+#                          ThreadPool Package
+#                 Copyright (2008) Sandia Corporation
+# 
+# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+# license for use of this work by or on behalf of the U.S. Government.
+# 
+# This library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of the
+# License, or (at your option) any later version.
+#  
+# This library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#  
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
+# 
+# ************************************************************************
+# @HEADER
+
+## #######################################################################
+## Options to automake (rarely used - don't worry about it)
+## #######################################################################
+AUTOMAKE_OPTIONS = foreign
+
+## #######################################################################
+## Aclocal command (rarely used - don't worry about it)
+## #######################################################################
+ACLOCAL_AMFLAGS = -I config
+
+#
+#       I believe that by switching to AUX_DIR(../../config) one 
+#       could get rid of these.
+#
+## #######################################################################
+## Additional files to be included in distribution for 'make dist'
+## #######################################################################
+#np# For a typical package, there is no reason to distribute these files
+#np# because users should not have to bootstrap.  We distribute them with
+#np# new package so that the files can be used in creating the 
+#np# configure script for other packages. 
+EXTRA_DIST = \
+config/generate-makeoptions.pl \
+config/replace-install-prefix.pl config/string-replace.pl \
+config/strip_dup_incl_paths.pl   config/strip_dup_libs.pl \
+config/token-replace.pl
+
+## #######################################################################
+## Tools in the auxillary directory 
+## #######################################################################
+AUX_DIST = config/install-sh config/missing config/mkinstalldirs 
+#
+#  Again, I hope that AUX_DIR(../../config) eliminates these
+#  config/install-sh config/missing config/mkinstalldirs 
+
+## #######################################################################
+## Files to be deleted by 'make maintainer-clean'
+## #######################################################################
+MAINTAINERCLEANFILES = Makefile.in aclocal.m4 autom4te.cache/* \
+	configure config.status config.log \
+	src/common/config-h.in src/common/stamp-h.in \
+	$(AUX_DIST) 
+
+#The following line helps the test harness recover from build errors.
+                                                                                
+all-local:
+	@echo "Trilinos package ThreadPool built successfully"
+
+## #######################################################################
+## Export Makefile Installation
+## #######################################################################
+if USING_EXPORT_MAKEFILES
+
+install-exec-hook:
+	mkdir -p $(DESTDIR)$(includedir)
+	cp $(top_builddir)/Makefile.export.threadpool $(DESTDIR)$(includedir)/.
+	$(PERL_EXE) $(top_srcdir)/config/replace-install-prefix.pl \
+		--exec-prefix=$(exec_prefix) \
+		--my-export-makefile=Makefile.export.threadpool \
+		--my-abs-top-srcdir=@abs_top_srcdir@ \
+		--my-abs-incl-dirs=@abs_top_builddir@/src:@abs_top_srcdir@/src \
+		--my-abs-lib-dirs=@abs_top_builddir@/src
+	$(PERL_EXE) $(top_srcdir)/config/generate-makeoptions.pl $(top_builddir)/src/Makefile \
+		THREADPOOL > $(DESTDIR)$(includedir)/Makefile.export.threadpool.macros
+
+uninstall-hook:
+	rm -f $(includedir)/Makefile.export.threadpool
+	rm -f $(includedir)/Makefile.export.threadpool.macros
+
+else
+
+install-exec-hook:
+
+uninstall-hook:
+
+endif
+
+## #######################################################################
+## Subdirectories to be make'd recursively
+## #######################################################################
+#We now build tests and examples through separate make targets, rather than
+#during "make".  We still need to conditionally include the test and example
+#in SUBDIRS, even though BUILD_TESTS and BUILD_EXAMPLES will never be
+#defined, so that the tests and examples are included in the distribution
+#tarball. 
+
+if SUB_TEST
+TEST_SUBDIR=test
+endif
+
+#if SUB_EXAMPLE
+#EXAMPLE_SUBDIR=example
+#endif
+
+#  #np# - The following make targets must be defined for all packages.
+#  #np# - If the package does not have tests or examples, replace the
+#  #np# - corresponding rules with something like:
+#  #np# - @echo "new_package does not have any tests yet"
+if BUILD_TESTS
+tests:
+	@echo ""
+	@echo "Now building ThreadPool tests."
+	@echo ""
+	cd $(top_builddir)/test && $(MAKE)
+	@echo ""
+	@echo "Finished building ThreadPool tests."
+	@echo ""
+else
+tests:
+	@echo "ThreadPool tests were disabled at configure time"
+endif
+
+examples:
+	@echo "ThreadPool does not have any examples yet"
+
+install-examples:
+	@echo "ThreadPool does not have any examples yet"
+
+clean-tests:
+	cd $(top_builddir)/test && $(MAKE) clean
+
+clean-examples:
+	@echo "ThreadPool does not have any examples yet"
+
+everything:
+	$(MAKE) && $(MAKE) examples && $(MAKE) tests
+
+clean-everything:
+	$(MAKE) clean-examples && $(MAKE) clean-tests && $(MAKE) clean
+
+install-everything:
+	$(MAKE) install && $(MAKE) install-examples
+
+SUBDIRS = src $(TEST_SUBDIR)
+
+## #######################################################################
+## The below targets allow you to use the new
+## testharness to run the test suite as make targets
+## #######################################################################
+
+TRILINOS_HOME_DIR=@abs_top_srcdir@/../..
+TRILINOS_BUILD_DIR=@abs_top_builddir@/../..
+TRILINOS_TEST_CATEGORY=INSTALL
+
+runtests-serial :
+	$(PERL_EXE) $(TRILINOS_HOME_DIR)/commonTools/test/utilities/runtests \
+  --trilinos-dir=$(TRILINOS_HOME_DIR) \
+  --comm=serial \
+  --build-dir=$(TRILINOS_BUILD_DIR) \
+  --category=$(TRILINOS_TEST_CATEGORY) \
+  --output-dir=@abs_top_builddir@/test/runtests-results \
+  --verbosity=1 \
+  --packages=ThreadPool
+
+runtests-mpi :
+	$(PERL_EXE) $(TRILINOS_HOME_DIR)/commonTools/test/utilities/runtests \
+  --trilinos-dir=$(TRILINOS_HOME_DIR) \
+  --comm=mpi \
+  --mpi-go=$(TRILINOS_MPI_GO) \
+  --build-dir=$(TRILINOS_BUILD_DIR) \
+  --category=$(TRILINOS_TEST_CATEGORY) \
+  --output-dir=@abs_top_builddir@/test/runtests-results \
+  --verbosity=1 \
+  --packages=ThreadPool
+
+if HAVE_MPI
+THREADPOOL_CHECK_COMM=mpi
+else
+THREADPOOL_CHECK_COMM=serial
+endif
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/Makefile.export.threadpool.in b/openmp-avx512/basic/optional/ThreadPool/Makefile.export.threadpool.in
new file mode 100644
index 0000000..66bfda9
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/Makefile.export.threadpool.in
@@ -0,0 +1,9 @@
+_THREADPOOL_INCLUDES = -I@abs_top_srcdir@/include -I@abs_top_builddir@/include
+
+_THREADPOOL_LIBS     = @LDFLAGS@ -L@abs_top_builddir@/src -ltpi $(LIBS)
+
+@USING_GNUMAKE_TRUE@THREADPOOL_INCLUDES  = $(shell @PERL_EXE@ @abs_top_srcdir@/config/strip_dup_incl_paths.pl $(_THREADPOOL_INCLUDES))
+@USING_GNUMAKE_TRUE@THREADPOOL_LIBS      = $(shell @PERL_EXE@ @abs_top_srcdir@/config/strip_dup_libs.pl $(_THREADPOOL_LIBS))
+
+@USING_GNUMAKE_FALSE@THREADPOOL_INCLUDES = $(_THREADPOOL_INCLUDES)
+@USING_GNUMAKE_FALSE@THREADPOOL_LIBS     = $(_THREADPOOL_LIBS)
diff --git a/openmp-avx512/basic/optional/ThreadPool/Makefile.in b/openmp-avx512/basic/optional/ThreadPool/Makefile.in
new file mode 100644
index 0000000..3e4abfd
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/Makefile.in
@@ -0,0 +1,777 @@
+# Makefile.in generated by automake 1.10 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# @HEADER
+# ************************************************************************
+# 
+#                          ThreadPool Package
+#                 Copyright (2008) Sandia Corporation
+# 
+# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+# license for use of this work by or on behalf of the U.S. Government.
+# 
+# This library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of the
+# License, or (at your option) any later version.
+#  
+# This library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#  
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
+# 
+# ************************************************************************
+# @HEADER
+VPATH = @srcdir@
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = .
+DIST_COMMON = $(am__configure_deps) $(srcdir)/Makefile.am \
+	$(srcdir)/Makefile.export.threadpool.in $(srcdir)/Makefile.in \
+	$(top_srcdir)/configure config/config.guess config/config.sub \
+	config/depcomp config/install-sh config/missing
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/config/acx_pthread.m4 \
+	$(top_srcdir)/config/tac_arg_check_mpi.m4 \
+	$(top_srcdir)/config/tac_arg_config_mpi.m4 \
+	$(top_srcdir)/config/tac_arg_enable_export-makefiles.m4 \
+	$(top_srcdir)/config/tac_arg_enable_feature.m4 \
+	$(top_srcdir)/config/tac_arg_enable_feature_sub_check.m4 \
+	$(top_srcdir)/config/tac_arg_with_ar.m4 \
+	$(top_srcdir)/config/tac_arg_with_flags.m4 \
+	$(top_srcdir)/config/tac_arg_with_incdirs.m4 \
+	$(top_srcdir)/config/tac_arg_with_libdirs.m4 \
+	$(top_srcdir)/config/tac_arg_with_libs.m4 \
+	$(top_srcdir)/config/tac_arg_with_perl.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
+ configure.lineno config.status.lineno
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/src/ThreadPool_config.h
+CONFIG_CLEAN_FILES = Makefile.export.threadpool
+SOURCES =
+DIST_SOURCES =
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = src test
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+distdir = $(PACKAGE)-$(VERSION)
+top_distdir = $(distdir)
+am__remove_distdir = \
+  { test ! -d $(distdir) \
+    || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \
+         && rm -fr $(distdir); }; }
+DIST_ARCHIVES = $(distdir).tar.gz
+GZIP_ENV = --best
+distuninstallcheck_listfiles = find . -type f -print
+distcleancheck_listfiles = find . -type f -print
+ACLOCAL = @ACLOCAL@
+ALTERNATE_AR = @ALTERNATE_AR@
+AMTAR = @AMTAR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+GREP = @GREP@
+HAVE_PERL = @HAVE_PERL@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MKDIR_P = @MKDIR_P@
+MPI_CC_EXISTS = @MPI_CC_EXISTS@
+MPI_CXX = @MPI_CXX@
+MPI_CXX_EXISTS = @MPI_CXX_EXISTS@
+MPI_F77_EXISTS = @MPI_F77_EXISTS@
+MPI_TEMP_CXX = @MPI_TEMP_CXX@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PERL_EXE = @PERL_EXE@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_aux_dir = @ac_aux_dir@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AUTOMAKE_OPTIONS = foreign
+ACLOCAL_AMFLAGS = -I config
+
+#
+#       I believe that by switching to AUX_DIR(../../config) one 
+#       could get rid of these.
+#
+#np# For a typical package, there is no reason to distribute these files
+#np# because users should not have to bootstrap.  We distribute them with
+#np# new package so that the files can be used in creating the 
+#np# configure script for other packages. 
+EXTRA_DIST = \
+config/generate-makeoptions.pl \
+config/replace-install-prefix.pl config/string-replace.pl \
+config/strip_dup_incl_paths.pl   config/strip_dup_libs.pl \
+config/token-replace.pl
+
+AUX_DIST = config/install-sh config/missing config/mkinstalldirs 
+#
+#  Again, I hope that AUX_DIR(../../config) eliminates these
+#  config/install-sh config/missing config/mkinstalldirs 
+MAINTAINERCLEANFILES = Makefile.in aclocal.m4 autom4te.cache/* \
+	configure config.status config.log \
+	src/common/config-h.in src/common/stamp-h.in \
+	$(AUX_DIST) 
+
+
+#We now build tests and examples through separate make targets, rather than
+#during "make".  We still need to conditionally include the test and example
+#in SUBDIRS, even though BUILD_TESTS and BUILD_EXAMPLES will never be
+#defined, so that the tests and examples are included in the distribution
+#tarball. 
+@SUB_TEST_TRUE@TEST_SUBDIR = test
+SUBDIRS = src $(TEST_SUBDIR)
+TRILINOS_HOME_DIR = @abs_top_srcdir@/../..
+TRILINOS_BUILD_DIR = @abs_top_builddir@/../..
+TRILINOS_TEST_CATEGORY = INSTALL
+@HAVE_MPI_FALSE@THREADPOOL_CHECK_COMM = serial
+@HAVE_MPI_TRUE@THREADPOOL_CHECK_COMM = mpi
+all: all-recursive
+
+.SUFFIXES:
+am--refresh:
+	@:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      echo ' cd $(srcdir) && $(AUTOMAKE) --foreign '; \
+	      cd $(srcdir) && $(AUTOMAKE) --foreign  \
+		&& exit 0; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  Makefile'; \
+	cd $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign  Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    echo ' $(SHELL) ./config.status'; \
+	    $(SHELL) ./config.status;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	$(SHELL) ./config.status --recheck
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(srcdir) && $(AUTOCONF)
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
+Makefile.export.threadpool: $(top_builddir)/config.status $(srcdir)/Makefile.export.threadpool.in
+	cd $(top_builddir) && $(SHELL) ./config.status $@
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	    $$tags $$unique; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(CTAGS_ARGS)$$tags$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$tags $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && cd $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) $$here
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	$(am__remove_distdir)
+	test -d $(distdir) || mkdir $(distdir)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
+	    fi; \
+	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file \
+	    || exit 1; \
+	  fi; \
+	done
+	list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test -d "$(distdir)/$$subdir" \
+	    || $(MKDIR_P) "$(distdir)/$$subdir" \
+	    || exit 1; \
+	    distdir=`$(am__cd) $(distdir) && pwd`; \
+	    top_distdir=`$(am__cd) $(top_distdir) && pwd`; \
+	    (cd $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$top_distdir" \
+	        distdir="$$distdir/$$subdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+	-find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
+	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -444 -exec $(install_sh) -c -m a+r {} {} \; \
+	|| chmod -R a+r $(distdir)
+dist-gzip: distdir
+	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
+	$(am__remove_distdir)
+
+dist-bzip2: distdir
+	tardir=$(distdir) && $(am__tar) | bzip2 -9 -c >$(distdir).tar.bz2
+	$(am__remove_distdir)
+
+dist-tarZ: distdir
+	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
+	$(am__remove_distdir)
+
+dist-shar: distdir
+	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
+	$(am__remove_distdir)
+
+dist-zip: distdir
+	-rm -f $(distdir).zip
+	zip -rq $(distdir).zip $(distdir)
+	$(am__remove_distdir)
+
+dist dist-all: distdir
+	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
+	$(am__remove_distdir)
+
+# This target untars the dist file and tries a VPATH configuration.  Then
+# it guarantees that the distribution is self-contained by making another
+# tarfile.
+distcheck: dist
+	case '$(DIST_ARCHIVES)' in \
+	*.tar.gz*) \
+	  GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(am__untar) ;;\
+	*.tar.bz2*) \
+	  bunzip2 -c $(distdir).tar.bz2 | $(am__untar) ;;\
+	*.tar.Z*) \
+	  uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
+	*.shar.gz*) \
+	  GZIP=$(GZIP_ENV) gunzip -c $(distdir).shar.gz | unshar ;;\
+	*.zip*) \
+	  unzip $(distdir).zip ;;\
+	esac
+	chmod -R a-w $(distdir); chmod a+w $(distdir)
+	mkdir $(distdir)/_build
+	mkdir $(distdir)/_inst
+	chmod a-w $(distdir)
+	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
+	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
+	  && cd $(distdir)/_build \
+	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
+	    $(DISTCHECK_CONFIGURE_FLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
+	  && $(MAKE) $(AM_MAKEFLAGS) check \
+	  && $(MAKE) $(AM_MAKEFLAGS) install \
+	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
+	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
+	  && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
+	        distuninstallcheck \
+	  && chmod -R a-w "$$dc_install_base" \
+	  && ({ \
+	       (cd ../.. && umask 077 && mkdir "$$dc_destdir") \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
+	            distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
+	      } || { rm -rf "$$dc_destdir"; exit 1; }) \
+	  && rm -rf "$$dc_destdir" \
+	  && $(MAKE) $(AM_MAKEFLAGS) dist \
+	  && rm -rf $(DIST_ARCHIVES) \
+	  && $(MAKE) $(AM_MAKEFLAGS) distcleancheck
+	$(am__remove_distdir)
+	@(echo "$(distdir) archives ready for distribution: "; \
+	  list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
+	  sed -e 1h -e 1s/./=/g -e 1p -e 1x -e '$$p' -e '$$x'
+distuninstallcheck:
+	@cd $(distuninstallcheck_dir) \
+	&& test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \
+	   || { echo "ERROR: files left after uninstall:" ; \
+	        if test -n "$(DESTDIR)"; then \
+	          echo "  (check DESTDIR support)"; \
+	        fi ; \
+	        $(distuninstallcheck_listfiles) ; \
+	        exit 1; } >&2
+distcleancheck: distclean
+	@if test '$(srcdir)' = . ; then \
+	  echo "ERROR: distcleancheck can only run from a VPATH build" ; \
+	  exit 1 ; \
+	fi
+	@test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
+	  || { echo "ERROR: files left in build directory after distclean:" ; \
+	       $(distcleancheck_listfiles) ; \
+	       exit 1; } >&2
+check-am: all-am
+check: check-recursive
+all-am: Makefile all-local
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	  `test -z '$(STRIP)' || \
+	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES)
+clean: clean-recursive
+
+clean-am: clean-generic mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-exec-am:
+	@$(NORMAL_INSTALL)
+	$(MAKE) $(AM_MAKEFLAGS) install-exec-hook
+
+install-html: install-html-recursive
+
+install-info: install-info-recursive
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-ps: install-ps-recursive
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
+	-rm -rf $(top_srcdir)/autom4te.cache
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-generic
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+	@$(NORMAL_INSTALL)
+	$(MAKE) $(AM_MAKEFLAGS) uninstall-hook
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) install-am \
+	install-exec-am install-strip uninstall-am
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am all-local am--refresh check check-am clean \
+	clean-generic ctags ctags-recursive dist dist-all dist-bzip2 \
+	dist-gzip dist-shar dist-tarZ dist-zip distcheck distclean \
+	distclean-generic distclean-tags distcleancheck distdir \
+	distuninstallcheck dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-exec-hook \
+	install-html install-html-am install-info install-info-am \
+	install-man install-pdf install-pdf-am install-ps \
+	install-ps-am install-strip installcheck installcheck-am \
+	installdirs installdirs-am maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
+	pdf-am ps ps-am tags tags-recursive uninstall uninstall-am \
+	uninstall-hook
+
+
+#The following line helps the test harness recover from build errors.
+
+all-local:
+	@echo "Trilinos package ThreadPool built successfully"
+
+@USING_EXPORT_MAKEFILES_TRUE@install-exec-hook:
+@USING_EXPORT_MAKEFILES_TRUE@	mkdir -p $(DESTDIR)$(includedir)
+@USING_EXPORT_MAKEFILES_TRUE@	cp $(top_builddir)/Makefile.export.threadpool $(DESTDIR)$(includedir)/.
+@USING_EXPORT_MAKEFILES_TRUE@	$(PERL_EXE) $(top_srcdir)/config/replace-install-prefix.pl \
+@USING_EXPORT_MAKEFILES_TRUE@		--exec-prefix=$(exec_prefix) \
+@USING_EXPORT_MAKEFILES_TRUE@		--my-export-makefile=Makefile.export.threadpool \
+@USING_EXPORT_MAKEFILES_TRUE@		--my-abs-top-srcdir=@abs_top_srcdir@ \
+@USING_EXPORT_MAKEFILES_TRUE@		--my-abs-incl-dirs=@abs_top_builddir@/src:@abs_top_srcdir@/src \
+@USING_EXPORT_MAKEFILES_TRUE@		--my-abs-lib-dirs=@abs_top_builddir@/src
+@USING_EXPORT_MAKEFILES_TRUE@	$(PERL_EXE) $(top_srcdir)/config/generate-makeoptions.pl $(top_builddir)/src/Makefile \
+@USING_EXPORT_MAKEFILES_TRUE@		THREADPOOL > $(DESTDIR)$(includedir)/Makefile.export.threadpool.macros
+
+@USING_EXPORT_MAKEFILES_TRUE@uninstall-hook:
+@USING_EXPORT_MAKEFILES_TRUE@	rm -f $(includedir)/Makefile.export.threadpool
+@USING_EXPORT_MAKEFILES_TRUE@	rm -f $(includedir)/Makefile.export.threadpool.macros
+
+@USING_EXPORT_MAKEFILES_FALSE@install-exec-hook:
+
+@USING_EXPORT_MAKEFILES_FALSE@uninstall-hook:
+
+#if SUB_EXAMPLE
+#EXAMPLE_SUBDIR=example
+#endif
+
+#  #np# - The following make targets must be defined for all packages.
+#  #np# - If the package does not have tests or examples, replace the
+#  #np# - corresponding rules with something like:
+#  #np# - @echo "new_package does not have any tests yet"
+@BUILD_TESTS_TRUE@tests:
+@BUILD_TESTS_TRUE@	@echo ""
+@BUILD_TESTS_TRUE@	@echo "Now building ThreadPool tests."
+@BUILD_TESTS_TRUE@	@echo ""
+@BUILD_TESTS_TRUE@	cd $(top_builddir)/test && $(MAKE)
+@BUILD_TESTS_TRUE@	@echo ""
+@BUILD_TESTS_TRUE@	@echo "Finished building ThreadPool tests."
+@BUILD_TESTS_TRUE@	@echo ""
+@BUILD_TESTS_FALSE@tests:
+@BUILD_TESTS_FALSE@	@echo "ThreadPool tests were disabled at configure time"
+
+examples:
+	@echo "ThreadPool does not have any examples yet"
+
+install-examples:
+	@echo "ThreadPool does not have any examples yet"
+
+clean-tests:
+	cd $(top_builddir)/test && $(MAKE) clean
+
+clean-examples:
+	@echo "ThreadPool does not have any examples yet"
+
+everything:
+	$(MAKE) && $(MAKE) examples && $(MAKE) tests
+
+clean-everything:
+	$(MAKE) clean-examples && $(MAKE) clean-tests && $(MAKE) clean
+
+install-everything:
+	$(MAKE) install && $(MAKE) install-examples
+
+runtests-serial :
+	$(PERL_EXE) $(TRILINOS_HOME_DIR)/commonTools/test/utilities/runtests \
+  --trilinos-dir=$(TRILINOS_HOME_DIR) \
+  --comm=serial \
+  --build-dir=$(TRILINOS_BUILD_DIR) \
+  --category=$(TRILINOS_TEST_CATEGORY) \
+  --output-dir=@abs_top_builddir@/test/runtests-results \
+  --verbosity=1 \
+  --packages=ThreadPool
+
+runtests-mpi :
+	$(PERL_EXE) $(TRILINOS_HOME_DIR)/commonTools/test/utilities/runtests \
+  --trilinos-dir=$(TRILINOS_HOME_DIR) \
+  --comm=mpi \
+  --mpi-go=$(TRILINOS_MPI_GO) \
+  --build-dir=$(TRILINOS_BUILD_DIR) \
+  --category=$(TRILINOS_TEST_CATEGORY) \
+  --output-dir=@abs_top_builddir@/test/runtests-results \
+  --verbosity=1 \
+  --packages=ThreadPool
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/openmp-avx512/basic/optional/ThreadPool/ThreadPool_config.h b/openmp-avx512/basic/optional/ThreadPool/ThreadPool_config.h
new file mode 100644
index 0000000..b941069
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/ThreadPool_config.h
@@ -0,0 +1,3 @@
+#ifndef HAVE_PTHREAD
+#define HAVE_PTHREAD
+#endif
diff --git a/openmp-avx512/basic/optional/ThreadPool/aclocal.m4 b/openmp-avx512/basic/optional/ThreadPool/aclocal.m4
new file mode 100644
index 0000000..e1f57a9
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/aclocal.m4
@@ -0,0 +1,932 @@
+# generated automatically by aclocal 1.10 -*- Autoconf -*-
+
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
+# 2005, 2006  Free Software Foundation, Inc.
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+m4_if(m4_PACKAGE_VERSION, [2.61],,
+[m4_fatal([this file was generated for autoconf 2.61.
+You have another version of autoconf.  If you want to use that,
+you should regenerate the build system entirely.], [63])])
+
+# Copyright (C) 2002, 2003, 2005, 2006  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_AUTOMAKE_VERSION(VERSION)
+# ----------------------------
+# Automake X.Y traces this macro to ensure aclocal.m4 has been
+# generated from the m4 files accompanying Automake X.Y.
+# (This private macro should not be called outside this file.)
+AC_DEFUN([AM_AUTOMAKE_VERSION],
+[am__api_version='1.10'
+dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
+dnl require some minimum version.  Point them to the right macro.
+m4_if([$1], [1.10], [],
+      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
+])
+
+# _AM_AUTOCONF_VERSION(VERSION)
+# -----------------------------
+# aclocal traces this macro to find the Autoconf version.
+# This is a private macro too.  Using m4_define simplifies
+# the logic in aclocal, which can simply ignore this definition.
+m4_define([_AM_AUTOCONF_VERSION], [])
+
+# AM_SET_CURRENT_AUTOMAKE_VERSION
+# -------------------------------
+# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
+# This function is AC_REQUIREd by AC_INIT_AUTOMAKE.
+AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
+[AM_AUTOMAKE_VERSION([1.10])dnl
+_AM_AUTOCONF_VERSION(m4_PACKAGE_VERSION)])
+
+# AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
+
+# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
+# $ac_aux_dir to `$srcdir/foo'.  In other projects, it is set to
+# `$srcdir', `$srcdir/..', or `$srcdir/../..'.
+#
+# Of course, Automake must honor this variable whenever it calls a
+# tool from the auxiliary directory.  The problem is that $srcdir (and
+# therefore $ac_aux_dir as well) can be either absolute or relative,
+# depending on how configure is run.  This is pretty annoying, since
+# it makes $ac_aux_dir quite unusable in subdirectories: in the top
+# source directory, any form will work fine, but in subdirectories a
+# relative path needs to be adjusted first.
+#
+# $ac_aux_dir/missing
+#    fails when called from a subdirectory if $ac_aux_dir is relative
+# $top_srcdir/$ac_aux_dir/missing
+#    fails if $ac_aux_dir is absolute,
+#    fails when called from a subdirectory in a VPATH build with
+#          a relative $ac_aux_dir
+#
+# The reason of the latter failure is that $top_srcdir and $ac_aux_dir
+# are both prefixed by $srcdir.  In an in-source build this is usually
+# harmless because $srcdir is `.', but things will broke when you
+# start a VPATH build or use an absolute $srcdir.
+#
+# So we could use something similar to $top_srcdir/$ac_aux_dir/missing,
+# iff we strip the leading $srcdir from $ac_aux_dir.  That would be:
+#   am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"`
+# and then we would define $MISSING as
+#   MISSING="\${SHELL} $am_aux_dir/missing"
+# This will work as long as MISSING is not called from configure, because
+# unfortunately $(top_srcdir) has no meaning in configure.
+# However there are other variables, like CC, which are often used in
+# configure, and could therefore not use this "fixed" $ac_aux_dir.
+#
+# Another solution, used here, is to always expand $ac_aux_dir to an
+# absolute PATH.  The drawback is that using absolute paths prevent a
+# configured tree to be moved without reconfiguration.
+
+AC_DEFUN([AM_AUX_DIR_EXPAND],
+[dnl Rely on autoconf to set up CDPATH properly.
+AC_PREREQ([2.50])dnl
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
+])
+
+# AM_CONDITIONAL                                            -*- Autoconf -*-
+
+# Copyright (C) 1997, 2000, 2001, 2003, 2004, 2005, 2006
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 8
+
+# AM_CONDITIONAL(NAME, SHELL-CONDITION)
+# -------------------------------------
+# Define a conditional.
+AC_DEFUN([AM_CONDITIONAL],
+[AC_PREREQ(2.52)dnl
+ ifelse([$1], [TRUE],  [AC_FATAL([$0: invalid condition: $1])],
+	[$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
+AC_SUBST([$1_TRUE])dnl
+AC_SUBST([$1_FALSE])dnl
+_AM_SUBST_NOTMAKE([$1_TRUE])dnl
+_AM_SUBST_NOTMAKE([$1_FALSE])dnl
+if $2; then
+  $1_TRUE=
+  $1_FALSE='#'
+else
+  $1_TRUE='#'
+  $1_FALSE=
+fi
+AC_CONFIG_COMMANDS_PRE(
+[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then
+  AC_MSG_ERROR([[conditional "$1" was never defined.
+Usually this means the macro was only invoked conditionally.]])
+fi])])
+
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 9
+
+# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
+# written in clear, in which case automake, when reading aclocal.m4,
+# will think it sees a *use*, and therefore will trigger all it's
+# C support machinery.  Also note that it means that autoscan, seeing
+# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
+
+
+# _AM_DEPENDENCIES(NAME)
+# ----------------------
+# See how the compiler implements dependency checking.
+# NAME is "CC", "CXX", "GCJ", or "OBJC".
+# We try a few techniques and use that to set a single cache variable.
+#
+# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was
+# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular
+# dependency, and given that the user is not expected to run this macro,
+# just rely on AC_PROG_CC.
+AC_DEFUN([_AM_DEPENDENCIES],
+[AC_REQUIRE([AM_SET_DEPDIR])dnl
+AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
+AC_REQUIRE([AM_MAKE_INCLUDE])dnl
+AC_REQUIRE([AM_DEP_TRACK])dnl
+
+ifelse([$1], CC,   [depcc="$CC"   am_compiler_list=],
+       [$1], CXX,  [depcc="$CXX"  am_compiler_list=],
+       [$1], OBJC, [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
+       [$1], UPC,  [depcc="$UPC"  am_compiler_list=],
+       [$1], GCJ,  [depcc="$GCJ"  am_compiler_list='gcc3 gcc'],
+                   [depcc="$$1"   am_compiler_list=])
+
+AC_CACHE_CHECK([dependency style of $depcc],
+               [am_cv_$1_dependencies_compiler_type],
+[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named `D' -- because `-MD' means `put the output
+  # in D'.
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_$1_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp`
+  fi
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
+      # Solaris 8's {/usr,}/bin/sh.
+      touch sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    case $depmode in
+    nosideeffect)
+      # after this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    none) break ;;
+    esac
+    # We check with `-c' and `-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle `-M -o', and we need to detect this.
+    if depmode=$depmode \
+       source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_$1_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_$1_dependencies_compiler_type=none
+fi
+])
+AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type])
+AM_CONDITIONAL([am__fastdep$1], [
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_$1_dependencies_compiler_type" = gcc3])
+])
+
+
+# AM_SET_DEPDIR
+# -------------
+# Choose a directory name for dependency files.
+# This macro is AC_REQUIREd in _AM_DEPENDENCIES
+AC_DEFUN([AM_SET_DEPDIR],
+[AC_REQUIRE([AM_SET_LEADING_DOT])dnl
+AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl
+])
+
+
+# AM_DEP_TRACK
+# ------------
+AC_DEFUN([AM_DEP_TRACK],
+[AC_ARG_ENABLE(dependency-tracking,
+[  --disable-dependency-tracking  speeds up one-time build
+  --enable-dependency-tracking   do not reject slow dependency extractors])
+if test "x$enable_dependency_tracking" != xno; then
+  am_depcomp="$ac_aux_dir/depcomp"
+  AMDEPBACKSLASH='\'
+fi
+AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno])
+AC_SUBST([AMDEPBACKSLASH])dnl
+_AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl
+])
+
+# Generate code to set up dependency tracking.              -*- Autoconf -*-
+
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+#serial 3
+
+# _AM_OUTPUT_DEPENDENCY_COMMANDS
+# ------------------------------
+AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
+[for mf in $CONFIG_FILES; do
+  # Strip MF so we end up with the name of the file.
+  mf=`echo "$mf" | sed -e 's/:.*$//'`
+  # Check whether this is an Automake generated Makefile or not.
+  # We used to match only the files named `Makefile.in', but
+  # some people rename them; so instead we look at the file content.
+  # Grep'ing the first line is not enough: some people post-process
+  # each Makefile.in and add a new line on top of each file to say so.
+  # Grep'ing the whole file is not good either: AIX grep has a line
+  # limit of 2048, but all sed's we know have understand at least 4000.
+  if sed 10q "$mf" | grep '^#.*generated by automake' > /dev/null 2>&1; then
+    dirpart=`AS_DIRNAME("$mf")`
+  else
+    continue
+  fi
+  # Extract the definition of DEPDIR, am__include, and am__quote
+  # from the Makefile without running `make'.
+  DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
+  test -z "$DEPDIR" && continue
+  am__include=`sed -n 's/^am__include = //p' < "$mf"`
+  test -z "am__include" && continue
+  am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
+  # When using ansi2knr, U may be empty or an underscore; expand it
+  U=`sed -n 's/^U = //p' < "$mf"`
+  # Find all dependency output files, they are included files with
+  # $(DEPDIR) in their names.  We invoke sed twice because it is the
+  # simplest approach to changing $(DEPDIR) to its actual value in the
+  # expansion.
+  for file in `sed -n "
+    s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
+       sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
+    # Make sure the directory exists.
+    test -f "$dirpart/$file" && continue
+    fdir=`AS_DIRNAME(["$file"])`
+    AS_MKDIR_P([$dirpart/$fdir])
+    # echo "creating $dirpart/$file"
+    echo '# dummy' > "$dirpart/$file"
+  done
+done
+])# _AM_OUTPUT_DEPENDENCY_COMMANDS
+
+
+# AM_OUTPUT_DEPENDENCY_COMMANDS
+# -----------------------------
+# This macro should only be invoked once -- use via AC_REQUIRE.
+#
+# This code is only required when automatic dependency tracking
+# is enabled.  FIXME.  This creates each `.P' file that we will
+# need in order to bootstrap the dependency handling code.
+AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
+[AC_CONFIG_COMMANDS([depfiles],
+     [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
+     [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
+])
+
+# Copyright (C) 1996, 1997, 2000, 2001, 2003, 2005
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 8
+
+# AM_CONFIG_HEADER is obsolete.  It has been replaced by AC_CONFIG_HEADERS.
+AU_DEFUN([AM_CONFIG_HEADER], [AC_CONFIG_HEADERS($@)])
+
+# Do all the work for Automake.                             -*- Autoconf -*-
+
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
+# 2005, 2006 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 12
+
+# This macro actually does too much.  Some checks are only needed if
+# your package does certain things.  But this isn't really a big deal.
+
+# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
+# AM_INIT_AUTOMAKE([OPTIONS])
+# -----------------------------------------------
+# The call with PACKAGE and VERSION arguments is the old style
+# call (pre autoconf-2.50), which is being phased out.  PACKAGE
+# and VERSION should now be passed to AC_INIT and removed from
+# the call to AM_INIT_AUTOMAKE.
+# We support both call styles for the transition.  After
+# the next Automake release, Autoconf can make the AC_INIT
+# arguments mandatory, and then we can depend on a new Autoconf
+# release and drop the old call support.
+AC_DEFUN([AM_INIT_AUTOMAKE],
+[AC_PREREQ([2.60])dnl
+dnl Autoconf wants to disallow AM_ names.  We explicitly allow
+dnl the ones we care about.
+m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
+AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl
+AC_REQUIRE([AC_PROG_INSTALL])dnl
+if test "`cd $srcdir && pwd`" != "`pwd`"; then
+  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
+  # is not polluted with repeated "-I."
+  AC_SUBST([am__isrc], [' -I$(srcdir)'])_AM_SUBST_NOTMAKE([am__isrc])dnl
+  # test to see if srcdir already configured
+  if test -f $srcdir/config.status; then
+    AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
+  fi
+fi
+
+# test whether we have cygpath
+if test -z "$CYGPATH_W"; then
+  if (cygpath --version) >/dev/null 2>/dev/null; then
+    CYGPATH_W='cygpath -w'
+  else
+    CYGPATH_W=echo
+  fi
+fi
+AC_SUBST([CYGPATH_W])
+
+# Define the identity of the package.
+dnl Distinguish between old-style and new-style calls.
+m4_ifval([$2],
+[m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
+ AC_SUBST([PACKAGE], [$1])dnl
+ AC_SUBST([VERSION], [$2])],
+[_AM_SET_OPTIONS([$1])dnl
+dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
+m4_if(m4_ifdef([AC_PACKAGE_NAME], 1)m4_ifdef([AC_PACKAGE_VERSION], 1), 11,,
+  [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
+ AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
+ AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl
+
+_AM_IF_OPTION([no-define],,
+[AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
+ AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])dnl
+
+# Some tools Automake needs.
+AC_REQUIRE([AM_SANITY_CHECK])dnl
+AC_REQUIRE([AC_ARG_PROGRAM])dnl
+AM_MISSING_PROG(ACLOCAL, aclocal-${am__api_version})
+AM_MISSING_PROG(AUTOCONF, autoconf)
+AM_MISSING_PROG(AUTOMAKE, automake-${am__api_version})
+AM_MISSING_PROG(AUTOHEADER, autoheader)
+AM_MISSING_PROG(MAKEINFO, makeinfo)
+AM_PROG_INSTALL_SH
+AM_PROG_INSTALL_STRIP
+AC_REQUIRE([AM_PROG_MKDIR_P])dnl
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
+AC_REQUIRE([AC_PROG_AWK])dnl
+AC_REQUIRE([AC_PROG_MAKE_SET])dnl
+AC_REQUIRE([AM_SET_LEADING_DOT])dnl
+_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])],
+              [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])],
+	      		     [_AM_PROG_TAR([v7])])])
+_AM_IF_OPTION([no-dependencies],,
+[AC_PROVIDE_IFELSE([AC_PROG_CC],
+                  [_AM_DEPENDENCIES(CC)],
+                  [define([AC_PROG_CC],
+                          defn([AC_PROG_CC])[_AM_DEPENDENCIES(CC)])])dnl
+AC_PROVIDE_IFELSE([AC_PROG_CXX],
+                  [_AM_DEPENDENCIES(CXX)],
+                  [define([AC_PROG_CXX],
+                          defn([AC_PROG_CXX])[_AM_DEPENDENCIES(CXX)])])dnl
+AC_PROVIDE_IFELSE([AC_PROG_OBJC],
+                  [_AM_DEPENDENCIES(OBJC)],
+                  [define([AC_PROG_OBJC],
+                          defn([AC_PROG_OBJC])[_AM_DEPENDENCIES(OBJC)])])dnl
+])
+])
+
+
+# When config.status generates a header, we must update the stamp-h file.
+# This file resides in the same directory as the config header
+# that is generated.  The stamp files are numbered to have different names.
+
+# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the
+# loop where config.status creates the headers, so we can generate
+# our stamp files there.
+AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK],
+[# Compute $1's index in $config_headers.
+_am_stamp_count=1
+for _am_header in $config_headers :; do
+  case $_am_header in
+    $1 | $1:* )
+      break ;;
+    * )
+      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
+  esac
+done
+echo "timestamp for $1" >`AS_DIRNAME([$1])`/stamp-h[]$_am_stamp_count])
+
+# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_INSTALL_SH
+# ------------------
+# Define $install_sh.
+AC_DEFUN([AM_PROG_INSTALL_SH],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+install_sh=${install_sh-"\$(SHELL) $am_aux_dir/install-sh"}
+AC_SUBST(install_sh)])
+
+# Copyright (C) 2003, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 2
+
+# Check whether the underlying file-system supports filenames
+# with a leading dot.  For instance MS-DOS doesn't.
+AC_DEFUN([AM_SET_LEADING_DOT],
+[rm -rf .tst 2>/dev/null
+mkdir .tst 2>/dev/null
+if test -d .tst; then
+  am__leading_dot=.
+else
+  am__leading_dot=_
+fi
+rmdir .tst 2>/dev/null
+AC_SUBST([am__leading_dot])])
+
+# Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
+# From Jim Meyering
+
+# Copyright (C) 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 4
+
+AC_DEFUN([AM_MAINTAINER_MODE],
+[AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
+  dnl maintainer-mode is disabled by default
+  AC_ARG_ENABLE(maintainer-mode,
+[  --enable-maintainer-mode  enable make rules and dependencies not useful
+			  (and sometimes confusing) to the casual installer],
+      USE_MAINTAINER_MODE=$enableval,
+      USE_MAINTAINER_MODE=no)
+  AC_MSG_RESULT([$USE_MAINTAINER_MODE])
+  AM_CONDITIONAL(MAINTAINER_MODE, [test $USE_MAINTAINER_MODE = yes])
+  MAINT=$MAINTAINER_MODE_TRUE
+  AC_SUBST(MAINT)dnl
+]
+)
+
+AU_DEFUN([jm_MAINTAINER_MODE], [AM_MAINTAINER_MODE])
+
+# Check to see how 'make' treats includes.	            -*- Autoconf -*-
+
+# Copyright (C) 2001, 2002, 2003, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 3
+
+# AM_MAKE_INCLUDE()
+# -----------------
+# Check to see how make treats includes.
+AC_DEFUN([AM_MAKE_INCLUDE],
+[am_make=${MAKE-make}
+cat > confinc << 'END'
+am__doit:
+	@echo done
+.PHONY: am__doit
+END
+# If we don't find an include directive, just comment out the code.
+AC_MSG_CHECKING([for style of include used by $am_make])
+am__include="#"
+am__quote=
+_am_result=none
+# First try GNU make style include.
+echo "include confinc" > confmf
+# We grep out `Entering directory' and `Leaving directory'
+# messages which can occur if `w' ends up in MAKEFLAGS.
+# In particular we don't look at `^make:' because GNU make might
+# be invoked under some other name (usually "gmake"), in which
+# case it prints its new name instead of `make'.
+if test "`$am_make -s -f confmf 2> /dev/null | grep -v 'ing directory'`" = "done"; then
+   am__include=include
+   am__quote=
+   _am_result=GNU
+fi
+# Now try BSD make style include.
+if test "$am__include" = "#"; then
+   echo '.include "confinc"' > confmf
+   if test "`$am_make -s -f confmf 2> /dev/null`" = "done"; then
+      am__include=.include
+      am__quote="\""
+      _am_result=BSD
+   fi
+fi
+AC_SUBST([am__include])
+AC_SUBST([am__quote])
+AC_MSG_RESULT([$_am_result])
+rm -f confinc confmf
+])
+
+# Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
+
+# Copyright (C) 1997, 1999, 2000, 2001, 2003, 2004, 2005
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 5
+
+# AM_MISSING_PROG(NAME, PROGRAM)
+# ------------------------------
+AC_DEFUN([AM_MISSING_PROG],
+[AC_REQUIRE([AM_MISSING_HAS_RUN])
+$1=${$1-"${am_missing_run}$2"}
+AC_SUBST($1)])
+
+
+# AM_MISSING_HAS_RUN
+# ------------------
+# Define MISSING if not defined so far and test if it supports --run.
+# If it does, set am_missing_run to use it, otherwise, to nothing.
+AC_DEFUN([AM_MISSING_HAS_RUN],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+AC_REQUIRE_AUX_FILE([missing])dnl
+test x"${MISSING+set}" = xset || MISSING="\${SHELL} $am_aux_dir/missing"
+# Use eval to expand $SHELL
+if eval "$MISSING --run true"; then
+  am_missing_run="$MISSING --run "
+else
+  am_missing_run=
+  AC_MSG_WARN([`missing' script is too old or missing])
+fi
+])
+
+# Copyright (C) 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_MKDIR_P
+# ---------------
+# Check for `mkdir -p'.
+AC_DEFUN([AM_PROG_MKDIR_P],
+[AC_PREREQ([2.60])dnl
+AC_REQUIRE([AC_PROG_MKDIR_P])dnl
+dnl Automake 1.8 to 1.9.6 used to define mkdir_p.  We now use MKDIR_P,
+dnl while keeping a definition of mkdir_p for backward compatibility.
+dnl @MKDIR_P@ is magic: AC_OUTPUT adjusts its value for each Makefile.
+dnl However we cannot define mkdir_p as $(MKDIR_P) for the sake of
+dnl Makefile.ins that do not define MKDIR_P, so we do our own
+dnl adjustment using top_builddir (which is defined more often than
+dnl MKDIR_P).
+AC_SUBST([mkdir_p], ["$MKDIR_P"])dnl
+case $mkdir_p in
+  [[\\/$]]* | ?:[[\\/]]*) ;;
+  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
+esac
+])
+
+# Helper functions for option handling.                     -*- Autoconf -*-
+
+# Copyright (C) 2001, 2002, 2003, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 3
+
+# _AM_MANGLE_OPTION(NAME)
+# -----------------------
+AC_DEFUN([_AM_MANGLE_OPTION],
+[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])])
+
+# _AM_SET_OPTION(NAME)
+# ------------------------------
+# Set option NAME.  Presently that only means defining a flag for this option.
+AC_DEFUN([_AM_SET_OPTION],
+[m4_define(_AM_MANGLE_OPTION([$1]), 1)])
+
+# _AM_SET_OPTIONS(OPTIONS)
+# ----------------------------------
+# OPTIONS is a space-separated list of Automake options.
+AC_DEFUN([_AM_SET_OPTIONS],
+[AC_FOREACH([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])])
+
+# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET])
+# -------------------------------------------
+# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
+AC_DEFUN([_AM_IF_OPTION],
+[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
+
+# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_RUN_LOG(COMMAND)
+# -------------------
+# Run COMMAND, save the exit status in ac_status, and log it.
+# (This has been adapted from Autoconf's _AC_RUN_LOG macro.)
+AC_DEFUN([AM_RUN_LOG],
+[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD
+   ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
+   (exit $ac_status); }])
+
+# Check to make sure that the build environment is sane.    -*- Autoconf -*-
+
+# Copyright (C) 1996, 1997, 2000, 2001, 2003, 2005
+# Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 4
+
+# AM_SANITY_CHECK
+# ---------------
+AC_DEFUN([AM_SANITY_CHECK],
+[AC_MSG_CHECKING([whether build environment is sane])
+# Just in case
+sleep 1
+echo timestamp > conftest.file
+# Do `set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
+   if test "$[*]" = "X"; then
+      # -L didn't work.
+      set X `ls -t $srcdir/configure conftest.file`
+   fi
+   rm -f conftest.file
+   if test "$[*]" != "X $srcdir/configure conftest.file" \
+      && test "$[*]" != "X conftest.file $srcdir/configure"; then
+
+      # If neither matched, then we have a broken ls.  This can happen
+      # if, for instance, CONFIG_SHELL is bash and it inherits a
+      # broken ls alias from the environment.  This has actually
+      # happened.  Such a system could not be considered "sane".
+      AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
+alias in your environment])
+   fi
+
+   test "$[2]" = conftest.file
+   )
+then
+   # Ok.
+   :
+else
+   AC_MSG_ERROR([newly created file is older than distributed files!
+Check your system clock])
+fi
+AC_MSG_RESULT(yes)])
+
+# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_INSTALL_STRIP
+# ---------------------
+# One issue with vendor `install' (even GNU) is that you can't
+# specify the program used to strip binaries.  This is especially
+# annoying in cross-compiling environments, where the build's strip
+# is unlikely to handle the host's binaries.
+# Fortunately install-sh will honor a STRIPPROG variable, so we
+# always use install-sh in `make install-strip', and initialize
+# STRIPPROG with the value of the STRIP variable (set by the user).
+AC_DEFUN([AM_PROG_INSTALL_STRIP],
+[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
+# Installed binaries are usually stripped using `strip' when the user
+# run `make install-strip'.  However `strip' might not be the right
+# tool to use in cross-compilation environments, therefore Automake
+# will honor the `STRIP' environment variable to overrule this program.
+dnl Don't test for $cross_compiling = yes, because it might be `maybe'.
+if test "$cross_compiling" != no; then
+  AC_CHECK_TOOL([STRIP], [strip], :)
+fi
+INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
+AC_SUBST([INSTALL_STRIP_PROGRAM])])
+
+# Copyright (C) 2006  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# _AM_SUBST_NOTMAKE(VARIABLE)
+# ---------------------------
+# Prevent Automake from outputing VARIABLE = @VARIABLE@ in Makefile.in.
+# This macro is traced by Automake.
+AC_DEFUN([_AM_SUBST_NOTMAKE])
+
+# Check how to create a tarball.                            -*- Autoconf -*-
+
+# Copyright (C) 2004, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 2
+
+# _AM_PROG_TAR(FORMAT)
+# --------------------
+# Check how to create a tarball in format FORMAT.
+# FORMAT should be one of `v7', `ustar', or `pax'.
+#
+# Substitute a variable $(am__tar) that is a command
+# writing to stdout a FORMAT-tarball containing the directory
+# $tardir.
+#     tardir=directory && $(am__tar) > result.tar
+#
+# Substitute a variable $(am__untar) that extract such
+# a tarball read from stdin.
+#     $(am__untar) < result.tar
+AC_DEFUN([_AM_PROG_TAR],
+[# Always define AMTAR for backward compatibility.
+AM_MISSING_PROG([AMTAR], [tar])
+m4_if([$1], [v7],
+     [am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'],
+     [m4_case([$1], [ustar],, [pax],,
+              [m4_fatal([Unknown tar format])])
+AC_MSG_CHECKING([how to create a $1 tar archive])
+# Loop over all known methods to create a tar archive until one works.
+_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
+_am_tools=${am_cv_prog_tar_$1-$_am_tools}
+# Do not fold the above two line into one, because Tru64 sh and
+# Solaris sh will not grok spaces in the rhs of `-'.
+for _am_tool in $_am_tools
+do
+  case $_am_tool in
+  gnutar)
+    for _am_tar in tar gnutar gtar;
+    do
+      AM_RUN_LOG([$_am_tar --version]) && break
+    done
+    am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
+    am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
+    am__untar="$_am_tar -xf -"
+    ;;
+  plaintar)
+    # Must skip GNU tar: if it does not support --format= it doesn't create
+    # ustar tarball either.
+    (tar --version) >/dev/null 2>&1 && continue
+    am__tar='tar chf - "$$tardir"'
+    am__tar_='tar chf - "$tardir"'
+    am__untar='tar xf -'
+    ;;
+  pax)
+    am__tar='pax -L -x $1 -w "$$tardir"'
+    am__tar_='pax -L -x $1 -w "$tardir"'
+    am__untar='pax -r'
+    ;;
+  cpio)
+    am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
+    am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
+    am__untar='cpio -i -H $1 -d'
+    ;;
+  none)
+    am__tar=false
+    am__tar_=false
+    am__untar=false
+    ;;
+  esac
+
+  # If the value was cached, stop now.  We just wanted to have am__tar
+  # and am__untar set.
+  test -n "${am_cv_prog_tar_$1}" && break
+
+  # tar/untar a dummy directory, and stop if the command works
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  echo GrepMe > conftest.dir/file
+  AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
+  rm -rf conftest.dir
+  if test -s conftest.tar; then
+    AM_RUN_LOG([$am__untar <conftest.tar])
+    grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+  fi
+done
+rm -rf conftest.dir
+
+AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
+AC_MSG_RESULT([$am_cv_prog_tar_$1])])
+AC_SUBST([am__tar])
+AC_SUBST([am__untar])
+]) # _AM_PROG_TAR
+
+m4_include([config/acx_pthread.m4])
+m4_include([config/tac_arg_check_mpi.m4])
+m4_include([config/tac_arg_config_mpi.m4])
+m4_include([config/tac_arg_enable_export-makefiles.m4])
+m4_include([config/tac_arg_enable_feature.m4])
+m4_include([config/tac_arg_enable_feature_sub_check.m4])
+m4_include([config/tac_arg_with_ar.m4])
+m4_include([config/tac_arg_with_flags.m4])
+m4_include([config/tac_arg_with_incdirs.m4])
+m4_include([config/tac_arg_with_libdirs.m4])
+m4_include([config/tac_arg_with_libs.m4])
+m4_include([config/tac_arg_with_perl.m4])
diff --git a/openmp-avx512/basic/optional/ThreadPool/bootstrap b/openmp-avx512/basic/optional/ThreadPool/bootstrap
new file mode 100755
index 0000000..8706e9e
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/bootstrap
@@ -0,0 +1,9 @@
+#! /bin/sh
+#np# This file does not need to be edited, other than removing this line.
+set -x
+# Only run aclocal if we need to create aclocal.m4
+aclocal -I config 
+# autoheader is smart and doesn't change anything unless it's necessary
+autoheader 
+automake --foreign --add-missing --copy
+autoconf
diff --git a/openmp-avx512/basic/optional/ThreadPool/cmake/Dependencies.cmake b/openmp-avx512/basic/optional/ThreadPool/cmake/Dependencies.cmake
new file mode 100644
index 0000000..746d066
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/cmake/Dependencies.cmake
@@ -0,0 +1,11 @@
+SET(LIB_REQUIRED_DEP_PACKAGES)
+SET(LIB_OPTIONAL_DEP_PACKAGES)
+SET(TEST_REQUIRED_DEP_PACKAGES)
+SET(TEST_OPTIONAL_DEP_PACKAGES)
+SET(LIB_REQUIRED_DEP_TPLS)
+SET(LIB_OPTIONAL_DEP_TPLS Pthread MPI)
+SET(TEST_REQUIRED_DEP_TPLS)
+SET(TEST_OPTIONAL_DEP_TPLS)
+
+TPL_TENTATIVELY_ENABLE(Pthread)
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/cmake/ThreadPool_config.h.in b/openmp-avx512/basic/optional/ThreadPool/cmake/ThreadPool_config.h.in
new file mode 100644
index 0000000..55614b9
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/cmake/ThreadPool_config.h.in
@@ -0,0 +1,2 @@
+#cmakedefine HAVE_MPI
+#cmakedefine HAVE_PTHREAD
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/acx_pthread.m4 b/openmp-avx512/basic/optional/ThreadPool/config/acx_pthread.m4
new file mode 100644
index 0000000..3bd3ec2
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/acx_pthread.m4
@@ -0,0 +1,224 @@
+dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+dnl
+dnl This macro figures out how to build C programs using POSIX
+dnl threads.  It sets the PTHREAD_LIBS output variable to the threads
+dnl library and linker flags, and the PTHREAD_CFLAGS output variable
+dnl to any special C compiler flags that are needed.  (The user can also
+dnl force certain compiler flags/libs to be tested by setting these
+dnl environment variables.)
+dnl
+dnl Also sets PTHREAD_CC to any special C compiler that is needed for
+dnl multi-threaded programs (defaults to the value of CC otherwise).
+dnl (This is necessary on AIX to use the special cc_r compiler alias.)
+dnl
+dnl If you are only building threads programs, you may wish to
+dnl use these variables in your default LIBS, CFLAGS, and CC:
+dnl
+dnl        LIBS="$PTHREAD_LIBS $LIBS"
+dnl        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+dnl        CC="$PTHREAD_CC"
+dnl
+dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute
+dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE
+dnl to that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+dnl
+dnl ACTION-IF-FOUND is a list of shell commands to run if a threads
+dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands
+dnl to run it if it is not found.  If ACTION-IF-FOUND is not specified,
+dnl the default action will define HAVE_PTHREAD.
+dnl
+dnl Please let the authors know if this macro fails on any platform,
+dnl or if you have any other suggestions or comments.  This macro was
+dnl based on work by SGJ on autoconf scripts for FFTW (www.fftw.org)
+dnl (with help from M. Frigo), as well as ac_pthread and hb_pthread
+dnl macros posted by AFC to the autoconf macro repository.  We are also
+dnl grateful for the helpful feedback of numerous users.
+dnl
+dnl @version $Id$
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu> and Alejandro Forero Cuervo <bachue@bachue.com>
+
+AC_DEFUN([ACX_PTHREAD], [
+AC_REQUIRE([AC_CANONICAL_HOST])
+acx_pthread_ok=no
+
+# First, check if the POSIX threads header, pthread.h, is available.
+# If it isn't, don't bother looking for the threads libraries.
+AC_CHECK_HEADER(pthread.h, , acx_pthread_ok=noheader)
+
+# We must check for the threads library under a number of different
+# names; the ordering is very important because some systems
+# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
+# libraries is broken (non-POSIX).
+
+# First of all, check if the user has set any of the PTHREAD_LIBS,
+# etcetera environment variables, and if threads linking works using
+# them:
+if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
+        AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes)
+        AC_MSG_RESULT($acx_pthread_ok)
+        if test x"$acx_pthread_ok" = xno; then
+                PTHREAD_LIBS=""
+                PTHREAD_CFLAGS=""
+        fi
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+fi
+
+# Create a list of thread flags to try.  Items starting with a "-" are
+# C compiler flags, and other items are library names, except for "none"
+# which indicates that we try without any flags at all.
+
+acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt"
+
+# The ordering *is* (sometimes) important.  Some notes on the
+# individual items follow:
+
+# pthreads: AIX (must check this before -lpthread)
+# none: in case threads are in libc; should be tried before -Kthread and
+#       other compiler flags to prevent continual compiler warnings
+# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
+# -pthreads: Solaris/gcc
+# -mthreads: Mingw32/gcc, Lynx/gcc
+# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+#      doesn't hurt to check since this sometimes defines pthreads too;
+#      also defines -D_REENTRANT)
+# pthread: Linux, etcetera
+# --thread-safe: KAI C++
+
+case "${host_cpu}-${host_os}" in
+        *solaris*)
+
+        # On Solaris (at least, for some versions), libc contains stubbed
+        # (non-functional) versions of the pthreads routines, so link-based
+        # tests will erroneously succeed.  (We need to link with -pthread or
+        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+        # a function called by this macro, so we could check for that, but
+        # who knows whether they'll stub that too in a future libc.)  So,
+        # we'll just look for -pthreads and -lpthread first:
+
+        acx_pthread_flags="-pthread -pthreads pthread -mt $acx_pthread_flags"
+        ;;
+esac
+
+if test x"$acx_pthread_ok" = xno; then
+for flag in $acx_pthread_flags; do
+
+        case $flag in
+                none)
+                AC_MSG_CHECKING([whether pthreads work without any flags])
+                ;;
+
+                -*)
+                AC_MSG_CHECKING([whether pthreads work with $flag])
+                PTHREAD_CFLAGS="$flag"
+                ;;
+
+                *)
+                AC_MSG_CHECKING([for the pthreads library -l$flag])
+                PTHREAD_LIBS="-l$flag"
+                ;;
+        esac
+
+        save_LIBS="$LIBS"
+        save_CFLAGS="$CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Check for various functions.  We must include pthread.h,
+        # since some functions may be macros.  (On the Sequent, we
+        # need a special flag -Kthread to make this header compile.)
+        # We check for pthread_join because it is in -lpthread on IRIX
+        # while pthread_create is in libc.  We check for pthread_attr_init
+        # due to DEC craziness with -lpthreads.  We check for
+        # pthread_cleanup_push because it is one of the few pthread
+        # functions on Solaris that doesn't have a non-functional libc stub.
+        # We try pthread_create on general principles.
+        AC_TRY_LINK([#include <pthread.h>],
+                    [pthread_t th; pthread_join(th, 0);
+                     pthread_attr_init(0); pthread_cleanup_push(0, 0);
+                     pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
+                    [acx_pthread_ok=yes])
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        AC_MSG_RESULT($acx_pthread_ok)
+        if test "x$acx_pthread_ok" = xyes; then
+                break;
+        fi
+
+        PTHREAD_LIBS=""
+        PTHREAD_CFLAGS=""
+done
+fi
+
+# Various other checks:
+if test "x$acx_pthread_ok" = xyes; then
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Detect AIX lossage: threads are created detached by default
+        # and the JOINABLE attribute has a nonstandard name (UNDETACHED).
+        AC_MSG_CHECKING([for joinable pthread attribute])
+        AC_TRY_LINK([#include <pthread.h>],
+                    [int attr=PTHREAD_CREATE_JOINABLE;],
+                    ok=PTHREAD_CREATE_JOINABLE, ok=unknown)
+        if test x"$ok" = xunknown; then
+                AC_TRY_LINK([#include <pthread.h>],
+                            [int attr=PTHREAD_CREATE_UNDETACHED;],
+                            ok=PTHREAD_CREATE_UNDETACHED, ok=unknown)
+        fi
+        if test x"$ok" != xPTHREAD_CREATE_JOINABLE; then
+                AC_DEFINE(PTHREAD_CREATE_JOINABLE, $ok,
+                          [Define to the necessary symbol if this constant
+                           uses a non-standard name on your system.])
+        fi
+        AC_MSG_RESULT(${ok})
+        if test x"$ok" = xunknown; then
+                AC_MSG_WARN([we do not know how to create joinable pthreads])
+        fi
+
+        AC_MSG_CHECKING([if more special flags are required for pthreads])
+        flag=no
+        case "${host_cpu}-${host_os}" in
+                *-aix* | *-freebsd*)     flag="-D_THREAD_SAFE";;
+                *solaris* | alpha*-osf*) flag="-D_REENTRANT";;
+        esac
+        AC_MSG_RESULT(${flag})
+        if test "x$flag" != xno; then
+                PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
+        fi
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        # More AIX lossage: must compile with cc_r
+        AC_CHECK_PROG(PTHREAD_CC, cc_r, cc_r, ${CC})
+else
+        PTHREAD_CC="$CC"
+fi
+
+AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(PTHREAD_CFLAGS)
+AC_SUBST(PTHREAD_CC)
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$acx_pthread_ok" = xyes; then
+        ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1])
+        :
+else
+        acx_pthread_ok=no
+        $2
+fi
+
+])dnl ACX_PTHREAD
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/config.guess b/openmp-avx512/basic/optional/ThreadPool/config/config.guess
new file mode 100755
index 0000000..396482d
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/config.guess
@@ -0,0 +1,1500 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
+#   Inc.
+
+timestamp='2006-07-02'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Originally written by Per Bothner <per@bothner.com>.
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
+#
+# The plan is that this can be called by configure scripts if you
+# don't specify an explicit build system type.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system \`$me' is run on.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+trap 'exit 1' 1 2 15
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+	for c in cc gcc c89 c99 ; do
+	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	     CC_FOR_BUILD="$c"; break ;
+	  fi ;
+	done ;
+	if test x"$CC_FOR_BUILD" = x ; then
+	  CC_FOR_BUILD=no_compiler_found ;
+	fi
+	;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ; set_cc_for_build= ;'
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	sysctl="sysctl -n hw.machine_arch"
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+	case "${UNAME_MACHINE_ARCH}" in
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently, or will in the future.
+	case "${UNAME_MACHINE_ARCH}" in
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		eval $set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep __ELF__ >/dev/null
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+	        os=netbsd
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case "${UNAME_VERSION}" in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	echo "${machine}-${os}${release}"
+	exit ;;
+    *:OpenBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
+	exit ;;
+    *:ekkoBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
+	exit ;;
+    *:SolidBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
+	exit ;;
+    macppc:MirBSD:*:*)
+	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    *:MirBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    alpha:OSF1:*:*)
+	case $UNAME_RELEASE in
+	*4.0)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+		;;
+	*5.*)
+	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case "$ALPHA_CPU_TYPE" in
+	    "EV4 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE="alphaev5" ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE="alphaev56" ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE="alphapca56" ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE="alphapca57" ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE="alphaev6" ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE="alphaev67" ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE="alphaev69" ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE="alphaev7" ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE="alphaev79" ;;
+	esac
+	# A Pn.n version is a patched version.
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	exit ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-unknown-sysv4
+	exit ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-morphos
+	exit ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit ;;
+    *:z/VM:*:*)
+	echo s390-ibm-zvmoe
+	exit ;;
+    *:OS400:*:*)
+        echo powerpc-ibm-os400
+	exit ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit ;;
+    arm:riscos:*:*|arm:RISCOS:*:*)
+	echo arm-unknown-riscos
+	exit ;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit ;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit ;;
+    DRS?6000:unix:4.0:6*)
+	echo sparc-icl-nx6
+	exit ;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) echo sparc-icl-nx7; exit ;;
+	esac ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    i86pc:SunOS:5.*:*)
+	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+        exit ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+        echo m68k-milan-mint${UNAME_RELEASE}
+        exit ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+        echo m68k-hades-mint${UNAME_RELEASE}
+        exit ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+        echo m68k-unknown-mint${UNAME_RELEASE}
+        exit ;;
+    m68k:machten:*:*)
+	echo m68k-apple-machten${UNAME_RELEASE}
+	exit ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c &&
+	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`$dummy $dummyarg` &&
+	    { echo "$SYSTEM_NAME"; exit; }
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit ;;
+    Motorola:PowerMAX_OS:*:*)
+	echo powerpc-motorola-powermax
+	exit ;;
+    Motorola:*:4.3:PL8-*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit ;;
+    AViiON:dgux:*:*)
+        # DG/UX returns AViiON for all architectures
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+ 	exit ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
+	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	echo i386-ibm-aix
+	exit ;;
+    ia64:AIX:*:*)
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		eval $set_cc_for_build
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+		then
+			echo "$SYSTEM_NAME"
+		else
+			echo rs6000-ibm-aix3.2.5
+		fi
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit ;;
+    *:AIX:*:[45])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit ;;                             # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit ;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if [ -x /usr/bin/getconf ]; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+                    case "${sc_cpu_version}" in
+                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+                      532)                      # CPU_PA_RISC2_0
+                        case "${sc_kernel_bits}" in
+                          32) HP_ARCH="hppa2.0n" ;;
+                          64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+                        esac ;;
+                    esac
+		fi
+		if [ "${HP_ARCH}" = "" ]; then
+		    eval $set_cc_for_build
+		    sed 's/^              //' << EOF >$dummy.c
+
+              #define _HPUX_SOURCE
+              #include <stdlib.h>
+              #include <unistd.h>
+
+              int main ()
+              {
+              #if defined(_SC_KERNEL_BITS)
+                  long bits = sysconf(_SC_KERNEL_BITS);
+              #endif
+                  long cpu  = sysconf (_SC_CPU_VERSION);
+
+                  switch (cpu)
+              	{
+              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+              	case CPU_PA_RISC2_0:
+              #if defined(_SC_KERNEL_BITS)
+              	    switch (bits)
+              		{
+              		case 64: puts ("hppa2.0w"); break;
+              		case 32: puts ("hppa2.0n"); break;
+              		default: puts ("hppa2.0"); break;
+              		} break;
+              #else  /* !defined(_SC_KERNEL_BITS) */
+              	    puts ("hppa2.0"); break;
+              #endif
+              	default: puts ("hppa1.0"); break;
+              	}
+                  exit (0);
+              }
+EOF
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if [ ${HP_ARCH} = "hppa2.0w" ]
+	then
+	    eval $set_cc_for_build
+
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep __LP64__ >/dev/null
+	    then
+		HP_ARCH="hppa2.0w"
+	    else
+		HP_ARCH="hppa64"
+	    fi
+	fi
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit ;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux${HPUX_REV}
+	exit ;;
+    3050*:HI-UX:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+		{ echo "$SYSTEM_NAME"; exit; }
+	echo unknown-hitachi-hiuxwe2
+	exit ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit ;;
+    i*86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+        exit ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+        exit ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+        exit ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+        exit ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+        exit ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    *:UNICOS/mp:*:*)
+	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+        exit ;;
+    5000:UNIX_System_V:4.*:*)
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+	exit ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:FreeBSD:*:*)
+	case ${UNAME_MACHINE} in
+	    pc98)
+		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    amd64)
+		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    *)
+		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	esac
+	exit ;;
+    i*:CYGWIN*:*)
+	echo ${UNAME_MACHINE}-pc-cygwin
+	exit ;;
+    i*:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit ;;
+    i*:windows32*:*)
+    	# uname -m includes "-pc" on this system.
+    	echo ${UNAME_MACHINE}-mingw32
+	exit ;;
+    i*:PW*:*)
+	echo ${UNAME_MACHINE}-pc-pw32
+	exit ;;
+    x86:Interix*:[3456]*)
+	echo i586-pc-interix${UNAME_RELEASE}
+	exit ;;
+    EM64T:Interix*:[3456]*)
+	echo x86_64-unknown-interix${UNAME_RELEASE}
+	exit ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+	echo i${UNAME_MACHINE}-pc-mks
+	exit ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i586-pc-interix
+	exit ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit ;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	echo x86_64-unknown-cygwin
+	exit ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    *:GNU:*:*)
+	# the GNU system
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit ;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	exit ;;
+    i*86:Minix:*:*)
+	echo ${UNAME_MACHINE}-pc-minix
+	exit ;;
+    arm*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    avr32*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    cris:Linux:*:*)
+	echo cris-axis-linux-gnu
+	exit ;;
+    crisv32:Linux:*:*)
+	echo crisv32-axis-linux-gnu
+	exit ;;
+    frv:Linux:*:*)
+    	echo frv-unknown-linux-gnu
+	exit ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m32r*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    mips:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips
+	#undef mipsel
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mipsel
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	;;
+    mips64:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips64
+	#undef mips64el
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mips64el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips64
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	;;
+    or32:Linux:*:*)
+	echo or32-unknown-linux-gnu
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
+	exit ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
+	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
+	  *)    echo hppa-unknown-linux-gnu ;;
+	esac
+	exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
+	exit ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	echo ${UNAME_MACHINE}-ibm-linux
+	exit ;;
+    sh64*:Linux:*:*)
+    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sh*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    vax:Linux:*:*)
+	echo ${UNAME_MACHINE}-dec-linux-gnu
+	exit ;;
+    x86_64:Linux:*:*)
+	echo x86_64-unknown-linux-gnu
+	exit ;;
+    i*86:Linux:*:*)
+	# The BFD linker knows what the default object file format is, so
+	# first see if it will tell us. cd to the root directory to prevent
+	# problems with other programs or directories called `ld' in the path.
+	# Set LC_ALL=C to ensure ld outputs messages in English.
+	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
+			 | sed -ne '/supported targets:/!d
+				    s/[ 	][ 	]*/ /g
+				    s/.*supported targets: *//
+				    s/ .*//
+				    p'`
+        case "$ld_supported_targets" in
+	  elf32-i386)
+		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
+		;;
+	  a.out-i386-linux)
+		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
+		exit ;;
+	  coff-i386)
+		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
+		exit ;;
+	  "")
+		# Either a pre-BFD a.out linker (linux-gnuoldld) or
+		# one that does not give us useful --help.
+		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
+		exit ;;
+	esac
+	# Determine whether the default compiler is a.out or elf
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <features.h>
+	#ifdef __ELF__
+	# ifdef __GLIBC__
+	#  if __GLIBC__ >= 2
+	LIBC=gnu
+	#  else
+	LIBC=gnulibc1
+	#  endif
+	# else
+	LIBC=gnulibc1
+	# endif
+	#else
+	#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+	LIBC=gnu
+	#else
+	LIBC=gnuaout
+	#endif
+	#endif
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^LIBC/{
+		s: ::g
+		p
+	    }'`"
+	test x"${LIBC}" != x && {
+		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+		exit
+	}
+	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
+	;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	echo i386-sequent-sysv4
+	exit ;;
+    i*86:UNIX_SV:4.2MP:2.*)
+        # Unixware is an offshoot of SVR4, but it has its own version
+        # number series starting with 2...
+        # I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+        # Use sysv4.2uw... so that sysv4* matches it.
+	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+	exit ;;
+    i*86:OS/2:*:*)
+	# If we were able to find `uname', then EMX Unix compatibility
+	# is probably installed.
+	echo ${UNAME_MACHINE}-pc-os2-emx
+	exit ;;
+    i*86:XTS-300:*:STOP)
+	echo ${UNAME_MACHINE}-unknown-stop
+	exit ;;
+    i*86:atheos:*:*)
+	echo ${UNAME_MACHINE}-unknown-atheos
+	exit ;;
+    i*86:syllable:*:*)
+	echo ${UNAME_MACHINE}-pc-syllable
+	exit ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    i*86:*DOS:*:*)
+	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	exit ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+	else
+		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+	fi
+	exit ;;
+    i*86:*:5:[678]*)
+    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	exit ;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+	else
+		echo ${UNAME_MACHINE}-pc-sysv32
+	fi
+	exit ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+        # uname -m prints for DJGPP always 'pc', but it prints nothing about
+        # the processor, so we play safe by assuming i386.
+	echo i386-pc-msdosdjgpp
+        exit ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	echo m68k-convergent-sysv
+	exit ;;
+    M680?0:D-NIX:5.3:*)
+	echo m68k-diab-dnix
+	exit ;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+          && { echo i486-ncr-sysv4; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    rs6000:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+	echo powerpc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit ;;
+    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                      # says <Richard.M.Bartel@ccMail.Census.GOV>
+        echo i586-unisys-sysv4
+        exit ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit ;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	echo i860-stratus-sysv4
+	exit ;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo ${UNAME_MACHINE}-stratus-vos
+	exit ;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo hppa1.1-stratus-vos
+	exit ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit ;;
+    news*:NEWS-OS:6*:*)
+	echo mips-sony-newsos6
+	exit ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+	        echo mips-nec-sysv${UNAME_RELEASE}
+	else
+	        echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+        exit ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-6:SUPER-UX:*:*)
+	echo sx6-nec-superux${UNAME_RELEASE}
+	exit ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Darwin:*:*)
+	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+	case $UNAME_PROCESSOR in
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
+	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	exit ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = "x86"; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	exit ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit ;;
+    NSE-?:NONSTOP_KERNEL:*:*)
+	echo nse-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSR-?:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    *:NonStop-UX:*:*)
+	echo mips-compaq-nonstopux
+	exit ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit ;;
+    DS/*:UNIX_System_V:*:*)
+	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	exit ;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "$cputype" = "386"; then
+	    UNAME_MACHINE=i386
+	else
+	    UNAME_MACHINE="$cputype"
+	fi
+	echo ${UNAME_MACHINE}-unknown-plan9
+	exit ;;
+    *:TOPS-10:*:*)
+	echo pdp10-unknown-tops10
+	exit ;;
+    *:TENEX:*:*)
+	echo pdp10-unknown-tenex
+	exit ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	echo pdp10-dec-tops20
+	exit ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	echo pdp10-xkl-tops20
+	exit ;;
+    *:TOPS-20:*:*)
+	echo pdp10-unknown-tops20
+	exit ;;
+    *:ITS:*:*)
+	echo pdp10-unknown-its
+	exit ;;
+    SEI:*:*:SEIUX)
+        echo mips-sei-seiux${UNAME_RELEASE}
+	exit ;;
+    *:DragonFly:*:*)
+	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit ;;
+    *:*VMS:*:*)
+    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case "${UNAME_MACHINE}" in
+	    A*) echo alpha-dec-vms ; exit ;;
+	    I*) echo ia64-dec-vms ; exit ;;
+	    V*) echo vax-dec-vms ; exit ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	echo i386-pc-xenix
+	exit ;;
+    i*86:skyos:*:*)
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+	exit ;;
+    i*86:rdos:*:*)
+	echo ${UNAME_MACHINE}-pc-rdos
+	exit ;;
+esac
+
+#echo '(No uname command or uname output not recognized.)' 1>&2
+#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
+
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+          "4"
+#else
+	  ""
+#endif
+         ); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix\n"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+	{ echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    c34*)
+	echo c34-convex-bsd
+	exit ;;
+    c38*)
+	echo c38-convex-bsd
+	exit ;;
+    c4*)
+	echo c4-convex-bsd
+	exit ;;
+    esac
+fi
+
+cat >&2 <<EOF
+$0: unable to guess system type
+
+This script, last modified $timestamp, has failed to recognize
+the operating system you are using. It is advised that you
+download the most up to date version of the config scripts from
+
+  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.guess
+and
+  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.sub
+
+If the version you run ($0) is already up to date, please
+send the following data and any information you think might be
+pertinent to <config-patches@gnu.org> in order to provide the needed
+information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/config.sub b/openmp-avx512/basic/optional/ThreadPool/config/config.sub
new file mode 100755
index 0000000..fab0aa3
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/config.sub
@@ -0,0 +1,1616 @@
+#! /bin/sh
+# Configuration validation subroutine script.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
+#   Inc.
+
+timestamp='2006-09-20'
+
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine.  It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+       $0 [OPTION] ALIAS
+
+Canonicalize a configuration name.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help"
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo $1
+       exit ;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
+esac
+
+# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
+# Here we must recognize all the valid KERNEL-OS combinations.
+maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+case $maybe_os in
+  nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \
+  uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \
+  storm-chaos* | os2-emx* | rtmk-nova*)
+    os=-$maybe_os
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    ;;
+  *)
+    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
+    if [ $basic_machine != $1 ]
+    then os=`echo $1 | sed 's/.*-/-/'`
+    else os=; fi
+    ;;
+esac
+
+### Let's recognize common machines as not being operating systems so
+### that things like config.sub decstation-3100 work.  We also
+### recognize some manufacturers as not being operating systems, so we
+### can provide default operating systems below.
+case $os in
+	-sun*os*)
+		# Prevent following clause from handling this invalid input.
+		;;
+	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
+	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
+	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
+	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
+	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
+	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
+	-apple | -axis | -knuth | -cray)
+		os=
+		basic_machine=$1
+		;;
+	-sim | -cisco | -oki | -wec | -winbond)
+		os=
+		basic_machine=$1
+		;;
+	-scout)
+		;;
+	-wrs)
+		os=-vxworks
+		basic_machine=$1
+		;;
+	-chorusos*)
+		os=-chorusos
+		basic_machine=$1
+		;;
+ 	-chorusrdb)
+ 		os=-chorusrdb
+		basic_machine=$1
+ 		;;
+	-hiux*)
+		os=-hiuxwe2
+		;;
+	-sco6)
+		os=-sco5v6
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco5)
+		os=-sco3.2v5
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco4)
+		os=-sco3.2v4
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2.[4-9]*)
+		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2v[4-9]*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco5v6*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco*)
+		os=-sco3.2v2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-udk*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-isc)
+		os=-isc2.2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-clix*)
+		basic_machine=clipper-intergraph
+		;;
+	-isc*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-lynx*)
+		os=-lynxos
+		;;
+	-ptx*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+		;;
+	-windowsnt*)
+		os=`echo $os | sed -e 's/windowsnt/winnt/'`
+		;;
+	-psos*)
+		os=-psos
+		;;
+	-mint | -mint[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+esac
+
+# Decode aliases for certain CPU-COMPANY combinations.
+case $basic_machine in
+	# Recognize the basic CPU types without company name.
+	# Some are omitted here because they have special meanings below.
+	1750a | 580 \
+	| a29k \
+	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
+	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
+	| am33_2.0 \
+	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
+	| bfin \
+	| c4x | clipper \
+	| d10v | d30v | dlx | dsp16xx \
+	| fr30 | frv \
+	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+	| i370 | i860 | i960 | ia64 \
+	| ip2k | iq2000 \
+	| m32c | m32r | m32rle | m68000 | m68k | m88k \
+	| maxq | mb | microblaze | mcore \
+	| mips | mipsbe | mipseb | mipsel | mipsle \
+	| mips16 \
+	| mips64 | mips64el \
+	| mips64vr | mips64vrel \
+	| mips64orion | mips64orionel \
+	| mips64vr4100 | mips64vr4100el \
+	| mips64vr4300 | mips64vr4300el \
+	| mips64vr5000 | mips64vr5000el \
+	| mips64vr5900 | mips64vr5900el \
+	| mipsisa32 | mipsisa32el \
+	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa64 | mipsisa64el \
+	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64sb1 | mipsisa64sb1el \
+	| mipsisa64sr71k | mipsisa64sr71kel \
+	| mipstx39 | mipstx39el \
+	| mn10200 | mn10300 \
+	| mt \
+	| msp430 \
+	| nios | nios2 \
+	| ns16k | ns32k \
+	| or32 \
+	| pdp10 | pdp11 | pj | pjl \
+	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
+	| pyramid \
+	| score \
+	| sh | sh[1234] | sh[24]a | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+	| sh64 | sh64le \
+	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
+	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
+	| spu | strongarm \
+	| tahoe | thumb | tic4x | tic80 | tron \
+	| v850 | v850e \
+	| we32k \
+	| x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \
+	| z8k)
+		basic_machine=$basic_machine-unknown
+		;;
+	m6811 | m68hc11 | m6812 | m68hc12)
+		# Motorola 68HC11/12.
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
+		;;
+	ms1)
+		basic_machine=mt-unknown
+		;;
+
+	# We use `pc' rather than `unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i*86 | x86_64)
+	  basic_machine=$basic_machine-pc
+	  ;;
+	# Object if more than one company name word.
+	*-*-*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+	# Recognize the basic CPU types with company name.
+	580-* \
+	| a29k-* \
+	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
+	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
+	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
+	| avr-* | avr32-* \
+	| bfin-* | bs2000-* \
+	| c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \
+	| clipper-* | craynv-* | cydra-* \
+	| d10v-* | d30v-* | dlx-* \
+	| elxsi-* \
+	| f30[01]-* | f700-* | fr30-* | frv-* | fx80-* \
+	| h8300-* | h8500-* \
+	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
+	| i*86-* | i860-* | i960-* | ia64-* \
+	| ip2k-* | iq2000-* \
+	| m32c-* | m32r-* | m32rle-* \
+	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* \
+	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
+	| mips16-* \
+	| mips64-* | mips64el-* \
+	| mips64vr-* | mips64vrel-* \
+	| mips64orion-* | mips64orionel-* \
+	| mips64vr4100-* | mips64vr4100el-* \
+	| mips64vr4300-* | mips64vr4300el-* \
+	| mips64vr5000-* | mips64vr5000el-* \
+	| mips64vr5900-* | mips64vr5900el-* \
+	| mipsisa32-* | mipsisa32el-* \
+	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa64-* | mipsisa64el-* \
+	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64sb1-* | mipsisa64sb1el-* \
+	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
+	| mipstx39-* | mipstx39el-* \
+	| mmix-* \
+	| mt-* \
+	| msp430-* \
+	| nios-* | nios2-* \
+	| none-* | np1-* | ns16k-* | ns32k-* \
+	| orion-* \
+	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
+	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
+	| pyramid-* \
+	| romp-* | rs6000-* \
+	| sh-* | sh[1234]-* | sh[24]a-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
+	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
+	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
+	| sparclite-* \
+	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \
+	| tahoe-* | thumb-* \
+	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+	| tron-* \
+	| v850-* | v850e-* | vax-* \
+	| we32k-* \
+	| x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \
+	| xstormy16-* | xtensa-* \
+	| ymp-* \
+	| z8k-*)
+		;;
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	386bsd)
+		basic_machine=i386-unknown
+		os=-bsd
+		;;
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		basic_machine=m68000-att
+		;;
+	3b*)
+		basic_machine=we32k-att
+		;;
+	a29khif)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+    	abacus)
+		basic_machine=abacus-unknown
+		;;
+	adobe68k)
+		basic_machine=m68010-adobe
+		os=-scout
+		;;
+	alliant | fx80)
+		basic_machine=fx80-alliant
+		;;
+	altos | altos3068)
+		basic_machine=m68k-altos
+		;;
+	am29k)
+		basic_machine=a29k-none
+		os=-bsd
+		;;
+	amd64)
+		basic_machine=x86_64-pc
+		;;
+	amd64-*)
+		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	amdahl)
+		basic_machine=580-amdahl
+		os=-sysv
+		;;
+	amiga | amiga-*)
+		basic_machine=m68k-unknown
+		;;
+	amigaos | amigados)
+		basic_machine=m68k-unknown
+		os=-amigaos
+		;;
+	amigaunix | amix)
+		basic_machine=m68k-unknown
+		os=-sysv4
+		;;
+	apollo68)
+		basic_machine=m68k-apollo
+		os=-sysv
+		;;
+	apollo68bsd)
+		basic_machine=m68k-apollo
+		os=-bsd
+		;;
+	aux)
+		basic_machine=m68k-apple
+		os=-aux
+		;;
+	balance)
+		basic_machine=ns32k-sequent
+		os=-dynix
+		;;
+	c90)
+		basic_machine=c90-cray
+		os=-unicos
+		;;
+	convex-c1)
+		basic_machine=c1-convex
+		os=-bsd
+		;;
+	convex-c2)
+		basic_machine=c2-convex
+		os=-bsd
+		;;
+	convex-c32)
+		basic_machine=c32-convex
+		os=-bsd
+		;;
+	convex-c34)
+		basic_machine=c34-convex
+		os=-bsd
+		;;
+	convex-c38)
+		basic_machine=c38-convex
+		os=-bsd
+		;;
+	cray | j90)
+		basic_machine=j90-cray
+		os=-unicos
+		;;
+	craynv)
+		basic_machine=craynv-cray
+		os=-unicosmp
+		;;
+	cr16c)
+		basic_machine=cr16c-unknown
+		os=-elf
+		;;
+	crds | unos)
+		basic_machine=m68k-crds
+		;;
+	crisv32 | crisv32-* | etraxfs*)
+		basic_machine=crisv32-axis
+		;;
+	cris | cris-* | etrax*)
+		basic_machine=cris-axis
+		;;
+	crx)
+		basic_machine=crx-unknown
+		os=-elf
+		;;
+	da30 | da30-*)
+		basic_machine=m68k-da30
+		;;
+	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
+		basic_machine=mips-dec
+		;;
+	decsystem10* | dec10*)
+		basic_machine=pdp10-dec
+		os=-tops10
+		;;
+	decsystem20* | dec20*)
+		basic_machine=pdp10-dec
+		os=-tops20
+		;;
+	delta | 3300 | motorola-3300 | motorola-delta \
+	      | 3300-motorola | delta-motorola)
+		basic_machine=m68k-motorola
+		;;
+	delta88)
+		basic_machine=m88k-motorola
+		os=-sysv3
+		;;
+	djgpp)
+		basic_machine=i586-pc
+		os=-msdosdjgpp
+		;;
+	dpx20 | dpx20-*)
+		basic_machine=rs6000-bull
+		os=-bosx
+		;;
+	dpx2* | dpx2*-bull)
+		basic_machine=m68k-bull
+		os=-sysv3
+		;;
+	ebmon29k)
+		basic_machine=a29k-amd
+		os=-ebmon
+		;;
+	elxsi)
+		basic_machine=elxsi-elxsi
+		os=-bsd
+		;;
+	encore | umax | mmax)
+		basic_machine=ns32k-encore
+		;;
+	es1800 | OSE68k | ose68k | ose | OSE)
+		basic_machine=m68k-ericsson
+		os=-ose
+		;;
+	fx2800)
+		basic_machine=i860-alliant
+		;;
+	genix)
+		basic_machine=ns32k-ns
+		;;
+	gmicro)
+		basic_machine=tron-gmicro
+		os=-sysv
+		;;
+	go32)
+		basic_machine=i386-pc
+		os=-go32
+		;;
+	h3050r* | hiux*)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	h8300hms)
+		basic_machine=h8300-hitachi
+		os=-hms
+		;;
+	h8300xray)
+		basic_machine=h8300-hitachi
+		os=-xray
+		;;
+	h8500hms)
+		basic_machine=h8500-hitachi
+		os=-hms
+		;;
+	harris)
+		basic_machine=m88k-harris
+		os=-sysv3
+		;;
+	hp300-*)
+		basic_machine=m68k-hp
+		;;
+	hp300bsd)
+		basic_machine=m68k-hp
+		os=-bsd
+		;;
+	hp300hpux)
+		basic_machine=m68k-hp
+		os=-hpux
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		basic_machine=m68000-hp
+		;;
+	hp9k3[2-9][0-9])
+		basic_machine=m68k-hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hppa-next)
+		os=-nextstep3
+		;;
+	hppaosf)
+		basic_machine=hppa1.1-hp
+		os=-osf
+		;;
+	hppro)
+		basic_machine=hppa1.1-hp
+		os=-proelf
+		;;
+	i370-ibm* | ibm*)
+		basic_machine=i370-ibm
+		;;
+# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
+	i*86v32)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv32
+		;;
+	i*86v4*)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv4
+		;;
+	i*86v)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv
+		;;
+	i*86sol2)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-solaris2
+		;;
+	i386mach)
+		basic_machine=i386-mach
+		os=-mach
+		;;
+	i386-vsta | vsta)
+		basic_machine=i386-unknown
+		os=-vsta
+		;;
+	iris | iris4d)
+		basic_machine=mips-sgi
+		case $os in
+		    -irix*)
+			;;
+		    *)
+			os=-irix4
+			;;
+		esac
+		;;
+	isi68 | isi)
+		basic_machine=m68k-isi
+		os=-sysv
+		;;
+	m88k-omron*)
+		basic_machine=m88k-omron
+		;;
+	magnum | m3230)
+		basic_machine=mips-mips
+		os=-sysv
+		;;
+	merlin)
+		basic_machine=ns32k-utek
+		os=-sysv
+		;;
+	mingw32)
+		basic_machine=i386-pc
+		os=-mingw32
+		;;
+	miniframe)
+		basic_machine=m68000-convergent
+		;;
+	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+	mips3*-*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+		;;
+	mips3*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+		;;
+	monitor)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	morphos)
+		basic_machine=powerpc-unknown
+		os=-morphos
+		;;
+	msdos)
+		basic_machine=i386-pc
+		os=-msdos
+		;;
+	ms1-*)
+		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
+		;;
+	mvs)
+		basic_machine=i370-ibm
+		os=-mvs
+		;;
+	ncr3000)
+		basic_machine=i486-ncr
+		os=-sysv4
+		;;
+	netbsd386)
+		basic_machine=i386-unknown
+		os=-netbsd
+		;;
+	netwinder)
+		basic_machine=armv4l-rebel
+		os=-linux
+		;;
+	news | news700 | news800 | news900)
+		basic_machine=m68k-sony
+		os=-newsos
+		;;
+	news1000)
+		basic_machine=m68030-sony
+		os=-newsos
+		;;
+	news-3600 | risc-news)
+		basic_machine=mips-sony
+		os=-newsos
+		;;
+	necv70)
+		basic_machine=v70-nec
+		os=-sysv
+		;;
+	next | m*-next )
+		basic_machine=m68k-next
+		case $os in
+		    -nextstep* )
+			;;
+		    -ns2*)
+		      os=-nextstep2
+			;;
+		    *)
+		      os=-nextstep3
+			;;
+		esac
+		;;
+	nh3000)
+		basic_machine=m68k-harris
+		os=-cxux
+		;;
+	nh[45]000)
+		basic_machine=m88k-harris
+		os=-cxux
+		;;
+	nindy960)
+		basic_machine=i960-intel
+		os=-nindy
+		;;
+	mon960)
+		basic_machine=i960-intel
+		os=-mon960
+		;;
+	nonstopux)
+		basic_machine=mips-compaq
+		os=-nonstopux
+		;;
+	np1)
+		basic_machine=np1-gould
+		;;
+	nsr-tandem)
+		basic_machine=nsr-tandem
+		;;
+	op50n-* | op60c-*)
+		basic_machine=hppa1.1-oki
+		os=-proelf
+		;;
+	openrisc | openrisc-*)
+		basic_machine=or32-unknown
+		;;
+	os400)
+		basic_machine=powerpc-ibm
+		os=-os400
+		;;
+	OSE68000 | ose68000)
+		basic_machine=m68000-ericsson
+		os=-ose
+		;;
+	os68k)
+		basic_machine=m68k-none
+		os=-os68k
+		;;
+	pa-hitachi)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	paragon)
+		basic_machine=i860-intel
+		os=-osf
+		;;
+	pbd)
+		basic_machine=sparc-tti
+		;;
+	pbb)
+		basic_machine=m68k-tti
+		;;
+	pc532 | pc532-*)
+		basic_machine=ns32k-pc532
+		;;
+	pc98)
+		basic_machine=i386-pc
+		;;
+	pc98-*)
+		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium | p5 | k5 | k6 | nexgen | viac3)
+		basic_machine=i586-pc
+		;;
+	pentiumpro | p6 | 6x86 | athlon | athlon_*)
+		basic_machine=i686-pc
+		;;
+	pentiumii | pentium2 | pentiumiii | pentium3)
+		basic_machine=i686-pc
+		;;
+	pentium4)
+		basic_machine=i786-pc
+		;;
+	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
+		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumpro-* | p6-* | 6x86-* | athlon-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium4-*)
+		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pn)
+		basic_machine=pn-gould
+		;;
+	power)	basic_machine=power-ibm
+		;;
+	ppc)	basic_machine=powerpc-unknown
+		;;
+	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppcle | powerpclittle | ppc-le | powerpc-little)
+		basic_machine=powerpcle-unknown
+		;;
+	ppcle-* | powerpclittle-*)
+		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64)	basic_machine=powerpc64-unknown
+		;;
+	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
+		basic_machine=powerpc64le-unknown
+		;;
+	ppc64le-* | powerpc64little-*)
+		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ps2)
+		basic_machine=i386-ibm
+		;;
+	pw32)
+		basic_machine=i586-unknown
+		os=-pw32
+		;;
+	rdos)
+		basic_machine=i386-pc
+		os=-rdos
+		;;
+	rom68k)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	rm[46]00)
+		basic_machine=mips-siemens
+		;;
+	rtpc | rtpc-*)
+		basic_machine=romp-ibm
+		;;
+	s390 | s390-*)
+		basic_machine=s390-ibm
+		;;
+	s390x | s390x-*)
+		basic_machine=s390x-ibm
+		;;
+	sa29200)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	sb1)
+		basic_machine=mipsisa64sb1-unknown
+		;;
+	sb1el)
+		basic_machine=mipsisa64sb1el-unknown
+		;;
+	sde)
+		basic_machine=mipsisa32-sde
+		os=-elf
+		;;
+	sei)
+		basic_machine=mips-sei
+		os=-seiux
+		;;
+	sequent)
+		basic_machine=i386-sequent
+		;;
+	sh)
+		basic_machine=sh-hitachi
+		os=-hms
+		;;
+	sh64)
+		basic_machine=sh64-unknown
+		;;
+	sparclite-wrs | simso-wrs)
+		basic_machine=sparclite-wrs
+		os=-vxworks
+		;;
+	sps7)
+		basic_machine=m68k-bull
+		os=-sysv2
+		;;
+	spur)
+		basic_machine=spur-unknown
+		;;
+	st2000)
+		basic_machine=m68k-tandem
+		;;
+	stratus)
+		basic_machine=i860-stratus
+		os=-sysv4
+		;;
+	sun2)
+		basic_machine=m68000-sun
+		;;
+	sun2os3)
+		basic_machine=m68000-sun
+		os=-sunos3
+		;;
+	sun2os4)
+		basic_machine=m68000-sun
+		os=-sunos4
+		;;
+	sun3os3)
+		basic_machine=m68k-sun
+		os=-sunos3
+		;;
+	sun3os4)
+		basic_machine=m68k-sun
+		os=-sunos4
+		;;
+	sun4os3)
+		basic_machine=sparc-sun
+		os=-sunos3
+		;;
+	sun4os4)
+		basic_machine=sparc-sun
+		os=-sunos4
+		;;
+	sun4sol2)
+		basic_machine=sparc-sun
+		os=-solaris2
+		;;
+	sun3 | sun3-*)
+		basic_machine=m68k-sun
+		;;
+	sun4)
+		basic_machine=sparc-sun
+		;;
+	sun386 | sun386i | roadrunner)
+		basic_machine=i386-sun
+		;;
+	sv1)
+		basic_machine=sv1-cray
+		os=-unicos
+		;;
+	symmetry)
+		basic_machine=i386-sequent
+		os=-dynix
+		;;
+	t3e)
+		basic_machine=alphaev5-cray
+		os=-unicos
+		;;
+	t90)
+		basic_machine=t90-cray
+		os=-unicos
+		;;
+	tic54x | c54x*)
+		basic_machine=tic54x-unknown
+		os=-coff
+		;;
+	tic55x | c55x*)
+		basic_machine=tic55x-unknown
+		os=-coff
+		;;
+	tic6x | c6x*)
+		basic_machine=tic6x-unknown
+		os=-coff
+		;;
+	tx39)
+		basic_machine=mipstx39-unknown
+		;;
+	tx39el)
+		basic_machine=mipstx39el-unknown
+		;;
+	toad1)
+		basic_machine=pdp10-xkl
+		os=-tops20
+		;;
+	tower | tower-32)
+		basic_machine=m68k-ncr
+		;;
+	tpf)
+		basic_machine=s390x-ibm
+		os=-tpf
+		;;
+	udi29k)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	ultra3)
+		basic_machine=a29k-nyu
+		os=-sym1
+		;;
+	v810 | necv810)
+		basic_machine=v810-nec
+		os=-none
+		;;
+	vaxv)
+		basic_machine=vax-dec
+		os=-sysv
+		;;
+	vms)
+		basic_machine=vax-dec
+		os=-vms
+		;;
+	vpp*|vx|vx-*)
+		basic_machine=f301-fujitsu
+		;;
+	vxworks960)
+		basic_machine=i960-wrs
+		os=-vxworks
+		;;
+	vxworks68)
+		basic_machine=m68k-wrs
+		os=-vxworks
+		;;
+	vxworks29k)
+		basic_machine=a29k-wrs
+		os=-vxworks
+		;;
+	w65*)
+		basic_machine=w65-wdc
+		os=-none
+		;;
+	w89k-*)
+		basic_machine=hppa1.1-winbond
+		os=-proelf
+		;;
+	xbox)
+		basic_machine=i686-pc
+		os=-mingw32
+		;;
+	xps | xps100)
+		basic_machine=xps100-honeywell
+		;;
+	ymp)
+		basic_machine=ymp-cray
+		os=-unicos
+		;;
+	z8k-*-coff)
+		basic_machine=z8k-unknown
+		os=-sim
+		;;
+	none)
+		basic_machine=none-none
+		os=-none
+		;;
+
+# Here we handle the default manufacturer of certain CPU types.  It is in
+# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		basic_machine=hppa1.1-winbond
+		;;
+	op50n)
+		basic_machine=hppa1.1-oki
+		;;
+	op60c)
+		basic_machine=hppa1.1-oki
+		;;
+	romp)
+		basic_machine=romp-ibm
+		;;
+	mmix)
+		basic_machine=mmix-knuth
+		;;
+	rs6000)
+		basic_machine=rs6000-ibm
+		;;
+	vax)
+		basic_machine=vax-dec
+		;;
+	pdp10)
+		# there are many clones, so DEC is not a safe bet
+		basic_machine=pdp10-unknown
+		;;
+	pdp11)
+		basic_machine=pdp11-dec
+		;;
+	we32k)
+		basic_machine=we32k-att
+		;;
+	sh[1234] | sh[24]a | sh[34]eb | sh[1234]le | sh[23]ele)
+		basic_machine=sh-unknown
+		;;
+	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
+		basic_machine=sparc-sun
+		;;
+	cydra)
+		basic_machine=cydra-cydrome
+		;;
+	orion)
+		basic_machine=orion-highlevel
+		;;
+	orion105)
+		basic_machine=clipper-highlevel
+		;;
+	mac | mpw | mac-mpw)
+		basic_machine=m68k-apple
+		;;
+	pmac | pmac-mpw)
+		basic_machine=powerpc-apple
+		;;
+	*-unknown)
+		# Make sure to match an already-canonicalized machine name.
+		;;
+	*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $basic_machine in
+	*-digital*)
+		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+		;;
+	*-commodore*)
+		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if [ x"$os" != x"" ]
+then
+case $os in
+        # First match some system type aliases
+        # that might get confused with valid system types.
+	# -solaris* is a basic system type, with this one exception.
+	-solaris1 | -solaris1.*)
+		os=`echo $os | sed -e 's|solaris1|sunos4|'`
+		;;
+	-solaris)
+		os=-solaris2
+		;;
+	-svr4*)
+		os=-sysv4
+		;;
+	-unixware*)
+		os=-sysv4.2uw
+		;;
+	-gnu/linux*)
+		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+		;;
+	# First accept the basic system types.
+	# The portable systems comes first.
+	# Each alternative MUST END IN A *, to match a version number.
+	# -sysv* is not here because it comes later, after sysvr4.
+	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
+	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
+	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
+	      | -aos* \
+	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
+	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
+	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
+	      | -openbsd* | -solidbsd* \
+	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
+	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
+	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
+	      | -chorusos* | -chorusrdb* \
+	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \
+	      | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
+	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
+	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
+	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
+	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
+	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
+	      | -skyos* | -haiku* | -rdos* | -toppers*)
+	# Remember, each alternative MUST END IN *, to match a version number.
+		;;
+	-qnx*)
+		case $basic_machine in
+		    x86-* | i*86-*)
+			;;
+		    *)
+			os=-nto$os
+			;;
+		esac
+		;;
+	-nto-qnx*)
+		;;
+	-nto*)
+		os=`echo $os | sed -e 's|nto|nto-qnx|'`
+		;;
+	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
+	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+		;;
+	-mac*)
+		os=`echo $os | sed -e 's|mac|macos|'`
+		;;
+	-linux-dietlibc)
+		os=-linux-dietlibc
+		;;
+	-linux*)
+		os=`echo $os | sed -e 's|linux|linux-gnu|'`
+		;;
+	-sunos5*)
+		os=`echo $os | sed -e 's|sunos5|solaris2|'`
+		;;
+	-sunos6*)
+		os=`echo $os | sed -e 's|sunos6|solaris3|'`
+		;;
+	-opened*)
+		os=-openedition
+		;;
+        -os400*)
+		os=-os400
+		;;
+	-wince*)
+		os=-wince
+		;;
+	-osfrose*)
+		os=-osfrose
+		;;
+	-osf*)
+		os=-osf
+		;;
+	-utek*)
+		os=-bsd
+		;;
+	-dynix*)
+		os=-bsd
+		;;
+	-acis*)
+		os=-aos
+		;;
+	-atheos*)
+		os=-atheos
+		;;
+	-syllable*)
+		os=-syllable
+		;;
+	-386bsd)
+		os=-bsd
+		;;
+	-ctix* | -uts*)
+		os=-sysv
+		;;
+	-nova*)
+		os=-rtmk-nova
+		;;
+	-ns2 )
+		os=-nextstep2
+		;;
+	-nsk*)
+		os=-nsk
+		;;
+	# Preserve the version number of sinix5.
+	-sinix5.*)
+		os=`echo $os | sed -e 's|sinix|sysv|'`
+		;;
+	-sinix*)
+		os=-sysv4
+		;;
+        -tpf*)
+		os=-tpf
+		;;
+	-triton*)
+		os=-sysv3
+		;;
+	-oss*)
+		os=-sysv3
+		;;
+	-svr4)
+		os=-sysv4
+		;;
+	-svr3)
+		os=-sysv3
+		;;
+	-sysvr4)
+		os=-sysv4
+		;;
+	# This must come after -sysvr4.
+	-sysv*)
+		;;
+	-ose*)
+		os=-ose
+		;;
+	-es1800*)
+		os=-ose
+		;;
+	-xenix)
+		os=-xenix
+		;;
+	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+		os=-mint
+		;;
+	-aros*)
+		os=-aros
+		;;
+	-kaos*)
+		os=-kaos
+		;;
+	-zvmoe)
+		os=-zvmoe
+		;;
+	-none)
+		;;
+	*)
+		# Get rid of the `-' at the beginning of $os.
+		os=`echo $os | sed 's/[^-]*-//'`
+		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+		exit 1
+		;;
+esac
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+case $basic_machine in
+        score-*)
+		os=-elf
+		;;
+        spu-*)
+		os=-elf
+		;;
+	*-acorn)
+		os=-riscix1.2
+		;;
+	arm*-rebel)
+		os=-linux
+		;;
+	arm*-semi)
+		os=-aout
+		;;
+        c4x-* | tic4x-*)
+        	os=-coff
+		;;
+	# This must come before the *-dec entry.
+	pdp10-*)
+		os=-tops20
+		;;
+	pdp11-*)
+		os=-none
+		;;
+	*-dec | vax-*)
+		os=-ultrix4.2
+		;;
+	m68*-apollo)
+		os=-domain
+		;;
+	i386-sun)
+		os=-sunos4.0.2
+		;;
+	m68000-sun)
+		os=-sunos3
+		# This also exists in the configure program, but was not the
+		# default.
+		# os=-sunos4
+		;;
+	m68*-cisco)
+		os=-aout
+		;;
+	mips*-cisco)
+		os=-elf
+		;;
+	mips*-*)
+		os=-elf
+		;;
+	or32-*)
+		os=-coff
+		;;
+	*-tti)	# must be before sparc entry or we get the wrong os.
+		os=-sysv3
+		;;
+	sparc-* | *-sun)
+		os=-sunos4.1.1
+		;;
+	*-be)
+		os=-beos
+		;;
+	*-haiku)
+		os=-haiku
+		;;
+	*-ibm)
+		os=-aix
+		;;
+    	*-knuth)
+		os=-mmixware
+		;;
+	*-wec)
+		os=-proelf
+		;;
+	*-winbond)
+		os=-proelf
+		;;
+	*-oki)
+		os=-proelf
+		;;
+	*-hp)
+		os=-hpux
+		;;
+	*-hitachi)
+		os=-hiux
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=-sysv
+		;;
+	*-cbm)
+		os=-amigaos
+		;;
+	*-dg)
+		os=-dgux
+		;;
+	*-dolphin)
+		os=-sysv3
+		;;
+	m68k-ccur)
+		os=-rtu
+		;;
+	m88k-omron*)
+		os=-luna
+		;;
+	*-next )
+		os=-nextstep
+		;;
+	*-sequent)
+		os=-ptx
+		;;
+	*-crds)
+		os=-unos
+		;;
+	*-ns)
+		os=-genix
+		;;
+	i370-*)
+		os=-mvs
+		;;
+	*-next)
+		os=-nextstep3
+		;;
+	*-gould)
+		os=-sysv
+		;;
+	*-highlevel)
+		os=-bsd
+		;;
+	*-encore)
+		os=-bsd
+		;;
+	*-sgi)
+		os=-irix
+		;;
+	*-siemens)
+		os=-sysv4
+		;;
+	*-masscomp)
+		os=-rtu
+		;;
+	f30[01]-fujitsu | f700-fujitsu)
+		os=-uxpv
+		;;
+	*-rom68k)
+		os=-coff
+		;;
+	*-*bug)
+		os=-coff
+		;;
+	*-apple)
+		os=-macos
+		;;
+	*-atari*)
+		os=-mint
+		;;
+	*)
+		os=-none
+		;;
+esac
+fi
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+vendor=unknown
+case $basic_machine in
+	*-unknown)
+		case $os in
+			-riscix*)
+				vendor=acorn
+				;;
+			-sunos*)
+				vendor=sun
+				;;
+			-aix*)
+				vendor=ibm
+				;;
+			-beos*)
+				vendor=be
+				;;
+			-hpux*)
+				vendor=hp
+				;;
+			-mpeix*)
+				vendor=hp
+				;;
+			-hiux*)
+				vendor=hitachi
+				;;
+			-unos*)
+				vendor=crds
+				;;
+			-dgux*)
+				vendor=dg
+				;;
+			-luna*)
+				vendor=omron
+				;;
+			-genix*)
+				vendor=ns
+				;;
+			-mvs* | -opened*)
+				vendor=ibm
+				;;
+			-os400*)
+				vendor=ibm
+				;;
+			-ptx*)
+				vendor=sequent
+				;;
+			-tpf*)
+				vendor=ibm
+				;;
+			-vxsim* | -vxworks* | -windiss*)
+				vendor=wrs
+				;;
+			-aux*)
+				vendor=apple
+				;;
+			-hms*)
+				vendor=hitachi
+				;;
+			-mpw* | -macos*)
+				vendor=apple
+				;;
+			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+				vendor=atari
+				;;
+			-vos*)
+				vendor=stratus
+				;;
+		esac
+		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+		;;
+esac
+
+echo $basic_machine$os
+exit
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/depcomp b/openmp-avx512/basic/optional/ThreadPool/config/depcomp
new file mode 100755
index 0000000..ca5ea4e
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/depcomp
@@ -0,0 +1,584 @@
+#! /bin/sh
+# depcomp - compile a program generating dependencies as side-effects
+
+scriptversion=2006-10-15.18
+
+# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006 Free Software
+# Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+# 02110-1301, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
+
+case $1 in
+  '')
+     echo "$0: No command.  Try \`$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: depcomp [--help] [--version] PROGRAM [ARGS]
+
+Run PROGRAMS ARGS to compile a file, generating dependencies
+as side-effects.
+
+Environment variables:
+  depmode     Dependency tracking mode.
+  source      Source file read by `PROGRAMS ARGS'.
+  object      Object file output by `PROGRAMS ARGS'.
+  DEPDIR      directory where to store dependencies.
+  depfile     Dependency file to output.
+  tmpdepfile  Temporary file to use when outputing dependencies.
+  libtool     Whether libtool is used (yes/no).
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "depcomp $scriptversion"
+    exit $?
+    ;;
+esac
+
+if test -z "$depmode" || test -z "$source" || test -z "$object"; then
+  echo "depcomp: Variables source, object and depmode must be set" 1>&2
+  exit 1
+fi
+
+# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
+depfile=${depfile-`echo "$object" |
+  sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
+tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
+
+rm -f "$tmpdepfile"
+
+# Some modes work just like other modes, but use different flags.  We
+# parameterize here, but still list the modes in the big case below,
+# to make depend.m4 easier to write.  Note that we *cannot* use a case
+# here, because this file can only contain one case statement.
+if test "$depmode" = hp; then
+  # HP compiler uses -M and no extra arg.
+  gccflag=-M
+  depmode=gcc
+fi
+
+if test "$depmode" = dashXmstdout; then
+   # This is just like dashmstdout with a different argument.
+   dashmflag=-xM
+   depmode=dashmstdout
+fi
+
+case "$depmode" in
+gcc3)
+## gcc 3 implements dependency tracking that does exactly what
+## we want.  Yay!  Note: for some reason libtool 1.4 doesn't like
+## it if -MD -MP comes after the -MF stuff.  Hmm.
+## Unfortunately, FreeBSD c89 acceptance of flags depends upon
+## the command line argument order; so add the flags where they
+## appear in depend2.am.  Note that the slowdown incurred here
+## affects only configure: in makefiles, %FASTDEP% shortcuts this.
+  for arg
+  do
+    case $arg in
+    -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
+    *)  set fnord "$@" "$arg" ;;
+    esac
+    shift # fnord
+    shift # $arg
+  done
+  "$@"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  mv "$tmpdepfile" "$depfile"
+  ;;
+
+gcc)
+## There are various ways to get dependency output from gcc.  Here's
+## why we pick this rather obscure method:
+## - Don't want to use -MD because we'd like the dependencies to end
+##   up in a subdir.  Having to rename by hand is ugly.
+##   (We might end up doing this anyway to support other compilers.)
+## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
+##   -MM, not -M (despite what the docs say).
+## - Using -M directly means running the compiler twice (even worse
+##   than renaming).
+  if test -z "$gccflag"; then
+    gccflag=-MD,
+  fi
+  "$@" -Wp,"$gccflag$tmpdepfile"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+## The second -e expression handles DOS-style file names with drive letters.
+  sed -e 's/^[^:]*: / /' \
+      -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
+## This next piece of magic avoids the `deleted header file' problem.
+## The problem is that when a header file which appears in a .P file
+## is deleted, the dependency causes make to die (because there is
+## typically no way to rebuild the header).  We avoid this by adding
+## dummy dependencies for each header file.  Too bad gcc doesn't do
+## this for us directly.
+  tr ' ' '
+' < "$tmpdepfile" |
+## Some versions of gcc put a space before the `:'.  On the theory
+## that the space means something, we add a space to the output as
+## well.
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+sgi)
+  if test "$libtool" = yes; then
+    "$@" "-Wp,-MDupdate,$tmpdepfile"
+  else
+    "$@" -MDupdate "$tmpdepfile"
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+
+  if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
+    echo "$object : \\" > "$depfile"
+
+    # Clip off the initial element (the dependent).  Don't try to be
+    # clever and replace this with sed code, as IRIX sed won't handle
+    # lines with more than a fixed number of characters (4096 in
+    # IRIX 6.2 sed, 8192 in IRIX 6.5).  We also remove comment lines;
+    # the IRIX cc adds comments like `#:fec' to the end of the
+    # dependency line.
+    tr ' ' '
+' < "$tmpdepfile" \
+    | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
+    tr '
+' ' ' >> $depfile
+    echo >> $depfile
+
+    # The second pass generates a dummy entry for each header file.
+    tr ' ' '
+' < "$tmpdepfile" \
+   | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
+   >> $depfile
+  else
+    # The sourcefile does not contain any dependencies, so just
+    # store a dummy comment line, to avoid errors with the Makefile
+    # "include basename.Plo" scheme.
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile"
+  ;;
+
+aix)
+  # The C for AIX Compiler uses -M and outputs the dependencies
+  # in a .u file.  In older versions, this file always lives in the
+  # current directory.  Also, the AIX compiler puts `$object:' at the
+  # start of each line; $object doesn't have directory information.
+  # Version 6 uses the directory in both cases.
+  stripped=`echo "$object" | sed 's/\(.*\)\..*$/\1/'`
+  tmpdepfile="$stripped.u"
+  if test "$libtool" = yes; then
+    "$@" -Wc,-M
+  else
+    "$@" -M
+  fi
+  stat=$?
+
+  if test -f "$tmpdepfile"; then :
+  else
+    stripped=`echo "$stripped" | sed 's,^.*/,,'`
+    tmpdepfile="$stripped.u"
+  fi
+
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+
+  if test -f "$tmpdepfile"; then
+    outname="$stripped.o"
+    # Each line is of the form `foo.o: dependent.h'.
+    # Do two passes, one to just change these to
+    # `$object: dependent.h' and one to simply `dependent.h:'.
+    sed -e "s,^$outname:,$object :," < "$tmpdepfile" > "$depfile"
+    sed -e "s,^$outname: \(.*\)$,\1:," < "$tmpdepfile" >> "$depfile"
+  else
+    # The sourcefile does not contain any dependencies, so just
+    # store a dummy comment line, to avoid errors with the Makefile
+    # "include basename.Plo" scheme.
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile"
+  ;;
+
+icc)
+  # Intel's C compiler understands `-MD -MF file'.  However on
+  #    icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
+  # ICC 7.0 will fill foo.d with something like
+  #    foo.o: sub/foo.c
+  #    foo.o: sub/foo.h
+  # which is wrong.  We want:
+  #    sub/foo.o: sub/foo.c
+  #    sub/foo.o: sub/foo.h
+  #    sub/foo.c:
+  #    sub/foo.h:
+  # ICC 7.1 will output
+  #    foo.o: sub/foo.c sub/foo.h
+  # and will wrap long lines using \ :
+  #    foo.o: sub/foo.c ... \
+  #     sub/foo.h ... \
+  #     ...
+
+  "$@" -MD -MF "$tmpdepfile"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  # Each line is of the form `foo.o: dependent.h',
+  # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
+  # Do two passes, one to just change these to
+  # `$object: dependent.h' and one to simply `dependent.h:'.
+  sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
+  # Some versions of the HPUX 10.20 sed can't process this invocation
+  # correctly.  Breaking it into two sed invocations is a workaround.
+  sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
+    sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp2)
+  # The "hp" stanza above does not work with aCC (C++) and HP's ia64
+  # compilers, which have integrated preprocessors.  The correct option
+  # to use with these is +Maked; it writes dependencies to a file named
+  # 'foo.d', which lands next to the object file, wherever that
+  # happens to be.
+  # Much of this is similar to the tru64 case; see comments there.
+  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+  test "x$dir" = "x$object" && dir=
+  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  if test "$libtool" = yes; then
+    tmpdepfile1=$dir$base.d
+    tmpdepfile2=$dir.libs/$base.d
+    "$@" -Wc,+Maked
+  else
+    tmpdepfile1=$dir$base.d
+    tmpdepfile2=$dir$base.d
+    "$@" +Maked
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+     rm -f "$tmpdepfile1" "$tmpdepfile2"
+     exit $stat
+  fi
+
+  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
+  do
+    test -f "$tmpdepfile" && break
+  done
+  if test -f "$tmpdepfile"; then
+    sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
+    # Add `dependent.h:' lines.
+    sed -ne '2,${; s/^ *//; s/ \\*$//; s/$/:/; p;}' "$tmpdepfile" >> "$depfile"
+  else
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile" "$tmpdepfile2"
+  ;;
+
+tru64)
+   # The Tru64 compiler uses -MD to generate dependencies as a side
+   # effect.  `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
+   # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
+   # dependencies in `foo.d' instead, so we check for that too.
+   # Subdirectories are respected.
+   dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+   test "x$dir" = "x$object" && dir=
+   base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+
+   if test "$libtool" = yes; then
+      # With Tru64 cc, shared objects can also be used to make a
+      # static library.  This mechanism is used in libtool 1.4 series to
+      # handle both shared and static libraries in a single compilation.
+      # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
+      #
+      # With libtool 1.5 this exception was removed, and libtool now
+      # generates 2 separate objects for the 2 libraries.  These two
+      # compilations output dependencies in $dir.libs/$base.o.d and
+      # in $dir$base.o.d.  We have to check for both files, because
+      # one of the two compilations can be disabled.  We should prefer
+      # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
+      # automatically cleaned when .libs/ is deleted, while ignoring
+      # the former would cause a distcleancheck panic.
+      tmpdepfile1=$dir.libs/$base.lo.d   # libtool 1.4
+      tmpdepfile2=$dir$base.o.d          # libtool 1.5
+      tmpdepfile3=$dir.libs/$base.o.d    # libtool 1.5
+      tmpdepfile4=$dir.libs/$base.d      # Compaq CCC V6.2-504
+      "$@" -Wc,-MD
+   else
+      tmpdepfile1=$dir$base.o.d
+      tmpdepfile2=$dir$base.d
+      tmpdepfile3=$dir$base.d
+      tmpdepfile4=$dir$base.d
+      "$@" -MD
+   fi
+
+   stat=$?
+   if test $stat -eq 0; then :
+   else
+      rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
+      exit $stat
+   fi
+
+   for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
+   do
+     test -f "$tmpdepfile" && break
+   done
+   if test -f "$tmpdepfile"; then
+      sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
+      # That's a tab and a space in the [].
+      sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
+   else
+      echo "#dummy" > "$depfile"
+   fi
+   rm -f "$tmpdepfile"
+   ;;
+
+#nosideeffect)
+  # This comment above is used by automake to tell side-effect
+  # dependency tracking mechanisms from slower ones.
+
+dashmstdout)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout, regardless of -o.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test $1 != '--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  # Remove `-o $object'.
+  IFS=" "
+  for arg
+  do
+    case $arg in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    *)
+      set fnord "$@" "$arg"
+      shift # fnord
+      shift # $arg
+      ;;
+    esac
+  done
+
+  test -z "$dashmflag" && dashmflag=-M
+  # Require at least two characters before searching for `:'
+  # in the target name.  This is to cope with DOS-style filenames:
+  # a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
+  "$@" $dashmflag |
+    sed 's:^[  ]*[^: ][^:][^:]*\:[    ]*:'"$object"'\: :' > "$tmpdepfile"
+  rm -f "$depfile"
+  cat < "$tmpdepfile" > "$depfile"
+  tr ' ' '
+' < "$tmpdepfile" | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+dashXmstdout)
+  # This case only exists to satisfy depend.m4.  It is never actually
+  # run, as this mode is specially recognized in the preamble.
+  exit 1
+  ;;
+
+makedepend)
+  "$@" || exit $?
+  # Remove any Libtool call
+  if test "$libtool" = yes; then
+    while test $1 != '--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+  # X makedepend
+  shift
+  cleared=no
+  for arg in "$@"; do
+    case $cleared in
+    no)
+      set ""; shift
+      cleared=yes ;;
+    esac
+    case "$arg" in
+    -D*|-I*)
+      set fnord "$@" "$arg"; shift ;;
+    # Strip any option that makedepend may not understand.  Remove
+    # the object too, otherwise makedepend will parse it as a source file.
+    -*|$object)
+      ;;
+    *)
+      set fnord "$@" "$arg"; shift ;;
+    esac
+  done
+  obj_suffix="`echo $object | sed 's/^.*\././'`"
+  touch "$tmpdepfile"
+  ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
+  rm -f "$depfile"
+  cat < "$tmpdepfile" > "$depfile"
+  sed '1,2d' "$tmpdepfile" | tr ' ' '
+' | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile" "$tmpdepfile".bak
+  ;;
+
+cpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test $1 != '--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  # Remove `-o $object'.
+  IFS=" "
+  for arg
+  do
+    case $arg in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    *)
+      set fnord "$@" "$arg"
+      shift # fnord
+      shift # $arg
+      ;;
+    esac
+  done
+
+  "$@" -E |
+    sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
+       -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
+    sed '$ s: \\$::' > "$tmpdepfile"
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  cat < "$tmpdepfile" >> "$depfile"
+  sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+msvisualcpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout, regardless of -o,
+  # because we must use -o when running libtool.
+  "$@" || exit $?
+  IFS=" "
+  for arg
+  do
+    case "$arg" in
+    "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
+	set fnord "$@"
+	shift
+	shift
+	;;
+    *)
+	set fnord "$@" "$arg"
+	shift
+	shift
+	;;
+    esac
+  done
+  "$@" -E |
+  sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::echo "`cygpath -u \\"\1\\"`":p' | sort | uniq > "$tmpdepfile"
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::	\1 \\:p' >> "$depfile"
+  echo "	" >> "$depfile"
+  . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::\1\::p' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+none)
+  exec "$@"
+  ;;
+
+*)
+  echo "Unknown depmode $depmode" 1>&2
+  exit 1
+  ;;
+esac
+
+exit 0
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/generate-makeoptions.pl b/openmp-avx512/basic/optional/ThreadPool/config/generate-makeoptions.pl
new file mode 100755
index 0000000..a39223e
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/generate-makeoptions.pl
@@ -0,0 +1,86 @@
+#!/usr/bin/perl -w
+#
+# This perl script graps a bunch of make macro definitions
+# generated for Teuchos that can be used in other makefiles.
+# This is dumped to stdout and can be redirected to build
+# a makefile.
+#
+# Note, this script must be maintained to be current for
+# the Teuchos makefile.
+#
+use strict;
+
+if( !(defined(@ARGV) && scalar(@ARGV)==2) ) {
+  die "Error, this script takes two and only two arguments (makefile_name package_name).!\n";
+}
+
+my $makefile_name = shift;
+my $package_name  = shift;
+
+#
+# List the macros you want to grep and include in the output
+#
+my @macros =
+	(
+	 "CC"
+	 ,"CXX"
+	 ,"F77"
+	 ,"CXXLD"
+	 ,"DEFS"
+	 ,"CPPFLAGS"
+	 ,"CFLAGS"
+	 ,"CXXFLAGS"
+	 ,"FFLAGS"
+	 ,"LDFLAGS"
+	 ,"FLIBS"
+	 ,"BLAS_LIBS"
+	 ,"LAPACK_LIBS"
+	 ,"prefix"
+	 ,"AR"
+	 ,"ALTERNATE_AR"
+	 ,"libteuchos_a_AR"
+	 ,"RANLIB"
+	 );
+
+open FILE_IN, "<$makefile_name" || die "The file $makefile_name could not be opended for input\n";
+my @makefile_name_array = <FILE_IN>;
+close FILE_IN;
+
+#
+# Find the above macros and append "${package_name}_" to the beginning.
+#
+my @new_macros;
+my $add_next_line = 0;
+foreach( @makefile_name_array ) {
+	my $line = $_;
+	if($add_next_line) {
+		push @new_macros, $line;
+		if( substr($line,-1,1) eq "\\" ) {
+			$add_next_line = 1;
+		}
+		else {
+			$add_next_line = 0;
+		}
+		next;
+	}
+	#print "Line = $line";
+	foreach( @macros ) {
+		my $macro_search = "^${_} ";
+		#print "Macro search = \'$macro_search\'\n";
+		if( $line=~/$macro_search/ ) {
+			#print "Adding Macro!\n";
+      my $find_str = '\(CXX\)';
+      my $replace_str = "(${package_name}_CXX)";
+      $line=~s/$find_str/$replace_str/;
+			push @new_macros, "${package_name}_${line}";
+			if( substr($line,-2,1) eq "\\" ) {
+				$add_next_line = 1;
+			}
+			else {
+				$add_next_line = 0;
+			}
+		}
+	}
+}
+
+print join("",@new_macros);
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/install-sh b/openmp-avx512/basic/optional/ThreadPool/config/install-sh
new file mode 100755
index 0000000..4fbbae7
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/install-sh
@@ -0,0 +1,507 @@
+#!/bin/sh
+# install - install a program, script, or datafile
+
+scriptversion=2006-10-14.15
+
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
+#
+# Copyright (C) 1994 X Consortium
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+
+nl='
+'
+IFS=" ""	$nl"
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+if test -z "$doit"; then
+  doit_exec=exec
+else
+  doit_exec=$doit
+fi
+
+# Put in absolute file names if you don't have them in your path;
+# or use environment vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+posix_glob=
+posix_mkdir=
+
+# Desired mode of installed file.
+mode=0755
+
+chmodcmd=$chmodprog
+chowncmd=
+chgrpcmd=
+stripcmd=
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=
+dst=
+dir_arg=
+dstarg=
+no_target_directory=
+
+usage="Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
+   or: $0 [OPTION]... SRCFILES... DIRECTORY
+   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
+   or: $0 [OPTION]... -d DIRECTORIES...
+
+In the 1st form, copy SRCFILE to DSTFILE.
+In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
+In the 4th, create DIRECTORIES.
+
+Options:
+-c         (ignored)
+-d         create directories instead of installing files.
+-g GROUP   $chgrpprog installed files to GROUP.
+-m MODE    $chmodprog installed files to MODE.
+-o USER    $chownprog installed files to USER.
+-s         $stripprog installed files.
+-t DIRECTORY  install into DIRECTORY.
+-T         report an error if DSTFILE is a directory.
+--help     display this help and exit.
+--version  display version info and exit.
+
+Environment variables override the default commands:
+  CHGRPPROG CHMODPROG CHOWNPROG CPPROG MKDIRPROG MVPROG RMPROG STRIPPROG
+"
+
+while test $# -ne 0; do
+  case $1 in
+    -c) shift
+        continue;;
+
+    -d) dir_arg=true
+        shift
+        continue;;
+
+    -g) chgrpcmd="$chgrpprog $2"
+        shift
+        shift
+        continue;;
+
+    --help) echo "$usage"; exit $?;;
+
+    -m) mode=$2
+        shift
+        shift
+	case $mode in
+	  *' '* | *'	'* | *'
+'*	  | *'*'* | *'?'* | *'['*)
+	    echo "$0: invalid mode: $mode" >&2
+	    exit 1;;
+	esac
+        continue;;
+
+    -o) chowncmd="$chownprog $2"
+        shift
+        shift
+        continue;;
+
+    -s) stripcmd=$stripprog
+        shift
+        continue;;
+
+    -t) dstarg=$2
+	shift
+	shift
+	continue;;
+
+    -T) no_target_directory=true
+	shift
+	continue;;
+
+    --version) echo "$0 $scriptversion"; exit $?;;
+
+    --)	shift
+	break;;
+
+    -*)	echo "$0: invalid option: $1" >&2
+	exit 1;;
+
+    *)  break;;
+  esac
+done
+
+if test $# -ne 0 && test -z "$dir_arg$dstarg"; then
+  # When -d is used, all remaining arguments are directories to create.
+  # When -t is used, the destination is already specified.
+  # Otherwise, the last argument is the destination.  Remove it from $@.
+  for arg
+  do
+    if test -n "$dstarg"; then
+      # $@ is not empty: it contains at least $arg.
+      set fnord "$@" "$dstarg"
+      shift # fnord
+    fi
+    shift # arg
+    dstarg=$arg
+  done
+fi
+
+if test $# -eq 0; then
+  if test -z "$dir_arg"; then
+    echo "$0: no input file specified." >&2
+    exit 1
+  fi
+  # It's OK to call `install-sh -d' without argument.
+  # This can happen when creating conditional directories.
+  exit 0
+fi
+
+if test -z "$dir_arg"; then
+  trap '(exit $?); exit' 1 2 13 15
+
+  # Set umask so as not to create temps with too-generous modes.
+  # However, 'strip' requires both read and write access to temps.
+  case $mode in
+    # Optimize common cases.
+    *644) cp_umask=133;;
+    *755) cp_umask=22;;
+
+    *[0-7])
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw='% 200'
+      fi
+      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
+    *)
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw=,u+rw
+      fi
+      cp_umask=$mode$u_plus_rw;;
+  esac
+fi
+
+for src
+do
+  # Protect names starting with `-'.
+  case $src in
+    -*) src=./$src ;;
+  esac
+
+  if test -n "$dir_arg"; then
+    dst=$src
+    dstdir=$dst
+    test -d "$dstdir"
+    dstdir_status=$?
+  else
+
+    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
+    # might cause directories to be created, which would be especially bad
+    # if $src (and thus $dsttmp) contains '*'.
+    if test ! -f "$src" && test ! -d "$src"; then
+      echo "$0: $src does not exist." >&2
+      exit 1
+    fi
+
+    if test -z "$dstarg"; then
+      echo "$0: no destination specified." >&2
+      exit 1
+    fi
+
+    dst=$dstarg
+    # Protect names starting with `-'.
+    case $dst in
+      -*) dst=./$dst ;;
+    esac
+
+    # If destination is a directory, append the input filename; won't work
+    # if double slashes aren't ignored.
+    if test -d "$dst"; then
+      if test -n "$no_target_directory"; then
+	echo "$0: $dstarg: Is a directory" >&2
+	exit 1
+      fi
+      dstdir=$dst
+      dst=$dstdir/`basename "$src"`
+      dstdir_status=0
+    else
+      # Prefer dirname, but fall back on a substitute if dirname fails.
+      dstdir=`
+	(dirname "$dst") 2>/dev/null ||
+	expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	     X"$dst" : 'X\(//\)[^/]' \| \
+	     X"$dst" : 'X\(//\)$' \| \
+	     X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
+	echo X"$dst" |
+	    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)[^/].*/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\).*/{
+		   s//\1/
+		   q
+		 }
+		 s/.*/./; q'
+      `
+
+      test -d "$dstdir"
+      dstdir_status=$?
+    fi
+  fi
+
+  obsolete_mkdir_used=false
+
+  if test $dstdir_status != 0; then
+    case $posix_mkdir in
+      '')
+	# Create intermediate dirs using mode 755 as modified by the umask.
+	# This is like FreeBSD 'install' as of 1997-10-28.
+	umask=`umask`
+	case $stripcmd.$umask in
+	  # Optimize common cases.
+	  *[2367][2367]) mkdir_umask=$umask;;
+	  .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
+
+	  *[0-7])
+	    mkdir_umask=`expr $umask + 22 \
+	      - $umask % 100 % 40 + $umask % 20 \
+	      - $umask % 10 % 4 + $umask % 2
+	    `;;
+	  *) mkdir_umask=$umask,go-w;;
+	esac
+
+	# With -d, create the new directory with the user-specified mode.
+	# Otherwise, rely on $mkdir_umask.
+	if test -n "$dir_arg"; then
+	  mkdir_mode=-m$mode
+	else
+	  mkdir_mode=
+	fi
+
+	posix_mkdir=false
+	case $umask in
+	  *[123567][0-7][0-7])
+	    # POSIX mkdir -p sets u+wx bits regardless of umask, which
+	    # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
+	    ;;
+	  *)
+	    tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
+	    trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+
+	    if (umask $mkdir_umask &&
+		exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+	    then
+	      if test -z "$dir_arg" || {
+		   # Check for POSIX incompatibilities with -m.
+		   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
+		   # other-writeable bit of parent directory when it shouldn't.
+		   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
+		   ls_ld_tmpdir=`ls -ld "$tmpdir"`
+		   case $ls_ld_tmpdir in
+		     d????-?r-*) different_mode=700;;
+		     d????-?--*) different_mode=755;;
+		     *) false;;
+		   esac &&
+		   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
+		     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+		     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
+		   }
+		 }
+	      then posix_mkdir=:
+	      fi
+	      rmdir "$tmpdir/d" "$tmpdir"
+	    else
+	      # Remove any dirs left behind by ancient mkdir implementations.
+	      rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+	    fi
+	    trap '' 0;;
+	esac;;
+    esac
+
+    if
+      $posix_mkdir && (
+	umask $mkdir_umask &&
+	$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
+      )
+    then :
+    else
+
+      # The umask is ridiculous, or mkdir does not conform to POSIX,
+      # or it failed possibly due to a race condition.  Create the
+      # directory the slow way, step by step, checking for races as we go.
+
+      case $dstdir in
+	/*) prefix=/ ;;
+	-*) prefix=./ ;;
+	*)  prefix= ;;
+      esac
+
+      case $posix_glob in
+        '')
+	  if (set -f) 2>/dev/null; then
+	    posix_glob=true
+	  else
+	    posix_glob=false
+	  fi ;;
+      esac
+
+      oIFS=$IFS
+      IFS=/
+      $posix_glob && set -f
+      set fnord $dstdir
+      shift
+      $posix_glob && set +f
+      IFS=$oIFS
+
+      prefixes=
+
+      for d
+      do
+	test -z "$d" && continue
+
+	prefix=$prefix$d
+	if test -d "$prefix"; then
+	  prefixes=
+	else
+	  if $posix_mkdir; then
+	    (umask=$mkdir_umask &&
+	     $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
+	    # Don't fail if two instances are running concurrently.
+	    test -d "$prefix" || exit 1
+	  else
+	    case $prefix in
+	      *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
+	      *) qprefix=$prefix;;
+	    esac
+	    prefixes="$prefixes '$qprefix'"
+	  fi
+	fi
+	prefix=$prefix/
+      done
+
+      if test -n "$prefixes"; then
+	# Don't fail if two instances are running concurrently.
+	(umask $mkdir_umask &&
+	 eval "\$doit_exec \$mkdirprog $prefixes") ||
+	  test -d "$dstdir" || exit 1
+	obsolete_mkdir_used=true
+      fi
+    fi
+  fi
+
+  if test -n "$dir_arg"; then
+    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
+    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
+      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
+  else
+
+    # Make a couple of temp file names in the proper directory.
+    dsttmp=$dstdir/_inst.$$_
+    rmtmp=$dstdir/_rm.$$_
+
+    # Trap to clean up those temp files at exit.
+    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
+
+    # Copy the file name to the temp name.
+    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
+
+    # and set any options; do chmod last to preserve setuid bits.
+    #
+    # If any of these fail, we abort the whole thing.  If we want to
+    # ignore errors from any of these, just make sure not to ignore
+    # errors from the above "$doit $cpprog $src $dsttmp" command.
+    #
+    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } \
+      && { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } \
+      && { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } \
+      && { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
+
+    # Now rename the file to the real destination.
+    { $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null \
+      || {
+	   # The rename failed, perhaps because mv can't rename something else
+	   # to itself, or perhaps because mv is so ancient that it does not
+	   # support -f.
+
+	   # Now remove or move aside any old file at destination location.
+	   # We try this two ways since rm can't unlink itself on some
+	   # systems and the destination file might be busy for other
+	   # reasons.  In this case, the final cleanup might fail but the new
+	   # file should still install successfully.
+	   {
+	     if test -f "$dst"; then
+	       $doit $rmcmd -f "$dst" 2>/dev/null \
+	       || { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null \
+		     && { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }; }\
+	       || {
+		 echo "$0: cannot unlink or rename $dst" >&2
+		 (exit 1); exit 1
+	       }
+	     else
+	       :
+	     fi
+	   } &&
+
+	   # Now rename the file to the real destination.
+	   $doit $mvcmd "$dsttmp" "$dst"
+	 }
+    } || exit 1
+
+    trap '' 0
+  fi
+done
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/missing b/openmp-avx512/basic/optional/ThreadPool/config/missing
new file mode 100755
index 0000000..1c8ff70
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/missing
@@ -0,0 +1,367 @@
+#! /bin/sh
+# Common stub for a few missing GNU programs while installing.
+
+scriptversion=2006-05-10.23
+
+# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006
+#   Free Software Foundation, Inc.
+# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+# 02110-1301, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+if test $# -eq 0; then
+  echo 1>&2 "Try \`$0 --help' for more information"
+  exit 1
+fi
+
+run=:
+sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
+sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
+
+# In the cases where this matters, `missing' is being run in the
+# srcdir already.
+if test -f configure.ac; then
+  configure_ac=configure.ac
+else
+  configure_ac=configure.in
+fi
+
+msg="missing on your system"
+
+case $1 in
+--run)
+  # Try to run requested program, and just exit if it succeeds.
+  run=
+  shift
+  "$@" && exit 0
+  # Exit code 63 means version mismatch.  This often happens
+  # when the user try to use an ancient version of a tool on
+  # a file that requires a minimum version.  In this case we
+  # we should proceed has if the program had been absent, or
+  # if --run hadn't been passed.
+  if test $? = 63; then
+    run=:
+    msg="probably too old"
+  fi
+  ;;
+
+  -h|--h|--he|--hel|--help)
+    echo "\
+$0 [OPTION]... PROGRAM [ARGUMENT]...
+
+Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
+error status if there is no known handling for PROGRAM.
+
+Options:
+  -h, --help      display this help and exit
+  -v, --version   output version information and exit
+  --run           try to run the given command, and emulate it if it fails
+
+Supported PROGRAM values:
+  aclocal      touch file \`aclocal.m4'
+  autoconf     touch file \`configure'
+  autoheader   touch file \`config.h.in'
+  autom4te     touch the output file, or create a stub one
+  automake     touch all \`Makefile.in' files
+  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
+  flex         create \`lex.yy.c', if possible, from existing .c
+  help2man     touch the output file
+  lex          create \`lex.yy.c', if possible, from existing .c
+  makeinfo     touch the output file
+  tar          try tar, gnutar, gtar, then tar without non-portable flags
+  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
+
+Send bug reports to <bug-automake@gnu.org>."
+    exit $?
+    ;;
+
+  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
+    echo "missing $scriptversion (GNU Automake)"
+    exit $?
+    ;;
+
+  -*)
+    echo 1>&2 "$0: Unknown \`$1' option"
+    echo 1>&2 "Try \`$0 --help' for more information"
+    exit 1
+    ;;
+
+esac
+
+# Now exit if we have it, but it failed.  Also exit now if we
+# don't have it and --version was passed (most likely to detect
+# the program).
+case $1 in
+  lex|yacc)
+    # Not GNU programs, they don't have --version.
+    ;;
+
+  tar)
+    if test -n "$run"; then
+       echo 1>&2 "ERROR: \`tar' requires --run"
+       exit 1
+    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
+       exit 1
+    fi
+    ;;
+
+  *)
+    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
+       # We have it, but it failed.
+       exit 1
+    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
+       # Could not run --version or --help.  This is probably someone
+       # running `$TOOL --version' or `$TOOL --help' to check whether
+       # $TOOL exists and not knowing $TOOL uses missing.
+       exit 1
+    fi
+    ;;
+esac
+
+# If it does not exist, or fails to run (possibly an outdated version),
+# try to emulate it.
+case $1 in
+  aclocal*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
+         to install the \`Automake' and \`Perl' packages.  Grab them from
+         any GNU archive site."
+    touch aclocal.m4
+    ;;
+
+  autoconf)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`${configure_ac}'.  You might want to install the
+         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
+         archive site."
+    touch configure
+    ;;
+
+  autoheader)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
+         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
+         from any GNU archive site."
+    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
+    test -z "$files" && files="config.h"
+    touch_files=
+    for f in $files; do
+      case $f in
+      *:*) touch_files="$touch_files "`echo "$f" |
+				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
+      *) touch_files="$touch_files $f.in";;
+      esac
+    done
+    touch $touch_files
+    ;;
+
+  automake*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
+         You might want to install the \`Automake' and \`Perl' packages.
+         Grab them from any GNU archive site."
+    find . -type f -name Makefile.am -print |
+	   sed 's/\.am$/.in/' |
+	   while read f; do touch "$f"; done
+    ;;
+
+  autom4te)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, but is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.
+         You can get \`$1' as part of \`Autoconf' from any GNU
+         archive site."
+
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -f "$file"; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo "#! /bin/sh"
+	echo "# Created by GNU Automake missing as a replacement of"
+	echo "#  $ $@"
+	echo "exit 0"
+	chmod +x $file
+	exit 1
+    fi
+    ;;
+
+  bison|yacc)
+    echo 1>&2 "\
+WARNING: \`$1' $msg.  You should only need it if
+         you modified a \`.y' file.  You may need the \`Bison' package
+         in order for those modifications to take effect.  You can get
+         \`Bison' from any GNU archive site."
+    rm -f y.tab.c y.tab.h
+    if test $# -ne 1; then
+        eval LASTARG="\${$#}"
+	case $LASTARG in
+	*.y)
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" y.tab.c
+	    fi
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" y.tab.h
+	    fi
+	  ;;
+	esac
+    fi
+    if test ! -f y.tab.h; then
+	echo >y.tab.h
+    fi
+    if test ! -f y.tab.c; then
+	echo 'main() { return 0; }' >y.tab.c
+    fi
+    ;;
+
+  lex|flex)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.l' file.  You may need the \`Flex' package
+         in order for those modifications to take effect.  You can get
+         \`Flex' from any GNU archive site."
+    rm -f lex.yy.c
+    if test $# -ne 1; then
+        eval LASTARG="\${$#}"
+	case $LASTARG in
+	*.l)
+	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" lex.yy.c
+	    fi
+	  ;;
+	esac
+    fi
+    if test ! -f lex.yy.c; then
+	echo 'main() { return 0; }' >lex.yy.c
+    fi
+    ;;
+
+  help2man)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+	 you modified a dependency of a manual page.  You may need the
+	 \`Help2man' package in order for those modifications to take
+	 effect.  You can get \`Help2man' from any GNU archive site."
+
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -f "$file"; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo ".ab help2man is required to generate this page"
+	exit 1
+    fi
+    ;;
+
+  makeinfo)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.texi' or \`.texinfo' file, or any other file
+         indirectly affecting the aspect of the manual.  The spurious
+         call might also be the consequence of using a buggy \`make' (AIX,
+         DU, IRIX).  You might want to install the \`Texinfo' package or
+         the \`GNU make' package.  Grab either from any GNU archive site."
+    # The file to touch is that specified with -o ...
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -z "$file"; then
+      # ... or it is the one specified with @setfilename ...
+      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
+      file=`sed -n '
+	/^@setfilename/{
+	  s/.* \([^ ]*\) *$/\1/
+	  p
+	  q
+	}' $infile`
+      # ... or it is derived from the source name (dir/f.texi becomes f.info)
+      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
+    fi
+    # If the file does not exist, the user really needs makeinfo;
+    # let's fail without touching anything.
+    test -f $file || exit 1
+    touch $file
+    ;;
+
+  tar)
+    shift
+
+    # We have already tried tar in the generic part.
+    # Look for gnutar/gtar before invocation to avoid ugly error
+    # messages.
+    if (gnutar --version > /dev/null 2>&1); then
+       gnutar "$@" && exit 0
+    fi
+    if (gtar --version > /dev/null 2>&1); then
+       gtar "$@" && exit 0
+    fi
+    firstarg="$1"
+    if shift; then
+	case $firstarg in
+	*o*)
+	    firstarg=`echo "$firstarg" | sed s/o//`
+	    tar "$firstarg" "$@" && exit 0
+	    ;;
+	esac
+	case $firstarg in
+	*h*)
+	    firstarg=`echo "$firstarg" | sed s/h//`
+	    tar "$firstarg" "$@" && exit 0
+	    ;;
+	esac
+    fi
+
+    echo 1>&2 "\
+WARNING: I can't seem to be able to run \`tar' with the given arguments.
+         You may want to install GNU tar or Free paxutils, or check the
+         command line arguments."
+    exit 1
+    ;;
+
+  *)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, and is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.  Check the \`README' file,
+         it often tells you about the needed prerequisites for installing
+         this package.  You may also peek at any GNU archive site, in case
+         some other package would contain this missing \`$1' program."
+    exit 1
+    ;;
+esac
+
+exit 0
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/replace-install-prefix.pl b/openmp-avx512/basic/optional/ThreadPool/config/replace-install-prefix.pl
new file mode 100755
index 0000000..7523b08
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/replace-install-prefix.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/perl -w
+use strict;
+use Getopt::Long;
+#
+# This script is called to do a set of text replacements for installing
+# a Mafile.export.package file so that external clients can use it.
+#
+# Read in commandline arguments
+#
+my $exec_prefix = "";           # [required] Abs path to base installation directory (i.e. --prefix=??? option passed to configure)
+my $my_export_makefile = "";    # [required] Name only of installed Makefile.export.package file
+my $my_top_srcdir = "";         # [required] Abs path to this package's top source directory
+my $my_incl_dirs = "";          # [required] Abs path to this package's include directories
+my $my_lib_dirs = "";           # [optional] Abs path to this package's library directories (if any exist)
+my $dep_package_builddirs = ""; # [optional] Abs paths to other directly dependent framework package build directories (if any exist)
+GetOptions(
+  "exec-prefix=s"                   => \$exec_prefix,
+  "my-export-makefile=s"            => \$my_export_makefile,
+  "my-abs-top-srcdir=s"             => \$my_top_srcdir,
+  "my-abs-incl-dirs=s"              => \$my_incl_dirs,
+  "my-abs-lib-dirs=s"               => \$my_lib_dirs,
+  "dep-package-abs-builddirs=s"     => \$dep_package_builddirs
+  );
+#
+# Validate commandline arguments
+#
+scalar(@ARGV) == 0 || die;
+$exec_prefix ne "" || die;
+$my_export_makefile ne "" || die;
+$my_top_srcdir ne "" || die;
+$my_incl_dirs ne "" || die;
+#
+# Interpret commandline arguments
+#
+$exec_prefix = remove_rel_paths($exec_prefix);
+my @my_incl_dirs = split(":",$my_incl_dirs);
+my @my_lib_dirs = split(":",$my_lib_dirs);
+my @dep_export_package_builddirs = split(":",$dep_package_builddirs);
+#
+# Do the replacements
+#
+my $my_abs_export_makefile = "${exec_prefix}/include/${my_export_makefile}";
+
+my $cmnd_base = "${my_top_srcdir}/config/token-replace.pl ";
+#
+foreach(@dep_export_package_builddirs) {
+  if($_ ne "") {
+    run_cmnd($cmnd_base . "${_} ${exec_prefix}/include ${my_abs_export_makefile} ${my_abs_export_makefile}");
+  }
+}
+#
+foreach(@my_incl_dirs) {
+  if($_ ne "") {
+    run_cmnd($cmnd_base . "-I${_} -I${exec_prefix}/include ${my_abs_export_makefile} ${my_abs_export_makefile}");
+  }
+}
+#
+foreach(@my_lib_dirs) {
+  if($_ ne "") {
+    run_cmnd($cmnd_base . "-L${_} -L${exec_prefix}/lib ${my_abs_export_makefile} ${my_abs_export_makefile}");
+  }
+}
+#
+run_cmnd($cmnd_base . "${my_top_srcdir}/config ${exec_prefix}/include ${my_abs_export_makefile} ${my_abs_export_makefile}");
+#
+# Subroutines
+#
+sub remove_rel_paths {
+	my $entry_in = shift;
+	if ($entry_in=~/-L\.\./) {
+		return $entry_in;
+	}
+	my @paths = split("/",$entry_in);
+	my @new_paths;
+	foreach( @paths ) {
+		if( !($_=~/\.\./) ) {
+			push @new_paths, $_;
+		}
+		else {
+			pop @new_paths
+		}
+	}
+	return join("/",@new_paths);
+}
+sub run_cmnd {
+  my $cmnd = shift;
+  #print "\n", $cmnd, "\n";
+  system($cmnd)==0 || die;
+}
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/string-replace.pl b/openmp-avx512/basic/optional/ThreadPool/config/string-replace.pl
new file mode 100755
index 0000000..adeb1f4
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/string-replace.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl -w
+#
+# This perl script replaces a string with another string.
+# Here it is allowd for file_in and file_out to be the
+# same file.
+#
+use strict;
+#
+my $g_use_msg =
+  "Use: string-replace.pl find_string replacement_string file_in file_out\n";
+if( scalar(@ARGV) < 4 ) {
+  print STDERR $g_use_msg;
+  exit(-1);
+}
+#
+my $find_string        = shift;
+my $replacement_string = shift;
+my $file_in_name       = shift;
+my $file_out_name      = shift;
+#
+#
+if($file_in_name=~/CVS/) {
+#  print "Do not replace in CVS\n";
+  exit;
+}
+#
+open FILE_IN, "<$file_in_name" || die "The file $file_in_name could not be opended for input\n";
+my @file_in_array = <FILE_IN>;
+close FILE_IN;
+#
+my @file_out_array;
+my $did_replacement = 0;
+foreach(@file_in_array) {
+  #print $_;
+  $did_replacement = 1 if $_=~s/$find_string/$replacement_string/g;
+  #print $_;
+  push @file_out_array, $_;
+}
+if($did_replacement || $file_out_name ne $file_in_name) {
+  open FILE_OUT, ">$file_out_name" || die "The file $file_out_name could not be opended for output\n";
+  print FILE_OUT @file_out_array;
+  close FILE_OUT;
+}
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/strip_dup_incl_paths.pl b/openmp-avx512/basic/optional/ThreadPool/config/strip_dup_incl_paths.pl
new file mode 100755
index 0000000..c628d31
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/strip_dup_incl_paths.pl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl -w
+# This perl script removes duplicate include paths left to the right
+use strict;
+my @all_incl_paths = @ARGV;
+my @cleaned_up_incl_paths;
+foreach( @all_incl_paths ) {
+	$_ = remove_rel_paths($_);
+	if( !($_=~/-I/) ) {
+		push @cleaned_up_incl_paths, $_;
+	}
+	elsif( !entry_exists($_,\@cleaned_up_incl_paths) ) {
+		push @cleaned_up_incl_paths, $_;
+	}
+}
+print join( " ", @cleaned_up_incl_paths );
+#
+# Subroutines
+#
+sub entry_exists {
+	my $entry = shift; # String
+	my $list  = shift; # Reference to an array
+	foreach( @$list ) {
+		if( $entry eq $_ ) { return 1; }
+	}
+	return 0;
+}
+#
+sub remove_rel_paths {
+	my $entry_in = shift;
+	if ($entry_in=~/-I\.\./) {
+		return $entry_in;
+	}
+	my @paths = split("/",$entry_in);
+	my @new_paths;
+	foreach( @paths ) {
+		if( !($_=~/\.\./) ) {
+			push @new_paths, $_;
+		}
+		else {
+			pop @new_paths
+		}
+	}
+	return join("/",@new_paths);
+}
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/strip_dup_libs.pl b/openmp-avx512/basic/optional/ThreadPool/config/strip_dup_libs.pl
new file mode 100755
index 0000000..cdf4b42
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/strip_dup_libs.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/perl -w
+# This perl script removes duplicate libraries from the right to the left and
+# removes duplicate -L library paths from the left to the right
+use strict;
+
+my @all_libs = @ARGV;
+#
+# Move from left to right and remove duplicate -l libraries
+#
+my @cleaned_up_libs_first;
+foreach( reverse @all_libs ) {
+	$_ = remove_rel_paths($_);
+	if( $_=~/-L/ ) {
+		unshift @cleaned_up_libs_first, $_;
+	}
+	else {
+		if( !entry_exists($_,\@cleaned_up_libs_first) ) {
+			unshift @cleaned_up_libs_first, $_;
+		}
+	}
+}
+
+#
+# Move from right to left and remove duplicate -L library paths
+#
+my @cleaned_up_libs;
+foreach( @cleaned_up_libs_first ) {
+	$_ = remove_rel_paths($_);
+	if( !($_=~/-L/) ) {
+		push @cleaned_up_libs, $_;
+	}
+	elsif( !entry_exists($_,\@cleaned_up_libs) ) {
+		push @cleaned_up_libs, $_;
+	}
+}
+#
+# Print the new list of libraries and paths
+#
+print join( " ", @cleaned_up_libs );
+
+#
+# Subroutines
+#
+sub entry_exists {
+	my $entry = shift; # String
+	my $list  = shift; # Reference to an array
+	foreach( @$list ) {
+		if( $entry eq $_ ) { return 1; }
+	}
+	return 0;
+}
+#
+sub remove_rel_paths {
+	my $entry_in = shift;
+	if ($entry_in=~/-L\.\./) {
+		return $entry_in;
+	}
+	my @paths = split("/",$entry_in);
+	my @new_paths;
+	foreach( @paths ) {
+		if( !($_=~/\.\./) ) {
+			push @new_paths, $_;
+		}
+		else {
+			pop @new_paths
+		}
+	}
+	return join("/",@new_paths);
+}
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_check_mpi.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_check_mpi.m4
new file mode 100644
index 0000000..10d569a
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_check_mpi.m4
@@ -0,0 +1,68 @@
+dnl @synopsis TAC_ARG_CHECK_MPI
+dnl
+dnl Check to make sure any definitions set in TAC_ARG_CONFIG_MPI
+dnl are valid, set the MPI flags.  Test MPI compile using C++ compiler.
+dnl
+dnl @author Mike Heroux <mheroux@cs.sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_CHECK_MPI],
+[
+
+if test "X${HAVE_PKG_MPI}" = "Xyes"; then
+
+  if test -n "${MPI_DIR}" && test -z "${MPI_INC}"; then
+    MPI_INC="${MPI_DIR}/include"
+  fi
+
+  if test -n "${MPI_INC}"; then
+    CPPFLAGS="${CPPFLAGS} -I${MPI_INC}"
+  fi
+
+  AC_LANG_CPLUSPLUS 
+  AC_MSG_CHECKING(for mpi.h)
+  AC_TRY_CPP([#include "mpi.h"],
+    [AC_MSG_RESULT(yes)], 
+    [
+     AC_MSG_RESULT(no)  
+     echo "-----"
+     echo "Cannot link simple MPI program."
+     echo "Try --with-mpi-compilers to specify MPI compilers."
+     echo "Or try --with-mpi-libs, --with-mpi-incdir, --with-mpi-libdir"
+     echo "to specify all the specific MPI compile options."
+     echo "-----"
+     AC_MSG_ERROR(MPI cannot link)
+    ])
+
+  if test -n "${MPI_DIR}" && test -z "${MPI_LIBDIR}"; then
+    MPI_LIBDIR="${MPI_DIR}/lib"
+  fi
+
+  if test -n "${MPI_LIBDIR}"; then
+    LDFLAGS="${LDFLAGS} -L${MPI_LIBDIR}"
+  fi
+
+  if test -z "${MPI_LIBS}" && test -n "${MPI_LIBDIR}"; then
+    MPI_LIBS="-lmpi"
+  fi
+
+  if test -n "${MPI_LIBS}"; then
+    LIBS="${MPI_LIBS} ${LIBS}"
+  fi
+
+#   AC_LANG_CPLUSPLUS 
+#   AC_MSG_CHECKING(whether MPI will link using C++ compiler)
+#   AC_TRY_LINK([#include <mpi.h>],
+#   [int c; char** v; MPI_Init(&c,&v);],
+#   [AC_MSG_RESULT(yes)], 
+#   [AC_MSG_RESULT(no)  
+#    echo "-----"
+#    echo "Cannot link simple MPI program."
+#    echo "Try --with-mpi-cxx to specify MPI C++ compile script."
+#    echo "Or try --with-mpi-libs, --with-mpi-incdir, --with-mpi-libdir"
+#    echo "to specify all the specific MPI compile options."
+#    echo "-----"
+#    AC_MSG_ERROR(MPI cannot link)]
+#   )
+
+fi
+])
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_config_mpi.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_config_mpi.m4
new file mode 100644
index 0000000..2d1dd98
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_config_mpi.m4
@@ -0,0 +1,188 @@
+dnl @synopsis TAC_ARG_CONFIG_MPI
+dnl
+dnl Test a variety of MPI options:
+dnl --enable-mpi       - Turns MPI compiling mode on
+dnl --with-mpi         - specify root directory of MPI
+dnl --with-mpi-compilers - Turns on MPI compiling mode and sets the MPI C++
+dnl                       compiler = mpicxx, mpic++ or mpiCC,
+dnl                       the MPI C compiler = mpicc and 
+dnl                       the MPI Fortran compiler = mpif77
+dnl --with-mpi-incdir - specify include directory for MPI 
+dnl --with-mpi-libs    - specify MPI libraries
+dnl --with-mpi-libdir  - specify location of MPI libraries
+dnl
+dnl If any of these options are set, HAVE_MPI will be defined for both
+dnl Autoconf and Automake, and HAVE_MPI will be defined in the
+dnl generated config.h file
+dnl
+dnl
+dnl @author Mike Heroux <maherou@sandia.gov>
+dnl Modified 12/26/2007 by Jim Willenbring to skip the Fortran compiler
+dnl check if Fortran is not enabled.
+dnl
+AC_DEFUN([TAC_ARG_CONFIG_MPI],
+[
+
+AC_ARG_ENABLE(mpi,
+[AC_HELP_STRING([--enable-mpi],[MPI support])],
+[HAVE_PKG_MPI=$enableval],
+[HAVE_PKG_MPI=no]
+)
+
+AC_ARG_WITH(mpi-compilers,
+[AC_HELP_STRING([--with-mpi-compilers=PATH],
+[use MPI compilers mpicc, mpif77, and mpicxx, mpic++ or mpiCC in the specified path or in the default path if no path is specified. Enables MPI])],
+[
+  if test X${withval} != Xno; then
+    HAVE_PKG_MPI=yes
+    if test X${withval} = Xyes; then
+      # Check for mpicxx, if it does not exist, check for mpic++, if it does 
+      # not exist, use mpiCC instead.
+      AC_CHECK_PROG(MPI_TEMP_CXX, mpicxx, mpicxx, no)
+      if test X${MPI_TEMP_CXX} = Xno; then
+	AC_CHECK_PROG(MPI_CXX, mpic++, mpic++, mpiCC)
+      else 
+	MPI_CXX=${MPI_TEMP_CXX}
+      fi
+      MPI_CC=mpicc
+      MPI_F77=mpif77
+    else
+      if test -f ${withval}/mpicxx; then
+        MPI_CXX=${withval}/mpicxx
+      elif test -f ${withval}/mpic++; then
+	MPI_CXX=${withval}/mpic++
+      else
+        MPI_CXX=${withval}/mpiCC
+      fi
+      MPI_CC=${withval}/mpicc
+      MPI_F77=${withval}/mpif77
+    fi
+  fi
+]
+)
+
+AC_ARG_WITH(mpi,
+[AC_HELP_STRING([--with-mpi=MPIROOT],[use MPI root directory (enables MPI)])],
+[
+  HAVE_PKG_MPI=yes
+  MPI_DIR=${withval}
+  AC_MSG_CHECKING(MPI directory)
+  AC_MSG_RESULT([${MPI_DIR}])
+]
+)
+
+#AC_ARG_WITH(mpi-include,
+#[AC_HELP_STRING([--with-mpi-include],[Obsolete.  Use --with-mpi-incdir=DIR instead.  Do not prefix DIR with '-I'.])],
+#[AC_MSG_ERROR([--with-mpi-include is an obsolte option.   Use --with-mpi-incdir=DIR instead.  Do not prefix DIR with '-I'.  For example '--with-mpi-incdir=/usr/lam_path/include'.])]
+#)
+
+AC_ARG_WITH(mpi-libs,
+[AC_HELP_STRING([--with-mpi-libs="LIBS"],[MPI libraries @<:@"-lmpi"@:>@])],
+[
+  MPI_LIBS=${withval}
+  AC_MSG_CHECKING(user-defined MPI libraries)
+  AC_MSG_RESULT([${MPI_LIBS}])
+]
+)
+
+AC_ARG_WITH(mpi-incdir,
+[AC_HELP_STRING([--with-mpi-incdir=DIR],[MPI include directory @<:@MPIROOT/include@:>@  Do not use -I])],
+[
+  MPI_INC=${withval}
+  AC_MSG_CHECKING(user-defined MPI includes)
+  AC_MSG_RESULT([${MPI_INC}])
+]
+)
+
+AC_ARG_WITH(mpi-libdir,
+[AC_HELP_STRING([--with-mpi-libdir=DIR],[MPI library directory @<:@MPIROOT/lib@:>@  Do not use -L])],
+[
+  MPI_LIBDIR=${withval}
+  AC_MSG_CHECKING(user-defined MPI library directory)
+  AC_MSG_RESULT([${MPI_LIBDIR}])
+]
+)
+
+AC_MSG_CHECKING(whether we are using MPI)
+AC_MSG_RESULT([${HAVE_PKG_MPI}])
+
+if test "X${HAVE_PKG_MPI}" = "Xyes"; then
+   AC_DEFINE(HAVE_MPI,,[define if we want to use MPI])
+fi
+
+dnl Define Automake version of HAVE_MPI if appropriate
+
+AM_CONDITIONAL(HAVE_MPI, [test "X${HAVE_PKG_MPI}" = "Xyes"])
+
+
+dnl
+dnl --------------------------------------------------------------------
+dnl Check for MPI compilers (must be done *before* AC_PROG_CXX,
+dnl AC_PROG_CC and AC_PROG_F77)
+dnl 
+dnl --------------------------------------------------------------------
+
+if test -n "${MPI_CXX}"; then
+  if test -f ${MPI_CXX}; then
+    MPI_CXX_EXISTS=yes
+  else
+    AC_CHECK_PROG(MPI_CXX_EXISTS, ${MPI_CXX}, yes, no)
+  fi
+
+  if test "X${MPI_CXX_EXISTS}" = "Xyes"; then
+    CXX=${MPI_CXX}
+  else
+    echo "-----"
+    echo "Cannot find MPI C++ compiler ${MPI_CXX}."
+    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
+    echo "or specify a C++ compiler using CXX=<compiler>"
+    echo "Do not use --with-mpi-compilers if using CXX=<compiler>"
+    echo "-----"
+    AC_MSG_ERROR([MPI C++ compiler (${MPI_CXX}) not found.])
+  fi
+fi
+
+if test -n "${MPI_CC}"; then
+  if test -f ${MPI_CC}; then
+    MPI_CC_EXISTS=yes
+  else
+    AC_CHECK_PROG(MPI_CC_EXISTS, ${MPI_CC}, yes, no)
+  fi
+
+  if test "X${MPI_CC_EXISTS}" = "Xyes"; then
+    CC=${MPI_CC}
+  else
+    echo "-----"
+    echo "Cannot find MPI C compiler ${MPI_CC}."
+    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
+    echo "or specify a C compiler using CC=<compiler>"
+    echo "Do not use --with-mpi-compilers if using CC=<compiler>"
+    echo "-----"
+    AC_MSG_ERROR([MPI C compiler (${MPI_CC}) not found.])
+  fi
+fi
+
+if test "X$ac_cv_use_fortran" = "Xyes"; then
+
+if test -n "${MPI_F77}"; then
+  if test -f ${MPI_F77}; then
+    MPI_F77_EXISTS=yes
+  else
+    AC_CHECK_PROG(MPI_F77_EXISTS, ${MPI_F77}, yes, no)
+  fi
+
+  if test "X${MPI_F77_EXISTS}" = "Xyes"; then
+    F77=${MPI_F77}
+  else
+    echo "-----"
+    echo "Cannot find MPI Fortran compiler ${MPI_F77}."
+    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
+    echo "or specify a Fortran 77 compiler using F77=<compiler>"
+    echo "Do not use --with-mpi-compilers if using F77=<compiler>"
+    echo "-----"
+    AC_MSG_ERROR([MPI Fortran 77 compiler (${MPI_F77}) not found.])
+  fi
+fi
+
+fi dnl ac_cv_use_fortran
+])
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_export-makefiles.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_export-makefiles.m4
new file mode 100644
index 0000000..b7a8b38
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_export-makefiles.m4
@@ -0,0 +1,76 @@
+dnl Enables export makefile specific code
+dnl 
+dnl The following AM_CONDITIONALS are set for makefiles to access:
+dnl USING_EXPORT_MAKEFILES
+dnl USING_PERL via TAC_ARG_WITH_PERL
+dnl USING_GNUMAKE
+dnl
+dnl The following AC_DEFINES are set:
+dnl HAVE_EXPORT_MAKEFILES
+dnl 
+dnl the following variables are set:
+dnl PERL_EXE for the perl executable via TAC_ARG_WITH_PERL
+dnl 
+dnl This file was based on tac_arg_enable_feature.m4 by Mike Heroux
+dnl @author Roger Pawlowski <rppawlo@sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_ENABLE_EXPORT_MAKEFILES],
+[
+AC_ARG_ENABLE(export-makefiles,
+AC_HELP_STRING([--enable-export-makefiles],[Creates export makefiles in the install (prefix) directory.  This option requires perl to be set in your path or defined with --with-perl=<perl executable>. Note that the export makefiles are always created and used in the build directory, but will not be installable without this option to change the paths. (default is $1)]),
+ac_cv_use_export_makefiles=$enableval, 
+ac_cv_use_export_makefiles=$1)
+
+AC_MSG_CHECKING(whether to build export makefiles)
+
+if test "X$ac_cv_use_export_makefiles" != "Xno"; then
+
+  AC_MSG_RESULT(yes)
+  AC_DEFINE([HAVE_EXPORT_MAKEFILES],,[Define if you want to build export makefiles.])
+
+else
+
+  AC_MSG_RESULT(no)
+
+fi
+
+AM_CONDITIONAL(USING_EXPORT_MAKEFILES, test X${ac_cv_use_export_makefiles} = Xyes)
+
+# Check for perl to run scripts (Required dependency)
+TAC_ARG_WITH_PERL
+
+if test "X$HAVE_PERL" != "Xyes" && 
+   test "X$ac_cv_use_export_makefiles" != "Xno"; then
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([Failed to find the perl executable.  The flag --enable-export-makefiles requires perl to be either in your path or explicitly defined by the flag --with-perl=<executable>.  If you do not require the export makefiles to be installed via 'make install', you can disable the export makefiles with --disable-export-makefiles.])
+fi
+
+# Check for using gnumake to clean up link lines via 
+# gnumake's "shell" command. Optional dependency.
+AC_DEFUN([TAC_ARG_WITH_GNUMAKE],
+[
+AC_ARG_WITH(gnumake,
+AC_HELP_STRING([--with-gnumake],[Gnu's make has special functions we can use to eliminate redundant paths in the build and link lines. Enable this if you use gnu-make to build Trilinos. This requires that perl is in your path or that you have specified the perl executable with --with-perl=<perl executable>.  Configure will check for the existence of the perl executable and quit with an error if it is not found. (default is no)]),
+ac_cv_use_gnumake=$withval, ac_cv_use_gnumake=no)
+
+AC_MSG_CHECKING(whether gnumake specific code should be enabled)
+
+if test "X$ac_cv_use_gnumake" != "Xno"; then
+  AC_MSG_RESULT(yes)  
+  AC_DEFINE([HAVE_GNUMAKE],,[Define if you are using gnumake - this will shorten your link lines.])
+else
+  AC_MSG_RESULT(no)
+fi
+AM_CONDITIONAL(USING_GNUMAKE, test "X$ac_cv_use_gnumake" = "Xyes")
+])
+
+TAC_ARG_WITH_GNUMAKE
+
+if test "X$HAVE_PERL" != "Xyes" && 
+   test "X$ac_cv_use_gnumake" != "Xno"; then
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([The flag --with-gnumake requires perl to be in your path.  The perl executable can alternatively be explicitly defined by the flag --with-perl=<executable>.])
+fi
+
+])
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_feature.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_feature.m4
new file mode 100644
index 0000000..4e22753
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_feature.m4
@@ -0,0 +1,40 @@
+dnl @synopsis TAC_ARG_ENABLE_FEATURE(FEATURE_NAME, FEATURE_DESCRIPTION, HAVE_NAME, DEFAULT_VAL)
+dnl
+dnl Test for --enable-${FEATURE_NAME} and set to DEFAULT_VAL value if feature not specified.
+dnl Also calls AC_DEFINE to define HAVE_${HAVE_NAME} if value is not equal to "no"
+dnl 
+dnl Use this macro to help defining whether or not optional 
+dnl features* should compiled.  For example:
+dnl
+dnl TAC_ARG_ENABLE_FEATURE(epetra, [Configure and build epetra], EPETRA, yes)
+dnl 
+dnl will test for --enable-epetra when configure is run.  If it is defined 
+dnl and not set to "no" or not defined (default is "yes") then HAVE_EPETRA will
+dnl be defined, if --enable-epetra is defined to be "no", HAVE_EPETRA will not
+dnl be defined.
+dnl
+dnl *NOTE: epetra, aztecoo, komplex, ifpack, and other software found in
+dnl subdirectories of Trilinos/packages are "packages" in their own right.
+dnl However, these packages are also "features" of the larger package
+dnl "Trilinos".  Therefore, when configuring from the Trilinos directory,
+dnl it is appropriate to refer to these software packages as "features".
+dnl
+dnl This file was based on tac_arg_with_package.m4 by Mike Heroux
+dnl @author James Willenbring <jmwille@sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_ENABLE_FEATURE],
+[
+AC_ARG_ENABLE([$1],
+AC_HELP_STRING([--enable-$1],[$2 (default is [$4])]),
+ac_cv_use_$1=$enableval, ac_cv_use_$1=$4)
+
+AC_MSG_CHECKING(whether to use [$1])
+
+if test "X$ac_cv_use_$1" != "Xno"; then
+  AC_MSG_RESULT(yes)  
+  AC_DEFINE([HAVE_$3],,[Define if want to build $1])
+else
+  AC_MSG_RESULT(no)
+fi
+])
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_feature_sub_check.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_feature_sub_check.m4
new file mode 100755
index 0000000..b3876fd
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_enable_feature_sub_check.m4
@@ -0,0 +1,54 @@
+dnl @synopsis TAC_ARG_ENABLE_FEATURE_SUB_CHECK(FEATURE_NAME, SUB_FEATURE_NAME, FEATURE_DESCRIPTION, HAVE_NAME)
+dnl
+dnl This hack gets around the fact that TAC_ARG_ENABLE_FEATURE does not support underscores
+dnl in its feature names.  TAC_ARG_ENABLE_FEATURE_SUB_CHECK allows exactly one underscore.  Not great,
+dnl but arguably better than supporting no underscores.
+dnl
+dnl TAC_ARG_ENABLE_FEATURE(feature-sub, [Configure and build feature-sub], FEATURE_SUB, yes) 
+dnl   fails because tac_arg_enable_feature tests for ac_cv_use_feature-sub which gets 
+dnl   rejected because the `-' is not allowed in variables.  (AC_ARG_ENABLE sets ac_cv_use_feature_sub
+dnl   to avoid this problem.)  Use:
+dnl 
+dnl TAC_ARG_ENABLE_FEATURE_SUB_CHECK(feature, sub, [Configure and build feature-sub], FEATURE_SUB) 
+dnl   instead.
+dnl
+dnl This macro will test for --enable-${FEATURE_NAME}-${SUB_FEATURE_NAME} when configure is run.  
+dnl If it is defined and not set to "no" or not defined and --disable-${SUB_FEATURE_NAME} is not
+dnl specified then HAVE_${HAVE_NAME} will be defined.
+dnl
+dnl *NOTE: This macro is designed for the use-case when there is an individual Trilinos package 
+dnl offering fine-grained control of a Trilinos option.  This way, the individual package 
+dnl option is enabled, as long as the Trilinos option is not disabled.  If the Trilinos option is
+dnl disabled, then the user must enable each packages option individually.  For instance:
+dnl
+dnl --disable-tests --enable-teuchos-tests
+dnl
+dnl *NOTE: epetra, aztecoo, komplex, ifpack, and other software found in
+dnl subdirectories of Trilinos/packages are "packages" in their own right.
+dnl However, these packages are also "features" of the larger package
+dnl "Trilinos".  Therefore, when configuring from the Trilinos directory,
+dnl it is appropriate to refer to these software packages as "features".
+dnl
+dnl This file was based on tac_arg_enable_package.m4 by Jim Willenbring
+dnl and tac_arg_enable_package_sub.m4 by Ken Stanley.
+dnl
+dnl @author Heidi Thornquist <hkthorn@sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_ENABLE_FEATURE_SUB_CHECK],
+[
+AC_ARG_ENABLE([$2],, ac_cv_use_$2=$enableval, ac_cv_use_$2=yes)
+
+AC_ARG_ENABLE([$1-$2],
+AC_HELP_STRING([--enable-$1-$2],[$3 (default is yes if --disable-$2 is not specified)]),
+ac_cv_use_$1_$2=$enableval, ac_cv_use_$1_$2=${ac_cv_use_$2})
+
+AC_MSG_CHECKING(whether to use [$1-$2])
+
+if test "X$ac_cv_use_$1_$2" != "Xno"; then
+  AC_MSG_RESULT(yes)  
+  AC_DEFINE([HAVE_$4],,[Define if want to build $1-$2])
+else
+  AC_MSG_RESULT(no)
+fi
+])
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_ar.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_ar.m4
new file mode 100644
index 0000000..9568f3e
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_ar.m4
@@ -0,0 +1,39 @@
+dnl @synopsis TAC_ARG_WITH_AR
+dnl
+dnl Test for --with-ar="ar_program ar_flags".
+dnl Default is "ar cru"
+dnl 
+dnl Generates an Automake conditional USE_ALTERNATE_AR that can be tested.  
+dnl Generates the user-specified archiver command in @ALTERNATE_AR@.
+dnl
+dnl @author Mike Heroux <mheroux@cs.sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_WITH_AR],
+[
+AC_ARG_WITH(ar,
+AC_HELP_STRING([--with-ar], [override archiver command (default is "ar cru")]),
+[
+AC_MSG_CHECKING(user-defined archiver)
+AC_MSG_RESULT([${withval}])
+USE_ALTERNATE_AR=yes
+ALTERNATE_AR="${withval}"
+]
+)
+
+if test -n "${SPECIAL_AR}" && test "X${USE_ALTERNATE_AR}" != "Xyes";
+then
+  USE_ALTERNATE_AR=yes
+  ALTERNATE_AR="${SPECIAL_AR}"
+fi
+
+AC_MSG_CHECKING(for special archiver command)
+if test "X${USE_ALTERNATE_AR}" = "Xyes"; then
+   AC_MSG_RESULT([${ALTERNATE_AR}])
+   AM_CONDITIONAL(USE_ALTERNATE_AR, true)
+else
+   AC_MSG_RESULT([none])
+   AM_CONDITIONAL(USE_ALTERNATE_AR, false)
+fi
+AC_SUBST(ALTERNATE_AR)
+])
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_flags.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_flags.m4
new file mode 100644
index 0000000..256450a
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_flags.m4
@@ -0,0 +1,31 @@
+dnl @synopsis TAC_ARG_WITH_FLAGS(lcase_name, UCASE_NAME)
+dnl
+dnl Test for --with-lcase_name="compiler/loader flags".  if defined, prepend 
+dnl flags to standard UCASE_NAME definition.
+dnl
+dnl Use this macro to facilitate additional special flags that should be
+dnl passed on to the preprocessor/compilers/loader.
+dnl
+dnl Example use
+dnl 
+dnl TAC_ARG_WITH_FLAGS(cxxflags, CXXFLAGS)
+dnl 
+dnl tests for --with-cxxflags and pre-pends to CXXFLAGS
+dnl 
+dnl
+dnl @author Mike Heroux <mheroux@cs.sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_WITH_FLAGS],
+[
+AC_MSG_CHECKING([whether additional [$2] flags should be added])
+AC_ARG_WITH($1,
+AC_HELP_STRING([--with-$1], 
+[additional [$2] flags to be added: will prepend to [$2]]),
+[
+$2="${withval} ${$2}"
+AC_MSG_RESULT([$2 = ${$2}])
+],
+AC_MSG_RESULT(no)
+)
+])
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_incdirs.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_incdirs.m4
new file mode 100644
index 0000000..f3092e5
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_incdirs.m4
@@ -0,0 +1,24 @@
+dnl @synopsis TAC_ARG_WITH_INCDIRS
+dnl
+dnl Test for --with-incdirs="-Iincdir1 -Iincdir2".  if defined, prepend 
+dnl "-Iincdir1 -Iincdir2" to CPPFLAGS
+dnl
+dnl Use this macro to facilitate addition of directories to include file search path.
+dnl 
+dnl
+dnl @author Mike Heroux <mheroux@cs.sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_WITH_INCDIRS],
+[
+AC_MSG_CHECKING([whether additional include search paths defined])
+AC_ARG_WITH(incdirs,
+AC_HELP_STRING([--with-incdirs], 
+[additional directories containing include files: will prepend to search here for includes, use -Idir format]),
+[
+CPPFLAGS="${withval} ${CPPFLAGS}"
+AC_MSG_RESULT([${withval}])
+],
+AC_MSG_RESULT(no)
+)
+])
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_libdirs.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_libdirs.m4
new file mode 100644
index 0000000..b2f9438
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_libdirs.m4
@@ -0,0 +1,24 @@
+dnl @synopsis TAC_ARG_WITH_LIBDIRS
+dnl
+dnl Test for --with-libdirs="-Llibdir1 -Llibdir2".  if defined, 
+dnl prepend "-Llibdir1 -Llibdir2" to LDFLAGS
+dnl
+dnl Use this macro to facilitate addition of directories to library search path.
+dnl 
+dnl
+dnl @author Mike Heroux <mheroux@cs.sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_WITH_LIBDIRS],
+[
+AC_MSG_CHECKING([whether additional library search paths defined])
+AC_ARG_WITH(libdirs,
+AC_HELP_STRING([--with-libdirs], 
+[OBSOLETE use --with-ldflags instead.  (ex. --with-ldflags="-L<DIR> -L<DIR2>")]),
+[
+LDFLAGS="${withval} ${LDFLAGS}"
+AC_MSG_RESULT([${withval}])
+],
+AC_MSG_RESULT(no)
+)
+])
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_libs.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_libs.m4
new file mode 100644
index 0000000..3a64880
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_libs.m4
@@ -0,0 +1,30 @@
+dnl @synopsis TAC_ARG_WITH_LIBS
+dnl
+dnl Test for --with-libs="name(s)".
+dnl 
+dnl Prepends the specified name(s) to the list of libraries to link 
+dnl with.  
+dnl
+dnl Example use
+dnl
+dnl TAC_ARG_WITH_LIBS
+dnl 
+dnl tests for --with-libs and pre-pends to LIBS
+dnl
+dnl @author Jim Willenbring <jmwille@sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_WITH_LIBS],
+[
+AC_MSG_CHECKING([whether additional libraries are needed])
+AC_ARG_WITH(libs,
+AC_HELP_STRING([--with-libs], 
+[List additional libraries here.  For example, --with-libs=-lsuperlu
+or --with-libs=/path/libsuperlu.a]),
+[
+LIBS="${withval} ${LIBS}"
+AC_MSG_RESULT([LIBS = ${LIBS}])
+],
+AC_MSG_RESULT(no)
+)
+]
+)
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_perl.m4 b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_perl.m4
new file mode 100644
index 0000000..63e74ba
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/tac_arg_with_perl.m4
@@ -0,0 +1,34 @@
+dnl @synopsis TAC_ARG_WITH_PERL(DEFAULT_VAL)
+dnl
+dnl Test for --enable-gnumake and set to DEFAULT_VAL value if feature not specified.
+dnl Calls AC_DEFINE to define HAVE_GNUMAKE if value is not equal to "no"
+dnl Calls AM_CONDITIONAL to define USING_GNUMAKE to true/false.
+dnl 
+dnl This file was based on tac_arg_with_ar.m4 by Mike Heroux
+dnl @author Roger Pawlowski <rppawlo@sandia.gov>
+dnl
+AC_DEFUN([TAC_ARG_WITH_PERL],
+[
+
+AC_ARG_WITH(perl,
+AC_HELP_STRING([--with-perl], [supply a perl executable.  For example --with-perl=/usr/bin/perl.]),
+[
+AC_MSG_CHECKING(for user supplied perl executable)
+AC_MSG_RESULT([${withval}])
+USER_SPECIFIED_PERL=yes
+PERL_EXE="${withval}"
+],
+[
+USER_SPECIFIED_PERL=no
+])
+
+if test "X${USER_SPECIFIED_PERL}" = "Xyes"; then
+  AC_CHECK_FILE(${PERL_EXE}, [HAVE_PERL=yes], [HAVE_PERL=no])
+  AC_SUBST(PERL_EXE, ${PERL_EXE})
+else
+  AC_CHECK_PROG(HAVE_PERL, perl, yes, no)
+  AC_SUBST(PERL_EXE, perl)
+fi
+AM_CONDITIONAL(USING_PERL, test X${HAVE_PERL} = Xyes)
+])
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/config/token-replace.pl b/openmp-avx512/basic/optional/ThreadPool/config/token-replace.pl
new file mode 100755
index 0000000..c3b413e
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/config/token-replace.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl -w
+#
+# This perl script replaces a string with another string
+# on a token basis.  Here it is allowed for file_in and
+# file_out to be the same file.
+#
+use strict;
+#
+my $g_use_msg =
+  "Use: token-replace.pl find_token replacement_token file_in file_out\n";
+if( scalar(@ARGV) < 4 ) {
+  print STDERR $g_use_msg;
+  exit(-1);
+}
+#
+my $find_token         = shift;
+my $replacement_token  = shift;
+my $file_in_name       = shift;
+my $file_out_name      = shift;
+#
+#print "file_in_name = $file_in_name\n";
+if($file_in_name=~/CVS/) {
+#  print "Do not replace in CVS\n";
+  exit;
+}
+open FILE_IN, "<$file_in_name" || die "The file $file_in_name could not be opended for input\n";
+my @file_in_array = <FILE_IN>;
+close FILE_IN;
+#
+my $match_str = '([^\w\d_]|^)' . $find_token . '([^\w\d_]|$)';
+#print $match_str . "\n";
+#
+my @file_out_array;
+my $did_replacement = 0;
+foreach(@file_in_array) {
+  $did_replacement = 1 if $_=~s/$match_str/$1$replacement_token$2/g;
+  push @file_out_array, $_;
+}
+if($did_replacement || $file_out_name ne $file_in_name) {
+  open FILE_OUT, ">$file_out_name" || die "The file $file_out_name could not be opended for output\n";
+  print FILE_OUT @file_out_array;
+  close FILE_OUT;
+}
diff --git a/openmp-avx512/basic/optional/ThreadPool/configure b/openmp-avx512/basic/optional/ThreadPool/configure
new file mode 100755
index 0000000..6312db9
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/configure
@@ -0,0 +1,7804 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.61 for ThreadPool 1.1d.
+#
+# Report bugs to <hcedwar@sandia.gov>.
+#
+# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+# 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+## --------------------- ##
+## M4sh Initialization.  ##
+## --------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in
+  *posix*) set -o posix ;;
+esac
+
+fi
+
+
+
+
+# PATH needs CR
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  echo "#! /bin/sh" >conf$$.sh
+  echo  "exit 0"   >>conf$$.sh
+  chmod +x conf$$.sh
+  if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then
+    PATH_SEPARATOR=';'
+  else
+    PATH_SEPARATOR=:
+  fi
+  rm -f conf$$.sh
+fi
+
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+  as_unset=unset
+else
+  as_unset=false
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+as_nl='
+'
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+case $0 in
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  { (exit 1); exit 1; }
+fi
+
+# Work around bugs in pre-3.0 UWIN ksh.
+for as_var in ENV MAIL MAILPATH
+do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+for as_var in \
+  LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \
+  LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \
+  LC_TELEPHONE LC_TIME
+do
+  if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then
+    eval $as_var=C; export $as_var
+  else
+    ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+  fi
+done
+
+# Required to use basename.
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+
+# Name of the executable.
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# CDPATH.
+$as_unset CDPATH
+
+
+if test "x$CONFIG_SHELL" = x; then
+  if (eval ":") 2>/dev/null; then
+  as_have_required=yes
+else
+  as_have_required=no
+fi
+
+  if test $as_have_required = yes && 	 (eval ":
+(as_func_return () {
+  (exit \$1)
+}
+as_func_success () {
+  as_func_return 0
+}
+as_func_failure () {
+  as_func_return 1
+}
+as_func_ret_success () {
+  return 0
+}
+as_func_ret_failure () {
+  return 1
+}
+
+exitcode=0
+if as_func_success; then
+  :
+else
+  exitcode=1
+  echo as_func_success failed.
+fi
+
+if as_func_failure; then
+  exitcode=1
+  echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+  :
+else
+  exitcode=1
+  echo as_func_ret_success failed.
+fi
+
+if as_func_ret_failure; then
+  exitcode=1
+  echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
+  :
+else
+  exitcode=1
+  echo positional parameters were not saved.
+fi
+
+test \$exitcode = 0) || { (exit 1); exit 1; }
+
+(
+  as_lineno_1=\$LINENO
+  as_lineno_2=\$LINENO
+  test \"x\$as_lineno_1\" != \"x\$as_lineno_2\" &&
+  test \"x\`expr \$as_lineno_1 + 1\`\" = \"x\$as_lineno_2\") || { (exit 1); exit 1; }
+") 2> /dev/null; then
+  :
+else
+  as_candidate_shells=
+    as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  case $as_dir in
+	 /*)
+	   for as_base in sh bash ksh sh5; do
+	     as_candidate_shells="$as_candidate_shells $as_dir/$as_base"
+	   done;;
+       esac
+done
+IFS=$as_save_IFS
+
+
+      for as_shell in $as_candidate_shells $SHELL; do
+	 # Try only shells that exist, to save several forks.
+	 if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+		{ ("$as_shell") 2> /dev/null <<\_ASEOF
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in
+  *posix*) set -o posix ;;
+esac
+
+fi
+
+
+:
+_ASEOF
+}; then
+  CONFIG_SHELL=$as_shell
+	       as_have_required=yes
+	       if { "$as_shell" 2> /dev/null <<\_ASEOF
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in
+  *posix*) set -o posix ;;
+esac
+
+fi
+
+
+:
+(as_func_return () {
+  (exit $1)
+}
+as_func_success () {
+  as_func_return 0
+}
+as_func_failure () {
+  as_func_return 1
+}
+as_func_ret_success () {
+  return 0
+}
+as_func_ret_failure () {
+  return 1
+}
+
+exitcode=0
+if as_func_success; then
+  :
+else
+  exitcode=1
+  echo as_func_success failed.
+fi
+
+if as_func_failure; then
+  exitcode=1
+  echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+  :
+else
+  exitcode=1
+  echo as_func_ret_success failed.
+fi
+
+if as_func_ret_failure; then
+  exitcode=1
+  echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = "$1" ); then
+  :
+else
+  exitcode=1
+  echo positional parameters were not saved.
+fi
+
+test $exitcode = 0) || { (exit 1); exit 1; }
+
+(
+  as_lineno_1=$LINENO
+  as_lineno_2=$LINENO
+  test "x$as_lineno_1" != "x$as_lineno_2" &&
+  test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2") || { (exit 1); exit 1; }
+
+_ASEOF
+}; then
+  break
+fi
+
+fi
+
+      done
+
+      if test "x$CONFIG_SHELL" != x; then
+  for as_var in BASH_ENV ENV
+        do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+        done
+        export CONFIG_SHELL
+        exec "$CONFIG_SHELL" "$as_myself" ${1+"$@"}
+fi
+
+
+    if test $as_have_required = no; then
+  echo This script requires a shell more modern than all the
+      echo shells that I found on your system.  Please install a
+      echo modern shell, or manually run the script under such a
+      echo shell if you do have one.
+      { (exit 1); exit 1; }
+fi
+
+
+fi
+
+fi
+
+
+
+(eval "as_func_return () {
+  (exit \$1)
+}
+as_func_success () {
+  as_func_return 0
+}
+as_func_failure () {
+  as_func_return 1
+}
+as_func_ret_success () {
+  return 0
+}
+as_func_ret_failure () {
+  return 1
+}
+
+exitcode=0
+if as_func_success; then
+  :
+else
+  exitcode=1
+  echo as_func_success failed.
+fi
+
+if as_func_failure; then
+  exitcode=1
+  echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+  :
+else
+  exitcode=1
+  echo as_func_ret_success failed.
+fi
+
+if as_func_ret_failure; then
+  exitcode=1
+  echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
+  :
+else
+  exitcode=1
+  echo positional parameters were not saved.
+fi
+
+test \$exitcode = 0") || {
+  echo No shell found that supports shell functions.
+  echo Please tell autoconf@gnu.org about your system,
+  echo including any error possibly output before this
+  echo message
+}
+
+
+
+  as_lineno_1=$LINENO
+  as_lineno_2=$LINENO
+  test "x$as_lineno_1" != "x$as_lineno_2" &&
+  test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
+
+  # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+  # uniformly replaced by the line number.  The first 'sed' inserts a
+  # line-number line after each line using $LINENO; the second 'sed'
+  # does the real work.  The second script uses 'N' to pair each
+  # line-number line with the line containing $LINENO, and appends
+  # trailing '-' during substitution so that $LINENO is not a special
+  # case at line end.
+  # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+  # scripts with optimization help from Paolo Bonzini.  Blame Lee
+  # E. McMahon (1931-1989) for sed's syntax.  :-)
+  sed -n '
+    p
+    /[$]LINENO/=
+  ' <$as_myself |
+    sed '
+      s/[$]LINENO.*/&-/
+      t lineno
+      b
+      :lineno
+      N
+      :loop
+      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+      t loop
+      s/-\n.*//
+    ' >$as_me.lineno &&
+  chmod +x "$as_me.lineno" ||
+    { echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
+   { (exit 1); exit 1; }; }
+
+  # Don't try to exec as it changes $[0], causing all sort of problems
+  # (the dirname of $[0] is not the place where we might find the
+  # original and so on.  Autoconf is especially sensitive to this).
+  . "./$as_me.lineno"
+  # Exit status is that of the last command.
+  exit
+}
+
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in
+-n*)
+  case `echo 'x\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  *)   ECHO_C='\c';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir
+fi
+echo >conf$$.file
+if ln -s conf$$.file conf$$ 2>/dev/null; then
+  as_ln_s='ln -s'
+  # ... but there are two gotchas:
+  # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+  # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+  # In both cases, we have to default to `cp -p'.
+  ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+    as_ln_s='cp -p'
+elif ln conf$$.file conf$$ 2>/dev/null; then
+  as_ln_s=ln
+else
+  as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p=:
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+if test -x / >/dev/null 2>&1; then
+  as_test_x='test -x'
+else
+  if ls -dL / >/dev/null 2>&1; then
+    as_ls_L_option=L
+  else
+    as_ls_L_option=
+  fi
+  as_test_x='
+    eval sh -c '\''
+      if test -d "$1"; then
+        test -d "$1/.";
+      else
+	case $1 in
+        -*)set "./$1";;
+	esac;
+	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
+	???[sx]*):;;*)false;;esac;fi
+    '\'' sh
+  '
+fi
+as_executable_p=$as_test_x
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+
+exec 7<&0 </dev/null 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+SHELL=${CONFIG_SHELL-/bin/sh}
+
+# Identity of this package.
+PACKAGE_NAME='ThreadPool'
+PACKAGE_TARNAME='threadpool'
+PACKAGE_VERSION='1.1d'
+PACKAGE_STRING='ThreadPool 1.1d'
+PACKAGE_BUGREPORT='hcedwar@sandia.gov'
+
+ac_unique_file="src/TPI.c"
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_STRING_H
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
+ac_subst_vars='SHELL
+PATH_SEPARATOR
+PACKAGE_NAME
+PACKAGE_TARNAME
+PACKAGE_VERSION
+PACKAGE_STRING
+PACKAGE_BUGREPORT
+exec_prefix
+prefix
+program_transform_name
+bindir
+sbindir
+libexecdir
+datarootdir
+datadir
+sysconfdir
+sharedstatedir
+localstatedir
+includedir
+oldincludedir
+docdir
+infodir
+htmldir
+dvidir
+pdfdir
+psdir
+libdir
+localedir
+mandir
+DEFS
+ECHO_C
+ECHO_N
+ECHO_T
+LIBS
+build_alias
+host_alias
+target_alias
+MAINTAINER_MODE_TRUE
+MAINTAINER_MODE_FALSE
+MAINT
+build
+build_cpu
+build_vendor
+build_os
+host
+host_cpu
+host_vendor
+host_os
+target
+target_cpu
+target_vendor
+target_os
+INSTALL_PROGRAM
+INSTALL_SCRIPT
+INSTALL_DATA
+am__isrc
+CYGPATH_W
+PACKAGE
+VERSION
+ACLOCAL
+AUTOCONF
+AUTOMAKE
+AUTOHEADER
+MAKEINFO
+install_sh
+STRIP
+INSTALL_STRIP_PROGRAM
+mkdir_p
+AWK
+SET_MAKE
+am__leading_dot
+AMTAR
+am__tar
+am__untar
+MPI_TEMP_CXX
+MPI_CXX
+HAVE_MPI_TRUE
+HAVE_MPI_FALSE
+MPI_CXX_EXISTS
+MPI_CC_EXISTS
+MPI_F77_EXISTS
+CC
+CFLAGS
+LDFLAGS
+CPPFLAGS
+ac_ct_CC
+EXEEXT
+OBJEXT
+DEPDIR
+am__include
+am__quote
+AMDEP_TRUE
+AMDEP_FALSE
+AMDEPBACKSLASH
+CCDEPMODE
+am__fastdepCC_TRUE
+am__fastdepCC_FALSE
+CXX
+CXXFLAGS
+ac_ct_CXX
+CXXDEPMODE
+am__fastdepCXX_TRUE
+am__fastdepCXX_FALSE
+RANLIB
+USE_ALTERNATE_AR_TRUE
+USE_ALTERNATE_AR_FALSE
+ALTERNATE_AR
+CXXCPP
+USING_EXPORT_MAKEFILES_TRUE
+USING_EXPORT_MAKEFILES_FALSE
+PERL_EXE
+HAVE_PERL
+USING_PERL_TRUE
+USING_PERL_FALSE
+USING_GNUMAKE_TRUE
+USING_GNUMAKE_FALSE
+BUILD_TESTS_TRUE
+BUILD_TESTS_FALSE
+SUB_TEST_TRUE
+SUB_TEST_FALSE
+GREP
+EGREP
+PTHREAD_CC
+PTHREAD_LIBS
+PTHREAD_CFLAGS
+ac_aux_dir
+LIBOBJS
+LTLIBOBJS'
+ac_subst_files=''
+      ac_precious_vars='build_alias
+host_alias
+target_alias
+CC
+CFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS
+CXX
+CXXFLAGS
+CCC
+CXXCPP'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+  # If the previous option needs an argument, assign it.
+  if test -n "$ac_prev"; then
+    eval $ac_prev=\$ac_option
+    ac_prev=
+    continue
+  fi
+
+  case $ac_option in
+  *=*)	ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+  *)	ac_optarg=yes ;;
+  esac
+
+  # Accept the important Cygnus configure options, so we can diagnose typos.
+
+  case $ac_dashdash$ac_option in
+  --)
+    ac_dashdash=yes ;;
+
+  -bindir | --bindir | --bindi | --bind | --bin | --bi)
+    ac_prev=bindir ;;
+  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+    bindir=$ac_optarg ;;
+
+  -build | --build | --buil | --bui | --bu)
+    ac_prev=build_alias ;;
+  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+    build_alias=$ac_optarg ;;
+
+  -cache-file | --cache-file | --cache-fil | --cache-fi \
+  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+    ac_prev=cache_file ;;
+  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+    cache_file=$ac_optarg ;;
+
+  --config-cache | -C)
+    cache_file=config.cache ;;
+
+  -datadir | --datadir | --datadi | --datad)
+    ac_prev=datadir ;;
+  -datadir=* | --datadir=* | --datadi=* | --datad=*)
+    datadir=$ac_optarg ;;
+
+  -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+  | --dataroo | --dataro | --datar)
+    ac_prev=datarootdir ;;
+  -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+  | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+    datarootdir=$ac_optarg ;;
+
+  -disable-* | --disable-*)
+    ac_feature=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_feature" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      { echo "$as_me: error: invalid feature name: $ac_feature" >&2
+   { (exit 1); exit 1; }; }
+    ac_feature=`echo $ac_feature | sed 's/[-.]/_/g'`
+    eval enable_$ac_feature=no ;;
+
+  -docdir | --docdir | --docdi | --doc | --do)
+    ac_prev=docdir ;;
+  -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+    docdir=$ac_optarg ;;
+
+  -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+    ac_prev=dvidir ;;
+  -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+    dvidir=$ac_optarg ;;
+
+  -enable-* | --enable-*)
+    ac_feature=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_feature" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      { echo "$as_me: error: invalid feature name: $ac_feature" >&2
+   { (exit 1); exit 1; }; }
+    ac_feature=`echo $ac_feature | sed 's/[-.]/_/g'`
+    eval enable_$ac_feature=\$ac_optarg ;;
+
+  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+  | --exec | --exe | --ex)
+    ac_prev=exec_prefix ;;
+  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+  | --exec=* | --exe=* | --ex=*)
+    exec_prefix=$ac_optarg ;;
+
+  -gas | --gas | --ga | --g)
+    # Obsolete; use --with-gas.
+    with_gas=yes ;;
+
+  -help | --help | --hel | --he | -h)
+    ac_init_help=long ;;
+  -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+    ac_init_help=recursive ;;
+  -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+    ac_init_help=short ;;
+
+  -host | --host | --hos | --ho)
+    ac_prev=host_alias ;;
+  -host=* | --host=* | --hos=* | --ho=*)
+    host_alias=$ac_optarg ;;
+
+  -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+    ac_prev=htmldir ;;
+  -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+  | --ht=*)
+    htmldir=$ac_optarg ;;
+
+  -includedir | --includedir | --includedi | --included | --include \
+  | --includ | --inclu | --incl | --inc)
+    ac_prev=includedir ;;
+  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+  | --includ=* | --inclu=* | --incl=* | --inc=*)
+    includedir=$ac_optarg ;;
+
+  -infodir | --infodir | --infodi | --infod | --info | --inf)
+    ac_prev=infodir ;;
+  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+    infodir=$ac_optarg ;;
+
+  -libdir | --libdir | --libdi | --libd)
+    ac_prev=libdir ;;
+  -libdir=* | --libdir=* | --libdi=* | --libd=*)
+    libdir=$ac_optarg ;;
+
+  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+  | --libexe | --libex | --libe)
+    ac_prev=libexecdir ;;
+  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+  | --libexe=* | --libex=* | --libe=*)
+    libexecdir=$ac_optarg ;;
+
+  -localedir | --localedir | --localedi | --localed | --locale)
+    ac_prev=localedir ;;
+  -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+    localedir=$ac_optarg ;;
+
+  -localstatedir | --localstatedir | --localstatedi | --localstated \
+  | --localstate | --localstat | --localsta | --localst | --locals)
+    ac_prev=localstatedir ;;
+  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+  | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+    localstatedir=$ac_optarg ;;
+
+  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+    ac_prev=mandir ;;
+  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+    mandir=$ac_optarg ;;
+
+  -nfp | --nfp | --nf)
+    # Obsolete; use --without-fp.
+    with_fp=no ;;
+
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c | -n)
+    no_create=yes ;;
+
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+    no_recursion=yes ;;
+
+  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+  | --oldin | --oldi | --old | --ol | --o)
+    ac_prev=oldincludedir ;;
+  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+    oldincludedir=$ac_optarg ;;
+
+  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+    ac_prev=prefix ;;
+  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+    prefix=$ac_optarg ;;
+
+  -program-prefix | --program-prefix | --program-prefi | --program-pref \
+  | --program-pre | --program-pr | --program-p)
+    ac_prev=program_prefix ;;
+  -program-prefix=* | --program-prefix=* | --program-prefi=* \
+  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+    program_prefix=$ac_optarg ;;
+
+  -program-suffix | --program-suffix | --program-suffi | --program-suff \
+  | --program-suf | --program-su | --program-s)
+    ac_prev=program_suffix ;;
+  -program-suffix=* | --program-suffix=* | --program-suffi=* \
+  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+    program_suffix=$ac_optarg ;;
+
+  -program-transform-name | --program-transform-name \
+  | --program-transform-nam | --program-transform-na \
+  | --program-transform-n | --program-transform- \
+  | --program-transform | --program-transfor \
+  | --program-transfo | --program-transf \
+  | --program-trans | --program-tran \
+  | --progr-tra | --program-tr | --program-t)
+    ac_prev=program_transform_name ;;
+  -program-transform-name=* | --program-transform-name=* \
+  | --program-transform-nam=* | --program-transform-na=* \
+  | --program-transform-n=* | --program-transform-=* \
+  | --program-transform=* | --program-transfor=* \
+  | --program-transfo=* | --program-transf=* \
+  | --program-trans=* | --program-tran=* \
+  | --progr-tra=* | --program-tr=* | --program-t=*)
+    program_transform_name=$ac_optarg ;;
+
+  -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+    ac_prev=pdfdir ;;
+  -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+    pdfdir=$ac_optarg ;;
+
+  -psdir | --psdir | --psdi | --psd | --ps)
+    ac_prev=psdir ;;
+  -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+    psdir=$ac_optarg ;;
+
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil)
+    silent=yes ;;
+
+  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+    ac_prev=sbindir ;;
+  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+  | --sbi=* | --sb=*)
+    sbindir=$ac_optarg ;;
+
+  -sharedstatedir | --sharedstatedir | --sharedstatedi \
+  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+  | --sharedst | --shareds | --shared | --share | --shar \
+  | --sha | --sh)
+    ac_prev=sharedstatedir ;;
+  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+  | --sha=* | --sh=*)
+    sharedstatedir=$ac_optarg ;;
+
+  -site | --site | --sit)
+    ac_prev=site ;;
+  -site=* | --site=* | --sit=*)
+    site=$ac_optarg ;;
+
+  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+    ac_prev=srcdir ;;
+  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+    srcdir=$ac_optarg ;;
+
+  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+  | --syscon | --sysco | --sysc | --sys | --sy)
+    ac_prev=sysconfdir ;;
+  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+    sysconfdir=$ac_optarg ;;
+
+  -target | --target | --targe | --targ | --tar | --ta | --t)
+    ac_prev=target_alias ;;
+  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+    target_alias=$ac_optarg ;;
+
+  -v | -verbose | --verbose | --verbos | --verbo | --verb)
+    verbose=yes ;;
+
+  -version | --version | --versio | --versi | --vers | -V)
+    ac_init_version=: ;;
+
+  -with-* | --with-*)
+    ac_package=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_package" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      { echo "$as_me: error: invalid package name: $ac_package" >&2
+   { (exit 1); exit 1; }; }
+    ac_package=`echo $ac_package | sed 's/[-.]/_/g'`
+    eval with_$ac_package=\$ac_optarg ;;
+
+  -without-* | --without-*)
+    ac_package=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_package" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      { echo "$as_me: error: invalid package name: $ac_package" >&2
+   { (exit 1); exit 1; }; }
+    ac_package=`echo $ac_package | sed 's/[-.]/_/g'`
+    eval with_$ac_package=no ;;
+
+  --x)
+    # Obsolete; use --with-x.
+    with_x=yes ;;
+
+  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+  | --x-incl | --x-inc | --x-in | --x-i)
+    ac_prev=x_includes ;;
+  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+    x_includes=$ac_optarg ;;
+
+  -x-libraries | --x-libraries | --x-librarie | --x-librari \
+  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+    ac_prev=x_libraries ;;
+  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+    x_libraries=$ac_optarg ;;
+
+  -*) { echo "$as_me: error: unrecognized option: $ac_option
+Try \`$0 --help' for more information." >&2
+   { (exit 1); exit 1; }; }
+    ;;
+
+  *=*)
+    ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null &&
+      { echo "$as_me: error: invalid variable name: $ac_envvar" >&2
+   { (exit 1); exit 1; }; }
+    eval $ac_envvar=\$ac_optarg
+    export $ac_envvar ;;
+
+  *)
+    # FIXME: should be removed in autoconf 3.0.
+    echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+    expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+    : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}
+    ;;
+
+  esac
+done
+
+if test -n "$ac_prev"; then
+  ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+  { echo "$as_me: error: missing argument to $ac_option" >&2
+   { (exit 1); exit 1; }; }
+fi
+
+# Be sure to have absolute directory names.
+for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
+		datadir sysconfdir sharedstatedir localstatedir includedir \
+		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+		libdir localedir mandir
+do
+  eval ac_val=\$$ac_var
+  case $ac_val in
+    [\\/$]* | ?:[\\/]* )  continue;;
+    NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+  esac
+  { echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2
+   { (exit 1); exit 1; }; }
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+  if test "x$build_alias" = x; then
+    cross_compiling=maybe
+    echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host.
+    If a cross compiler is detected then cross compile mode will be used." >&2
+  elif test "x$build_alias" != "x$host_alias"; then
+    cross_compiling=yes
+  fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+  { echo "$as_me: error: Working directory cannot be determined" >&2
+   { (exit 1); exit 1; }; }
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+  { echo "$as_me: error: pwd does not report name of working directory" >&2
+   { (exit 1); exit 1; }; }
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+  ac_srcdir_defaulted=yes
+  # Try the directory containing this script, then the parent directory.
+  ac_confdir=`$as_dirname -- "$0" ||
+$as_expr X"$0" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$0" : 'X\(//\)[^/]' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+echo X"$0" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  srcdir=$ac_confdir
+  if test ! -r "$srcdir/$ac_unique_file"; then
+    srcdir=..
+  fi
+else
+  ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+  test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+  { echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2
+   { (exit 1); exit 1; }; }
+fi
+ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_abs_confdir=`(
+	cd "$srcdir" && test -r "./$ac_unique_file" || { echo "$as_me: error: $ac_msg" >&2
+   { (exit 1); exit 1; }; }
+	pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+  srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+  eval ac_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_env_${ac_var}_value=\$${ac_var}
+  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+  # Omit some internal or obsolete options to make the list less imposing.
+  # This message is too long to be a string in the A/UX 3.1 sh.
+  cat <<_ACEOF
+\`configure' configures ThreadPool 1.1d to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE.  See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              display this help and exit
+      --help=short        display options specific to this package
+      --help=recursive    display the short help of all the included packages
+  -V, --version           display version information and exit
+  -q, --quiet, --silent   do not print \`checking...' messages
+      --cache-file=FILE   cache test results in FILE [disabled]
+  -C, --config-cache      alias for \`--cache-file=config.cache'
+  -n, --no-create         do not create output files
+      --srcdir=DIR        find the sources in DIR [configure dir or \`..']
+
+Installation directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+			  [$ac_default_prefix]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+			  [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc.  You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+  --bindir=DIR           user executables [EPREFIX/bin]
+  --sbindir=DIR          system admin executables [EPREFIX/sbin]
+  --libexecdir=DIR       program executables [EPREFIX/libexec]
+  --sysconfdir=DIR       read-only single-machine data [PREFIX/etc]
+  --sharedstatedir=DIR   modifiable architecture-independent data [PREFIX/com]
+  --localstatedir=DIR    modifiable single-machine data [PREFIX/var]
+  --libdir=DIR           object code libraries [EPREFIX/lib]
+  --includedir=DIR       C header files [PREFIX/include]
+  --oldincludedir=DIR    C header files for non-gcc [/usr/include]
+  --datarootdir=DIR      read-only arch.-independent data root [PREFIX/share]
+  --datadir=DIR          read-only architecture-independent data [DATAROOTDIR]
+  --infodir=DIR          info documentation [DATAROOTDIR/info]
+  --localedir=DIR        locale-dependent data [DATAROOTDIR/locale]
+  --mandir=DIR           man documentation [DATAROOTDIR/man]
+  --docdir=DIR           documentation root [DATAROOTDIR/doc/threadpool]
+  --htmldir=DIR          html documentation [DOCDIR]
+  --dvidir=DIR           dvi documentation [DOCDIR]
+  --pdfdir=DIR           pdf documentation [DOCDIR]
+  --psdir=DIR            ps documentation [DOCDIR]
+_ACEOF
+
+  cat <<\_ACEOF
+
+Program names:
+  --program-prefix=PREFIX            prepend PREFIX to installed program names
+  --program-suffix=SUFFIX            append SUFFIX to installed program names
+  --program-transform-name=PROGRAM   run sed PROGRAM on installed program names
+
+System types:
+  --build=BUILD     configure for building on BUILD [guessed]
+  --host=HOST       cross-compile to build programs to run on HOST [BUILD]
+  --target=TARGET   configure for building compilers for TARGET [HOST]
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+  case $ac_init_help in
+     short | recursive ) echo "Configuration of ThreadPool 1.1d:";;
+   esac
+  cat <<\_ACEOF
+
+Optional Features:
+  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
+  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
+  --enable-maintainer-mode  enable make rules and dependencies not useful
+			  (and sometimes confusing) to the casual installer
+  --enable-mpi            MPI support
+  --disable-dependency-tracking  speeds up one-time build
+  --enable-dependency-tracking   do not reject slow dependency extractors
+  --enable-export-makefiles
+                          Creates export makefiles in the install (prefix)
+                          directory. This option requires perl to be set in
+                          your path or defined with --with-perl=<perl
+                          executable>. Note that the export makefiles are
+                          always created and used in the build directory, but
+                          will not be installable without this option to
+                          change the paths. (default is yes)
+  --enable-tests          Make tests for all Trilinos packages buildable with
+                          'make tests' (default is yes)
+
+  --enable-threadpool-tests
+                          Make ThreadPool tests buildable with 'make tests'
+                          (default is yes if --disable-tests is not specified)
+  --enable-libcheck       Check for some third-party libraries. (Cannot be
+                          disabled unless tests and examples are also
+                          disabled.) (default is yes)
+
+Optional Packages:
+  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
+  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
+  --with-install=INSTALL_PROGRAM
+                          Use the installation program INSTALL_PROGRAM rather
+                          the default that is provided. For example
+                          --with-install="/path/install -p"
+  --with-mpi-compilers=PATH
+                          use MPI compilers mpicc, mpif77, and mpicxx, mpic++
+                          or mpiCC in the specified path or in the default
+                          path if no path is specified. Enables MPI
+  --with-mpi=MPIROOT      use MPI root directory (enables MPI)
+  --with-mpi-libs="LIBS"  MPI libraries ["-lmpi"]
+  --with-mpi-incdir=DIR   MPI include directory [MPIROOT/include] Do not use
+                          -I
+  --with-mpi-libdir=DIR   MPI library directory [MPIROOT/lib] Do not use -L
+  --with-ccflags          additional CCFLAGS flags to be added: will prepend
+                          to CCFLAGS
+  --with-cxxflags         additional CXXFLAGS flags to be added: will
+                          prepend to CXXFLAGS
+  --with-cflags           additional CFLAGS flags to be added: will prepend
+                          to CFLAGS
+  --with-libs             List additional libraries here. For example,
+                          --with-libs=-lsuperlu or
+                          --with-libs=/path/libsuperlu.a
+  --with-ldflags          additional LDFLAGS flags to be added: will prepend
+                          to LDFLAGS
+  --with-ar               override archiver command (default is "ar cru")
+  --with-perl             supply a perl executable. For example
+                          --with-perl=/usr/bin/perl.
+  --with-gnumake          Gnu's make has special functions we can use to
+                          eliminate redundant paths in the build and link
+                          lines. Enable this if you use gnu-make to build
+                          Trilinos. This requires that perl is in your path or
+                          that you have specified the perl executable with
+                          --with-perl=<perl executable>. Configure will check
+                          for the existence of the perl executable and quit
+                          with an error if it is not found. (default is no)
+  --with-libdirs          OBSOLETE use --with-ldflags instead. (ex.
+                          --with-ldflags="-L<DIR> -L<DIR2>")
+  --with-incdirs          additional directories containing include files:
+                          will prepend to search here for includes, use -Idir
+                          format
+
+Some influential environment variables:
+  CC          C compiler command
+  CFLAGS      C compiler flags
+  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
+              nonstandard directory <lib dir>
+  LIBS        libraries to pass to the linker, e.g. -l<library>
+  CPPFLAGS    C/C++/Objective C preprocessor flags, e.g. -I<include dir> if
+              you have headers in a nonstandard directory <include dir>
+  CXX         C++ compiler command
+  CXXFLAGS    C++ compiler flags
+  CXXCPP      C++ preprocessor
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+Report bugs to <hcedwar@sandia.gov>.
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+  # If there are subdirs, report their specific --help.
+  for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+    test -d "$ac_dir" || continue
+    ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,/..,g;s,/,,'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+    cd "$ac_dir" || { ac_status=$?; continue; }
+    # Check for guested configure.
+    if test -f "$ac_srcdir/configure.gnu"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+    elif test -f "$ac_srcdir/configure"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure" --help=recursive
+    else
+      echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+    fi || ac_status=$?
+    cd "$ac_pwd" || { ac_status=$?; break; }
+  done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+  cat <<\_ACEOF
+ThreadPool configure 1.1d
+generated by GNU Autoconf 2.61
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+_ACEOF
+  exit
+fi
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by ThreadPool $as_me 1.1d, which was
+generated by GNU Autoconf 2.61.  Invocation command line was
+
+  $ $0 $@
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
+
+/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
+/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
+/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  echo "PATH: $as_dir"
+done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+  for ac_arg
+  do
+    case $ac_arg in
+    -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+    -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+    | -silent | --silent | --silen | --sile | --sil)
+      continue ;;
+    *\'*)
+      ac_arg=`echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    case $ac_pass in
+    1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;;
+    2)
+      ac_configure_args1="$ac_configure_args1 '$ac_arg'"
+      if test $ac_must_keep_next = true; then
+	ac_must_keep_next=false # Got value, back to normal.
+      else
+	case $ac_arg in
+	  *=* | --config-cache | -C | -disable-* | --disable-* \
+	  | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+	  | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+	  | -with-* | --with-* | -without-* | --without-* | --x)
+	    case "$ac_configure_args0 " in
+	      "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+	    esac
+	    ;;
+	  -* ) ac_must_keep_next=true ;;
+	esac
+      fi
+      ac_configure_args="$ac_configure_args '$ac_arg'"
+      ;;
+    esac
+  done
+done
+$as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; }
+$as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; }
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log.  We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+  # Save into config.log some information that might help in debugging.
+  {
+    echo
+
+    cat <<\_ASBOX
+## ---------------- ##
+## Cache variables. ##
+## ---------------- ##
+_ASBOX
+    echo
+    # The following way of writing the cache mishandles newlines in values,
+(
+  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { echo "$as_me:$LINENO: WARNING: Cache variable $ac_var contains a newline." >&5
+echo "$as_me: WARNING: Cache variable $ac_var contains a newline." >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      *) $as_unset $ac_var ;;
+      esac ;;
+    esac
+  done
+  (set) 2>&1 |
+    case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      sed -n \
+	"s/'\''/'\''\\\\'\'''\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+      ;; #(
+    *)
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+)
+    echo
+
+    cat <<\_ASBOX
+## ----------------- ##
+## Output variables. ##
+## ----------------- ##
+_ASBOX
+    echo
+    for ac_var in $ac_subst_vars
+    do
+      eval ac_val=\$$ac_var
+      case $ac_val in
+      *\'\''*) ac_val=`echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+      esac
+      echo "$ac_var='\''$ac_val'\''"
+    done | sort
+    echo
+
+    if test -n "$ac_subst_files"; then
+      cat <<\_ASBOX
+## ------------------- ##
+## File substitutions. ##
+## ------------------- ##
+_ASBOX
+      echo
+      for ac_var in $ac_subst_files
+      do
+	eval ac_val=\$$ac_var
+	case $ac_val in
+	*\'\''*) ac_val=`echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+	esac
+	echo "$ac_var='\''$ac_val'\''"
+      done | sort
+      echo
+    fi
+
+    if test -s confdefs.h; then
+      cat <<\_ASBOX
+## ----------- ##
+## confdefs.h. ##
+## ----------- ##
+_ASBOX
+      echo
+      cat confdefs.h
+      echo
+    fi
+    test "$ac_signal" != 0 &&
+      echo "$as_me: caught signal $ac_signal"
+    echo "$as_me: exit $exit_status"
+  } >&5
+  rm -f core *.core core.conftest.* &&
+    rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+    exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+  trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer explicitly selected file to automatically selected ones.
+if test -n "$CONFIG_SITE"; then
+  set x "$CONFIG_SITE"
+elif test "x$prefix" != xNONE; then
+  set x "$prefix/share/config.site" "$prefix/etc/config.site"
+else
+  set x "$ac_default_prefix/share/config.site" \
+	"$ac_default_prefix/etc/config.site"
+fi
+shift
+for ac_site_file
+do
+  if test -r "$ac_site_file"; then
+    { echo "$as_me:$LINENO: loading site script $ac_site_file" >&5
+echo "$as_me: loading site script $ac_site_file" >&6;}
+    sed 's/^/| /' "$ac_site_file" >&5
+    . "$ac_site_file"
+  fi
+done
+
+if test -r "$cache_file"; then
+  # Some versions of bash will fail to source /dev/null (special
+  # files actually), so we avoid doing that.
+  if test -f "$cache_file"; then
+    { echo "$as_me:$LINENO: loading cache $cache_file" >&5
+echo "$as_me: loading cache $cache_file" >&6;}
+    case $cache_file in
+      [\\/]* | ?:[\\/]* ) . "$cache_file";;
+      *)                      . "./$cache_file";;
+    esac
+  fi
+else
+  { echo "$as_me:$LINENO: creating cache $cache_file" >&5
+echo "$as_me: creating cache $cache_file" >&6;}
+  >$cache_file
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+  eval ac_old_set=\$ac_cv_env_${ac_var}_set
+  eval ac_new_set=\$ac_env_${ac_var}_set
+  eval ac_old_val=\$ac_cv_env_${ac_var}_value
+  eval ac_new_val=\$ac_env_${ac_var}_value
+  case $ac_old_set,$ac_new_set in
+    set,)
+      { echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,set)
+      { echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5
+echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,);;
+    *)
+      if test "x$ac_old_val" != "x$ac_new_val"; then
+	{ echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5
+echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+	{ echo "$as_me:$LINENO:   former value:  $ac_old_val" >&5
+echo "$as_me:   former value:  $ac_old_val" >&2;}
+	{ echo "$as_me:$LINENO:   current value: $ac_new_val" >&5
+echo "$as_me:   current value: $ac_new_val" >&2;}
+	ac_cache_corrupted=:
+      fi;;
+  esac
+  # Pass precious variables to config.status.
+  if test "$ac_new_set" = set; then
+    case $ac_new_val in
+    *\'*) ac_arg=$ac_var=`echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *) ac_arg=$ac_var=$ac_new_val ;;
+    esac
+    case " $ac_configure_args " in
+      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
+      *) ac_configure_args="$ac_configure_args '$ac_arg'" ;;
+    esac
+  fi
+done
+if $ac_cache_corrupted; then
+  { echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5
+echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+  { { echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5
+echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+# Hello World!
+echo "----------------------------------------"
+echo "Running ThreadPool Configure Script"
+echo "----------------------------------------"
+
+# This is to protect against accidentally specifying the wrong
+# directory with --srcdir.  Any file in that directory will do,
+# preferably one that is unlikely to be removed or renamed.
+
+
+
+# Specify directory for auxillary build tools (e.g., install-sh,
+# config.sub, config.guess) and M4 files.
+
+ac_aux_dir=
+for ac_dir in config "$srcdir"/config; do
+  if test -f "$ac_dir/install-sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install-sh -c"
+    break
+  elif test -f "$ac_dir/install.sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install.sh -c"
+    break
+  elif test -f "$ac_dir/shtool"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/shtool install -c"
+    break
+  fi
+done
+if test -z "$ac_aux_dir"; then
+  { { echo "$as_me:$LINENO: error: cannot find install-sh or install.sh in config \"$srcdir\"/config" >&5
+echo "$as_me: error: cannot find install-sh or install.sh in config \"$srcdir\"/config" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+# These three variables are undocumented and unsupported,
+# and are intended to be withdrawn in a future Autoconf release.
+# They can cause serious problems if a builder's source tree is in a directory
+# whose full name contains unusual characters.
+ac_config_guess="$SHELL $ac_aux_dir/config.guess"  # Please don't use this var.
+ac_config_sub="$SHELL $ac_aux_dir/config.sub"  # Please don't use this var.
+ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
+
+
+#  #auto np# - Change file names in next line
+# Configure should create src/ThreadPool_config.h from src/ThreadPool_config.h.in
+
+ac_config_headers="$ac_config_headers src/ThreadPool_config.h:src/ThreadPool_config.h.in"
+
+
+# Allow users to specify their own "install" command.  If none is specified,
+# the default is install-sh found in the config subdirectory.
+
+
+# Check whether --with-install was given.
+if test "${with_install+set}" = set; then
+  withval=$with_install;
+   INSTALL=$withval
+   INSTALL_PROGRAM=$withval
+   INSTALL_SCRIPT=$withval
+   INSTALL_DATA="$withval -m 644"
+
+fi
+
+
+# AM_MAINTAINER_MODE turns off maintainer-only makefile targets by
+# default, and changes configure to understand a
+# --enable-maintainer-mode option. --enable-maintainer-mode turns the
+# maintainer-only targets back on. The maintainer-only makefile
+# targets permit end users to clean automatically-generated files such
+# as configure, which means they have to have autoconf and automake
+# installed to repair the damage. AM_MAINTAINER_MODE makes it a bit
+# harder for users to shoot themselves in the foot.
+
+{ echo "$as_me:$LINENO: checking whether to enable maintainer-specific portions of Makefiles" >&5
+echo $ECHO_N "checking whether to enable maintainer-specific portions of Makefiles... $ECHO_C" >&6; }
+    # Check whether --enable-maintainer-mode was given.
+if test "${enable_maintainer_mode+set}" = set; then
+  enableval=$enable_maintainer_mode; USE_MAINTAINER_MODE=$enableval
+else
+  USE_MAINTAINER_MODE=no
+fi
+
+  { echo "$as_me:$LINENO: result: $USE_MAINTAINER_MODE" >&5
+echo "${ECHO_T}$USE_MAINTAINER_MODE" >&6; }
+   if test $USE_MAINTAINER_MODE = yes; then
+  MAINTAINER_MODE_TRUE=
+  MAINTAINER_MODE_FALSE='#'
+else
+  MAINTAINER_MODE_TRUE='#'
+  MAINTAINER_MODE_FALSE=
+fi
+
+  MAINT=$MAINTAINER_MODE_TRUE
+
+
+
+# Define $build, $host, $target, etc
+
+# Make sure we can run config.sub.
+$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
+  { { echo "$as_me:$LINENO: error: cannot run $SHELL $ac_aux_dir/config.sub" >&5
+echo "$as_me: error: cannot run $SHELL $ac_aux_dir/config.sub" >&2;}
+   { (exit 1); exit 1; }; }
+
+{ echo "$as_me:$LINENO: checking build system type" >&5
+echo $ECHO_N "checking build system type... $ECHO_C" >&6; }
+if test "${ac_cv_build+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_build_alias=$build_alias
+test "x$ac_build_alias" = x &&
+  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
+test "x$ac_build_alias" = x &&
+  { { echo "$as_me:$LINENO: error: cannot guess build type; you must specify one" >&5
+echo "$as_me: error: cannot guess build type; you must specify one" >&2;}
+   { (exit 1); exit 1; }; }
+ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
+  { { echo "$as_me:$LINENO: error: $SHELL $ac_aux_dir/config.sub $ac_build_alias failed" >&5
+echo "$as_me: error: $SHELL $ac_aux_dir/config.sub $ac_build_alias failed" >&2;}
+   { (exit 1); exit 1; }; }
+
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_build" >&5
+echo "${ECHO_T}$ac_cv_build" >&6; }
+case $ac_cv_build in
+*-*-*) ;;
+*) { { echo "$as_me:$LINENO: error: invalid value of canonical build" >&5
+echo "$as_me: error: invalid value of canonical build" >&2;}
+   { (exit 1); exit 1; }; };;
+esac
+build=$ac_cv_build
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_build
+shift
+build_cpu=$1
+build_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+build_os=$*
+IFS=$ac_save_IFS
+case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
+
+
+{ echo "$as_me:$LINENO: checking host system type" >&5
+echo $ECHO_N "checking host system type... $ECHO_C" >&6; }
+if test "${ac_cv_host+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test "x$host_alias" = x; then
+  ac_cv_host=$ac_cv_build
+else
+  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
+    { { echo "$as_me:$LINENO: error: $SHELL $ac_aux_dir/config.sub $host_alias failed" >&5
+echo "$as_me: error: $SHELL $ac_aux_dir/config.sub $host_alias failed" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_host" >&5
+echo "${ECHO_T}$ac_cv_host" >&6; }
+case $ac_cv_host in
+*-*-*) ;;
+*) { { echo "$as_me:$LINENO: error: invalid value of canonical host" >&5
+echo "$as_me: error: invalid value of canonical host" >&2;}
+   { (exit 1); exit 1; }; };;
+esac
+host=$ac_cv_host
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_host
+shift
+host_cpu=$1
+host_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+host_os=$*
+IFS=$ac_save_IFS
+case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
+
+
+{ echo "$as_me:$LINENO: checking target system type" >&5
+echo $ECHO_N "checking target system type... $ECHO_C" >&6; }
+if test "${ac_cv_target+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test "x$target_alias" = x; then
+  ac_cv_target=$ac_cv_host
+else
+  ac_cv_target=`$SHELL "$ac_aux_dir/config.sub" $target_alias` ||
+    { { echo "$as_me:$LINENO: error: $SHELL $ac_aux_dir/config.sub $target_alias failed" >&5
+echo "$as_me: error: $SHELL $ac_aux_dir/config.sub $target_alias failed" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_target" >&5
+echo "${ECHO_T}$ac_cv_target" >&6; }
+case $ac_cv_target in
+*-*-*) ;;
+*) { { echo "$as_me:$LINENO: error: invalid value of canonical target" >&5
+echo "$as_me: error: invalid value of canonical target" >&2;}
+   { (exit 1); exit 1; }; };;
+esac
+target=$ac_cv_target
+ac_save_IFS=$IFS; IFS='-'
+set x $ac_cv_target
+shift
+target_cpu=$1
+target_vendor=$2
+shift; shift
+# Remember, the first character of IFS is used to create $*,
+# except with old shells:
+target_os=$*
+IFS=$ac_save_IFS
+case $target_os in *\ *) target_os=`echo "$target_os" | sed 's/ /-/g'`;; esac
+
+
+# The aliases save the names the user supplied, while $host etc.
+# will get canonicalized.
+test -n "$target_alias" &&
+  test "$program_prefix$program_suffix$program_transform_name" = \
+    NONENONEs,x,x, &&
+  program_prefix=${target_alias}-
+
+# Use automake
+
+#  - Required version of automake.
+am__api_version='1.10'
+
+# Find a good install program.  We prefer a C program (faster),
+# so one script is as good as another.  But avoid the broken or
+# incompatible versions:
+# SysV /etc/install, /usr/sbin/install
+# SunOS /usr/etc/install
+# IRIX /sbin/install
+# AIX /bin/install
+# AmigaOS /C/install, which installs bootblocks on floppy discs
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
+# AFS /usr/afsws/bin/install, which mishandles nonexistent args
+# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
+# OS/2's system install, which has a completely different semantic
+# ./install, which can be erroneously created by make from ./install.sh.
+{ echo "$as_me:$LINENO: checking for a BSD-compatible install" >&5
+echo $ECHO_N "checking for a BSD-compatible install... $ECHO_C" >&6; }
+if test -z "$INSTALL"; then
+if test "${ac_cv_path_install+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  # Account for people who put trailing slashes in PATH elements.
+case $as_dir/ in
+  ./ | .// | /cC/* | \
+  /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \
+  ?:\\/os2\\/install\\/* | ?:\\/OS2\\/INSTALL\\/* | \
+  /usr/ucb/* ) ;;
+  *)
+    # OSF1 and SCO ODT 3.0 have their own names for install.
+    # Don't use installbsd from OSF since it installs stuff as root
+    # by default.
+    for ac_prog in ginstall scoinst install; do
+      for ac_exec_ext in '' $ac_executable_extensions; do
+	if { test -f "$as_dir/$ac_prog$ac_exec_ext" && $as_test_x "$as_dir/$ac_prog$ac_exec_ext"; }; then
+	  if test $ac_prog = install &&
+	    grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # AIX install.  It has an incompatible calling convention.
+	    :
+	  elif test $ac_prog = install &&
+	    grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # program-specific install script used by HP pwplus--don't use.
+	    :
+	  else
+	    ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c"
+	    break 3
+	  fi
+	fi
+      done
+    done
+    ;;
+esac
+done
+IFS=$as_save_IFS
+
+
+fi
+  if test "${ac_cv_path_install+set}" = set; then
+    INSTALL=$ac_cv_path_install
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for INSTALL within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    INSTALL=$ac_install_sh
+  fi
+fi
+{ echo "$as_me:$LINENO: result: $INSTALL" >&5
+echo "${ECHO_T}$INSTALL" >&6; }
+
+# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
+# It thinks the first close brace ends the variable substitution.
+test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
+
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+
+test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
+
+{ echo "$as_me:$LINENO: checking whether build environment is sane" >&5
+echo $ECHO_N "checking whether build environment is sane... $ECHO_C" >&6; }
+# Just in case
+sleep 1
+echo timestamp > conftest.file
+# Do `set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
+   if test "$*" = "X"; then
+      # -L didn't work.
+      set X `ls -t $srcdir/configure conftest.file`
+   fi
+   rm -f conftest.file
+   if test "$*" != "X $srcdir/configure conftest.file" \
+      && test "$*" != "X conftest.file $srcdir/configure"; then
+
+      # If neither matched, then we have a broken ls.  This can happen
+      # if, for instance, CONFIG_SHELL is bash and it inherits a
+      # broken ls alias from the environment.  This has actually
+      # happened.  Such a system could not be considered "sane".
+      { { echo "$as_me:$LINENO: error: ls -t appears to fail.  Make sure there is not a broken
+alias in your environment" >&5
+echo "$as_me: error: ls -t appears to fail.  Make sure there is not a broken
+alias in your environment" >&2;}
+   { (exit 1); exit 1; }; }
+   fi
+
+   test "$2" = conftest.file
+   )
+then
+   # Ok.
+   :
+else
+   { { echo "$as_me:$LINENO: error: newly created file is older than distributed files!
+Check your system clock" >&5
+echo "$as_me: error: newly created file is older than distributed files!
+Check your system clock" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+{ echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+test "$program_prefix" != NONE &&
+  program_transform_name="s&^&$program_prefix&;$program_transform_name"
+# Use a double $ so make ignores it.
+test "$program_suffix" != NONE &&
+  program_transform_name="s&\$&$program_suffix&;$program_transform_name"
+# Double any \ or $.  echo might interpret backslashes.
+# By default was `s,x,x', remove it if useless.
+cat <<\_ACEOF >conftest.sed
+s/[\\$]/&&/g;s/;s,x,x,$//
+_ACEOF
+program_transform_name=`echo $program_transform_name | sed -f conftest.sed`
+rm -f conftest.sed
+
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
+
+test x"${MISSING+set}" = xset || MISSING="\${SHELL} $am_aux_dir/missing"
+# Use eval to expand $SHELL
+if eval "$MISSING --run true"; then
+  am_missing_run="$MISSING --run "
+else
+  am_missing_run=
+  { echo "$as_me:$LINENO: WARNING: \`missing' script is too old or missing" >&5
+echo "$as_me: WARNING: \`missing' script is too old or missing" >&2;}
+fi
+
+{ echo "$as_me:$LINENO: checking for a thread-safe mkdir -p" >&5
+echo $ECHO_N "checking for a thread-safe mkdir -p... $ECHO_C" >&6; }
+if test -z "$MKDIR_P"; then
+  if test "${ac_cv_path_mkdir+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/opt/sfw/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_prog in mkdir gmkdir; do
+	 for ac_exec_ext in '' $ac_executable_extensions; do
+	   { test -f "$as_dir/$ac_prog$ac_exec_ext" && $as_test_x "$as_dir/$ac_prog$ac_exec_ext"; } || continue
+	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
+	     'mkdir (GNU coreutils) '* | \
+	     'mkdir (coreutils) '* | \
+	     'mkdir (fileutils) '4.1*)
+	       ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext
+	       break 3;;
+	   esac
+	 done
+       done
+done
+IFS=$as_save_IFS
+
+fi
+
+  if test "${ac_cv_path_mkdir+set}" = set; then
+    MKDIR_P="$ac_cv_path_mkdir -p"
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for MKDIR_P within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    test -d ./--version && rmdir ./--version
+    MKDIR_P="$ac_install_sh -d"
+  fi
+fi
+{ echo "$as_me:$LINENO: result: $MKDIR_P" >&5
+echo "${ECHO_T}$MKDIR_P" >&6; }
+
+mkdir_p="$MKDIR_P"
+case $mkdir_p in
+  [\\/$]* | ?:[\\/]*) ;;
+  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
+esac
+
+for ac_prog in gawk mawk nawk awk
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_AWK+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$AWK"; then
+  ac_cv_prog_AWK="$AWK" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_AWK="$ac_prog"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+AWK=$ac_cv_prog_AWK
+if test -n "$AWK"; then
+  { echo "$as_me:$LINENO: result: $AWK" >&5
+echo "${ECHO_T}$AWK" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+  test -n "$AWK" && break
+done
+
+{ echo "$as_me:$LINENO: checking whether ${MAKE-make} sets \$(MAKE)" >&5
+echo $ECHO_N "checking whether ${MAKE-make} sets \$(MAKE)... $ECHO_C" >&6; }
+set x ${MAKE-make}; ac_make=`echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'`
+if { as_var=ac_cv_prog_make_${ac_make}_set; eval "test \"\${$as_var+set}\" = set"; }; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.make <<\_ACEOF
+SHELL = /bin/sh
+all:
+	@echo '@@@%%%=$(MAKE)=@@@%%%'
+_ACEOF
+# GNU make sometimes prints "make[1]: Entering...", which would confuse us.
+case `${MAKE-make} -f conftest.make 2>/dev/null` in
+  *@@@%%%=?*=@@@%%%*)
+    eval ac_cv_prog_make_${ac_make}_set=yes;;
+  *)
+    eval ac_cv_prog_make_${ac_make}_set=no;;
+esac
+rm -f conftest.make
+fi
+if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then
+  { echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+  SET_MAKE=
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+  SET_MAKE="MAKE=${MAKE-make}"
+fi
+
+rm -rf .tst 2>/dev/null
+mkdir .tst 2>/dev/null
+if test -d .tst; then
+  am__leading_dot=.
+else
+  am__leading_dot=_
+fi
+rmdir .tst 2>/dev/null
+
+if test "`cd $srcdir && pwd`" != "`pwd`"; then
+  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
+  # is not polluted with repeated "-I."
+  am__isrc=' -I$(srcdir)'
+  # test to see if srcdir already configured
+  if test -f $srcdir/config.status; then
+    { { echo "$as_me:$LINENO: error: source directory already configured; run \"make distclean\" there first" >&5
+echo "$as_me: error: source directory already configured; run \"make distclean\" there first" >&2;}
+   { (exit 1); exit 1; }; }
+  fi
+fi
+
+# test whether we have cygpath
+if test -z "$CYGPATH_W"; then
+  if (cygpath --version) >/dev/null 2>/dev/null; then
+    CYGPATH_W='cygpath -w'
+  else
+    CYGPATH_W=echo
+  fi
+fi
+
+
+# Define the identity of the package.
+ PACKAGE='threadpool'
+ VERSION='1.1d'
+
+
+# Some tools Automake needs.
+
+ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"}
+
+
+AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
+
+
+AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"}
+
+
+AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
+
+
+MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
+
+install_sh=${install_sh-"\$(SHELL) $am_aux_dir/install-sh"}
+
+# Installed binaries are usually stripped using `strip' when the user
+# run `make install-strip'.  However `strip' might not be the right
+# tool to use in cross-compilation environments, therefore Automake
+# will honor the `STRIP' environment variable to overrule this program.
+if test "$cross_compiling" != no; then
+  if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_STRIP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+STRIP=$ac_cv_prog_STRIP
+if test -n "$STRIP"; then
+  { echo "$as_me:$LINENO: result: $STRIP" >&5
+echo "${ECHO_T}$STRIP" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_STRIP"; then
+  ac_ct_STRIP=$STRIP
+  # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_ac_ct_STRIP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$ac_ct_STRIP"; then
+  ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_ac_ct_STRIP="strip"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP
+if test -n "$ac_ct_STRIP"; then
+  { echo "$as_me:$LINENO: result: $ac_ct_STRIP" >&5
+echo "${ECHO_T}$ac_ct_STRIP" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+  if test "x$ac_ct_STRIP" = x; then
+    STRIP=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ echo "$as_me:$LINENO: WARNING: In the future, Autoconf will not detect cross-tools
+whose name does not start with the host triplet.  If you think this
+configuration is useful to you, please write to autoconf@gnu.org." >&5
+echo "$as_me: WARNING: In the future, Autoconf will not detect cross-tools
+whose name does not start with the host triplet.  If you think this
+configuration is useful to you, please write to autoconf@gnu.org." >&2;}
+ac_tool_warned=yes ;;
+esac
+    STRIP=$ac_ct_STRIP
+  fi
+else
+  STRIP="$ac_cv_prog_STRIP"
+fi
+
+fi
+INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
+
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
+# Always define AMTAR for backward compatibility.
+
+AMTAR=${AMTAR-"${am_missing_run}tar"}
+
+
+{ echo "$as_me:$LINENO: checking how to create a ustar tar archive" >&5
+echo $ECHO_N "checking how to create a ustar tar archive... $ECHO_C" >&6; }
+# Loop over all known methods to create a tar archive until one works.
+_am_tools='gnutar plaintar pax cpio none'
+_am_tools=${am_cv_prog_tar_ustar-$_am_tools}
+# Do not fold the above two line into one, because Tru64 sh and
+# Solaris sh will not grok spaces in the rhs of `-'.
+for _am_tool in $_am_tools
+do
+  case $_am_tool in
+  gnutar)
+    for _am_tar in tar gnutar gtar;
+    do
+      { echo "$as_me:$LINENO: $_am_tar --version" >&5
+   ($_am_tar --version) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } && break
+    done
+    am__tar="$_am_tar --format=ustar -chf - "'"$$tardir"'
+    am__tar_="$_am_tar --format=ustar -chf - "'"$tardir"'
+    am__untar="$_am_tar -xf -"
+    ;;
+  plaintar)
+    # Must skip GNU tar: if it does not support --format= it doesn't create
+    # ustar tarball either.
+    (tar --version) >/dev/null 2>&1 && continue
+    am__tar='tar chf - "$$tardir"'
+    am__tar_='tar chf - "$tardir"'
+    am__untar='tar xf -'
+    ;;
+  pax)
+    am__tar='pax -L -x ustar -w "$$tardir"'
+    am__tar_='pax -L -x ustar -w "$tardir"'
+    am__untar='pax -r'
+    ;;
+  cpio)
+    am__tar='find "$$tardir" -print | cpio -o -H ustar -L'
+    am__tar_='find "$tardir" -print | cpio -o -H ustar -L'
+    am__untar='cpio -i -H ustar -d'
+    ;;
+  none)
+    am__tar=false
+    am__tar_=false
+    am__untar=false
+    ;;
+  esac
+
+  # If the value was cached, stop now.  We just wanted to have am__tar
+  # and am__untar set.
+  test -n "${am_cv_prog_tar_ustar}" && break
+
+  # tar/untar a dummy directory, and stop if the command works
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  echo GrepMe > conftest.dir/file
+  { echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5
+   (tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+  rm -rf conftest.dir
+  if test -s conftest.tar; then
+    { echo "$as_me:$LINENO: $am__untar <conftest.tar" >&5
+   ($am__untar <conftest.tar) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); }
+    grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+  fi
+done
+rm -rf conftest.dir
+
+if test "${am_cv_prog_tar_ustar+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  am_cv_prog_tar_ustar=$_am_tool
+fi
+
+{ echo "$as_me:$LINENO: result: $am_cv_prog_tar_ustar" >&5
+echo "${ECHO_T}$am_cv_prog_tar_ustar" >&6; }
+
+
+
+
+
+
+# Specify required version of autoconf.
+
+
+
+# ------------------------------------------------------------------------
+# Check to see if MPI enabled and if any special configuration done
+# ------------------------------------------------------------------------
+
+
+
+# Check whether --enable-mpi was given.
+if test "${enable_mpi+set}" = set; then
+  enableval=$enable_mpi; HAVE_PKG_MPI=$enableval
+else
+  HAVE_PKG_MPI=no
+
+fi
+
+
+
+# Check whether --with-mpi-compilers was given.
+if test "${with_mpi_compilers+set}" = set; then
+  withval=$with_mpi_compilers;
+  if test X${withval} != Xno; then
+    HAVE_PKG_MPI=yes
+    if test X${withval} = Xyes; then
+      # Check for mpicxx, if it does not exist, check for mpic++, if it does
+      # not exist, use mpiCC instead.
+      # Extract the first word of "mpicxx", so it can be a program name with args.
+set dummy mpicxx; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_MPI_TEMP_CXX+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$MPI_TEMP_CXX"; then
+  ac_cv_prog_MPI_TEMP_CXX="$MPI_TEMP_CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_MPI_TEMP_CXX="mpicxx"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_prog_MPI_TEMP_CXX" && ac_cv_prog_MPI_TEMP_CXX="no"
+fi
+fi
+MPI_TEMP_CXX=$ac_cv_prog_MPI_TEMP_CXX
+if test -n "$MPI_TEMP_CXX"; then
+  { echo "$as_me:$LINENO: result: $MPI_TEMP_CXX" >&5
+echo "${ECHO_T}$MPI_TEMP_CXX" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+      if test X${MPI_TEMP_CXX} = Xno; then
+	# Extract the first word of "mpic++", so it can be a program name with args.
+set dummy mpic++; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_MPI_CXX+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$MPI_CXX"; then
+  ac_cv_prog_MPI_CXX="$MPI_CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_MPI_CXX="mpic++"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_prog_MPI_CXX" && ac_cv_prog_MPI_CXX="mpiCC"
+fi
+fi
+MPI_CXX=$ac_cv_prog_MPI_CXX
+if test -n "$MPI_CXX"; then
+  { echo "$as_me:$LINENO: result: $MPI_CXX" >&5
+echo "${ECHO_T}$MPI_CXX" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+      else
+	MPI_CXX=${MPI_TEMP_CXX}
+      fi
+      MPI_CC=mpicc
+      MPI_F77=mpif77
+    else
+      if test -f ${withval}/mpicxx; then
+        MPI_CXX=${withval}/mpicxx
+      elif test -f ${withval}/mpic++; then
+	MPI_CXX=${withval}/mpic++
+      else
+        MPI_CXX=${withval}/mpiCC
+      fi
+      MPI_CC=${withval}/mpicc
+      MPI_F77=${withval}/mpif77
+    fi
+  fi
+
+
+fi
+
+
+
+# Check whether --with-mpi was given.
+if test "${with_mpi+set}" = set; then
+  withval=$with_mpi;
+  HAVE_PKG_MPI=yes
+  MPI_DIR=${withval}
+  { echo "$as_me:$LINENO: checking MPI directory" >&5
+echo $ECHO_N "checking MPI directory... $ECHO_C" >&6; }
+  { echo "$as_me:$LINENO: result: ${MPI_DIR}" >&5
+echo "${ECHO_T}${MPI_DIR}" >&6; }
+
+
+fi
+
+
+#AC_ARG_WITH(mpi-include,
+#[AC_HELP_STRING([--with-mpi-include],[Obsolete.  Use --with-mpi-incdir=DIR instead.  Do not prefix DIR with '-I'.])],
+#[AC_MSG_ERROR([--with-mpi-include is an obsolte option.   Use --with-mpi-incdir=DIR instead.  Do not prefix DIR with '-I'.  For example '--with-mpi-incdir=/usr/lam_path/include'.])]
+#)
+
+
+# Check whether --with-mpi-libs was given.
+if test "${with_mpi_libs+set}" = set; then
+  withval=$with_mpi_libs;
+  MPI_LIBS=${withval}
+  { echo "$as_me:$LINENO: checking user-defined MPI libraries" >&5
+echo $ECHO_N "checking user-defined MPI libraries... $ECHO_C" >&6; }
+  { echo "$as_me:$LINENO: result: ${MPI_LIBS}" >&5
+echo "${ECHO_T}${MPI_LIBS}" >&6; }
+
+
+fi
+
+
+
+# Check whether --with-mpi-incdir was given.
+if test "${with_mpi_incdir+set}" = set; then
+  withval=$with_mpi_incdir;
+  MPI_INC=${withval}
+  { echo "$as_me:$LINENO: checking user-defined MPI includes" >&5
+echo $ECHO_N "checking user-defined MPI includes... $ECHO_C" >&6; }
+  { echo "$as_me:$LINENO: result: ${MPI_INC}" >&5
+echo "${ECHO_T}${MPI_INC}" >&6; }
+
+
+fi
+
+
+
+# Check whether --with-mpi-libdir was given.
+if test "${with_mpi_libdir+set}" = set; then
+  withval=$with_mpi_libdir;
+  MPI_LIBDIR=${withval}
+  { echo "$as_me:$LINENO: checking user-defined MPI library directory" >&5
+echo $ECHO_N "checking user-defined MPI library directory... $ECHO_C" >&6; }
+  { echo "$as_me:$LINENO: result: ${MPI_LIBDIR}" >&5
+echo "${ECHO_T}${MPI_LIBDIR}" >&6; }
+
+
+fi
+
+
+{ echo "$as_me:$LINENO: checking whether we are using MPI" >&5
+echo $ECHO_N "checking whether we are using MPI... $ECHO_C" >&6; }
+{ echo "$as_me:$LINENO: result: ${HAVE_PKG_MPI}" >&5
+echo "${ECHO_T}${HAVE_PKG_MPI}" >&6; }
+
+if test "X${HAVE_PKG_MPI}" = "Xyes"; then
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_MPI
+_ACEOF
+
+fi
+
+
+ if test "X${HAVE_PKG_MPI}" = "Xyes"; then
+  HAVE_MPI_TRUE=
+  HAVE_MPI_FALSE='#'
+else
+  HAVE_MPI_TRUE='#'
+  HAVE_MPI_FALSE=
+fi
+
+
+
+
+if test -n "${MPI_CXX}"; then
+  if test -f ${MPI_CXX}; then
+    MPI_CXX_EXISTS=yes
+  else
+    # Extract the first word of "${MPI_CXX}", so it can be a program name with args.
+set dummy ${MPI_CXX}; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_MPI_CXX_EXISTS+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$MPI_CXX_EXISTS"; then
+  ac_cv_prog_MPI_CXX_EXISTS="$MPI_CXX_EXISTS" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_MPI_CXX_EXISTS="yes"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_prog_MPI_CXX_EXISTS" && ac_cv_prog_MPI_CXX_EXISTS="no"
+fi
+fi
+MPI_CXX_EXISTS=$ac_cv_prog_MPI_CXX_EXISTS
+if test -n "$MPI_CXX_EXISTS"; then
+  { echo "$as_me:$LINENO: result: $MPI_CXX_EXISTS" >&5
+echo "${ECHO_T}$MPI_CXX_EXISTS" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+  fi
+
+  if test "X${MPI_CXX_EXISTS}" = "Xyes"; then
+    CXX=${MPI_CXX}
+  else
+    echo "-----"
+    echo "Cannot find MPI C++ compiler ${MPI_CXX}."
+    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
+    echo "or specify a C++ compiler using CXX=<compiler>"
+    echo "Do not use --with-mpi-compilers if using CXX=<compiler>"
+    echo "-----"
+    { { echo "$as_me:$LINENO: error: MPI C++ compiler (${MPI_CXX}) not found." >&5
+echo "$as_me: error: MPI C++ compiler (${MPI_CXX}) not found." >&2;}
+   { (exit 1); exit 1; }; }
+  fi
+fi
+
+if test -n "${MPI_CC}"; then
+  if test -f ${MPI_CC}; then
+    MPI_CC_EXISTS=yes
+  else
+    # Extract the first word of "${MPI_CC}", so it can be a program name with args.
+set dummy ${MPI_CC}; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_MPI_CC_EXISTS+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$MPI_CC_EXISTS"; then
+  ac_cv_prog_MPI_CC_EXISTS="$MPI_CC_EXISTS" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_MPI_CC_EXISTS="yes"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_prog_MPI_CC_EXISTS" && ac_cv_prog_MPI_CC_EXISTS="no"
+fi
+fi
+MPI_CC_EXISTS=$ac_cv_prog_MPI_CC_EXISTS
+if test -n "$MPI_CC_EXISTS"; then
+  { echo "$as_me:$LINENO: result: $MPI_CC_EXISTS" >&5
+echo "${ECHO_T}$MPI_CC_EXISTS" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+  fi
+
+  if test "X${MPI_CC_EXISTS}" = "Xyes"; then
+    CC=${MPI_CC}
+  else
+    echo "-----"
+    echo "Cannot find MPI C compiler ${MPI_CC}."
+    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
+    echo "or specify a C compiler using CC=<compiler>"
+    echo "Do not use --with-mpi-compilers if using CC=<compiler>"
+    echo "-----"
+    { { echo "$as_me:$LINENO: error: MPI C compiler (${MPI_CC}) not found." >&5
+echo "$as_me: error: MPI C compiler (${MPI_CC}) not found." >&2;}
+   { (exit 1); exit 1; }; }
+  fi
+fi
+
+if test "X$ac_cv_use_fortran" = "Xyes"; then
+
+if test -n "${MPI_F77}"; then
+  if test -f ${MPI_F77}; then
+    MPI_F77_EXISTS=yes
+  else
+    # Extract the first word of "${MPI_F77}", so it can be a program name with args.
+set dummy ${MPI_F77}; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_MPI_F77_EXISTS+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$MPI_F77_EXISTS"; then
+  ac_cv_prog_MPI_F77_EXISTS="$MPI_F77_EXISTS" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_MPI_F77_EXISTS="yes"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_prog_MPI_F77_EXISTS" && ac_cv_prog_MPI_F77_EXISTS="no"
+fi
+fi
+MPI_F77_EXISTS=$ac_cv_prog_MPI_F77_EXISTS
+if test -n "$MPI_F77_EXISTS"; then
+  { echo "$as_me:$LINENO: result: $MPI_F77_EXISTS" >&5
+echo "${ECHO_T}$MPI_F77_EXISTS" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+  fi
+
+  if test "X${MPI_F77_EXISTS}" = "Xyes"; then
+    F77=${MPI_F77}
+  else
+    echo "-----"
+    echo "Cannot find MPI Fortran compiler ${MPI_F77}."
+    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
+    echo "or specify a Fortran 77 compiler using F77=<compiler>"
+    echo "Do not use --with-mpi-compilers if using F77=<compiler>"
+    echo "-----"
+    { { echo "$as_me:$LINENO: error: MPI Fortran 77 compiler (${MPI_F77}) not found." >&5
+echo "$as_me: error: MPI Fortran 77 compiler (${MPI_F77}) not found." >&2;}
+   { (exit 1); exit 1; }; }
+  fi
+fi
+
+fi
+
+#  #np# - can eliminate compiler checks below if your package does not use the
+#         language corresponding to the check.  Please note that if you use
+#	  F77_FUNC to determine Fortran name mangling, you should not remove
+#	  the Fortran compiler check or the check for Fortran flags.  Doing
+#	  so will prevent the detection of the proper name mangling in some
+#	  cases.
+# ------------------------------------------------------------------------
+# Checks for programs
+# ------------------------------------------------------------------------
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+  for ac_prog in cc gcc
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_CC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { echo "$as_me:$LINENO: result: $CC" >&5
+echo "${ECHO_T}$CC" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+    test -n "$CC" && break
+  done
+fi
+if test -z "$CC"; then
+  ac_ct_CC=$CC
+  for ac_prog in cc gcc
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_ac_ct_CC="$ac_prog"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
+echo "${ECHO_T}$ac_ct_CC" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CC" && break
+done
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ echo "$as_me:$LINENO: WARNING: In the future, Autoconf will not detect cross-tools
+whose name does not start with the host triplet.  If you think this
+configuration is useful to you, please write to autoconf@gnu.org." >&5
+echo "$as_me: WARNING: In the future, Autoconf will not detect cross-tools
+whose name does not start with the host triplet.  If you think this
+configuration is useful to you, please write to autoconf@gnu.org." >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+fi
+
+
+test -z "$CC" && { { echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&5
+echo "$as_me: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }
+
+# Provide some information about the compiler.
+echo "$as_me:$LINENO: checking for C compiler version" >&5
+ac_compiler=`set X $ac_compile; echo $2`
+{ (ac_try="$ac_compiler --version >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compiler --version >&5") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -v >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compiler -v >&5") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -V >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compiler -V >&5") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ echo "$as_me:$LINENO: checking for C compiler default output file name" >&5
+echo $ECHO_N "checking for C compiler default output file name... $ECHO_C" >&6; }
+ac_link_default=`echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+#
+# List of possible output files, starting from the most likely.
+# The algorithm is not robust to junk in `.', hence go to wildcards (a.*)
+# only as a last resort.  b.out is created by i960 compilers.
+ac_files='a_out.exe a.exe conftest.exe a.out conftest a.* conftest.* b.out'
+#
+# The IRIX 6 linker writes into existing files which may not be
+# executable, retaining their permissions.  Remove them first so a
+# subsequent execution test works.
+ac_rmfiles=
+for ac_file in $ac_files
+do
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.o | *.obj ) ;;
+    * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+  esac
+done
+rm -f $ac_rmfiles
+
+if { (ac_try="$ac_link_default"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link_default") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; then
+  # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
+# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+# in a Makefile.  We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.o | *.obj )
+	;;
+    [ab].out )
+	# We found the default executable, but exeext='' is most
+	# certainly right.
+	break;;
+    *.* )
+        if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+	then :; else
+	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	fi
+	# We set ac_cv_exeext here because the later test for it is not
+	# safe: cross compilers may not add the suffix if given an `-o'
+	# argument, so we may need to know it at that point already.
+	# Even if this section looks crufty: it has the advantage of
+	# actually working.
+	break;;
+    * )
+	break;;
+  esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else
+  ac_file=''
+fi
+
+{ echo "$as_me:$LINENO: result: $ac_file" >&5
+echo "${ECHO_T}$ac_file" >&6; }
+if test -z "$ac_file"; then
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { echo "$as_me:$LINENO: error: C compiler cannot create executables
+See \`config.log' for more details." >&5
+echo "$as_me: error: C compiler cannot create executables
+See \`config.log' for more details." >&2;}
+   { (exit 77); exit 77; }; }
+fi
+
+ac_exeext=$ac_cv_exeext
+
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ echo "$as_me:$LINENO: checking whether the C compiler works" >&5
+echo $ECHO_N "checking whether the C compiler works... $ECHO_C" >&6; }
+# FIXME: These cross compiler hacks should be removed for Autoconf 3.0
+# If not cross compiling, check that we can run a simple program.
+if test "$cross_compiling" != yes; then
+  if { ac_try='./$ac_file'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+    cross_compiling=no
+  else
+    if test "$cross_compiling" = maybe; then
+	cross_compiling=yes
+    else
+	{ { echo "$as_me:$LINENO: error: cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details." >&5
+echo "$as_me: error: cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }
+    fi
+  fi
+fi
+{ echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+
+rm -f a.out a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ echo "$as_me:$LINENO: checking whether we are cross compiling" >&5
+echo $ECHO_N "checking whether we are cross compiling... $ECHO_C" >&6; }
+{ echo "$as_me:$LINENO: result: $cross_compiling" >&5
+echo "${ECHO_T}$cross_compiling" >&6; }
+
+{ echo "$as_me:$LINENO: checking for suffix of executables" >&5
+echo $ECHO_N "checking for suffix of executables... $ECHO_C" >&6; }
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; then
+  # If both `conftest.exe' and `conftest' are `present' (well, observable)
+# catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
+# work properly (i.e., refer to `conftest.exe'), while it won't with
+# `rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.o | *.obj ) ;;
+    *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	  break;;
+    * ) break;;
+  esac
+done
+else
+  { { echo "$as_me:$LINENO: error: cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details." >&5
+echo "$as_me: error: cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+rm -f conftest$ac_cv_exeext
+{ echo "$as_me:$LINENO: result: $ac_cv_exeext" >&5
+echo "${ECHO_T}$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+{ echo "$as_me:$LINENO: checking for suffix of object files" >&5
+echo $ECHO_N "checking for suffix of object files... $ECHO_C" >&6; }
+if test "${ac_cv_objext+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; then
+  for ac_file in conftest.o conftest.obj conftest.*; do
+  test -f "$ac_file" || continue;
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf ) ;;
+    *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+       break;;
+  esac
+done
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { echo "$as_me:$LINENO: error: cannot compute suffix of object files: cannot compile
+See \`config.log' for more details." >&5
+echo "$as_me: error: cannot compute suffix of object files: cannot compile
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+rm -f conftest.$ac_cv_objext conftest.$ac_ext
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_objext" >&5
+echo "${ECHO_T}$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5
+echo $ECHO_N "checking whether we are using the GNU C compiler... $ECHO_C" >&6; }
+if test "${ac_cv_c_compiler_gnu+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_compiler_gnu=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_compiler_gnu=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5
+echo "${ECHO_T}$ac_cv_c_compiler_gnu" >&6; }
+GCC=`test $ac_compiler_gnu = yes && echo yes`
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5
+echo $ECHO_N "checking whether $CC accepts -g... $ECHO_C" >&6; }
+if test "${ac_cv_prog_cc_g+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_save_c_werror_flag=$ac_c_werror_flag
+   ac_c_werror_flag=yes
+   ac_cv_prog_cc_g=no
+   CFLAGS="-g"
+   cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_cv_prog_cc_g=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	CFLAGS=""
+      cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  :
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_c_werror_flag=$ac_save_c_werror_flag
+	 CFLAGS="-g"
+	 cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_cv_prog_cc_g=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5
+echo "${ECHO_T}$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+  CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+  if test "$GCC" = yes; then
+    CFLAGS="-g -O2"
+  else
+    CFLAGS="-g"
+  fi
+else
+  if test "$GCC" = yes; then
+    CFLAGS="-O2"
+  else
+    CFLAGS=
+  fi
+fi
+{ echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5
+echo $ECHO_N "checking for $CC option to accept ISO C89... $ECHO_C" >&6; }
+if test "${ac_cv_prog_cc_c89+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
+   function prototypes and stuff, but not '\xHH' hex character constants.
+   These don't provoke an error unfortunately, instead are silently treated
+   as 'x'.  The following induces an error, until -std is added to get
+   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
+   array size at least.  It's necessary to write '\x00'==0 to get something
+   that's true only with -std.  */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+   inside strings and character constants.  */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_cv_prog_cc_c89=$ac_arg
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+  x)
+    { echo "$as_me:$LINENO: result: none needed" >&5
+echo "${ECHO_T}none needed" >&6; } ;;
+  xno)
+    { echo "$as_me:$LINENO: result: unsupported" >&5
+echo "${ECHO_T}unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c89"
+    { echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5
+echo "${ECHO_T}$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+DEPDIR="${am__leading_dot}deps"
+
+ac_config_commands="$ac_config_commands depfiles"
+
+
+am_make=${MAKE-make}
+cat > confinc << 'END'
+am__doit:
+	@echo done
+.PHONY: am__doit
+END
+# If we don't find an include directive, just comment out the code.
+{ echo "$as_me:$LINENO: checking for style of include used by $am_make" >&5
+echo $ECHO_N "checking for style of include used by $am_make... $ECHO_C" >&6; }
+am__include="#"
+am__quote=
+_am_result=none
+# First try GNU make style include.
+echo "include confinc" > confmf
+# We grep out `Entering directory' and `Leaving directory'
+# messages which can occur if `w' ends up in MAKEFLAGS.
+# In particular we don't look at `^make:' because GNU make might
+# be invoked under some other name (usually "gmake"), in which
+# case it prints its new name instead of `make'.
+if test "`$am_make -s -f confmf 2> /dev/null | grep -v 'ing directory'`" = "done"; then
+   am__include=include
+   am__quote=
+   _am_result=GNU
+fi
+# Now try BSD make style include.
+if test "$am__include" = "#"; then
+   echo '.include "confinc"' > confmf
+   if test "`$am_make -s -f confmf 2> /dev/null`" = "done"; then
+      am__include=.include
+      am__quote="\""
+      _am_result=BSD
+   fi
+fi
+
+
+{ echo "$as_me:$LINENO: result: $_am_result" >&5
+echo "${ECHO_T}$_am_result" >&6; }
+rm -f confinc confmf
+
+# Check whether --enable-dependency-tracking was given.
+if test "${enable_dependency_tracking+set}" = set; then
+  enableval=$enable_dependency_tracking;
+fi
+
+if test "x$enable_dependency_tracking" != xno; then
+  am_depcomp="$ac_aux_dir/depcomp"
+  AMDEPBACKSLASH='\'
+fi
+ if test "x$enable_dependency_tracking" != xno; then
+  AMDEP_TRUE=
+  AMDEP_FALSE='#'
+else
+  AMDEP_TRUE='#'
+  AMDEP_FALSE=
+fi
+
+
+
+depcc="$CC"   am_compiler_list=
+
+{ echo "$as_me:$LINENO: checking dependency style of $depcc" >&5
+echo $ECHO_N "checking dependency style of $depcc... $ECHO_C" >&6; }
+if test "${am_cv_CC_dependencies_compiler_type+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named `D' -- because `-MD' means `put the output
+  # in D'.
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CC_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
+      # Solaris 8's {/usr,}/bin/sh.
+      touch sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    case $depmode in
+    nosideeffect)
+      # after this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    none) break ;;
+    esac
+    # We check with `-c' and `-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle `-M -o', and we need to detect this.
+    if depmode=$depmode \
+       source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CC_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CC_dependencies_compiler_type=none
+fi
+
+fi
+{ echo "$as_me:$LINENO: result: $am_cv_CC_dependencies_compiler_type" >&5
+echo "${ECHO_T}$am_cv_CC_dependencies_compiler_type" >&6; }
+CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then
+  am__fastdepCC_TRUE=
+  am__fastdepCC_FALSE='#'
+else
+  am__fastdepCC_TRUE='#'
+  am__fastdepCC_FALSE=
+fi
+
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+if test -z "$CXX"; then
+  if test -n "$CCC"; then
+    CXX=$CCC
+  else
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in CC g++ c++ cxx
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_CXX+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { echo "$as_me:$LINENO: result: $CXX" >&5
+echo "${ECHO_T}$CXX" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+    test -n "$CXX" && break
+  done
+fi
+if test -z "$CXX"; then
+  ac_ct_CXX=$CXX
+  for ac_prog in CC g++ c++ cxx
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_ac_ct_CXX+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$ac_ct_CXX"; then
+  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_ac_ct_CXX="$ac_prog"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
+if test -n "$ac_ct_CXX"; then
+  { echo "$as_me:$LINENO: result: $ac_ct_CXX" >&5
+echo "${ECHO_T}$ac_ct_CXX" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CXX" && break
+done
+
+  if test "x$ac_ct_CXX" = x; then
+    CXX="g++"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ echo "$as_me:$LINENO: WARNING: In the future, Autoconf will not detect cross-tools
+whose name does not start with the host triplet.  If you think this
+configuration is useful to you, please write to autoconf@gnu.org." >&5
+echo "$as_me: WARNING: In the future, Autoconf will not detect cross-tools
+whose name does not start with the host triplet.  If you think this
+configuration is useful to you, please write to autoconf@gnu.org." >&2;}
+ac_tool_warned=yes ;;
+esac
+    CXX=$ac_ct_CXX
+  fi
+fi
+
+  fi
+fi
+# Provide some information about the compiler.
+echo "$as_me:$LINENO: checking for C++ compiler version" >&5
+ac_compiler=`set X $ac_compile; echo $2`
+{ (ac_try="$ac_compiler --version >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compiler --version >&5") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -v >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compiler -v >&5") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -V >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compiler -V >&5") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+
+{ echo "$as_me:$LINENO: checking whether we are using the GNU C++ compiler" >&5
+echo $ECHO_N "checking whether we are using the GNU C++ compiler... $ECHO_C" >&6; }
+if test "${ac_cv_cxx_compiler_gnu+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_compiler_gnu=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_compiler_gnu=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_cxx_compiler_gnu" >&5
+echo "${ECHO_T}$ac_cv_cxx_compiler_gnu" >&6; }
+GXX=`test $ac_compiler_gnu = yes && echo yes`
+ac_test_CXXFLAGS=${CXXFLAGS+set}
+ac_save_CXXFLAGS=$CXXFLAGS
+{ echo "$as_me:$LINENO: checking whether $CXX accepts -g" >&5
+echo $ECHO_N "checking whether $CXX accepts -g... $ECHO_C" >&6; }
+if test "${ac_cv_prog_cxx_g+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
+   ac_cxx_werror_flag=yes
+   ac_cv_prog_cxx_g=no
+   CXXFLAGS="-g"
+   cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_cv_prog_cxx_g=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	CXXFLAGS=""
+      cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  :
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+	 CXXFLAGS="-g"
+	 cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_cv_prog_cxx_g=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_prog_cxx_g" >&5
+echo "${ECHO_T}$ac_cv_prog_cxx_g" >&6; }
+if test "$ac_test_CXXFLAGS" = set; then
+  CXXFLAGS=$ac_save_CXXFLAGS
+elif test $ac_cv_prog_cxx_g = yes; then
+  if test "$GXX" = yes; then
+    CXXFLAGS="-g -O2"
+  else
+    CXXFLAGS="-g"
+  fi
+else
+  if test "$GXX" = yes; then
+    CXXFLAGS="-O2"
+  else
+    CXXFLAGS=
+  fi
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+depcc="$CXX"  am_compiler_list=
+
+{ echo "$as_me:$LINENO: checking dependency style of $depcc" >&5
+echo $ECHO_N "checking dependency style of $depcc... $ECHO_C" >&6; }
+if test "${am_cv_CXX_dependencies_compiler_type+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named `D' -- because `-MD' means `put the output
+  # in D'.
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CXX_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
+      # Solaris 8's {/usr,}/bin/sh.
+      touch sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    case $depmode in
+    nosideeffect)
+      # after this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    none) break ;;
+    esac
+    # We check with `-c' and `-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle `-M -o', and we need to detect this.
+    if depmode=$depmode \
+       source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CXX_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CXX_dependencies_compiler_type=none
+fi
+
+fi
+{ echo "$as_me:$LINENO: result: $am_cv_CXX_dependencies_compiler_type" >&5
+echo "${ECHO_T}$am_cv_CXX_dependencies_compiler_type" >&6; }
+CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then
+  am__fastdepCXX_TRUE=
+  am__fastdepCXX_FALSE='#'
+else
+  am__fastdepCXX_TRUE='#'
+  am__fastdepCXX_FALSE=
+fi
+
+
+#AC_PROG_F77(f77 g77 gfortran f90 xlf90 f95)
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
+set dummy ${ac_tool_prefix}ranlib; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_RANLIB+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+RANLIB=$ac_cv_prog_RANLIB
+if test -n "$RANLIB"; then
+  { echo "$as_me:$LINENO: result: $RANLIB" >&5
+echo "${ECHO_T}$RANLIB" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_RANLIB"; then
+  ac_ct_RANLIB=$RANLIB
+  # Extract the first word of "ranlib", so it can be a program name with args.
+set dummy ranlib; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_ac_ct_RANLIB+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$ac_ct_RANLIB"; then
+  ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_ac_ct_RANLIB="ranlib"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB
+if test -n "$ac_ct_RANLIB"; then
+  { echo "$as_me:$LINENO: result: $ac_ct_RANLIB" >&5
+echo "${ECHO_T}$ac_ct_RANLIB" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+  if test "x$ac_ct_RANLIB" = x; then
+    RANLIB=":"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ echo "$as_me:$LINENO: WARNING: In the future, Autoconf will not detect cross-tools
+whose name does not start with the host triplet.  If you think this
+configuration is useful to you, please write to autoconf@gnu.org." >&5
+echo "$as_me: WARNING: In the future, Autoconf will not detect cross-tools
+whose name does not start with the host triplet.  If you think this
+configuration is useful to you, please write to autoconf@gnu.org." >&2;}
+ac_tool_warned=yes ;;
+esac
+    RANLIB=$ac_ct_RANLIB
+  fi
+else
+  RANLIB="$ac_cv_prog_RANLIB"
+fi
+
+
+# Check if --with-flags present, prepend any specs to FLAGS
+
+
+{ echo "$as_me:$LINENO: checking whether additional CCFLAGS flags should be added" >&5
+echo $ECHO_N "checking whether additional CCFLAGS flags should be added... $ECHO_C" >&6; }
+
+# Check whether --with-ccflags was given.
+if test "${with_ccflags+set}" = set; then
+  withval=$with_ccflags;
+CCFLAGS="${withval} ${CCFLAGS}"
+{ echo "$as_me:$LINENO: result: CCFLAGS = ${CCFLAGS}" >&5
+echo "${ECHO_T}CCFLAGS = ${CCFLAGS}" >&6; }
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+
+fi
+
+
+
+{ echo "$as_me:$LINENO: checking whether additional CXXFLAGS flags should be added" >&5
+echo $ECHO_N "checking whether additional CXXFLAGS flags should be added... $ECHO_C" >&6; }
+
+# Check whether --with-cxxflags was given.
+if test "${with_cxxflags+set}" = set; then
+  withval=$with_cxxflags;
+CXXFLAGS="${withval} ${CXXFLAGS}"
+{ echo "$as_me:$LINENO: result: CXXFLAGS = ${CXXFLAGS}" >&5
+echo "${ECHO_T}CXXFLAGS = ${CXXFLAGS}" >&6; }
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+
+fi
+
+
+
+{ echo "$as_me:$LINENO: checking whether additional CFLAGS flags should be added" >&5
+echo $ECHO_N "checking whether additional CFLAGS flags should be added... $ECHO_C" >&6; }
+
+# Check whether --with-cflags was given.
+if test "${with_cflags+set}" = set; then
+  withval=$with_cflags;
+CFLAGS="${withval} ${CFLAGS}"
+{ echo "$as_me:$LINENO: result: CFLAGS = ${CFLAGS}" >&5
+echo "${ECHO_T}CFLAGS = ${CFLAGS}" >&6; }
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+
+fi
+
+
+#TAC_ARG_WITH_FLAGS(fflags, FFLAGS)
+
+{ echo "$as_me:$LINENO: checking whether additional libraries are needed" >&5
+echo $ECHO_N "checking whether additional libraries are needed... $ECHO_C" >&6; }
+
+# Check whether --with-libs was given.
+if test "${with_libs+set}" = set; then
+  withval=$with_libs;
+LIBS="${withval} ${LIBS}"
+{ echo "$as_me:$LINENO: result: LIBS = ${LIBS}" >&5
+echo "${ECHO_T}LIBS = ${LIBS}" >&6; }
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+
+fi
+
+
+
+
+{ echo "$as_me:$LINENO: checking whether additional LDFLAGS flags should be added" >&5
+echo $ECHO_N "checking whether additional LDFLAGS flags should be added... $ECHO_C" >&6; }
+
+# Check whether --with-ldflags was given.
+if test "${with_ldflags+set}" = set; then
+  withval=$with_ldflags;
+LDFLAGS="${withval} ${LDFLAGS}"
+{ echo "$as_me:$LINENO: result: LDFLAGS = ${LDFLAGS}" >&5
+echo "${ECHO_T}LDFLAGS = ${LDFLAGS}" >&6; }
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+
+fi
+
+
+
+# ------------------------------------------------------------------------
+# Alternate archiver
+# ------------------------------------------------------------------------
+
+
+
+# Check whether --with-ar was given.
+if test "${with_ar+set}" = set; then
+  withval=$with_ar;
+{ echo "$as_me:$LINENO: checking user-defined archiver" >&5
+echo $ECHO_N "checking user-defined archiver... $ECHO_C" >&6; }
+{ echo "$as_me:$LINENO: result: ${withval}" >&5
+echo "${ECHO_T}${withval}" >&6; }
+USE_ALTERNATE_AR=yes
+ALTERNATE_AR="${withval}"
+
+
+fi
+
+
+if test -n "${SPECIAL_AR}" && test "X${USE_ALTERNATE_AR}" != "Xyes";
+then
+  USE_ALTERNATE_AR=yes
+  ALTERNATE_AR="${SPECIAL_AR}"
+fi
+
+{ echo "$as_me:$LINENO: checking for special archiver command" >&5
+echo $ECHO_N "checking for special archiver command... $ECHO_C" >&6; }
+if test "X${USE_ALTERNATE_AR}" = "Xyes"; then
+   { echo "$as_me:$LINENO: result: ${ALTERNATE_AR}" >&5
+echo "${ECHO_T}${ALTERNATE_AR}" >&6; }
+    if true; then
+  USE_ALTERNATE_AR_TRUE=
+  USE_ALTERNATE_AR_FALSE='#'
+else
+  USE_ALTERNATE_AR_TRUE='#'
+  USE_ALTERNATE_AR_FALSE=
+fi
+
+else
+   { echo "$as_me:$LINENO: result: none" >&5
+echo "${ECHO_T}none" >&6; }
+    if false; then
+  USE_ALTERNATE_AR_TRUE=
+  USE_ALTERNATE_AR_FALSE='#'
+else
+  USE_ALTERNATE_AR_TRUE='#'
+  USE_ALTERNATE_AR_FALSE=
+fi
+
+fi
+
+
+
+# ------------------------------------------------------------------------
+# MPI link check
+# ------------------------------------------------------------------------
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+{ echo "$as_me:$LINENO: checking how to run the C++ preprocessor" >&5
+echo $ECHO_N "checking how to run the C++ preprocessor... $ECHO_C" >&6; }
+if test -z "$CXXCPP"; then
+  if test "${ac_cv_prog_CXXCPP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+      # Double quotes because CXXCPP needs to be expanded
+    for CXXCPP in "$CXX -E" "/lib/cpp"
+    do
+      ac_preproc_ok=false
+for ac_cxx_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  :
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  # Broken: fails on valid input.
+continue
+fi
+
+rm -f conftest.err conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  # Broken: success on invalid input.
+continue
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+
+rm -f conftest.err conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then
+  break
+fi
+
+    done
+    ac_cv_prog_CXXCPP=$CXXCPP
+
+fi
+  CXXCPP=$ac_cv_prog_CXXCPP
+else
+  ac_cv_prog_CXXCPP=$CXXCPP
+fi
+{ echo "$as_me:$LINENO: result: $CXXCPP" >&5
+echo "${ECHO_T}$CXXCPP" >&6; }
+ac_preproc_ok=false
+for ac_cxx_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  :
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  # Broken: fails on valid input.
+continue
+fi
+
+rm -f conftest.err conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  # Broken: success on invalid input.
+continue
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+
+rm -f conftest.err conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then
+  :
+else
+  { { echo "$as_me:$LINENO: error: C++ preprocessor \"$CXXCPP\" fails sanity check
+See \`config.log' for more details." >&5
+echo "$as_me: error: C++ preprocessor \"$CXXCPP\" fails sanity check
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+
+
+if test "X${HAVE_PKG_MPI}" = "Xyes"; then
+
+  if test -n "${MPI_DIR}" && test -z "${MPI_INC}"; then
+    MPI_INC="${MPI_DIR}/include"
+  fi
+
+  if test -n "${MPI_INC}"; then
+    CPPFLAGS="${CPPFLAGS} -I${MPI_INC}"
+  fi
+
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+  { echo "$as_me:$LINENO: checking for mpi.h" >&5
+echo $ECHO_N "checking for mpi.h... $ECHO_C" >&6; }
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include "mpi.h"
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  { echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+     { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+     echo "-----"
+     echo "Cannot link simple MPI program."
+     echo "Try --with-mpi-compilers to specify MPI compilers."
+     echo "Or try --with-mpi-libs, --with-mpi-incdir, --with-mpi-libdir"
+     echo "to specify all the specific MPI compile options."
+     echo "-----"
+     { { echo "$as_me:$LINENO: error: MPI cannot link" >&5
+echo "$as_me: error: MPI cannot link" >&2;}
+   { (exit 1); exit 1; }; }
+
+fi
+
+rm -f conftest.err conftest.$ac_ext
+
+  if test -n "${MPI_DIR}" && test -z "${MPI_LIBDIR}"; then
+    MPI_LIBDIR="${MPI_DIR}/lib"
+  fi
+
+  if test -n "${MPI_LIBDIR}"; then
+    LDFLAGS="${LDFLAGS} -L${MPI_LIBDIR}"
+  fi
+
+  if test -z "${MPI_LIBS}" && test -n "${MPI_LIBDIR}"; then
+    MPI_LIBS="-lmpi"
+  fi
+
+  if test -n "${MPI_LIBS}"; then
+    LIBS="${MPI_LIBS} ${LIBS}"
+  fi
+
+#   AC_LANG_CPLUSPLUS
+#   AC_MSG_CHECKING(whether MPI will link using C++ compiler)
+#   AC_TRY_LINK([#include <mpi.h>],
+#   [int c; char** v; MPI_Init(&c,&v);],
+#   [AC_MSG_RESULT(yes)],
+#   [AC_MSG_RESULT(no)
+#    echo "-----"
+#    echo "Cannot link simple MPI program."
+#    echo "Try --with-mpi-cxx to specify MPI C++ compile script."
+#    echo "Or try --with-mpi-libs, --with-mpi-incdir, --with-mpi-libdir"
+#    echo "to specify all the specific MPI compile options."
+#    echo "-----"
+#    AC_MSG_ERROR(MPI cannot link)]
+#   )
+
+fi
+
+
+# ------------------------------------------------------------------------
+# Checks for Makefile.export related systems
+# ------------------------------------------------------------------------
+
+# Check whether --enable-export-makefiles was given.
+if test "${enable_export_makefiles+set}" = set; then
+  enableval=$enable_export_makefiles; ac_cv_use_export_makefiles=$enableval
+else
+  ac_cv_use_export_makefiles=yes
+fi
+
+
+{ echo "$as_me:$LINENO: checking whether to build export makefiles" >&5
+echo $ECHO_N "checking whether to build export makefiles... $ECHO_C" >&6; }
+
+if test "X$ac_cv_use_export_makefiles" != "Xno"; then
+
+  { echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_EXPORT_MAKEFILES
+_ACEOF
+
+
+else
+
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+
+fi
+
+ if test X${ac_cv_use_export_makefiles} = Xyes; then
+  USING_EXPORT_MAKEFILES_TRUE=
+  USING_EXPORT_MAKEFILES_FALSE='#'
+else
+  USING_EXPORT_MAKEFILES_TRUE='#'
+  USING_EXPORT_MAKEFILES_FALSE=
+fi
+
+
+# Check for perl to run scripts (Required dependency)
+
+
+
+# Check whether --with-perl was given.
+if test "${with_perl+set}" = set; then
+  withval=$with_perl;
+{ echo "$as_me:$LINENO: checking for user supplied perl executable" >&5
+echo $ECHO_N "checking for user supplied perl executable... $ECHO_C" >&6; }
+{ echo "$as_me:$LINENO: result: ${withval}" >&5
+echo "${ECHO_T}${withval}" >&6; }
+USER_SPECIFIED_PERL=yes
+PERL_EXE="${withval}"
+
+else
+
+USER_SPECIFIED_PERL=no
+
+fi
+
+
+if test "X${USER_SPECIFIED_PERL}" = "Xyes"; then
+  as_ac_File=`echo "ac_cv_file_${PERL_EXE}" | $as_tr_sh`
+{ echo "$as_me:$LINENO: checking for ${PERL_EXE}" >&5
+echo $ECHO_N "checking for ${PERL_EXE}... $ECHO_C" >&6; }
+if { as_var=$as_ac_File; eval "test \"\${$as_var+set}\" = set"; }; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  test "$cross_compiling" = yes &&
+  { { echo "$as_me:$LINENO: error: cannot check for file existence when cross compiling" >&5
+echo "$as_me: error: cannot check for file existence when cross compiling" >&2;}
+   { (exit 1); exit 1; }; }
+if test -r "${PERL_EXE}"; then
+  eval "$as_ac_File=yes"
+else
+  eval "$as_ac_File=no"
+fi
+fi
+ac_res=`eval echo '${'$as_ac_File'}'`
+	       { echo "$as_me:$LINENO: result: $ac_res" >&5
+echo "${ECHO_T}$ac_res" >&6; }
+if test `eval echo '${'$as_ac_File'}'` = yes; then
+  HAVE_PERL=yes
+else
+  HAVE_PERL=no
+fi
+
+  PERL_EXE=${PERL_EXE}
+
+else
+  # Extract the first word of "perl", so it can be a program name with args.
+set dummy perl; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_HAVE_PERL+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$HAVE_PERL"; then
+  ac_cv_prog_HAVE_PERL="$HAVE_PERL" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_HAVE_PERL="yes"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_prog_HAVE_PERL" && ac_cv_prog_HAVE_PERL="no"
+fi
+fi
+HAVE_PERL=$ac_cv_prog_HAVE_PERL
+if test -n "$HAVE_PERL"; then
+  { echo "$as_me:$LINENO: result: $HAVE_PERL" >&5
+echo "${ECHO_T}$HAVE_PERL" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+  PERL_EXE=perl
+
+fi
+ if test X${HAVE_PERL} = Xyes; then
+  USING_PERL_TRUE=
+  USING_PERL_FALSE='#'
+else
+  USING_PERL_TRUE='#'
+  USING_PERL_FALSE=
+fi
+
+
+
+if test "X$HAVE_PERL" != "Xyes" &&
+   test "X$ac_cv_use_export_makefiles" != "Xno"; then
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+  { { echo "$as_me:$LINENO: error: Failed to find the perl executable.  The flag --enable-export-makefiles requires perl to be either in your path or explicitly defined by the flag --with-perl=<executable>.  If you do not require the export makefiles to be installed via 'make install', you can disable the export makefiles with --disable-export-makefiles." >&5
+echo "$as_me: error: Failed to find the perl executable.  The flag --enable-export-makefiles requires perl to be either in your path or explicitly defined by the flag --with-perl=<executable>.  If you do not require the export makefiles to be installed via 'make install', you can disable the export makefiles with --disable-export-makefiles." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+# Check for using gnumake to clean up link lines via
+# gnumake's "shell" command. Optional dependency.
+
+
+
+
+# Check whether --with-gnumake was given.
+if test "${with_gnumake+set}" = set; then
+  withval=$with_gnumake; ac_cv_use_gnumake=$withval
+else
+  ac_cv_use_gnumake=no
+fi
+
+
+{ echo "$as_me:$LINENO: checking whether gnumake specific code should be enabled" >&5
+echo $ECHO_N "checking whether gnumake specific code should be enabled... $ECHO_C" >&6; }
+
+if test "X$ac_cv_use_gnumake" != "Xno"; then
+  { echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_GNUMAKE
+_ACEOF
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+ if test "X$ac_cv_use_gnumake" = "Xyes"; then
+  USING_GNUMAKE_TRUE=
+  USING_GNUMAKE_FALSE='#'
+else
+  USING_GNUMAKE_TRUE='#'
+  USING_GNUMAKE_FALSE=
+fi
+
+
+
+if test "X$HAVE_PERL" != "Xyes" &&
+   test "X$ac_cv_use_gnumake" != "Xno"; then
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+  { { echo "$as_me:$LINENO: error: The flag --with-gnumake requires perl to be in your path.  The perl executable can alternatively be explicitly defined by the flag --with-perl=<executable>." >&5
+echo "$as_me: error: The flag --with-gnumake requires perl to be in your path.  The perl executable can alternatively be explicitly defined by the flag --with-perl=<executable>." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+
+
+# ------------------------------------------------------------------------
+# Checks if tests and examples should be built
+# ------------------------------------------------------------------------
+
+#  #np# - These options can disable the tests and examples of a package.
+#  #np# - Packages that do not have tests or examples should #-out the
+#  #np# - option(s) that does (do) not apply.
+
+
+# Check whether --enable-tests was given.
+if test "${enable_tests+set}" = set; then
+  enableval=$enable_tests; ac_cv_use_tests=$enableval
+else
+  ac_cv_use_tests=yes
+fi
+
+
+{ echo "$as_me:$LINENO: checking whether to use tests" >&5
+echo $ECHO_N "checking whether to use tests... $ECHO_C" >&6; }
+
+if test "X$ac_cv_use_tests" != "Xno"; then
+  { echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_TESTS
+_ACEOF
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+# Check whether --enable-tests was given.
+if test "${enable_tests+set}" = set; then
+  enableval=$enable_tests; ac_cv_use_tests=$enableval
+else
+  ac_cv_use_tests=yes
+fi
+
+
+# Check whether --enable-threadpool-tests was given.
+if test "${enable_threadpool_tests+set}" = set; then
+  enableval=$enable_threadpool_tests; ac_cv_use_threadpool_tests=$enableval
+else
+  ac_cv_use_threadpool_tests=${ac_cv_use_tests}
+fi
+
+
+{ echo "$as_me:$LINENO: checking whether to use threadpool-tests" >&5
+echo $ECHO_N "checking whether to use threadpool-tests... $ECHO_C" >&6; }
+
+if test "X$ac_cv_use_threadpool_tests" != "Xno"; then
+  { echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_NEW_PACKAGE_TESTS
+_ACEOF
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+ if test "X$ac_cv_use_threadpool_tests" != "Xno"; then
+  BUILD_TESTS_TRUE=
+  BUILD_TESTS_FALSE='#'
+else
+  BUILD_TESTS_TRUE='#'
+  BUILD_TESTS_FALSE=
+fi
+
+
+#TAC_ARG_ENABLE_FEATURE(examples, [Make examples for all Trilinos packages buildable with 'make examples'], EXAMPLES, yes)
+#TAC_ARG_ENABLE_FEATURE_SUB_CHECK( new_package, examples, [Make New_Package examples buildable with 'make examples'], NEW_PACKAGE_EXAMPLES)
+#AM_CONDITIONAL(BUILD_EXAMPLES, test "X$ac_cv_use_new_package_examples" != "Xno")
+
+#We now build tests and examples through separate make targets, rather than
+#during "make".  We still need to conditionally include the test and example
+#in SUBDIRS, even though SUB_TEST and SUB_EXAMPLE will never be
+#defined, so that the tests and examples are included in the distribution
+#tarball.
+ if test "X$ac_cv_use_sub_test" = "Xyes"; then
+  SUB_TEST_TRUE=
+  SUB_TEST_FALSE='#'
+else
+  SUB_TEST_TRUE='#'
+  SUB_TEST_FALSE=
+fi
+
+#AM_CONDITIONAL(SUB_EXAMPLE, test "X$ac_cv_use_sub_example" = "Xyes")
+
+
+# Check whether --enable-libcheck was given.
+if test "${enable_libcheck+set}" = set; then
+  enableval=$enable_libcheck; ac_cv_use_libcheck=$enableval
+else
+  ac_cv_use_libcheck=yes
+fi
+
+
+{ echo "$as_me:$LINENO: checking whether to use libcheck" >&5
+echo $ECHO_N "checking whether to use libcheck... $ECHO_C" >&6; }
+
+if test "X$ac_cv_use_libcheck" != "Xno"; then
+  { echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_LIBCHECK
+_ACEOF
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+# ------------------------------------------------------------------------
+# Specify other directories
+# ------------------------------------------------------------------------
+
+# enable use of --with-libdirs="-Llibdir1 -Llibdir2 ..." to prepend to LDFLAGS
+
+{ echo "$as_me:$LINENO: checking whether additional library search paths defined" >&5
+echo $ECHO_N "checking whether additional library search paths defined... $ECHO_C" >&6; }
+
+# Check whether --with-libdirs was given.
+if test "${with_libdirs+set}" = set; then
+  withval=$with_libdirs;
+LDFLAGS="${withval} ${LDFLAGS}"
+{ echo "$as_me:$LINENO: result: ${withval}" >&5
+echo "${ECHO_T}${withval}" >&6; }
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+
+fi
+
+
+# enable use of --with-incdirs="-Lincdir1 -Lincdir2 ..." to prepend to CPPFLAGS
+
+{ echo "$as_me:$LINENO: checking whether additional include search paths defined" >&5
+echo $ECHO_N "checking whether additional include search paths defined... $ECHO_C" >&6; }
+
+# Check whether --with-incdirs was given.
+if test "${with_incdirs+set}" = set; then
+  withval=$with_incdirs;
+CPPFLAGS="${withval} ${CPPFLAGS}"
+{ echo "$as_me:$LINENO: result: ${withval}" >&5
+echo "${ECHO_T}${withval}" >&6; }
+
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+
+fi
+
+
+
+# #np# - Yet another opportunity to remove code if you aren't
+# using Fortran
+# Define F77_FUNC that will be used to link with Fortran subroutines. - trash WORKGXX
+#AC_F77_WRAPPERS
+
+# ------------------------------------------------------------------------
+# Checks for libraries
+# ------------------------------------------------------------------------
+
+# If tests, examples and libcheck are disabled, we don't have to check
+# for these libraries.
+
+# #np# -
+# If a package does not have tests or examples, the corresponding check(s)
+# should be pulled out of the "if" statement below.
+#if test "X$ac_cv_use_new_package_examples" != "Xno" || test "X$ac_cv_use_libcheck" != "Xno"; then
+if test "X$ac_cv_use_threadpool_tests" != "Xno" || test "X$ac_cv_use_libcheck" != "Xno"; then
+
+{ echo "$as_me:$LINENO: checking for grep that handles long lines and -e" >&5
+echo $ECHO_N "checking for grep that handles long lines and -e... $ECHO_C" >&6; }
+if test "${ac_cv_path_GREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  # Extract the first word of "grep ggrep" to use in msg output
+if test -z "$GREP"; then
+set dummy grep ggrep; ac_prog_name=$2
+if test "${ac_cv_path_GREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_path_GREP_found=false
+# Loop through the user's path and test for each of PROGNAME-LIST
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_prog in grep ggrep; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
+    ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+    { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue
+    # Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    ac_count=`expr $ac_count + 1`
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+
+    $ac_path_GREP_found && break 3
+  done
+done
+
+done
+IFS=$as_save_IFS
+
+
+fi
+
+GREP="$ac_cv_path_GREP"
+if test -z "$GREP"; then
+  { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_path_GREP" >&5
+echo "${ECHO_T}$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ echo "$as_me:$LINENO: checking for egrep" >&5
+echo $ECHO_N "checking for egrep... $ECHO_C" >&6; }
+if test "${ac_cv_path_EGREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     # Extract the first word of "egrep" to use in msg output
+if test -z "$EGREP"; then
+set dummy egrep; ac_prog_name=$2
+if test "${ac_cv_path_EGREP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_path_EGREP_found=false
+# Loop through the user's path and test for each of PROGNAME-LIST
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_prog in egrep; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
+    ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+    { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue
+    # Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    ac_count=`expr $ac_count + 1`
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+
+    $ac_path_EGREP_found && break 3
+  done
+done
+
+done
+IFS=$as_save_IFS
+
+
+fi
+
+EGREP="$ac_cv_path_EGREP"
+if test -z "$EGREP"; then
+  { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+
+   fi
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_path_EGREP" >&5
+echo "${ECHO_T}$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ echo "$as_me:$LINENO: checking for ANSI C header files" >&5
+echo $ECHO_N "checking for ANSI C header files... $ECHO_C" >&6; }
+if test "${ac_cv_header_stdc+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_cv_header_stdc=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_header_stdc=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then
+  :
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then
+  :
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then
+  :
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  :
+else
+  echo "$as_me: program exited with status $ac_status" >&5
+echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+( exit $ac_status )
+ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+fi
+
+
+fi
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_stdc" >&5
+echo "${ECHO_T}$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define STDC_HEADERS 1
+_ACEOF
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+
+
+
+
+
+
+
+
+
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+		  inttypes.h stdint.h unistd.h
+do
+as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh`
+{ echo "$as_me:$LINENO: checking for $ac_header" >&5
+echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
+if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+
+#include <$ac_header>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  eval "$as_ac_Header=yes"
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	eval "$as_ac_Header=no"
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+ac_res=`eval echo '${'$as_ac_Header'}'`
+	       { echo "$as_me:$LINENO: result: $ac_res" >&5
+echo "${ECHO_T}$ac_res" >&6; }
+if test `eval echo '${'$as_ac_Header'}'` = yes; then
+  cat >>confdefs.h <<_ACEOF
+#define `echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+
+
+acx_pthread_ok=no
+
+# First, check if the POSIX threads header, pthread.h, is available.
+# If it isn't, don't bother looking for the threads libraries.
+if test "${ac_cv_header_pthread_h+set}" = set; then
+  { echo "$as_me:$LINENO: checking for pthread.h" >&5
+echo $ECHO_N "checking for pthread.h... $ECHO_C" >&6; }
+if test "${ac_cv_header_pthread_h+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_pthread_h" >&5
+echo "${ECHO_T}$ac_cv_header_pthread_h" >&6; }
+else
+  # Is the header compilable?
+{ echo "$as_me:$LINENO: checking pthread.h usability" >&5
+echo $ECHO_N "checking pthread.h usability... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+#include <pthread.h>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_header_compiler=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_header_compiler=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
+echo "${ECHO_T}$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ echo "$as_me:$LINENO: checking pthread.h presence" >&5
+echo $ECHO_N "checking pthread.h presence... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <pthread.h>
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  ac_header_preproc=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  ac_header_preproc=no
+fi
+
+rm -f conftest.err conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
+echo "${ECHO_T}$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_cxx_preproc_warn_flag in
+  yes:no: )
+    { echo "$as_me:$LINENO: WARNING: pthread.h: accepted by the compiler, rejected by the preprocessor!" >&5
+echo "$as_me: WARNING: pthread.h: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { echo "$as_me:$LINENO: WARNING: pthread.h: proceeding with the compiler's result" >&5
+echo "$as_me: WARNING: pthread.h: proceeding with the compiler's result" >&2;}
+    ac_header_preproc=yes
+    ;;
+  no:yes:* )
+    { echo "$as_me:$LINENO: WARNING: pthread.h: present but cannot be compiled" >&5
+echo "$as_me: WARNING: pthread.h: present but cannot be compiled" >&2;}
+    { echo "$as_me:$LINENO: WARNING: pthread.h:     check for missing prerequisite headers?" >&5
+echo "$as_me: WARNING: pthread.h:     check for missing prerequisite headers?" >&2;}
+    { echo "$as_me:$LINENO: WARNING: pthread.h: see the Autoconf documentation" >&5
+echo "$as_me: WARNING: pthread.h: see the Autoconf documentation" >&2;}
+    { echo "$as_me:$LINENO: WARNING: pthread.h:     section \"Present But Cannot Be Compiled\"" >&5
+echo "$as_me: WARNING: pthread.h:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { echo "$as_me:$LINENO: WARNING: pthread.h: proceeding with the preprocessor's result" >&5
+echo "$as_me: WARNING: pthread.h: proceeding with the preprocessor's result" >&2;}
+    { echo "$as_me:$LINENO: WARNING: pthread.h: in the future, the compiler will take precedence" >&5
+echo "$as_me: WARNING: pthread.h: in the future, the compiler will take precedence" >&2;}
+    ( cat <<\_ASBOX
+## --------------------------------- ##
+## Report this to hcedwar@sandia.gov ##
+## --------------------------------- ##
+_ASBOX
+     ) | sed "s/^/$as_me: WARNING:     /" >&2
+    ;;
+esac
+{ echo "$as_me:$LINENO: checking for pthread.h" >&5
+echo $ECHO_N "checking for pthread.h... $ECHO_C" >&6; }
+if test "${ac_cv_header_pthread_h+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_cv_header_pthread_h=$ac_header_preproc
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_pthread_h" >&5
+echo "${ECHO_T}$ac_cv_header_pthread_h" >&6; }
+
+fi
+if test $ac_cv_header_pthread_h = yes; then
+  :
+else
+  acx_pthread_ok=noheader
+fi
+
+
+
+# We must check for the threads library under a number of different
+# names; the ordering is very important because some systems
+# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
+# libraries is broken (non-POSIX).
+
+# First of all, check if the user has set any of the PTHREAD_LIBS,
+# etcetera environment variables, and if threads linking works using
+# them:
+if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        { echo "$as_me:$LINENO: checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS" >&5
+echo $ECHO_N "checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS... $ECHO_C" >&6; }
+        cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pthread_join ();
+int
+main ()
+{
+return pthread_join ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext &&
+       $as_test_x conftest$ac_exeext; then
+  acx_pthread_ok=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
+        { echo "$as_me:$LINENO: result: $acx_pthread_ok" >&5
+echo "${ECHO_T}$acx_pthread_ok" >&6; }
+        if test x"$acx_pthread_ok" = xno; then
+                PTHREAD_LIBS=""
+                PTHREAD_CFLAGS=""
+        fi
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+fi
+
+# Create a list of thread flags to try.  Items starting with a "-" are
+# C compiler flags, and other items are library names, except for "none"
+# which indicates that we try without any flags at all.
+
+acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt"
+
+# The ordering *is* (sometimes) important.  Some notes on the
+# individual items follow:
+
+# pthreads: AIX (must check this before -lpthread)
+# none: in case threads are in libc; should be tried before -Kthread and
+#       other compiler flags to prevent continual compiler warnings
+# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
+# -pthreads: Solaris/gcc
+# -mthreads: Mingw32/gcc, Lynx/gcc
+# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+#      doesn't hurt to check since this sometimes defines pthreads too;
+#      also defines -D_REENTRANT)
+# pthread: Linux, etcetera
+# --thread-safe: KAI C++
+
+case "${host_cpu}-${host_os}" in
+        *solaris*)
+
+        # On Solaris (at least, for some versions), libc contains stubbed
+        # (non-functional) versions of the pthreads routines, so link-based
+        # tests will erroneously succeed.  (We need to link with -pthread or
+        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+        # a function called by this macro, so we could check for that, but
+        # who knows whether they'll stub that too in a future libc.)  So,
+        # we'll just look for -pthreads and -lpthread first:
+
+        acx_pthread_flags="-pthread -pthreads pthread -mt $acx_pthread_flags"
+        ;;
+esac
+
+if test x"$acx_pthread_ok" = xno; then
+for flag in $acx_pthread_flags; do
+
+        case $flag in
+                none)
+                { echo "$as_me:$LINENO: checking whether pthreads work without any flags" >&5
+echo $ECHO_N "checking whether pthreads work without any flags... $ECHO_C" >&6; }
+                ;;
+
+                -*)
+                { echo "$as_me:$LINENO: checking whether pthreads work with $flag" >&5
+echo $ECHO_N "checking whether pthreads work with $flag... $ECHO_C" >&6; }
+                PTHREAD_CFLAGS="$flag"
+                ;;
+
+                *)
+                { echo "$as_me:$LINENO: checking for the pthreads library -l$flag" >&5
+echo $ECHO_N "checking for the pthreads library -l$flag... $ECHO_C" >&6; }
+                PTHREAD_LIBS="-l$flag"
+                ;;
+        esac
+
+        save_LIBS="$LIBS"
+        save_CFLAGS="$CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Check for various functions.  We must include pthread.h,
+        # since some functions may be macros.  (On the Sequent, we
+        # need a special flag -Kthread to make this header compile.)
+        # We check for pthread_join because it is in -lpthread on IRIX
+        # while pthread_create is in libc.  We check for pthread_attr_init
+        # due to DEC craziness with -lpthreads.  We check for
+        # pthread_cleanup_push because it is one of the few pthread
+        # functions on Solaris that doesn't have a non-functional libc stub.
+        # We try pthread_create on general principles.
+        cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <pthread.h>
+int
+main ()
+{
+pthread_t th; pthread_join(th, 0);
+                     pthread_attr_init(0); pthread_cleanup_push(0, 0);
+                     pthread_create(0,0,0,0); pthread_cleanup_pop(0);
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext &&
+       $as_test_x conftest$ac_exeext; then
+  acx_pthread_ok=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        { echo "$as_me:$LINENO: result: $acx_pthread_ok" >&5
+echo "${ECHO_T}$acx_pthread_ok" >&6; }
+        if test "x$acx_pthread_ok" = xyes; then
+                break;
+        fi
+
+        PTHREAD_LIBS=""
+        PTHREAD_CFLAGS=""
+done
+fi
+
+# Various other checks:
+if test "x$acx_pthread_ok" = xyes; then
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Detect AIX lossage: threads are created detached by default
+        # and the JOINABLE attribute has a nonstandard name (UNDETACHED).
+        { echo "$as_me:$LINENO: checking for joinable pthread attribute" >&5
+echo $ECHO_N "checking for joinable pthread attribute... $ECHO_C" >&6; }
+        cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <pthread.h>
+int
+main ()
+{
+int attr=PTHREAD_CREATE_JOINABLE;
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext &&
+       $as_test_x conftest$ac_exeext; then
+  ok=PTHREAD_CREATE_JOINABLE
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ok=unknown
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
+        if test x"$ok" = xunknown; then
+                cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <pthread.h>
+int
+main ()
+{
+int attr=PTHREAD_CREATE_UNDETACHED;
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext &&
+       $as_test_x conftest$ac_exeext; then
+  ok=PTHREAD_CREATE_UNDETACHED
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ok=unknown
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
+        fi
+        if test x"$ok" != xPTHREAD_CREATE_JOINABLE; then
+
+cat >>confdefs.h <<\_ACEOF
+#define PTHREAD_CREATE_JOINABLE $ok
+_ACEOF
+
+        fi
+        { echo "$as_me:$LINENO: result: ${ok}" >&5
+echo "${ECHO_T}${ok}" >&6; }
+        if test x"$ok" = xunknown; then
+                { echo "$as_me:$LINENO: WARNING: we do not know how to create joinable pthreads" >&5
+echo "$as_me: WARNING: we do not know how to create joinable pthreads" >&2;}
+        fi
+
+        { echo "$as_me:$LINENO: checking if more special flags are required for pthreads" >&5
+echo $ECHO_N "checking if more special flags are required for pthreads... $ECHO_C" >&6; }
+        flag=no
+        case "${host_cpu}-${host_os}" in
+                *-aix* | *-freebsd*)     flag="-D_THREAD_SAFE";;
+                *solaris* | alpha*-osf*) flag="-D_REENTRANT";;
+        esac
+        { echo "$as_me:$LINENO: result: ${flag}" >&5
+echo "${ECHO_T}${flag}" >&6; }
+        if test "x$flag" != xno; then
+                PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
+        fi
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        # More AIX lossage: must compile with cc_r
+        # Extract the first word of "cc_r", so it can be a program name with args.
+set dummy cc_r; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_prog_PTHREAD_CC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  if test -n "$PTHREAD_CC"; then
+  ac_cv_prog_PTHREAD_CC="$PTHREAD_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_prog_PTHREAD_CC="cc_r"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_prog_PTHREAD_CC" && ac_cv_prog_PTHREAD_CC="${CC}"
+fi
+fi
+PTHREAD_CC=$ac_cv_prog_PTHREAD_CC
+if test -n "$PTHREAD_CC"; then
+  { echo "$as_me:$LINENO: result: $PTHREAD_CC" >&5
+echo "${ECHO_T}$PTHREAD_CC" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+else
+        PTHREAD_CC="$CC"
+fi
+
+
+
+
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$acx_pthread_ok" = xyes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_PTHREAD 1
+_ACEOF
+
+        :
+else
+        acx_pthread_ok=no
+
+fi
+
+
+LIBS="$PTHREAD_LIBS $LIBS"
+CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+CC="$PTHREAD_CC"
+
+fi
+# end of the list of libraries that don't need to be checked for if
+# tests and examples are disabled.
+
+# ------------------------------------------------------------------------
+# Checks for linker characteristics
+# ------------------------------------------------------------------------
+
+# Determine libraries needed for linking with Fortran
+#AC_F77_LIBRARY_LDFLAGS
+
+
+# ------------------------------------------------------------------------
+# Perform substitutions in output files
+# ------------------------------------------------------------------------
+
+
+
+# ------------------------------------------------------------------------
+# Output files
+# ------------------------------------------------------------------------
+##
+#  You will need to change AC_CONFIG_FILES below and Makefile.am
+#  to add a new directory.
+ac_config_files="$ac_config_files Makefile Makefile.export.threadpool src/Makefile test/Makefile"
+
+
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+  for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { echo "$as_me:$LINENO: WARNING: Cache variable $ac_var contains a newline." >&5
+echo "$as_me: WARNING: Cache variable $ac_var contains a newline." >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      *) $as_unset $ac_var ;;
+      esac ;;
+    esac
+  done
+
+  (set) 2>&1 |
+    case $as_nl`(ac_space=' '; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      # `set' does not quote correctly, so add quotes (double-quote
+      # substitution turns \\\\ into \\, and sed turns \\ into \).
+      sed -n \
+	"s/'/'\\\\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+      ;; #(
+    *)
+      # `set' quotes correctly as required by POSIX, so do not add quotes.
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+) |
+  sed '
+     /^ac_cv_env_/b end
+     t clear
+     :clear
+     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+     t end
+     s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+     :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+  if test -w "$cache_file"; then
+    test "x$cache_file" != "x/dev/null" &&
+      { echo "$as_me:$LINENO: updating cache $cache_file" >&5
+echo "$as_me: updating cache $cache_file" >&6;}
+    cat confcache >$cache_file
+  else
+    { echo "$as_me:$LINENO: not updating unwritable cache $cache_file" >&5
+echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+  fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+DEFS=-DHAVE_CONFIG_H
+
+ac_libobjs=
+ac_ltlibobjs=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+  # 1. Remove the extension, and $U if already installed.
+  ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+  ac_i=`echo "$ac_i" | sed "$ac_script"`
+  # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
+  #    will be set to the directory where LIBOBJS objects are built.
+  ac_libobjs="$ac_libobjs \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+  ac_ltlibobjs="$ac_ltlibobjs \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"MAINTAINER_MODE\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"MAINTAINER_MODE\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${HAVE_MPI_TRUE}" && test -z "${HAVE_MPI_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"HAVE_MPI\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"HAVE_MPI\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"AMDEP\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"AMDEP\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"am__fastdepCC\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"am__fastdepCC\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"am__fastdepCXX\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"am__fastdepCXX\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${USE_ALTERNATE_AR_TRUE}" && test -z "${USE_ALTERNATE_AR_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"USE_ALTERNATE_AR\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"USE_ALTERNATE_AR\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${USE_ALTERNATE_AR_TRUE}" && test -z "${USE_ALTERNATE_AR_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"USE_ALTERNATE_AR\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"USE_ALTERNATE_AR\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${USING_EXPORT_MAKEFILES_TRUE}" && test -z "${USING_EXPORT_MAKEFILES_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"USING_EXPORT_MAKEFILES\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"USING_EXPORT_MAKEFILES\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${USING_PERL_TRUE}" && test -z "${USING_PERL_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"USING_PERL\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"USING_PERL\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${USING_GNUMAKE_TRUE}" && test -z "${USING_GNUMAKE_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"USING_GNUMAKE\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"USING_GNUMAKE\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${BUILD_TESTS_TRUE}" && test -z "${BUILD_TESTS_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"BUILD_TESTS\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"BUILD_TESTS\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+if test -z "${SUB_TEST_TRUE}" && test -z "${SUB_TEST_FALSE}"; then
+  { { echo "$as_me:$LINENO: error: conditional \"SUB_TEST\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+echo "$as_me: error: conditional \"SUB_TEST\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
+
+: ${CONFIG_STATUS=./config.status}
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5
+echo "$as_me: creating $CONFIG_STATUS" >&6;}
+cat >$CONFIG_STATUS <<_ACEOF
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+SHELL=\${CONFIG_SHELL-$SHELL}
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+## --------------------- ##
+## M4sh Initialization.  ##
+## --------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in
+  *posix*) set -o posix ;;
+esac
+
+fi
+
+
+
+
+# PATH needs CR
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  echo "#! /bin/sh" >conf$$.sh
+  echo  "exit 0"   >>conf$$.sh
+  chmod +x conf$$.sh
+  if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then
+    PATH_SEPARATOR=';'
+  else
+    PATH_SEPARATOR=:
+  fi
+  rm -f conf$$.sh
+fi
+
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+  as_unset=unset
+else
+  as_unset=false
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+as_nl='
+'
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+case $0 in
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  { (exit 1); exit 1; }
+fi
+
+# Work around bugs in pre-3.0 UWIN ksh.
+for as_var in ENV MAIL MAILPATH
+do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+for as_var in \
+  LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \
+  LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \
+  LC_TELEPHONE LC_TIME
+do
+  if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then
+    eval $as_var=C; export $as_var
+  else
+    ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+  fi
+done
+
+# Required to use basename.
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+
+# Name of the executable.
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# CDPATH.
+$as_unset CDPATH
+
+
+
+  as_lineno_1=$LINENO
+  as_lineno_2=$LINENO
+  test "x$as_lineno_1" != "x$as_lineno_2" &&
+  test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
+
+  # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+  # uniformly replaced by the line number.  The first 'sed' inserts a
+  # line-number line after each line using $LINENO; the second 'sed'
+  # does the real work.  The second script uses 'N' to pair each
+  # line-number line with the line containing $LINENO, and appends
+  # trailing '-' during substitution so that $LINENO is not a special
+  # case at line end.
+  # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+  # scripts with optimization help from Paolo Bonzini.  Blame Lee
+  # E. McMahon (1931-1989) for sed's syntax.  :-)
+  sed -n '
+    p
+    /[$]LINENO/=
+  ' <$as_myself |
+    sed '
+      s/[$]LINENO.*/&-/
+      t lineno
+      b
+      :lineno
+      N
+      :loop
+      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+      t loop
+      s/-\n.*//
+    ' >$as_me.lineno &&
+  chmod +x "$as_me.lineno" ||
+    { echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
+   { (exit 1); exit 1; }; }
+
+  # Don't try to exec as it changes $[0], causing all sort of problems
+  # (the dirname of $[0] is not the place where we might find the
+  # original and so on.  Autoconf is especially sensitive to this).
+  . "./$as_me.lineno"
+  # Exit status is that of the last command.
+  exit
+}
+
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in
+-n*)
+  case `echo 'x\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  *)   ECHO_C='\c';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir
+fi
+echo >conf$$.file
+if ln -s conf$$.file conf$$ 2>/dev/null; then
+  as_ln_s='ln -s'
+  # ... but there are two gotchas:
+  # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+  # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+  # In both cases, we have to default to `cp -p'.
+  ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+    as_ln_s='cp -p'
+elif ln conf$$.file conf$$ 2>/dev/null; then
+  as_ln_s=ln
+else
+  as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p=:
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+if test -x / >/dev/null 2>&1; then
+  as_test_x='test -x'
+else
+  if ls -dL / >/dev/null 2>&1; then
+    as_ls_L_option=L
+  else
+    as_ls_L_option=
+  fi
+  as_test_x='
+    eval sh -c '\''
+      if test -d "$1"; then
+        test -d "$1/.";
+      else
+	case $1 in
+        -*)set "./$1";;
+	esac;
+	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
+	???[sx]*):;;*)false;;esac;fi
+    '\'' sh
+  '
+fi
+as_executable_p=$as_test_x
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+exec 6>&1
+
+# Save the log message, to keep $[0] and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by ThreadPool $as_me 1.1d, which was
+generated by GNU Autoconf 2.61.  Invocation command line was
+
+  CONFIG_FILES    = $CONFIG_FILES
+  CONFIG_HEADERS  = $CONFIG_HEADERS
+  CONFIG_LINKS    = $CONFIG_LINKS
+  CONFIG_COMMANDS = $CONFIG_COMMANDS
+  $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<_ACEOF
+# Files that config.status was made for.
+config_files="$ac_config_files"
+config_headers="$ac_config_headers"
+config_commands="$ac_config_commands"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+ac_cs_usage="\
+\`$as_me' instantiates files from templates according to the
+current configuration.
+
+Usage: $0 [OPTIONS] [FILE]...
+
+  -h, --help       print this help, then exit
+  -V, --version    print version number and configuration settings, then exit
+  -q, --quiet      do not print progress messages
+  -d, --debug      don't remove temporary files
+      --recheck    update $as_me by reconfiguring in the same conditions
+  --file=FILE[:TEMPLATE]
+		   instantiate the configuration file FILE
+  --header=FILE[:TEMPLATE]
+		   instantiate the configuration header FILE
+
+Configuration files:
+$config_files
+
+Configuration headers:
+$config_headers
+
+Configuration commands:
+$config_commands
+
+Report bugs to <bug-autoconf@gnu.org>."
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF
+ac_cs_version="\\
+ThreadPool config.status 1.1d
+configured by $0, generated by GNU Autoconf 2.61,
+  with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
+
+Copyright (C) 2006 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+INSTALL='$INSTALL'
+MKDIR_P='$MKDIR_P'
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+# If no file are specified by the user, then we need to provide default
+# value.  By we need to know if files were specified by the user.
+ac_need_defaults=:
+while test $# != 0
+do
+  case $1 in
+  --*=*)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+    ac_shift=:
+    ;;
+  *)
+    ac_option=$1
+    ac_optarg=$2
+    ac_shift=shift
+    ;;
+  esac
+
+  case $ac_option in
+  # Handling of the options.
+  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+    ac_cs_recheck=: ;;
+  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+    echo "$ac_cs_version"; exit ;;
+  --debug | --debu | --deb | --de | --d | -d )
+    debug=: ;;
+  --file | --fil | --fi | --f )
+    $ac_shift
+    CONFIG_FILES="$CONFIG_FILES $ac_optarg"
+    ac_need_defaults=false;;
+  --header | --heade | --head | --hea )
+    $ac_shift
+    CONFIG_HEADERS="$CONFIG_HEADERS $ac_optarg"
+    ac_need_defaults=false;;
+  --he | --h)
+    # Conflict between --help and --header
+    { echo "$as_me: error: ambiguous option: $1
+Try \`$0 --help' for more information." >&2
+   { (exit 1); exit 1; }; };;
+  --help | --hel | -h )
+    echo "$ac_cs_usage"; exit ;;
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil | --si | --s)
+    ac_cs_silent=: ;;
+
+  # This is an error.
+  -*) { echo "$as_me: error: unrecognized option: $1
+Try \`$0 --help' for more information." >&2
+   { (exit 1); exit 1; }; } ;;
+
+  *) ac_config_targets="$ac_config_targets $1"
+     ac_need_defaults=false ;;
+
+  esac
+  shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+  exec 6>/dev/null
+  ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF
+if \$ac_cs_recheck; then
+  echo "running CONFIG_SHELL=$SHELL $SHELL $0 "$ac_configure_args \$ac_configure_extra_args " --no-create --no-recursion" >&6
+  CONFIG_SHELL=$SHELL
+  export CONFIG_SHELL
+  exec $SHELL "$0"$ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF
+exec 5>>config.log
+{
+  echo
+  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+  echo "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF
+#
+# INIT-COMMANDS
+#
+AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+  case $ac_config_target in
+    "src/ThreadPool_config.h") CONFIG_HEADERS="$CONFIG_HEADERS src/ThreadPool_config.h:src/ThreadPool_config.h.in" ;;
+    "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
+    "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
+    "Makefile.export.threadpool") CONFIG_FILES="$CONFIG_FILES Makefile.export.threadpool" ;;
+    "src/Makefile") CONFIG_FILES="$CONFIG_FILES src/Makefile" ;;
+    "test/Makefile") CONFIG_FILES="$CONFIG_FILES test/Makefile" ;;
+
+  *) { { echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5
+echo "$as_me: error: invalid argument: $ac_config_target" >&2;}
+   { (exit 1); exit 1; }; };;
+  esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used.  Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+  test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers
+  test "${CONFIG_COMMANDS+set}" = set || CONFIG_COMMANDS=$config_commands
+fi
+
+# Have a temporary directory for convenience.  Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to `$tmp'.
+$debug ||
+{
+  tmp=
+  trap 'exit_status=$?
+  { test -z "$tmp" || test ! -d "$tmp" || rm -fr "$tmp"; } && exit $exit_status
+' 0
+  trap '{ (exit 1); exit 1; }' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+  tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+  test -n "$tmp" && test -d "$tmp"
+}  ||
+{
+  tmp=./conf$$-$RANDOM
+  (umask 077 && mkdir "$tmp")
+} ||
+{
+   echo "$me: cannot create a temporary directory in ." >&2
+   { (exit 1); exit 1; }
+}
+
+#
+# Set up the sed scripts for CONFIG_FILES section.
+#
+
+# No need to generate the scripts if there are no CONFIG_FILES.
+# This happens for instance when ./config.status config.h
+if test -n "$CONFIG_FILES"; then
+
+_ACEOF
+
+
+
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+  cat >conf$$subs.sed <<_ACEOF
+SHELL!$SHELL$ac_delim
+PATH_SEPARATOR!$PATH_SEPARATOR$ac_delim
+PACKAGE_NAME!$PACKAGE_NAME$ac_delim
+PACKAGE_TARNAME!$PACKAGE_TARNAME$ac_delim
+PACKAGE_VERSION!$PACKAGE_VERSION$ac_delim
+PACKAGE_STRING!$PACKAGE_STRING$ac_delim
+PACKAGE_BUGREPORT!$PACKAGE_BUGREPORT$ac_delim
+exec_prefix!$exec_prefix$ac_delim
+prefix!$prefix$ac_delim
+program_transform_name!$program_transform_name$ac_delim
+bindir!$bindir$ac_delim
+sbindir!$sbindir$ac_delim
+libexecdir!$libexecdir$ac_delim
+datarootdir!$datarootdir$ac_delim
+datadir!$datadir$ac_delim
+sysconfdir!$sysconfdir$ac_delim
+sharedstatedir!$sharedstatedir$ac_delim
+localstatedir!$localstatedir$ac_delim
+includedir!$includedir$ac_delim
+oldincludedir!$oldincludedir$ac_delim
+docdir!$docdir$ac_delim
+infodir!$infodir$ac_delim
+htmldir!$htmldir$ac_delim
+dvidir!$dvidir$ac_delim
+pdfdir!$pdfdir$ac_delim
+psdir!$psdir$ac_delim
+libdir!$libdir$ac_delim
+localedir!$localedir$ac_delim
+mandir!$mandir$ac_delim
+DEFS!$DEFS$ac_delim
+ECHO_C!$ECHO_C$ac_delim
+ECHO_N!$ECHO_N$ac_delim
+ECHO_T!$ECHO_T$ac_delim
+LIBS!$LIBS$ac_delim
+build_alias!$build_alias$ac_delim
+host_alias!$host_alias$ac_delim
+target_alias!$target_alias$ac_delim
+MAINTAINER_MODE_TRUE!$MAINTAINER_MODE_TRUE$ac_delim
+MAINTAINER_MODE_FALSE!$MAINTAINER_MODE_FALSE$ac_delim
+MAINT!$MAINT$ac_delim
+build!$build$ac_delim
+build_cpu!$build_cpu$ac_delim
+build_vendor!$build_vendor$ac_delim
+build_os!$build_os$ac_delim
+host!$host$ac_delim
+host_cpu!$host_cpu$ac_delim
+host_vendor!$host_vendor$ac_delim
+host_os!$host_os$ac_delim
+target!$target$ac_delim
+target_cpu!$target_cpu$ac_delim
+target_vendor!$target_vendor$ac_delim
+target_os!$target_os$ac_delim
+INSTALL_PROGRAM!$INSTALL_PROGRAM$ac_delim
+INSTALL_SCRIPT!$INSTALL_SCRIPT$ac_delim
+INSTALL_DATA!$INSTALL_DATA$ac_delim
+am__isrc!$am__isrc$ac_delim
+CYGPATH_W!$CYGPATH_W$ac_delim
+PACKAGE!$PACKAGE$ac_delim
+VERSION!$VERSION$ac_delim
+ACLOCAL!$ACLOCAL$ac_delim
+AUTOCONF!$AUTOCONF$ac_delim
+AUTOMAKE!$AUTOMAKE$ac_delim
+AUTOHEADER!$AUTOHEADER$ac_delim
+MAKEINFO!$MAKEINFO$ac_delim
+install_sh!$install_sh$ac_delim
+STRIP!$STRIP$ac_delim
+INSTALL_STRIP_PROGRAM!$INSTALL_STRIP_PROGRAM$ac_delim
+mkdir_p!$mkdir_p$ac_delim
+AWK!$AWK$ac_delim
+SET_MAKE!$SET_MAKE$ac_delim
+am__leading_dot!$am__leading_dot$ac_delim
+AMTAR!$AMTAR$ac_delim
+am__tar!$am__tar$ac_delim
+am__untar!$am__untar$ac_delim
+MPI_TEMP_CXX!$MPI_TEMP_CXX$ac_delim
+MPI_CXX!$MPI_CXX$ac_delim
+HAVE_MPI_TRUE!$HAVE_MPI_TRUE$ac_delim
+HAVE_MPI_FALSE!$HAVE_MPI_FALSE$ac_delim
+MPI_CXX_EXISTS!$MPI_CXX_EXISTS$ac_delim
+MPI_CC_EXISTS!$MPI_CC_EXISTS$ac_delim
+MPI_F77_EXISTS!$MPI_F77_EXISTS$ac_delim
+CC!$CC$ac_delim
+CFLAGS!$CFLAGS$ac_delim
+LDFLAGS!$LDFLAGS$ac_delim
+CPPFLAGS!$CPPFLAGS$ac_delim
+ac_ct_CC!$ac_ct_CC$ac_delim
+EXEEXT!$EXEEXT$ac_delim
+OBJEXT!$OBJEXT$ac_delim
+DEPDIR!$DEPDIR$ac_delim
+am__include!$am__include$ac_delim
+am__quote!$am__quote$ac_delim
+AMDEP_TRUE!$AMDEP_TRUE$ac_delim
+AMDEP_FALSE!$AMDEP_FALSE$ac_delim
+AMDEPBACKSLASH!$AMDEPBACKSLASH$ac_delim
+CCDEPMODE!$CCDEPMODE$ac_delim
+am__fastdepCC_TRUE!$am__fastdepCC_TRUE$ac_delim
+am__fastdepCC_FALSE!$am__fastdepCC_FALSE$ac_delim
+_ACEOF
+
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
+    break
+  elif $ac_last_try; then
+    { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+   { (exit 1); exit 1; }; }
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+
+ac_eof=`sed -n '/^CEOF[0-9]*$/s/CEOF/0/p' conf$$subs.sed`
+if test -n "$ac_eof"; then
+  ac_eof=`echo "$ac_eof" | sort -nru | sed 1q`
+  ac_eof=`expr $ac_eof + 1`
+fi
+
+cat >>$CONFIG_STATUS <<_ACEOF
+cat >"\$tmp/subs-1.sed" <<\CEOF$ac_eof
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+_ACEOF
+sed '
+s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g
+s/^/s,@/; s/!/@,|#_!!_#|/
+:n
+t n
+s/'"$ac_delim"'$/,g/; t
+s/$/\\/; p
+N; s/^.*\n//; s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g; b n
+' >>$CONFIG_STATUS <conf$$subs.sed
+rm -f conf$$subs.sed
+cat >>$CONFIG_STATUS <<_ACEOF
+CEOF$ac_eof
+_ACEOF
+
+
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+  cat >conf$$subs.sed <<_ACEOF
+CXX!$CXX$ac_delim
+CXXFLAGS!$CXXFLAGS$ac_delim
+ac_ct_CXX!$ac_ct_CXX$ac_delim
+CXXDEPMODE!$CXXDEPMODE$ac_delim
+am__fastdepCXX_TRUE!$am__fastdepCXX_TRUE$ac_delim
+am__fastdepCXX_FALSE!$am__fastdepCXX_FALSE$ac_delim
+RANLIB!$RANLIB$ac_delim
+USE_ALTERNATE_AR_TRUE!$USE_ALTERNATE_AR_TRUE$ac_delim
+USE_ALTERNATE_AR_FALSE!$USE_ALTERNATE_AR_FALSE$ac_delim
+ALTERNATE_AR!$ALTERNATE_AR$ac_delim
+CXXCPP!$CXXCPP$ac_delim
+USING_EXPORT_MAKEFILES_TRUE!$USING_EXPORT_MAKEFILES_TRUE$ac_delim
+USING_EXPORT_MAKEFILES_FALSE!$USING_EXPORT_MAKEFILES_FALSE$ac_delim
+PERL_EXE!$PERL_EXE$ac_delim
+HAVE_PERL!$HAVE_PERL$ac_delim
+USING_PERL_TRUE!$USING_PERL_TRUE$ac_delim
+USING_PERL_FALSE!$USING_PERL_FALSE$ac_delim
+USING_GNUMAKE_TRUE!$USING_GNUMAKE_TRUE$ac_delim
+USING_GNUMAKE_FALSE!$USING_GNUMAKE_FALSE$ac_delim
+BUILD_TESTS_TRUE!$BUILD_TESTS_TRUE$ac_delim
+BUILD_TESTS_FALSE!$BUILD_TESTS_FALSE$ac_delim
+SUB_TEST_TRUE!$SUB_TEST_TRUE$ac_delim
+SUB_TEST_FALSE!$SUB_TEST_FALSE$ac_delim
+GREP!$GREP$ac_delim
+EGREP!$EGREP$ac_delim
+PTHREAD_CC!$PTHREAD_CC$ac_delim
+PTHREAD_LIBS!$PTHREAD_LIBS$ac_delim
+PTHREAD_CFLAGS!$PTHREAD_CFLAGS$ac_delim
+ac_aux_dir!$ac_aux_dir$ac_delim
+LIBOBJS!$LIBOBJS$ac_delim
+LTLIBOBJS!$LTLIBOBJS$ac_delim
+_ACEOF
+
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 31; then
+    break
+  elif $ac_last_try; then
+    { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+   { (exit 1); exit 1; }; }
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+
+ac_eof=`sed -n '/^CEOF[0-9]*$/s/CEOF/0/p' conf$$subs.sed`
+if test -n "$ac_eof"; then
+  ac_eof=`echo "$ac_eof" | sort -nru | sed 1q`
+  ac_eof=`expr $ac_eof + 1`
+fi
+
+cat >>$CONFIG_STATUS <<_ACEOF
+cat >"\$tmp/subs-2.sed" <<\CEOF$ac_eof
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b end
+_ACEOF
+sed '
+s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g
+s/^/s,@/; s/!/@,|#_!!_#|/
+:n
+t n
+s/'"$ac_delim"'$/,g/; t
+s/$/\\/; p
+N; s/^.*\n//; s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g; b n
+' >>$CONFIG_STATUS <conf$$subs.sed
+rm -f conf$$subs.sed
+cat >>$CONFIG_STATUS <<_ACEOF
+:end
+s/|#_!!_#|//g
+CEOF$ac_eof
+_ACEOF
+
+
+# VPATH may cause trouble with some makes, so we remove $(srcdir),
+# ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+  ac_vpsub='/^[	 ]*VPATH[	 ]*=/{
+s/:*\$(srcdir):*/:/
+s/:*\${srcdir}:*/:/
+s/:*@srcdir@:*/:/
+s/^\([^=]*=[	 ]*\):*/\1/
+s/:*$//
+s/^[^=]*=[	 ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+fi # test -n "$CONFIG_FILES"
+
+
+for ac_tag in  :F $CONFIG_FILES  :H $CONFIG_HEADERS    :C $CONFIG_COMMANDS
+do
+  case $ac_tag in
+  :[FHLC]) ac_mode=$ac_tag; continue;;
+  esac
+  case $ac_mode$ac_tag in
+  :[FHL]*:*);;
+  :L* | :C*:*) { { echo "$as_me:$LINENO: error: Invalid tag $ac_tag." >&5
+echo "$as_me: error: Invalid tag $ac_tag." >&2;}
+   { (exit 1); exit 1; }; };;
+  :[FH]-) ac_tag=-:-;;
+  :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+  esac
+  ac_save_IFS=$IFS
+  IFS=:
+  set x $ac_tag
+  IFS=$ac_save_IFS
+  shift
+  ac_file=$1
+  shift
+
+  case $ac_mode in
+  :L) ac_source=$1;;
+  :[FH])
+    ac_file_inputs=
+    for ac_f
+    do
+      case $ac_f in
+      -) ac_f="$tmp/stdin";;
+      *) # Look for the file first in the build tree, then in the source tree
+	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
+	 # because $ac_f cannot contain `:'.
+	 test -f "$ac_f" ||
+	   case $ac_f in
+	   [\\/$]*) false;;
+	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+	   esac ||
+	   { { echo "$as_me:$LINENO: error: cannot find input file: $ac_f" >&5
+echo "$as_me: error: cannot find input file: $ac_f" >&2;}
+   { (exit 1); exit 1; }; };;
+      esac
+      ac_file_inputs="$ac_file_inputs $ac_f"
+    done
+
+    # Let's still pretend it is `configure' which instantiates (i.e., don't
+    # use $as_me), people would be surprised to read:
+    #    /* config.h.  Generated by config.status.  */
+    configure_input="Generated from "`IFS=:
+	  echo $* | sed 's|^[^:]*/||;s|:[^:]*/|, |g'`" by configure."
+    if test x"$ac_file" != x-; then
+      configure_input="$ac_file.  $configure_input"
+      { echo "$as_me:$LINENO: creating $ac_file" >&5
+echo "$as_me: creating $ac_file" >&6;}
+    fi
+
+    case $ac_tag in
+    *:-:* | *:-) cat >"$tmp/stdin";;
+    esac
+    ;;
+  esac
+
+  ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$ac_file" : 'X\(//\)[^/]' \| \
+	 X"$ac_file" : 'X\(//\)$' \| \
+	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+echo X"$ac_file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  { as_dir="$ac_dir"
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || { { echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5
+echo "$as_me: error: cannot create directory $as_dir" >&2;}
+   { (exit 1); exit 1; }; }; }
+  ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,/..,g;s,/,,'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+  case $ac_mode in
+  :F)
+  #
+  # CONFIG_FILE
+  #
+
+  case $INSTALL in
+  [\\/$]* | ?:[\\/]* ) ac_INSTALL=$INSTALL ;;
+  *) ac_INSTALL=$ac_top_build_prefix$INSTALL ;;
+  esac
+  ac_MKDIR_P=$MKDIR_P
+  case $MKDIR_P in
+  [\\/$]* | ?:[\\/]* ) ;;
+  */*) ac_MKDIR_P=$ac_top_build_prefix$MKDIR_P ;;
+  esac
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+
+case `sed -n '/datarootdir/ {
+  p
+  q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p
+' $ac_file_inputs` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+  { echo "$as_me:$LINENO: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF
+  ac_datarootdir_hack='
+  s&@datadir@&$datadir&g
+  s&@docdir@&$docdir&g
+  s&@infodir@&$infodir&g
+  s&@localedir@&$localedir&g
+  s&@mandir@&$mandir&g
+    s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when `$srcdir' = `.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF
+  sed "$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s&@configure_input@&$configure_input&;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+s&@INSTALL@&$ac_INSTALL&;t t
+s&@MKDIR_P@&$ac_MKDIR_P&;t t
+$ac_datarootdir_hack
+" $ac_file_inputs | sed -f "$tmp/subs-1.sed" | sed -f "$tmp/subs-2.sed" >$tmp/out
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+  { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } &&
+  { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' "$tmp/out"`; test -z "$ac_out"; } &&
+  { echo "$as_me:$LINENO: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined." >&5
+echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined." >&2;}
+
+  rm -f "$tmp/stdin"
+  case $ac_file in
+  -) cat "$tmp/out"; rm -f "$tmp/out";;
+  *) rm -f "$ac_file"; mv "$tmp/out" $ac_file;;
+  esac
+ ;;
+  :H)
+  #
+  # CONFIG_HEADER
+  #
+_ACEOF
+
+# Transform confdefs.h into a sed script `conftest.defines', that
+# substitutes the proper values into config.h.in to produce config.h.
+rm -f conftest.defines conftest.tail
+# First, append a space to every undef/define line, to ease matching.
+echo 's/$/ /' >conftest.defines
+# Then, protect against being on the right side of a sed subst, or in
+# an unquoted here document, in config.status.  If some macros were
+# called several times there might be several #defines for the same
+# symbol, which is useless.  But do not sort them, since the last
+# AC_DEFINE must be honored.
+ac_word_re=[_$as_cr_Letters][_$as_cr_alnum]*
+# These sed commands are passed to sed as "A NAME B PARAMS C VALUE D", where
+# NAME is the cpp macro being defined, VALUE is the value it is being given.
+# PARAMS is the parameter list in the macro definition--in most cases, it's
+# just an empty string.
+ac_dA='s,^\\([	 #]*\\)[^	 ]*\\([	 ]*'
+ac_dB='\\)[	 (].*,\\1define\\2'
+ac_dC=' '
+ac_dD=' ,'
+
+uniq confdefs.h |
+  sed -n '
+	t rset
+	:rset
+	s/^[	 ]*#[	 ]*define[	 ][	 ]*//
+	t ok
+	d
+	:ok
+	s/[\\&,]/\\&/g
+	s/^\('"$ac_word_re"'\)\(([^()]*)\)[	 ]*\(.*\)/ '"$ac_dA"'\1'"$ac_dB"'\2'"${ac_dC}"'\3'"$ac_dD"'/p
+	s/^\('"$ac_word_re"'\)[	 ]*\(.*\)/'"$ac_dA"'\1'"$ac_dB$ac_dC"'\2'"$ac_dD"'/p
+  ' >>conftest.defines
+
+# Remove the space that was appended to ease matching.
+# Then replace #undef with comments.  This is necessary, for
+# example, in the case of _POSIX_SOURCE, which is predefined and required
+# on some systems where configure will not decide to define it.
+# (The regexp can be short, since the line contains either #define or #undef.)
+echo 's/ $//
+s,^[	 #]*u.*,/* & */,' >>conftest.defines
+
+# Break up conftest.defines:
+ac_max_sed_lines=50
+
+# First sed command is:	 sed -f defines.sed $ac_file_inputs >"$tmp/out1"
+# Second one is:	 sed -f defines.sed "$tmp/out1" >"$tmp/out2"
+# Third one will be:	 sed -f defines.sed "$tmp/out2" >"$tmp/out1"
+# et cetera.
+ac_in='$ac_file_inputs'
+ac_out='"$tmp/out1"'
+ac_nxt='"$tmp/out2"'
+
+while :
+do
+  # Write a here document:
+    cat >>$CONFIG_STATUS <<_ACEOF
+    # First, check the format of the line:
+    cat >"\$tmp/defines.sed" <<\\CEOF
+/^[	 ]*#[	 ]*undef[	 ][	 ]*$ac_word_re[	 ]*\$/b def
+/^[	 ]*#[	 ]*define[	 ][	 ]*$ac_word_re[(	 ]/b def
+b
+:def
+_ACEOF
+  sed ${ac_max_sed_lines}q conftest.defines >>$CONFIG_STATUS
+  echo 'CEOF
+    sed -f "$tmp/defines.sed"' "$ac_in >$ac_out" >>$CONFIG_STATUS
+  ac_in=$ac_out; ac_out=$ac_nxt; ac_nxt=$ac_in
+  sed 1,${ac_max_sed_lines}d conftest.defines >conftest.tail
+  grep . conftest.tail >/dev/null || break
+  rm -f conftest.defines
+  mv conftest.tail conftest.defines
+done
+rm -f conftest.defines conftest.tail
+
+echo "ac_result=$ac_in" >>$CONFIG_STATUS
+cat >>$CONFIG_STATUS <<\_ACEOF
+  if test x"$ac_file" != x-; then
+    echo "/* $configure_input  */" >"$tmp/config.h"
+    cat "$ac_result" >>"$tmp/config.h"
+    if diff $ac_file "$tmp/config.h" >/dev/null 2>&1; then
+      { echo "$as_me:$LINENO: $ac_file is unchanged" >&5
+echo "$as_me: $ac_file is unchanged" >&6;}
+    else
+      rm -f $ac_file
+      mv "$tmp/config.h" $ac_file
+    fi
+  else
+    echo "/* $configure_input  */"
+    cat "$ac_result"
+  fi
+  rm -f "$tmp/out12"
+# Compute $ac_file's index in $config_headers.
+_am_stamp_count=1
+for _am_header in $config_headers :; do
+  case $_am_header in
+    $ac_file | $ac_file:* )
+      break ;;
+    * )
+      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
+  esac
+done
+echo "timestamp for $ac_file" >`$as_dirname -- $ac_file ||
+$as_expr X$ac_file : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X$ac_file : 'X\(//\)[^/]' \| \
+	 X$ac_file : 'X\(//\)$' \| \
+	 X$ac_file : 'X\(/\)' \| . 2>/dev/null ||
+echo X$ac_file |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`/stamp-h$_am_stamp_count
+ ;;
+
+  :C)  { echo "$as_me:$LINENO: executing $ac_file commands" >&5
+echo "$as_me: executing $ac_file commands" >&6;}
+ ;;
+  esac
+
+
+  case $ac_file$ac_mode in
+    "depfiles":C) test x"$AMDEP_TRUE" != x"" || for mf in $CONFIG_FILES; do
+  # Strip MF so we end up with the name of the file.
+  mf=`echo "$mf" | sed -e 's/:.*$//'`
+  # Check whether this is an Automake generated Makefile or not.
+  # We used to match only the files named `Makefile.in', but
+  # some people rename them; so instead we look at the file content.
+  # Grep'ing the first line is not enough: some people post-process
+  # each Makefile.in and add a new line on top of each file to say so.
+  # Grep'ing the whole file is not good either: AIX grep has a line
+  # limit of 2048, but all sed's we know have understand at least 4000.
+  if sed 10q "$mf" | grep '^#.*generated by automake' > /dev/null 2>&1; then
+    dirpart=`$as_dirname -- "$mf" ||
+$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$mf" : 'X\(//\)[^/]' \| \
+	 X"$mf" : 'X\(//\)$' \| \
+	 X"$mf" : 'X\(/\)' \| . 2>/dev/null ||
+echo X"$mf" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  else
+    continue
+  fi
+  # Extract the definition of DEPDIR, am__include, and am__quote
+  # from the Makefile without running `make'.
+  DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
+  test -z "$DEPDIR" && continue
+  am__include=`sed -n 's/^am__include = //p' < "$mf"`
+  test -z "am__include" && continue
+  am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
+  # When using ansi2knr, U may be empty or an underscore; expand it
+  U=`sed -n 's/^U = //p' < "$mf"`
+  # Find all dependency output files, they are included files with
+  # $(DEPDIR) in their names.  We invoke sed twice because it is the
+  # simplest approach to changing $(DEPDIR) to its actual value in the
+  # expansion.
+  for file in `sed -n "
+    s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
+       sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
+    # Make sure the directory exists.
+    test -f "$dirpart/$file" && continue
+    fdir=`$as_dirname -- "$file" ||
+$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$file" : 'X\(//\)[^/]' \| \
+	 X"$file" : 'X\(//\)$' \| \
+	 X"$file" : 'X\(/\)' \| . 2>/dev/null ||
+echo X"$file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+    { as_dir=$dirpart/$fdir
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || { { echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5
+echo "$as_me: error: cannot create directory $as_dir" >&2;}
+   { (exit 1); exit 1; }; }; }
+    # echo "creating $dirpart/$file"
+    echo '# dummy' > "$dirpart/$file"
+  done
+done
+ ;;
+
+  esac
+done # for ac_tag
+
+
+{ (exit 0); exit 0; }
+_ACEOF
+chmod +x $CONFIG_STATUS
+ac_clean_files=$ac_clean_files_save
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded.  So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status.  When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+  ac_cs_success=:
+  ac_config_status_args=
+  test "$silent" = yes &&
+    ac_config_status_args="$ac_config_status_args --quiet"
+  exec 5>/dev/null
+  $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+  exec 5>>config.log
+  # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+  # would make configure fail if this is the last instruction.
+  $ac_cs_success || { (exit 1); exit 1; }
+fi
+
+
+# Bye World!
+echo "---------------------------------------------"
+echo "Finished Running ThreadPool Configure Script"
+echo "---------------------------------------------"
diff --git a/openmp-avx512/basic/optional/ThreadPool/configure.ac b/openmp-avx512/basic/optional/ThreadPool/configure.ac
new file mode 100644
index 0000000..12778f4
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/configure.ac
@@ -0,0 +1,240 @@
+# ------------------------------------------------------------------------
+# Process this file with autoconf to produce a configure script.
+# ------------------------------------------------------------------------
+
+# @HEADER
+# ************************************************************************
+# 
+#                           ThreadPool Package
+#                 Copyright (2008) Sandia Corporation
+# 
+# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+# license for use of this work by or on behalf of the U.S. Government.
+# 
+# This library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of the
+# License, or (at your option) any later version.
+#  
+# This library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#  
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
+# 
+# ************************************************************************
+# @HEADER
+
+# ------------------------------------------------------------------------
+# Initialization 
+# ------------------------------------------------------------------------
+
+# This must be the first line in configure.ac.
+# Optional 3rd argument is email address for bugs.
+
+#  #np# - package name, version number, and e-mail address below
+AC_INIT(ThreadPool, 1.1d, hcedwar@sandia.gov)
+
+# Hello World!
+echo "----------------------------------------"
+echo "Running ThreadPool Configure Script"
+echo "----------------------------------------"
+
+# This is to protect against accidentally specifying the wrong
+# directory with --srcdir.  Any file in that directory will do,
+# preferably one that is unlikely to be removed or renamed.
+
+AC_CONFIG_SRCDIR([src/TPI.c])
+
+# Specify directory for auxillary build tools (e.g., install-sh,
+# config.sub, config.guess) and M4 files.
+
+AC_CONFIG_AUX_DIR(config)
+#  #auto np# - Change file names in next line 
+# Configure should create src/ThreadPool_config.h from src/ThreadPool_config.h.in
+
+AM_CONFIG_HEADER(src/ThreadPool_config.h:src/ThreadPool_config.h.in)
+
+# Allow users to specify their own "install" command.  If none is specified,
+# the default is install-sh found in the config subdirectory.
+                                                                                
+AC_ARG_WITH(install,
+ [AC_HELP_STRING([--with-install=INSTALL_PROGRAM],
+ [Use the installation program INSTALL_PROGRAM rather the default that is provided.  For example --with-install="/path/install -p"])],
+ [
+   INSTALL=$withval
+   INSTALL_PROGRAM=$withval
+   INSTALL_SCRIPT=$withval
+   INSTALL_DATA="$withval -m 644"
+ ],)
+                                                                                
+# AM_MAINTAINER_MODE turns off maintainer-only makefile targets by
+# default, and changes configure to understand a
+# --enable-maintainer-mode option. --enable-maintainer-mode turns the
+# maintainer-only targets back on. The maintainer-only makefile
+# targets permit end users to clean automatically-generated files such
+# as configure, which means they have to have autoconf and automake
+# installed to repair the damage. AM_MAINTAINER_MODE makes it a bit
+# harder for users to shoot themselves in the foot.
+
+AM_MAINTAINER_MODE
+
+# Define $build, $host, $target, etc
+
+AC_CANONICAL_TARGET
+
+# Use automake
+
+#  - Required version of automake.
+AM_INIT_AUTOMAKE(1.10 no-define tar-ustar)
+
+# Specify required version of autoconf.
+
+AC_PREREQ(2.61)
+
+# ------------------------------------------------------------------------
+# Check to see if MPI enabled and if any special configuration done
+# ------------------------------------------------------------------------
+
+TAC_ARG_CONFIG_MPI
+
+#  #np# - can eliminate compiler checks below if your package does not use the
+#         language corresponding to the check.  Please note that if you use
+#	  F77_FUNC to determine Fortran name mangling, you should not remove
+#	  the Fortran compiler check or the check for Fortran flags.  Doing
+#	  so will prevent the detection of the proper name mangling in some
+#	  cases.
+# ------------------------------------------------------------------------
+# Checks for programs
+# ------------------------------------------------------------------------
+
+AC_PROG_CC(cc gcc)
+AC_PROG_CXX(CC g++ c++ cxx)
+#AC_PROG_F77(f77 g77 gfortran f90 xlf90 f95)
+AC_PROG_RANLIB
+
+# Check if --with-flags present, prepend any specs to FLAGS
+
+TAC_ARG_WITH_FLAGS(ccflags, CCFLAGS)
+TAC_ARG_WITH_FLAGS(cxxflags, CXXFLAGS)
+TAC_ARG_WITH_FLAGS(cflags, CFLAGS)
+#TAC_ARG_WITH_FLAGS(fflags, FFLAGS)
+TAC_ARG_WITH_LIBS
+TAC_ARG_WITH_FLAGS(ldflags, LDFLAGS)
+
+# ------------------------------------------------------------------------
+# Alternate archiver
+# ------------------------------------------------------------------------
+
+TAC_ARG_WITH_AR
+
+# ------------------------------------------------------------------------
+# MPI link check
+# ------------------------------------------------------------------------
+TAC_ARG_CHECK_MPI
+
+# ------------------------------------------------------------------------
+# Checks for Makefile.export related systems
+# ------------------------------------------------------------------------
+TAC_ARG_ENABLE_EXPORT_MAKEFILES(yes)
+
+# ------------------------------------------------------------------------
+# Checks if tests and examples should be built
+# ------------------------------------------------------------------------
+
+#  #np# - These options can disable the tests and examples of a package.
+#  #np# - Packages that do not have tests or examples should #-out the 
+#  #np# - option(s) that does (do) not apply.
+
+TAC_ARG_ENABLE_FEATURE(tests, [Make tests for all Trilinos packages buildable with 'make tests'], TESTS, yes)
+TAC_ARG_ENABLE_FEATURE_SUB_CHECK( threadpool, tests, [Make ThreadPool tests buildable with 'make tests'], NEW_PACKAGE_TESTS)
+AM_CONDITIONAL(BUILD_TESTS, test "X$ac_cv_use_threadpool_tests" != "Xno")
+
+#TAC_ARG_ENABLE_FEATURE(examples, [Make examples for all Trilinos packages buildable with 'make examples'], EXAMPLES, yes)
+#TAC_ARG_ENABLE_FEATURE_SUB_CHECK( new_package, examples, [Make New_Package examples buildable with 'make examples'], NEW_PACKAGE_EXAMPLES)
+#AM_CONDITIONAL(BUILD_EXAMPLES, test "X$ac_cv_use_new_package_examples" != "Xno")
+
+#We now build tests and examples through separate make targets, rather than
+#during "make".  We still need to conditionally include the test and example
+#in SUBDIRS, even though SUB_TEST and SUB_EXAMPLE will never be
+#defined, so that the tests and examples are included in the distribution
+#tarball.
+AM_CONDITIONAL(SUB_TEST, test "X$ac_cv_use_sub_test" = "Xyes")
+#AM_CONDITIONAL(SUB_EXAMPLE, test "X$ac_cv_use_sub_example" = "Xyes")
+
+TAC_ARG_ENABLE_FEATURE(libcheck, [Check for some third-party libraries.  (Cannot be disabled unless tests and examples are also disabled.)], LIBCHECK, yes)
+
+# ------------------------------------------------------------------------
+# Specify other directories 
+# ------------------------------------------------------------------------
+
+# enable use of --with-libdirs="-Llibdir1 -Llibdir2 ..." to prepend to LDFLAGS
+TAC_ARG_WITH_LIBDIRS
+# enable use of --with-incdirs="-Lincdir1 -Lincdir2 ..." to prepend to CPPFLAGS
+TAC_ARG_WITH_INCDIRS
+
+# #np# - Yet another opportunity to remove code if you aren't
+# using Fortran
+# Define F77_FUNC that will be used to link with Fortran subroutines. - trash WORKGXX 
+#AC_F77_WRAPPERS
+
+# ------------------------------------------------------------------------
+# Checks for libraries
+# ------------------------------------------------------------------------
+
+# If tests, examples and libcheck are disabled, we don't have to check
+# for these libraries.
+
+# #np# -
+# If a package does not have tests or examples, the corresponding check(s)
+# should be pulled out of the "if" statement below.
+#if test "X$ac_cv_use_new_package_examples" != "Xno" || test "X$ac_cv_use_libcheck" != "Xno"; then
+if test "X$ac_cv_use_threadpool_tests" != "Xno" || test "X$ac_cv_use_libcheck" != "Xno"; then
+
+ACX_PTHREAD
+LIBS="$PTHREAD_LIBS $LIBS"
+CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+CC="$PTHREAD_CC"
+
+fi
+# end of the list of libraries that don't need to be checked for if
+# tests and examples are disabled.
+
+# ------------------------------------------------------------------------
+# Checks for linker characteristics
+# ------------------------------------------------------------------------
+
+# Determine libraries needed for linking with Fortran
+#AC_F77_LIBRARY_LDFLAGS
+
+
+# ------------------------------------------------------------------------
+# Perform substitutions in output files
+# ------------------------------------------------------------------------
+
+AC_SUBST(ac_aux_dir)
+
+# ------------------------------------------------------------------------
+# Output files
+# ------------------------------------------------------------------------
+##
+#  You will need to change AC_CONFIG_FILES below and Makefile.am
+#  to add a new directory.
+AC_CONFIG_FILES([
+		Makefile
+		Makefile.export.threadpool
+		src/Makefile
+		test/Makefile
+		])
+
+AC_OUTPUT()
+
+# Bye World!
+echo "---------------------------------------------"
+echo "Finished Running ThreadPool Configure Script"
+echo "---------------------------------------------"
diff --git a/openmp-avx512/basic/optional/ThreadPool/src/CMakeLists.txt b/openmp-avx512/basic/optional/ThreadPool/src/CMakeLists.txt
new file mode 100644
index 0000000..41a1f39
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/src/CMakeLists.txt
@@ -0,0 +1,70 @@
+
+INCLUDE(PackageLibraryMacros)
+
+#
+# A) Package-specific configuration options
+#
+
+PACKAGE_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
+
+#
+# B) Define the header and source files (and directories)
+#
+
+#
+# src
+#
+
+SET(HEADERS "")
+SET(SOURCES "")
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+
+SET(HEADERS ${HEADERS}
+  ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h
+  )
+
+#
+# Core files
+#
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+APPEND_SET(HEADERS
+  TPI.h
+  TPI.hpp
+  )
+
+APPEND_SET(SOURCES
+  TPI.c
+  )
+
+#
+# Util files
+#
+APPEND_SET(SOURCES
+  TPI_Walltime.c
+  )
+
+######################################
+
+APPEND_SET(HEADERS
+  )
+
+APPEND_SET(SOURCES
+  )
+
+######################################
+IF (TPL_ENABLE_MPI)
+ENDIF()
+
+#
+# C) Define the targets for package's library(s)
+#
+
+PACKAGE_ADD_LIBRARY(
+  tpi
+  HEADERS ${HEADERS}
+  SOURCES ${SOURCES}
+  )
diff --git a/openmp-avx512/basic/optional/ThreadPool/src/Makefile.am b/openmp-avx512/basic/optional/ThreadPool/src/Makefile.am
new file mode 100644
index 0000000..44c1621
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/src/Makefile.am
@@ -0,0 +1,140 @@
+# @HEADER
+# ************************************************************************
+# 
+#                          ThreadPool Package
+#                 Copyright (2008) Sandia Corporation
+# 
+# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+# license for use of this work by or on behalf of the U.S. Government.
+# 
+# This library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of the
+# License, or (at your option) any later version.
+#  
+# This library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#  
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
+# 
+# ************************************************************************
+# @HEADER
+
+# The following line helps the test harness recover from build errors.
+
+all-local:
+	@echo ""
+	@echo "Trilinos package ThreadPool subdirectory src built successfully."
+	@echo ""
+
+# ------------------------------------------------------------------------
+# For each category, create two variables - NAME and NAME_H. The
+# second is the list of headers to be installed, i.e., any header that
+# might someday be needed by some other code outside New_Package. The first is
+# the list of all source and any other header files.
+# ------------------------------------------------------------------------
+
+#np# Make sure to list all source files in one of the following categories.
+
+CORE = $(srcdir)/TPI.c
+
+CORE_H = \
+	$(srcdir)/TPI.h \
+	$(srcdir)/TPI.hpp
+
+UTIL = \
+	$(srcdir)/TPI_Walltime.c
+
+
+# ------------------------------------------------------------------------
+# ThreadPool library specifications
+# ------------------------------------------------------------------------
+#np# replace new_package with the name of the package being autotool'ed here
+THREADPOOL_LIB = libtpi.a
+
+#np# replace new_package with the name of the package being autotool'ed here
+THREADPOOL_H = \
+	$(CORE_H)
+
+#np# replace new_package with the name of the package being autotool'ed here
+libtpi_a_SOURCES = \
+	$(CORE) \
+	$(UTIL)
+
+#np# replace new_package with the name of the package being autotool'ed here
+#EXTRA_libtpi_a_SOURCES =
+
+include $(top_builddir)/Makefile.export.threadpool
+
+if USING_GNUMAKE
+EXPORT_INCLUDES = $(shell $(PERL_EXE) $(top_srcdir)/config/strip_dup_incl_paths.pl $(THREADPOOL_INCLUDES))
+else
+EXPORT_INCLUDES = $(THREADPOOL_INCLUDES)
+endif
+
+AM_CPPFLAGS = $(EXPORT_INCLUDES)
+
+# ------------------------------------------------------------------------
+# For using a special archiver
+# ------------------------------------------------------------------------
+
+if USE_ALTERNATE_AR
+
+libtpi_a_AR = $(ALTERNATE_AR)
+else
+
+libtpi_a_AR = $(AR) cru
+
+endif
+
+# ------------------------------------------------------------------------
+# Some C++ compilers create extra .o-files for templates. We need to
+# be sure to include these, and this is the hack to do it.
+# ------------------------------------------------------------------------
+
+libtpi_a_LIBADD = $(XTRALDADD)
+
+# ------------------------------------------------------------------------
+# List of all libraries to install in $(libexecdir)
+# ------------------------------------------------------------------------
+
+lib_LIBRARIES = $(THREADPOOL_LIB)
+
+# ------------------------------------------------------------------------
+# List of all headers to install in $(includedir)
+# ------------------------------------------------------------------------
+
+#np# replace new_package with the name of the package being autotool'ed here
+include_HEADERS = $(THREADPOOL_H) 
+
+# ------------------------------------------------------------------------
+# Special stuff to install in our special $(execincludedir)
+# ------------------------------------------------------------------------
+
+# SPECIAL NOTE: New_Package_config.h is a machine-dependent file, so we need
+# to install it in the machine-dependent directory. However, that is
+# not a default installation directory, so we had to create it
+# special.
+
+# All Trilinos headers are now installed in the same directory
+execincludedir = $(includedir)
+#np# replace new_package with the name of the package being autotool'ed here
+nodist_execinclude_HEADERS = ThreadPool_config.h
+
+# ------------------------------------------------------------------------
+# Files to be deleted by 'make maintainer-clean'
+# ------------------------------------------------------------------------
+
+MAINTAINERCLEANFILES = Makefile.in 
+
+
+
+
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/src/Makefile.in b/openmp-avx512/basic/optional/ThreadPool/src/Makefile.in
new file mode 100644
index 0000000..4dd7802
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/src/Makefile.in
@@ -0,0 +1,680 @@
+# Makefile.in generated by automake 1.10 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# @HEADER
+# ************************************************************************
+# 
+#                          ThreadPool Package
+#                 Copyright (2008) Sandia Corporation
+# 
+# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+# license for use of this work by or on behalf of the U.S. Government.
+# 
+# This library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of the
+# License, or (at your option) any later version.
+#  
+# This library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#  
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
+# 
+# ************************************************************************
+# @HEADER
+
+# The following line helps the test harness recover from build errors.
+
+
+VPATH = @srcdir@
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = src
+DIST_COMMON = $(include_HEADERS) $(srcdir)/Makefile.am \
+	$(srcdir)/Makefile.in $(srcdir)/ThreadPool_config.h.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/config/acx_pthread.m4 \
+	$(top_srcdir)/config/tac_arg_check_mpi.m4 \
+	$(top_srcdir)/config/tac_arg_config_mpi.m4 \
+	$(top_srcdir)/config/tac_arg_enable_export-makefiles.m4 \
+	$(top_srcdir)/config/tac_arg_enable_feature.m4 \
+	$(top_srcdir)/config/tac_arg_enable_feature_sub_check.m4 \
+	$(top_srcdir)/config/tac_arg_with_ar.m4 \
+	$(top_srcdir)/config/tac_arg_with_flags.m4 \
+	$(top_srcdir)/config/tac_arg_with_incdirs.m4 \
+	$(top_srcdir)/config/tac_arg_with_libdirs.m4 \
+	$(top_srcdir)/config/tac_arg_with_libs.m4 \
+	$(top_srcdir)/config/tac_arg_with_perl.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = ThreadPool_config.h
+CONFIG_CLEAN_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = `echo $$p | sed -e 's|^.*/||'`;
+am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)" \
+	"$(DESTDIR)$(execincludedir)"
+libLIBRARIES_INSTALL = $(INSTALL_DATA)
+LIBRARIES = $(lib_LIBRARIES)
+AR = ar
+ARFLAGS = cru
+libtpi_a_DEPENDENCIES =
+am__objects_1 = TPI.$(OBJEXT)
+am__objects_2 = TPI_Walltime.$(OBJEXT)
+am_libtpi_a_OBJECTS = $(am__objects_1) $(am__objects_2)
+libtpi_a_OBJECTS = $(am_libtpi_a_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@
+depcomp = $(SHELL) $(top_srcdir)/config/depcomp
+am__depfiles_maybe = depfiles
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+SOURCES = $(libtpi_a_SOURCES)
+DIST_SOURCES = $(libtpi_a_SOURCES)
+includeHEADERS_INSTALL = $(INSTALL_HEADER)
+nodist_execincludeHEADERS_INSTALL = $(INSTALL_HEADER)
+HEADERS = $(include_HEADERS) $(nodist_execinclude_HEADERS)
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALTERNATE_AR = @ALTERNATE_AR@
+AMTAR = @AMTAR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+GREP = @GREP@
+HAVE_PERL = @HAVE_PERL@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MKDIR_P = @MKDIR_P@
+MPI_CC_EXISTS = @MPI_CC_EXISTS@
+MPI_CXX = @MPI_CXX@
+MPI_CXX_EXISTS = @MPI_CXX_EXISTS@
+MPI_F77_EXISTS = @MPI_F77_EXISTS@
+MPI_TEMP_CXX = @MPI_TEMP_CXX@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PERL_EXE = @PERL_EXE@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_aux_dir = @ac_aux_dir@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+
+# ------------------------------------------------------------------------
+# For each category, create two variables - NAME and NAME_H. The
+# second is the list of headers to be installed, i.e., any header that
+# might someday be needed by some other code outside New_Package. The first is
+# the list of all source and any other header files.
+# ------------------------------------------------------------------------
+
+#np# Make sure to list all source files in one of the following categories.
+CORE = $(srcdir)/TPI.c
+CORE_H = \
+	$(srcdir)/TPI.h \
+	$(srcdir)/TPI.hpp
+
+UTIL = \
+	$(srcdir)/TPI_Walltime.c
+
+
+# ------------------------------------------------------------------------
+# ThreadPool library specifications
+# ------------------------------------------------------------------------
+#np# replace new_package with the name of the package being autotool'ed here
+THREADPOOL_LIB = libtpi.a
+
+#np# replace new_package with the name of the package being autotool'ed here
+THREADPOOL_H = \
+	$(CORE_H)
+
+
+#np# replace new_package with the name of the package being autotool'ed here
+libtpi_a_SOURCES = \
+	$(CORE) \
+	$(UTIL)
+
+@USING_GNUMAKE_FALSE@EXPORT_INCLUDES = $(THREADPOOL_INCLUDES)
+@USING_GNUMAKE_TRUE@EXPORT_INCLUDES = $(shell $(PERL_EXE) $(top_srcdir)/config/strip_dup_incl_paths.pl $(THREADPOOL_INCLUDES))
+AM_CPPFLAGS = $(EXPORT_INCLUDES)
+@USE_ALTERNATE_AR_FALSE@libtpi_a_AR = $(AR) cru
+
+# ------------------------------------------------------------------------
+# For using a special archiver
+# ------------------------------------------------------------------------
+@USE_ALTERNATE_AR_TRUE@libtpi_a_AR = $(ALTERNATE_AR)
+
+# ------------------------------------------------------------------------
+# Some C++ compilers create extra .o-files for templates. We need to
+# be sure to include these, and this is the hack to do it.
+# ------------------------------------------------------------------------
+libtpi_a_LIBADD = $(XTRALDADD)
+
+# ------------------------------------------------------------------------
+# List of all libraries to install in $(libexecdir)
+# ------------------------------------------------------------------------
+lib_LIBRARIES = $(THREADPOOL_LIB)
+
+# ------------------------------------------------------------------------
+# List of all headers to install in $(includedir)
+# ------------------------------------------------------------------------
+
+#np# replace new_package with the name of the package being autotool'ed here
+include_HEADERS = $(THREADPOOL_H) 
+
+# ------------------------------------------------------------------------
+# Special stuff to install in our special $(execincludedir)
+# ------------------------------------------------------------------------
+
+# SPECIAL NOTE: New_Package_config.h is a machine-dependent file, so we need
+# to install it in the machine-dependent directory. However, that is
+# not a default installation directory, so we had to create it
+# special.
+
+# All Trilinos headers are now installed in the same directory
+execincludedir = $(includedir)
+#np# replace new_package with the name of the package being autotool'ed here
+nodist_execinclude_HEADERS = ThreadPool_config.h
+
+# ------------------------------------------------------------------------
+# Files to be deleted by 'make maintainer-clean'
+# ------------------------------------------------------------------------
+MAINTAINERCLEANFILES = Makefile.in 
+all: ThreadPool_config.h
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \
+		&& exit 0; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  src/Makefile'; \
+	cd $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign  src/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+ThreadPool_config.h: stamp-h1
+	@if test ! -f $@; then \
+	  rm -f stamp-h1; \
+	  $(MAKE) $(AM_MAKEFLAGS) stamp-h1; \
+	else :; fi
+
+stamp-h1: $(srcdir)/ThreadPool_config.h.in $(top_builddir)/config.status
+	@rm -f stamp-h1
+	cd $(top_builddir) && $(SHELL) ./config.status src/ThreadPool_config.h
+$(srcdir)/ThreadPool_config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) 
+	cd $(top_srcdir) && $(AUTOHEADER)
+	rm -f stamp-h1
+	touch $@
+
+distclean-hdr:
+	-rm -f ThreadPool_config.h stamp-h1
+install-libLIBRARIES: $(lib_LIBRARIES)
+	@$(NORMAL_INSTALL)
+	test -z "$(libdir)" || $(MKDIR_P) "$(DESTDIR)$(libdir)"
+	@list='$(lib_LIBRARIES)'; for p in $$list; do \
+	  if test -f $$p; then \
+	    f=$(am__strip_dir) \
+	    echo " $(libLIBRARIES_INSTALL) '$$p' '$(DESTDIR)$(libdir)/$$f'"; \
+	    $(libLIBRARIES_INSTALL) "$$p" "$(DESTDIR)$(libdir)/$$f"; \
+	  else :; fi; \
+	done
+	@$(POST_INSTALL)
+	@list='$(lib_LIBRARIES)'; for p in $$list; do \
+	  if test -f $$p; then \
+	    p=$(am__strip_dir) \
+	    echo " $(RANLIB) '$(DESTDIR)$(libdir)/$$p'"; \
+	    $(RANLIB) "$(DESTDIR)$(libdir)/$$p"; \
+	  else :; fi; \
+	done
+
+uninstall-libLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LIBRARIES)'; for p in $$list; do \
+	  p=$(am__strip_dir) \
+	  echo " rm -f '$(DESTDIR)$(libdir)/$$p'"; \
+	  rm -f "$(DESTDIR)$(libdir)/$$p"; \
+	done
+
+clean-libLIBRARIES:
+	-test -z "$(lib_LIBRARIES)" || rm -f $(lib_LIBRARIES)
+libtpi.a: $(libtpi_a_OBJECTS) $(libtpi_a_DEPENDENCIES) 
+	-rm -f libtpi.a
+	$(libtpi_a_AR) libtpi.a $(libtpi_a_OBJECTS) $(libtpi_a_LIBADD)
+	$(RANLIB) libtpi.a
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TPI.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TPI_Walltime.Po@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+TPI.o: $(srcdir)/TPI.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT TPI.o -MD -MP -MF $(DEPDIR)/TPI.Tpo -c -o TPI.o `test -f '$(srcdir)/TPI.c' || echo '$(srcdir)/'`$(srcdir)/TPI.c
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/TPI.Tpo $(DEPDIR)/TPI.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/TPI.c' object='TPI.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o TPI.o `test -f '$(srcdir)/TPI.c' || echo '$(srcdir)/'`$(srcdir)/TPI.c
+
+TPI.obj: $(srcdir)/TPI.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT TPI.obj -MD -MP -MF $(DEPDIR)/TPI.Tpo -c -o TPI.obj `if test -f '$(srcdir)/TPI.c'; then $(CYGPATH_W) '$(srcdir)/TPI.c'; else $(CYGPATH_W) '$(srcdir)/$(srcdir)/TPI.c'; fi`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/TPI.Tpo $(DEPDIR)/TPI.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/TPI.c' object='TPI.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o TPI.obj `if test -f '$(srcdir)/TPI.c'; then $(CYGPATH_W) '$(srcdir)/TPI.c'; else $(CYGPATH_W) '$(srcdir)/$(srcdir)/TPI.c'; fi`
+
+TPI_Walltime.o: $(srcdir)/TPI_Walltime.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT TPI_Walltime.o -MD -MP -MF $(DEPDIR)/TPI_Walltime.Tpo -c -o TPI_Walltime.o `test -f '$(srcdir)/TPI_Walltime.c' || echo '$(srcdir)/'`$(srcdir)/TPI_Walltime.c
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/TPI_Walltime.Tpo $(DEPDIR)/TPI_Walltime.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/TPI_Walltime.c' object='TPI_Walltime.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o TPI_Walltime.o `test -f '$(srcdir)/TPI_Walltime.c' || echo '$(srcdir)/'`$(srcdir)/TPI_Walltime.c
+
+TPI_Walltime.obj: $(srcdir)/TPI_Walltime.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT TPI_Walltime.obj -MD -MP -MF $(DEPDIR)/TPI_Walltime.Tpo -c -o TPI_Walltime.obj `if test -f '$(srcdir)/TPI_Walltime.c'; then $(CYGPATH_W) '$(srcdir)/TPI_Walltime.c'; else $(CYGPATH_W) '$(srcdir)/$(srcdir)/TPI_Walltime.c'; fi`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/TPI_Walltime.Tpo $(DEPDIR)/TPI_Walltime.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/TPI_Walltime.c' object='TPI_Walltime.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o TPI_Walltime.obj `if test -f '$(srcdir)/TPI_Walltime.c'; then $(CYGPATH_W) '$(srcdir)/TPI_Walltime.c'; else $(CYGPATH_W) '$(srcdir)/$(srcdir)/TPI_Walltime.c'; fi`
+install-includeHEADERS: $(include_HEADERS)
+	@$(NORMAL_INSTALL)
+	test -z "$(includedir)" || $(MKDIR_P) "$(DESTDIR)$(includedir)"
+	@list='$(include_HEADERS)'; for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  f=$(am__strip_dir) \
+	  echo " $(includeHEADERS_INSTALL) '$$d$$p' '$(DESTDIR)$(includedir)/$$f'"; \
+	  $(includeHEADERS_INSTALL) "$$d$$p" "$(DESTDIR)$(includedir)/$$f"; \
+	done
+
+uninstall-includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(include_HEADERS)'; for p in $$list; do \
+	  f=$(am__strip_dir) \
+	  echo " rm -f '$(DESTDIR)$(includedir)/$$f'"; \
+	  rm -f "$(DESTDIR)$(includedir)/$$f"; \
+	done
+install-nodist_execincludeHEADERS: $(nodist_execinclude_HEADERS)
+	@$(NORMAL_INSTALL)
+	test -z "$(execincludedir)" || $(MKDIR_P) "$(DESTDIR)$(execincludedir)"
+	@list='$(nodist_execinclude_HEADERS)'; for p in $$list; do \
+	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
+	  f=$(am__strip_dir) \
+	  echo " $(nodist_execincludeHEADERS_INSTALL) '$$d$$p' '$(DESTDIR)$(execincludedir)/$$f'"; \
+	  $(nodist_execincludeHEADERS_INSTALL) "$$d$$p" "$(DESTDIR)$(execincludedir)/$$f"; \
+	done
+
+uninstall-nodist_execincludeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(nodist_execinclude_HEADERS)'; for p in $$list; do \
+	  f=$(am__strip_dir) \
+	  echo " rm -f '$(DESTDIR)$(execincludedir)/$$f'"; \
+	  rm -f "$(DESTDIR)$(execincludedir)/$$f"; \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS:  $(HEADERS) $(SOURCES) ThreadPool_config.h.in $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS) ThreadPool_config.h.in $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	    $$tags $$unique; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES) ThreadPool_config.h.in $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS) ThreadPool_config.h.in $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(CTAGS_ARGS)$$tags$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$tags $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && cd $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) $$here
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
+	    fi; \
+	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LIBRARIES) $(HEADERS) ThreadPool_config.h all-local
+installdirs:
+	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)" "$(DESTDIR)$(execincludedir)"; do \
+	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+	done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	  `test -z '$(STRIP)' || \
+	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES)
+clean: clean-am
+
+clean-am: clean-generic clean-libLIBRARIES mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-hdr distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+info: info-am
+
+info-am:
+
+install-data-am: install-includeHEADERS
+
+install-dvi: install-dvi-am
+
+install-exec-am: install-libLIBRARIES \
+	install-nodist_execincludeHEADERS
+
+install-html: install-html-am
+
+install-info: install-info-am
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-ps: install-ps-am
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-includeHEADERS uninstall-libLIBRARIES \
+	uninstall-nodist_execincludeHEADERS
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS all all-am all-local check check-am clean \
+	clean-generic clean-libLIBRARIES ctags distclean \
+	distclean-compile distclean-generic distclean-hdr \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-includeHEADERS install-info \
+	install-info-am install-libLIBRARIES install-man \
+	install-nodist_execincludeHEADERS install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic pdf pdf-am ps ps-am tags uninstall \
+	uninstall-am uninstall-includeHEADERS uninstall-libLIBRARIES \
+	uninstall-nodist_execincludeHEADERS
+
+
+all-local:
+	@echo ""
+	@echo "Trilinos package ThreadPool subdirectory src built successfully."
+	@echo ""
+
+#np# replace new_package with the name of the package being autotool'ed here
+#EXTRA_libtpi_a_SOURCES =
+
+include $(top_builddir)/Makefile.export.threadpool
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/openmp-avx512/basic/optional/ThreadPool/src/TPI.c b/openmp-avx512/basic/optional/ThreadPool/src/TPI.c
new file mode 100644
index 0000000..f2b1566
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/src/TPI.c
@@ -0,0 +1,1016 @@
+/*------------------------------------------------------------------------*/
+/*                    TPI: Thread Pool Interface                          */
+/*                Copyright (2008) Sandia Corporation                     */
+/*                                                                        */
+/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
+/*  license for use of this work by or on behalf of the U.S. Government.  */
+/*                                                                        */
+/*  This library is free software; you can redistribute it and/or modify  */
+/*  it under the terms of the GNU Lesser General Public License as        */
+/*  published by the Free Software Foundation; either version 2.1 of the  */
+/*  License, or (at your option) any later version.                       */
+/*                                                                        */
+/*  This library is distributed in the hope that it will be useful,       */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
+/*  Lesser General Public License for more details.                       */
+/*                                                                        */
+/*  You should have received a copy of the GNU Lesser General Public      */
+/*  License along with this library; if not, write to the Free Software   */
+/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
+/*  USA                                                                   */
+/*------------------------------------------------------------------------*/
+/**
+ * @author H. Carter Edwards
+ */
+
+/*--------------------------------------------------------------------*/
+
+#include <TPI.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ThreadPool_config.h>
+
+/*--------------------------------------------------------------------*/
+/*----------- PTHREAD CONFIGURATION (BEGIN) --------------------------*/
+/*--------------------------------------------------------------------*/
+
+#if	defined( HAVE_PTHREAD )
+
+#include <errno.h>
+#include <pthread.h>
+#include <sched.h>
+
+/*--------------------------------------------------------------------*/
+/*---------------- COMPILER SPECIFICS (BEGIN) ------------------------*/
+/*--------------------------------------------------------------------*/
+
+/*  Performance is heavily impacted by an
+ *  atomic decrement of the work counter.
+ *  Optimize this if at all possible.
+ */
+
+#if	defined( __INTEL_COMPILER )
+
+#define THREADPOOL_CONFIG "PTHREAD SCHED_YIELD"
+
+#elif	defined( __linux__ ) && \
+	defined( __GNUC__ ) && ( 4 <= __GNUC__ )
+
+#define THREADPOOL_CONFIG "PTHREAD SCHED_YIELD ATOMIC_SYNC"
+
+#define atomic_fetch_and_decrement( VALUE_PTR )	\
+	__sync_fetch_and_sub( VALUE_PTR , 1 )
+
+#else
+
+#define THREADPOOL_CONFIG "PTHREAD SCHED_YIELD"
+
+#endif
+
+#if ! defined( atomic_fetch_and_decrement )
+
+static int atomic_fetch_and_decrement( volatile int * value )
+{
+  static pthread_mutex_t atomic_lock = PTHREAD_MUTEX_INITIALIZER ;
+  int result ;
+  while ( EBUSY == pthread_mutex_trylock( & atomic_lock ) );
+  result = ( *value )-- ;
+  pthread_mutex_unlock( & atomic_lock );
+  return result ;
+}
+
+#endif
+
+/*--------------------------------------------------------------------*/
+/*---------------- COMPILER SPECIFICS (END) --------------------------*/
+/*--------------------------------------------------------------------*/
+
+typedef pthread_mutex_t  local_lock_type ;
+
+#else /* ! defined( HAVE_PTHREAD ) */
+
+#define THREADPOOL_CONFIG "NO THREADING"
+
+typedef int  local_lock_type ;
+
+#endif
+
+/*--------------------------------------------------------------------*/
+/*----------- PTHREAD CONFIGURATION (END) ----------------------------*/
+/*--------------------------------------------------------------------*/
+
+const char * TPI_Version()
+{
+  static const char version_string[] =
+    "TPI Version 1.1 , November 2009 , Configuration = " THREADPOOL_CONFIG ;
+
+  return version_string ;
+}
+
+/*--------------------------------------------------------------------*/
+
+enum { THREAD_COUNT_MAX = 256 };
+enum { LOCK_COUNT_MAX   = 32 };
+
+struct ThreadPool_Data ;
+
+typedef struct Thread_Data {
+  struct Thread_Data * m_thread_fan ; /* Fan-in / fan-out begin */
+  void               * m_reduce ;     /* Reduction memory */
+  long                 m_rank ;
+  long                 m_barrier_wait_max ;
+  long                 m_barrier_wait_total ;
+  long                 m_barrier_wait_count ;
+  volatile long        m_control ;
+} Thread ;
+
+typedef struct ThreadPool_Data {
+  TPI_work_subprogram   m_work_routine ;
+  const void *          m_work_info ;
+  TPI_reduce_join       m_reduce_join ;
+  TPI_reduce_init       m_reduce_init ;
+  unsigned char       * m_reduce_alloc ;
+  int                   m_reduce_alloc_size ;
+  int                   m_thread_count ;
+  int                   m_lock_init ;
+  int                   m_lock_count ;
+  int                   m_work_thread_count ;
+  int                   m_work_count ;
+  int                   m_work_count_claim ;
+
+  Thread                m_thread[ THREAD_COUNT_MAX ];
+  local_lock_type       m_lock[ LOCK_COUNT_MAX ];
+} ThreadPool ;
+
+
+static ThreadPool thread_pool =
+{
+  /* m_work_routine        */  NULL ,
+  /* m_work_info           */  NULL ,
+  /* m_reduce_join         */  NULL ,
+  /* m_reduce_init         */  NULL ,
+  /* m_reduce_alloc        */  NULL ,
+  /* m_reduce_alloc_size   */  0 ,
+  /* m_thread_count        */  0 ,
+  /* m_lock_init           */  0 ,
+  /* m_lock_count          */  0 ,
+  /* m_work_thread_count   */  0 ,
+  /* m_work_count          */  0 ,
+  /* m_work_count_claim    */  0
+};
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+#if defined( HAVE_PTHREAD )
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+int TPI_Lock( int i )
+{
+  int result = i < 0 || thread_pool.m_lock_count <= i ? TPI_ERROR_SIZE : 0 ;
+
+  if ( ! result ) {
+    pthread_mutex_t * const lock = thread_pool.m_lock + i ;
+
+    while ( EBUSY == ( result = pthread_mutex_trylock( lock ) ) );
+
+    if ( result ) { result = TPI_ERROR_LOCK ; }
+  }
+  return result ;
+}
+
+int TPI_Unlock( int i )
+{
+  int result = i < 0 || thread_pool.m_lock_count <= i ? TPI_ERROR_SIZE : 0 ;
+
+  if ( ! result && pthread_mutex_unlock( thread_pool.m_lock + i ) ) {
+    result = TPI_ERROR_LOCK ;
+  }
+
+  return result ;
+}
+
+static int local_set_lock_count( const int lock_count )
+{
+  int result = lock_count < 0 || LOCK_COUNT_MAX < lock_count
+             ? TPI_ERROR_SIZE : 0 ;
+
+  while ( ! result && thread_pool.m_lock_init < lock_count ) {
+
+    pthread_mutex_t * const lock = thread_pool.m_lock +
+                                   thread_pool.m_lock_init ;
+
+    if ( pthread_mutex_init( lock , NULL ) ) {
+      result = TPI_ERROR_INTERNAL ;
+    }
+    else {
+      ++( thread_pool.m_lock_init );
+    }
+  }
+
+  return result ;
+}
+
+static void local_destroy_locks()
+{
+  while ( thread_pool.m_lock_init ) {
+    --( thread_pool.m_lock_init );
+    pthread_mutex_destroy( thread_pool.m_lock + thread_pool.m_lock_init );
+  }
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+/* Run work if any, then wait for child threads to block. */
+
+static void local_run( Thread * const this_thread , void * reduce )
+{
+  struct TPI_Work_Struct work ;
+
+  work.info       = thread_pool.m_work_info ;
+  work.reduce     = reduce ;
+  work.count      = thread_pool.m_work_count ;
+  work.lock_count = thread_pool.m_lock_count ;
+
+  if ( work.count <= thread_pool.m_work_thread_count ) {
+
+    work.rank = ( thread_pool.m_thread_count - 1 ) - this_thread->m_rank ;
+
+    if ( work.rank < work.count ) {
+      (*thread_pool.m_work_routine)( & work );
+    }
+  }
+  else {
+
+    int * const claim = & thread_pool.m_work_count_claim ;
+
+    while ( 0 < ( work.rank = atomic_fetch_and_decrement( claim ))) {
+
+      work.rank = work.count - work.rank ;
+
+      (*thread_pool.m_work_routine)( & work );
+    }
+  }
+}
+
+static int wait_thread( volatile long * const control , const int val )
+{
+  int count = 0 ;
+  while ( val == *control ) {
+    sched_yield();
+    ++count ;
+  }
+  return count ;
+}
+
+static void local_barrier_wait( Thread * const this_thread ,
+                                Thread * const thread )
+{
+  const long count = wait_thread( & thread->m_control , 1 );
+
+  ++( this_thread->m_barrier_wait_count );
+
+  this_thread->m_barrier_wait_total += count ;
+
+  if ( this_thread->m_barrier_wait_max < count ) {
+    this_thread->m_barrier_wait_max = count ;
+  }
+}
+
+static void local_barrier( Thread * const this_thread )
+{
+  Thread * const thread_beg = this_thread[0].m_thread_fan ;
+  Thread *       thread     = this_thread[1].m_thread_fan ;
+
+  if ( ! thread_pool.m_work_routine ) {
+    while ( thread_beg < thread ) {
+      --thread ; local_barrier_wait( this_thread , thread );
+    }
+  }
+  else if ( ! thread_pool.m_reduce_join ) {
+
+    local_run( this_thread , NULL );
+
+    while ( thread_beg < thread ) {
+      --thread ; local_barrier_wait( this_thread , thread );
+    }
+  }
+  else {
+
+    /* Work data for the reduction initialization and join */
+
+    struct TPI_Work_Struct work ;
+
+    work.info       = thread_pool.m_work_info ;
+    work.reduce     = this_thread->m_reduce ;
+    work.count      = -1 ;
+    work.rank       = -1 ;
+    work.lock_count = -1 ;
+
+    /* Initialize reduction value for non-root thread */
+
+    if ( this_thread->m_rank ) { (*thread_pool.m_reduce_init)( & work ); }
+
+    /* Run the work routine with barrier blocking */
+
+    local_run( this_thread , work.reduce );
+
+    /* Reduction of thread's contributions */
+
+    while ( thread_beg < thread ) {
+      --thread ; local_barrier_wait( this_thread , thread );
+      (*thread_pool.m_reduce_join)( & work , thread->m_reduce );
+    }
+  }
+}
+ 
+/*--------------------------------------------------------------------*/
+/*  The driver given to 'pthread_create'.
+ *  Run work until told to terminate.
+ */
+static void * local_driver( void * arg )
+{
+  Thread * const this_thread = (Thread *) arg ;
+
+  do {
+    /* Wait for my subtree of threads to complete */
+    local_barrier( this_thread );
+
+    this_thread->m_control = 0 ;
+
+    /*  Spin until I am activated. */
+    wait_thread( & this_thread->m_control , 0 );
+
+  } while ( thread_pool.m_work_routine );
+
+  local_barrier( this_thread ); /* Termination barrier */
+
+  this_thread->m_control = 0 ;
+
+  return NULL ;
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+ 
+static void alloc_reduce( int reduce_size )
+{
+  const int alloc_count = thread_pool.m_thread_count - 1 ;
+
+  if ( thread_pool.m_reduce_alloc_size < alloc_count * reduce_size ) {
+
+    const int grain_shift  = 8 ; /* grain_size = 0x80 */
+    const int grain_size   = 1 << grain_shift ; /* Byte grain size */
+    const int grain_count  = ( reduce_size + grain_size - 1 ) >> grain_shift ;
+    const int reduce_grain = grain_size * grain_count ; 
+    const int alloc_size   = alloc_count * reduce_grain ;
+
+    int i ;
+
+    if ( thread_pool.m_reduce_alloc ) {
+      thread_pool.m_reduce_alloc =
+        (unsigned char *) realloc( thread_pool.m_reduce_alloc , alloc_size );
+    }
+    else {
+      thread_pool.m_reduce_alloc = (unsigned char *) malloc( alloc_size );
+    }
+
+    thread_pool.m_reduce_alloc_size = alloc_size ;
+
+    for ( i = 0 ; i < alloc_count ; ++i ) {
+      thread_pool.m_thread[i+1].m_reduce =
+        thread_pool.m_reduce_alloc + reduce_grain * i ;
+    }
+  }
+}
+
+static int local_start(
+  int                   work_thread_count ,
+  TPI_work_subprogram   work_subprogram  ,
+  const void *          work_info ,
+  int                   work_count  ,
+  int                   lock_count ,
+  TPI_reduce_join       reduce_join ,
+  TPI_reduce_init       reduce_init ,
+  int                   reduce_size ,
+  void *                reduce_data )
+{
+  const int result = lock_count ? local_set_lock_count( lock_count ) : 0 ;
+
+  if ( ! result ) {
+
+    thread_pool.m_work_routine     = work_subprogram ;
+    thread_pool.m_work_info        = work_info ;
+    thread_pool.m_work_count       = work_count ;
+    thread_pool.m_lock_count       = lock_count ;
+    thread_pool.m_thread->m_reduce = reduce_data ;
+
+    if ( 1 < thread_pool.m_thread_count ) {
+
+      if ( reduce_size ) { alloc_reduce( reduce_size ); }
+
+      thread_pool.m_reduce_join       = reduce_join ;
+      thread_pool.m_reduce_init       = reduce_init ;
+      thread_pool.m_work_thread_count = work_thread_count ;
+      thread_pool.m_work_count_claim  = work_count ;
+
+      /* Activate the spinning worker threads */
+      {
+        Thread * const thread_beg = thread_pool.m_thread + 1 ;
+        Thread *       thread     = thread_pool.m_thread +
+                                    thread_pool.m_thread_count ;
+
+        while ( thread_beg < thread ) { (--thread)->m_control = 1 ; }
+      }
+    }
+  }
+
+  return result ;
+}
+
+static void local_wait()
+{
+  if ( 1 < thread_pool.m_thread_count ) {
+
+    local_barrier( thread_pool.m_thread );
+
+    thread_pool.m_reduce_join       = NULL ;
+    thread_pool.m_reduce_init       = NULL ;
+    thread_pool.m_work_thread_count = 0 ;
+    thread_pool.m_work_count_claim  = 0 ;
+  }
+  else {
+    struct TPI_Work_Struct w = { NULL , NULL , 0 , 0 , 0 };
+
+    w.info       = thread_pool.m_work_info ;
+    w.count      = thread_pool.m_work_count ;
+    w.lock_count = thread_pool.m_lock_count ;
+    w.reduce     = thread_pool.m_thread->m_reduce ;
+
+    for ( w.rank = 0 ; w.rank < w.count ; ++( w.rank ) ) {
+      (* thread_pool.m_work_routine )( & w );
+    }
+  }
+
+  thread_pool.m_work_routine     = NULL ;
+  thread_pool.m_work_info        = NULL ;
+  thread_pool.m_work_count       = 0 ;
+  thread_pool.m_lock_count       = 0 ;
+  thread_pool.m_thread->m_reduce = NULL ;
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+int TPI_Init( int n )
+{
+  int result = thread_pool.m_thread_count ? TPI_ERROR_ACTIVE : 0 ;
+
+  if ( ! result && ( n < 1 || THREAD_COUNT_MAX + 1 <= n ) ) {
+    result = TPI_ERROR_SIZE ;
+  }
+
+  if ( ! result ) {
+    pthread_attr_t attr ;
+
+    if ( pthread_attr_init( & attr )
+         || pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM )
+         || pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
+      result = TPI_ERROR_INTERNAL ;
+    }
+
+    if ( ! result ) {
+      int thread_rank = 0 ;
+      int count = 1 ;
+
+      /* Initialize one lock for blocking and unblocking */
+
+      local_set_lock_count( 1 );
+
+      /* Initialize threads with fan-in / fan-out span of threads */
+
+      for ( thread_rank = 0 ; thread_rank <= n ; ++thread_rank ) {
+        Thread * const thread = thread_pool.m_thread + thread_rank ;
+
+        thread->m_thread_fan         = thread_pool.m_thread + count ;
+        thread->m_reduce             = NULL ;
+        thread->m_rank               = thread_rank ;
+        thread->m_barrier_wait_max   = 0 ;
+        thread->m_barrier_wait_total = 0 ;
+        thread->m_barrier_wait_count = 0 ;
+        thread->m_control            = 1 ;
+
+        {
+          int up = 1 ;
+          while ( up <= thread_rank )    { up <<= 1 ; }
+          while ( thread_rank + up < n ) { up <<= 1 ; ++count ; }
+        }
+      }
+
+      thread_pool.m_thread_count = n ;
+
+      /* Create threads last-to-first for start up fan-in barrier */
+
+      for ( thread_rank = n ; ! result && 1 < thread_rank ; ) {
+        Thread * const thread = thread_pool.m_thread + --thread_rank ;
+
+        pthread_t pt ;
+
+        if ( pthread_create( & pt, & attr, & local_driver, thread ) ) {
+          thread->m_control = 0 ;
+          result = TPI_ERROR_INTERNAL ;
+        }
+      }
+
+      /* If a thread-spawn failed, terminate the created threads */
+
+      if ( result ) {
+        while ( thread_rank < --( thread_pool.m_thread_count ) ) {
+          Thread * thread = thread_pool.m_thread + thread_pool.m_thread_count ;
+          wait_thread( & thread->m_control , 1 ); /* Wait for blocking */
+          thread->m_control = 1 ; /* Reactivate thread */
+          wait_thread( & thread->m_control , 1 ); /* Wait for termination */
+        }
+        thread_pool.m_thread_count = 0 ;
+      }
+
+      pthread_attr_destroy( & attr );
+    }
+  }
+
+  if ( ! result ) {
+    local_barrier( thread_pool.m_thread );
+    result = n ;
+  }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+
+int TPI_Finalize()
+{
+  static int print_statistics = 0 ;
+
+  int result ;
+
+  result = NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : 0 ;
+
+  if ( ! result ) {
+
+    /* Wake up threads then wait for them to terminate */
+    local_start( 0 , NULL , NULL , 0 ,
+                 0 , NULL , NULL , 0 , NULL );
+
+    local_wait();
+
+    if ( print_statistics ) {
+      int i = 0 ;
+      for ( ; i < thread_pool.m_thread_count ; ++i ) {
+        if ( thread_pool.m_thread[i].m_barrier_wait_count ) {
+          long mean = ( thread_pool.m_thread[i].m_barrier_wait_total + 0.5 ) /
+                        thread_pool.m_thread[i].m_barrier_wait_count ;
+          fprintf(stdout,"Thread[%d] barrier_wait( max %ld , mean %ld )\n", i ,
+                   thread_pool.m_thread[i].m_barrier_wait_max , mean );
+        }
+      }
+    }
+
+    thread_pool.m_thread_count = 0 ;
+
+    local_destroy_locks();
+
+    if ( thread_pool.m_reduce_alloc ) {
+      free( thread_pool.m_reduce_alloc );
+      thread_pool.m_reduce_alloc = NULL ;
+      thread_pool.m_reduce_alloc_size = 0 ;
+    }
+  }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+static void local_block( TPI_Work * work )
+{
+  if ( work->rank ) {
+    pthread_mutex_lock(   thread_pool.m_lock );
+    pthread_mutex_unlock( thread_pool.m_lock );
+  }
+}
+
+int TPI_Block()
+{
+  const int result =
+    NULL != thread_pool.m_work_routine       ? TPI_ERROR_ACTIVE : (
+    pthread_mutex_lock( thread_pool.m_lock ) ? TPI_ERROR_INTERNAL :
+
+    local_start( thread_pool.m_thread_count ,
+                 local_block , NULL ,
+                 thread_pool.m_thread_count ,
+                 0 /* lock_count */ ,
+                 NULL , NULL , 0 , NULL ) );
+
+  return result ;
+}
+
+int TPI_Unblock()
+{
+  const int result =
+    local_block != thread_pool.m_work_routine  ? TPI_ERROR_ACTIVE : (
+    pthread_mutex_unlock( thread_pool.m_lock ) ? TPI_ERROR_INTERNAL : 0 );
+
+  if ( ! result ) { local_wait(); }
+
+  return result ;
+}
+
+int TPI_Isblocked()
+{
+  return local_block == thread_pool.m_work_routine ;
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+#else /* ! defined( HAVE_PTHREAD ) */
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+int TPI_Lock( int i )
+{
+  int result = i < 0 || thread_pool.m_lock_count <= i ? TPI_ERROR_SIZE : 0 ;
+
+  if ( ! result ) {
+    if ( 0 != thread_pool.m_lock[i] ) {
+      result = TPI_ERROR_LOCK ;
+    }
+    else {
+      thread_pool.m_lock[i] = 1 ;
+    }
+  }
+  return result ;
+}
+
+int TPI_Unlock( int i )
+{
+  int result = i < 0 || thread_pool.m_lock_count <= i ? TPI_ERROR_SIZE : 0 ;
+
+  if ( ! result ) {
+    if ( 0 == thread_pool.m_lock[i] ) {
+      result = TPI_ERROR_LOCK ;
+    }
+    else {
+      thread_pool.m_lock[i] = 0 ;
+    }
+  }
+  return result ;
+}
+
+static int local_set_lock_count( const int lock_count )
+{
+  int result = lock_count < 0 || LOCK_COUNT_MAX < lock_count
+             ? TPI_ERROR_SIZE : 0 ;
+
+  while ( thread_pool.m_lock_init < lock_count ) {
+
+    thread_pool.m_lock[ thread_pool.m_lock_init ] = 0 ;
+
+    ++( thread_pool.m_lock_init );
+  }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+
+static int local_start(
+  int                   work_thread_count ,
+  TPI_work_subprogram   work_subprogram  ,
+  const void *          work_info ,
+  int                   work_count  ,
+  int                   lock_count ,
+  TPI_reduce_join       reduce_join ,
+  TPI_reduce_init       reduce_init ,
+  int                   reduce_size ,
+  void *                reduce_data )
+{
+  const int result = lock_count ? local_set_lock_count( lock_count ) : 0 ;
+
+  if ( ! result ) {
+    thread_pool.m_work_routine     = work_subprogram ;
+    thread_pool.m_work_info        = work_info ;
+    thread_pool.m_work_count       = work_count ;
+    thread_pool.m_lock_count       = lock_count ;
+    thread_pool.m_thread->m_reduce = reduce_data ;
+  }
+
+  return result ;
+}
+
+static void local_wait()
+{
+  struct TPI_Work_Struct w = { NULL , NULL , 0 , 0 , 0 };
+
+  w.info       = thread_pool.m_work_info ;
+  w.count      = thread_pool.m_work_count ;
+  w.lock_count = thread_pool.m_lock_count ;
+  w.reduce     = thread_pool.m_thread->m_reduce ;
+
+  for ( w.rank = 0 ; w.rank < w.count ; ++( w.rank ) ) {
+    (* thread_pool.m_work_routine )( & w );
+  }
+
+  thread_pool.m_work_routine     = NULL ;
+  thread_pool.m_work_info        = NULL ;
+  thread_pool.m_work_count       = 0 ;
+  thread_pool.m_lock_count       = 0 ;
+  thread_pool.m_thread->m_reduce = NULL ;
+}
+
+/*--------------------------------------------------------------------*/
+
+static void local_block( TPI_Work * work ) {}
+
+int TPI_Block()
+{
+  const int result =
+    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE :
+
+    local_start( thread_pool.m_thread_count ,
+                 local_block , NULL ,
+                 thread_pool.m_thread_count ,
+                 0 /* lock_count */ ,
+                 NULL , NULL , 0 , NULL ) ;
+
+  return result ;
+}
+
+int TPI_Unblock()
+{
+  const int result =
+    local_block != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : 0 ;
+
+  if ( ! result ) { local_wait(); }
+
+  return result ;
+}
+
+int TPI_Isblocked()
+{
+  return local_block == thread_pool.m_work_routine ;
+}
+
+/*--------------------------------------------------------------------*/
+
+int TPI_Init( int n )
+{
+  int result = thread_pool.m_thread_count ? TPI_ERROR_ACTIVE : 0 ;
+
+  if ( ! result && ( n < 1 || THREAD_COUNT_MAX + 1 <= n ) ) {
+    result = TPI_ERROR_SIZE ;
+  }
+  else {
+    Thread * const thread = thread_pool.m_thread ;
+
+    thread->m_thread_fan         = NULL ;
+    thread->m_reduce             = NULL ;
+    thread->m_rank               = 0 ;
+    thread->m_barrier_wait_max   = 0 ;
+    thread->m_barrier_wait_total = 0 ;
+    thread->m_barrier_wait_count = 0 ;
+    thread->m_control            = 1 ;
+
+    thread_pool.m_thread_count = result = n ;
+
+    /* Initialize one lock for blocking and unblocking */
+
+    local_set_lock_count( 1 );
+  }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+
+int TPI_Finalize()
+{
+  int result =  NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : 0 ;
+
+  if ( ! result ) {
+    thread_pool.m_thread_count = 0 ;
+    thread_pool.m_lock_init = 0 ;
+  }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+#endif
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+int TPI_Wait()
+{
+  const int result =
+    ( NULL        == thread_pool.m_work_routine ||
+      local_block == thread_pool.m_work_routine ) ? TPI_ERROR_ACTIVE : 0 ;
+
+  if ( ! result ) { local_wait(); }
+
+  return result ;
+}
+
+int TPI_Start( TPI_work_subprogram work_subprogram  ,
+               const void *        work_info ,
+               int                 work_count ,
+               int                 lock_count )
+{
+  const int result =
+    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
+    NULL == work_subprogram            ? TPI_ERROR_NULL : (
+    work_count  < 0                    ? TPI_ERROR_SIZE :
+    local_start( thread_pool.m_thread_count - 1 ,
+                 work_subprogram , work_info , work_count , lock_count ,
+                 NULL , NULL , 0 , NULL ) ) );
+
+  return result ;
+}
+
+int TPI_Run( TPI_work_subprogram work_subprogram  ,
+             const void *        work_info ,
+             int                 work_count ,
+             int                 lock_count )
+{
+  const int result =
+    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
+    NULL == work_subprogram            ? TPI_ERROR_NULL : (
+    work_count  < 0                    ? TPI_ERROR_SIZE :
+    local_start( thread_pool.m_thread_count ,
+                 work_subprogram , work_info , work_count , lock_count ,
+                 NULL , NULL , 0 , NULL ) ) );
+
+  if ( ! result ) { local_wait(); }
+
+  return result ;
+}
+
+int TPI_Run_threads( TPI_work_subprogram work_subprogram  ,
+                     const void *        work_info ,
+                     int                 lock_count  )
+{
+  const int work_count = 0 < thread_pool.m_thread_count ?
+                             thread_pool.m_thread_count : 1 ;
+
+  const int result =
+    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
+    NULL == work_subprogram            ? TPI_ERROR_NULL : (
+    local_start( thread_pool.m_thread_count ,
+                 work_subprogram , work_info , work_count , lock_count ,
+                 NULL , NULL , 0 , NULL ) ) );
+
+  if ( ! result ) { local_wait(); }
+
+  return result ;
+}
+
+int TPI_Start_threads( TPI_work_subprogram work_subprogram  ,
+                       const void *        work_info ,
+                       int                 lock_count  )
+{
+  const int work_count = 1 < thread_pool.m_thread_count ?
+                             thread_pool.m_thread_count - 1 : 1 ;
+
+  const int result =
+    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
+    NULL == work_subprogram            ? TPI_ERROR_NULL : (
+    local_start( thread_pool.m_thread_count - 1 ,
+                 work_subprogram , work_info , work_count , lock_count ,
+                 NULL , NULL , 0 , NULL ) ) );
+
+  if ( ! result ) { local_wait(); }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+int TPI_Run_reduce( TPI_work_subprogram   work_subprogram  ,
+                    const void *          work_info ,
+                    int                   work_count  ,
+                    TPI_reduce_join       reduce_join ,
+                    TPI_reduce_init       reduce_init ,
+                    int                   reduce_size ,
+                    void *                reduce_data )
+{
+  const int lock_count = 0 ;
+
+  const int result =
+    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
+    NULL == work_subprogram            ? TPI_ERROR_NULL : (
+    NULL == reduce_join                ? TPI_ERROR_NULL : (
+    NULL == reduce_init                ? TPI_ERROR_NULL : (
+    NULL == reduce_data                ? TPI_ERROR_NULL : (
+    work_count  <= 0                   ? TPI_ERROR_SIZE : (
+    reduce_size <= 0                   ? TPI_ERROR_SIZE : 
+
+    local_start( thread_pool.m_thread_count ,
+                 work_subprogram, work_info, work_count, lock_count,
+                 reduce_join, reduce_init, reduce_size, reduce_data )))))));
+
+  if ( ! result ) { local_wait(); }
+
+  return result ;
+}
+
+int TPI_Run_threads_reduce( TPI_work_subprogram   work_subprogram  ,
+                            const void *          work_info ,
+                            TPI_reduce_join       reduce_join ,
+                            TPI_reduce_init       reduce_init ,
+                            int                   reduce_size ,
+                            void *                reduce_data )
+{
+  const int lock_count = 0 ;
+  const int work_count = 0 < thread_pool.m_thread_count ?
+                             thread_pool.m_thread_count : 1 ;
+
+  const int result =
+    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
+    NULL == work_subprogram            ? TPI_ERROR_NULL : (
+    NULL == reduce_join                ? TPI_ERROR_NULL : (
+    NULL == reduce_init                ? TPI_ERROR_NULL : (
+    NULL == reduce_data                ? TPI_ERROR_NULL : (
+    reduce_size <= 0                   ? TPI_ERROR_SIZE : 
+
+    local_start( thread_pool.m_thread_count ,
+                 work_subprogram , work_info , work_count , lock_count ,
+                 reduce_join, reduce_init, reduce_size, reduce_data ))))));
+
+  if ( ! result ) { local_wait(); }
+
+  return result ;
+}
+
+int TPI_Start_reduce( TPI_work_subprogram   work_subprogram  ,
+                      const void *          work_info ,
+                      int                   work_count  ,
+                      TPI_reduce_join       reduce_join ,
+                      TPI_reduce_init       reduce_init ,
+                      int                   reduce_size ,
+                      void *                reduce_data )
+{
+  const int lock_count = 0 ;
+
+  const int result =
+    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
+    NULL == work_subprogram            ? TPI_ERROR_NULL : (
+    NULL == reduce_join                ? TPI_ERROR_NULL : (
+    NULL == reduce_init                ? TPI_ERROR_NULL : (
+    NULL == reduce_data                ? TPI_ERROR_NULL : (
+    work_count  <= 0                   ? TPI_ERROR_SIZE : (
+    reduce_size <= 0                   ? TPI_ERROR_SIZE : 
+
+    local_start( thread_pool.m_thread_count - 1 ,
+                 work_subprogram , work_info , work_count , lock_count ,
+                 reduce_join, reduce_init, reduce_size, reduce_data )))))));
+
+  return result ;
+}
+
+int TPI_Start_threads_reduce( TPI_work_subprogram   work_subprogram  ,
+                              const void *          work_info ,
+                              TPI_reduce_join       reduce_join ,
+                              TPI_reduce_init       reduce_init ,
+                              int                   reduce_size ,
+                              void *                reduce_data )
+{
+  const int lock_count = 0 ;
+  const int work_count = 1 < thread_pool.m_thread_count ?
+                             thread_pool.m_thread_count - 1 : 1 ;
+
+  const int result =
+    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
+    NULL == work_subprogram            ? TPI_ERROR_NULL : (
+    NULL == reduce_join                ? TPI_ERROR_NULL : (
+    NULL == reduce_init                ? TPI_ERROR_NULL : (
+    NULL == reduce_data                ? TPI_ERROR_NULL : (
+    reduce_size <= 0                   ? TPI_ERROR_SIZE : 
+
+    local_start( thread_pool.m_thread_count - 1 ,
+                 work_subprogram , work_info , work_count , lock_count ,
+                 reduce_join, reduce_init, reduce_size, reduce_data ))))));
+
+  return result ;
+}
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/src/TPI.h b/openmp-avx512/basic/optional/ThreadPool/src/TPI.h
new file mode 100644
index 0000000..939d3be
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/src/TPI.h
@@ -0,0 +1,253 @@
+/*------------------------------------------------------------------------*/
+/*                    TPI: Thread Pool Interface                          */
+/*                Copyright (2008) Sandia Corporation                     */
+/*                                                                        */
+/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
+/*  license for use of this work by or on behalf of the U.S. Government.  */
+/*                                                                        */
+/*  This library is free software; you can redistribute it and/or modify  */
+/*  it under the terms of the GNU Lesser General Public License as        */
+/*  published by the Free Software Foundation; either version 2.1 of the  */
+/*  License, or (at your option) any later version.                       */
+/*                                                                        */
+/*  This library is distributed in the hope that it will be useful,       */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
+/*  Lesser General Public License for more details.                       */
+/*                                                                        */
+/*  You should have received a copy of the GNU Lesser General Public      */
+/*  License along with this library; if not, write to the Free Software   */
+/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
+/*  USA                                                                   */
+/*------------------------------------------------------------------------*/
+/**
+ * @author H. Carter Edwards  <hcedwar@sandia.gov>
+ *
+ *  Thread Pool Interface (TPI).
+ *
+ *  A simple and miminalistic interface for executing subprograms
+ *  in a thread parallel, shared memory mode.
+ *
+ *  States: the underlying thread pool has four states.
+ *    1) Uninitialized: no extra threads exist, this is the initial state.
+ *    2) Ready:  extra threads exist and are ready to run a subprogram.
+ *    3) Active: extra threads are calling the subprogram.
+ *    4) Blocked: extra threads blocked.
+ *
+ *  Threads are created on initialization and placed in the 'Ready' state.
+ *  While in the 'Ready' state the threads are spin-waiting to minimize
+ *  the cost of activating blocked threads.
+ *  Threads can be blocked so that they do not compete for computatational
+ *  resources with other threads created external to the TPI interface.
+ *  For example, threads created by OpenMP or TBB.
+ */
+
+#ifndef ThreadPoolInterface_h
+#define ThreadPoolInterface_h
+
+#if defined( __cplusplus )
+extern "C" {
+#endif
+
+/*--------------------------------------------------------------------*/
+/** \brief  Version string. */
+const char * TPI_Version();
+
+/** Start up the requested number of threads, less the calling thread.
+ *  Return the actual number of threads, including the calling thread,
+ *  otherwise return an error.
+ */
+int TPI_Init( int thread_count );
+
+/** Shut down all started threads. */
+int TPI_Finalize();
+
+/*--------------------------------------------------------------------*/
+/** \brief  A utility to measure wall-clock time, which is frequently
+ *          needed when performance testing HPC algorithms.
+ */
+double TPI_Walltime();
+
+/*--------------------------------------------------------------------*/
+/* All functions return zero for success. */
+
+#define TPI_ERROR_NULL     ((int) -1)  /**<  NULL input */
+#define TPI_ERROR_SIZE     ((int) -2)  /**<  BAD input: size or index */
+#define TPI_ERROR_LOCK     ((int) -3)  /**<  BAD lock or unlock */
+#define TPI_ERROR_ACTIVE   ((int) -4)  /**<  BAD input: the pool is active  */
+#define TPI_ERROR_INTERNAL ((int) -5)  /**< internal resource error */
+
+/*--------------------------------------------------------------------*/
+/** \brief  Work information passed to a work subprogram. */
+struct TPI_Work_Struct {
+  const void * info ;       /**<  Shared info input to TPI_Run */
+  void       * reduce ;     /**<  Data for reduce operation, if any */
+  int          count ;      /**<  Count of work requested via TPI_Run */
+  int          rank ;       /**<  Rank  of work for the current call */
+  int          lock_count ; /**<  Count of locks requested via TPI_Run */
+};
+
+/** \brief  Typedef for work subprogram argument */
+typedef const struct TPI_Work_Struct TPI_Work ;
+
+/**  The interface for a parallel task */
+typedef void (*TPI_work_subprogram)( TPI_Work * );
+
+/**  The interface for a parallel reduction operation.
+ *   Initialize  work->reduce value.
+ */
+typedef
+void (*TPI_reduce_init)( TPI_Work * work );
+
+/**  The interface for a parallel reduction operation.
+ *   Perform reduction operation  work->reduce OP= reduce.
+ *   Every initialized reduce value will appear exactly
+ *   once as the 'reduce' argument of a call to the join function.
+ */
+typedef
+void (*TPI_reduce_join)( TPI_Work * work , const void * reduce );
+
+/*--------------------------------------------------------------------*/
+/** \brief Run a work subprogram in thread parallel.
+ *
+ *  The thread pool must be in the 'paused' state when this
+ *  function is called.  Thus a recursive call to TPI_Run is illegal.
+ */
+int TPI_Run( TPI_work_subprogram work_subprogram  ,
+             const void *        work_info ,
+             int                 work_count  ,
+             int                 lock_count );
+
+/** \brief Run a work and reduction subprograms in thread parallel.
+ *
+ *  Each call to the work_subprogram has exclusive (thread safe)
+ *  access to its work->reduce data.
+ *  The reduce_init and reduce_join subprograms have
+ *  exclusive access to their arguments.
+ */
+int TPI_Run_reduce( TPI_work_subprogram   work_subprogram  ,
+                    const void *          work_info ,
+                    int                   work_count  ,
+                    TPI_reduce_join       reduce_join ,
+                    TPI_reduce_init       reduce_init ,
+                    int                   reduce_size ,
+                    void *                reduce_data );
+
+/** \brief  Run a work subprogram exactly once on each thread.
+ *
+ *  The thread pool must be in the 'paused' state when this
+ *  function is called.  Thus a recursive call to TPI_Run is illegal.
+ */
+int TPI_Run_threads( TPI_work_subprogram work_subprogram ,
+                     const void *        work_info ,
+                     int                 lock_count );
+
+/** \brief Run a work and reduction subprograms in thread parallel.
+ *
+ *  Each call to the work_subprogram has exclusive (thread safe)
+ *  access to its work->reduce data.
+ *  The reduce_init and reduce_join subprograms have
+ *  exclusive access to their arguments.
+ */
+int TPI_Run_threads_reduce( TPI_work_subprogram   work_subprogram ,
+                            const void *          work_info ,
+                            TPI_reduce_join       reduce_join ,
+                            TPI_reduce_init       reduce_init ,
+                            int                   reduce_size ,
+                            void *                reduce_data );
+
+/*--------------------------------------------------------------------*/
+/** \brief  Start a work subprogram in thread parallel
+ *          running on all but the 'main' calling thread;
+ *          the 'main' calling thread returns immediately.
+ *
+ *  The thread pool must be in the 'paused' state when this
+ *  function is called.  Thus a recursive call to TPI_Start is illegal.
+ */
+int TPI_Start( TPI_work_subprogram work_subprogram  ,
+               const void *        work_info ,
+               int                 work_count  ,
+               int                 lock_count );
+
+/** \brief  Start a work and reduction subprograms in thread parallel
+ *          running on all but the 'main' calling thread;
+ *          the 'main' calling thread returns immediately.
+ *
+ *  Each call to the work_subprogram has exclusive (thread safe)
+ *  access to its work->reduce data.
+ *  The reduce_init and reduce_join subprograms have
+ *  exclusive access to their arguments.
+ */
+int TPI_Start_reduce( TPI_work_subprogram   work_subprogram  ,
+                      const void *          work_info ,
+                      int                   work_count  ,
+                      TPI_reduce_join       reduce_join ,
+                      TPI_reduce_init       reduce_init ,
+                      int                   reduce_size ,
+                      void *                reduce_data );
+
+/** \brief  Run a work subprogram on each thread
+ *          that is not the 'main' calling thread.
+ *          The 'main' calling thread returns immediately.
+ *
+ *  The thread pool must be in the 'paused' state when this
+ *  function is called.  Thus a recursive call to TPI_Start_threads is illegal.
+ */
+int TPI_Start_threads( TPI_work_subprogram work_subprogram ,
+                       const void *        work_info ,
+                       int                 lock_count );
+
+/** \brief  Start a work / reduction subprogram 
+ *          on each thread that is not the 'main' calling thread.
+ *          The 'main' calling thread returns immediately.
+ *
+ *  Each call to the work_subprogram has exclusive (thread safe)
+ *  access to its work->reduce data.
+ *  The reduce_init and reduce_join subprograms have
+ *  exclusive access to their arguments.
+ */
+int TPI_Start_threads_reduce( TPI_work_subprogram   work_subprogram ,
+                              const void *          work_info ,
+                              TPI_reduce_join       reduce_join ,
+                              TPI_reduce_init       reduce_init ,
+                              int                   reduce_size ,
+                              void *                reduce_data );
+
+/** \brief  Wait for a started work subprogram to complete. */
+int TPI_Wait();
+
+/*--------------------------------------------------------------------*/
+/** \brief  Block threads within the operating system.
+ *
+ *  Normally the worker threads are unblocked and spinning for
+ *  minimal start up overhead when running work subprograms.
+ *  If no TPI work is to be performed for a long period of time
+ *  then an application can block the worker threads.
+ */
+int TPI_Block();
+
+/** \brief  Unblock blocked threads within the operating system */
+int TPI_Unblock();
+
+/** \brief  Query if threads are blocked */
+int TPI_Isblocked();
+
+/*--------------------------------------------------------------------*/
+/** \brief  Blocks until lock lock_rank is obtained.
+ *          The thread pool must be in the 'active' state.
+ */
+int TPI_Lock( int lock_rank );
+
+/** \brief  Unlocks lock lock_rank.
+ *          The thread pool must be in the 'active' state.
+ */
+int TPI_Unlock( int lock_rank );
+
+/*--------------------------------------------------------------------*/
+
+#if defined( __cplusplus )
+}
+#endif
+
+#endif
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/src/TPI.hpp b/openmp-avx512/basic/optional/ThreadPool/src/TPI.hpp
new file mode 100644
index 0000000..fc1894e
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/src/TPI.hpp
@@ -0,0 +1,135 @@
+/*------------------------------------------------------------------------*/
+/*                    TPI: Thread Pool Interface                          */
+/*                Copyright (2008) Sandia Corporation                     */
+/*                                                                        */
+/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
+/*  license for use of this work by or on behalf of the U.S. Government.  */
+/*                                                                        */
+/*  This library is free software; you can redistribute it and/or modify  */
+/*  it under the terms of the GNU Lesser General Public License as        */
+/*  published by the Free Software Foundation; either version 2.1 of the  */
+/*  License, or (at your option) any later version.                       */
+/*                                                                        */
+/*  This library is distributed in the hope that it will be useful,       */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
+/*  Lesser General Public License for more details.                       */
+/*                                                                        */
+/*  You should have received a copy of the GNU Lesser General Public      */
+/*  License along with this library; if not, write to the Free Software   */
+/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
+/*  USA                                                                   */
+/*------------------------------------------------------------------------*/
+/**
+ * @author H. Carter Edwards  <hcedwar@sandia.gov>
+ */
+
+#ifndef util_ThreadPool_hpp
+#define util_ThreadPool_hpp
+
+#include <TPI.h>
+
+namespace TPI {
+
+typedef TPI_Work Work ;
+
+//----------------------------------------------------------------------
+/** Run  worker.*method(work)  on all threads.
+ */
+template<class Worker>
+int Run( Worker & worker , void (Worker::*method)(Work &) ,
+         int work_count , int lock_count = 0 );
+
+//----------------------------------------------------------------------
+
+inline int Lock( int n )    { return TPI_Lock( n ); }
+inline int Unlock( int n )  { return TPI_Unlock( n ); }
+
+/** Lock guard to insure that a lock is released
+ *  when control exists a block.
+ *    {
+ *      TPI::LockGuard local_lock( i );
+ *    }
+ */
+class LockGuard {
+private:
+  LockGuard();
+  LockGuard( const LockGuard & );
+  LockGuard & operator = ( const LockGuard & );
+  const int m_value ;
+  const int m_result ;
+public:
+  operator int() const { return m_result ; }
+
+  explicit LockGuard( unsigned i_lock )
+    : m_value( i_lock ), m_result( TPI_Lock(i_lock) ) {}
+
+  ~LockGuard() { TPI_Unlock( m_value ); }
+};
+
+//----------------------------------------------------------------------
+
+inline
+int Init( int n ) { return TPI_Init( n ); }
+
+inline
+int Finalize() { return TPI_Finalize(); }
+
+inline
+double Walltime() { return TPI_Walltime(); }
+
+//----------------------------------------------------------------------
+//----------------------------------------------------------------------
+
+namespace {
+
+template<class Worker>
+class WorkerMethodHelper {
+private:
+  WorkerMethodHelper();
+  WorkerMethodHelper( const WorkerMethodHelper & );
+  WorkerMethodHelper & operator = ( const WorkerMethodHelper & );
+
+public:
+
+  typedef void (Worker::*Method)( Work & );
+
+  Worker & worker ;
+  Method   method ;
+
+  WorkerMethodHelper( Worker & w , Method m ) : worker(w), method(m) {}
+
+  static void run( TPI_Work * work )
+    {
+      try {
+        const WorkerMethodHelper & wm =
+          * reinterpret_cast<const WorkerMethodHelper*>(work->info);
+        (wm.worker.*wm.method)(*work);
+      } catch(...){}
+    }
+};
+
+}
+
+//----------------------------------------------------------------------
+//----------------------------------------------------------------------
+
+template<class Worker>
+inline
+int Run( Worker & worker, void (Worker::*method)(Work &) ,
+         int work_count , int lock_count )
+{
+  typedef WorkerMethodHelper<Worker> WM ;
+
+  WM tmp( worker , method );
+
+  return TPI_Run( reinterpret_cast<TPI_work_subprogram>(& WM::run),&tmp,work_count,lock_count);
+}
+
+//----------------------------------------------------------------------
+//----------------------------------------------------------------------
+
+}
+
+#endif
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/src/TPI_Walltime.c b/openmp-avx512/basic/optional/ThreadPool/src/TPI_Walltime.c
new file mode 100644
index 0000000..d2c1fe4
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/src/TPI_Walltime.c
@@ -0,0 +1,44 @@
+/*------------------------------------------------------------------------*/
+/*                    TPI: Thread Pool Interface                          */
+/*                Copyright (2008) Sandia Corporation                     */
+/*                                                                        */
+/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
+/*  license for use of this work by or on behalf of the U.S. Government.  */
+/*                                                                        */
+/*  This library is free software; you can redistribute it and/or modify  */
+/*  it under the terms of the GNU Lesser General Public License as        */
+/*  published by the Free Software Foundation; either version 2.1 of the  */
+/*  License, or (at your option) any later version.                       */
+/*                                                                        */
+/*  This library is distributed in the hope that it will be useful,       */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
+/*  Lesser General Public License for more details.                       */
+/*                                                                        */
+/*  You should have received a copy of the GNU Lesser General Public      */
+/*  License along with this library; if not, write to the Free Software   */
+/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
+/*  USA                                                                   */
+/*------------------------------------------------------------------------*/
+/**
+ * @author H. Carter Edwards
+ */
+
+#include <TPI.h>
+
+#include <stddef.h>
+#ifdef _MSC_VER
+#include <gettimeofday.c>
+#else
+#include <sys/time.h>
+#endif
+
+double TPI_Walltime()
+{
+  struct timeval tp ;
+
+  gettimeofday( &tp , ((struct timezone *) NULL ) );
+
+  return ( (double) tp.tv_sec ) + ( (double) tp.tv_usec ) / 1.0e6 ;
+}
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/src/ThreadPool_config.h.in b/openmp-avx512/basic/optional/ThreadPool/src/ThreadPool_config.h.in
new file mode 100644
index 0000000..752f5c5
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/src/ThreadPool_config.h.in
@@ -0,0 +1,71 @@
+/* src/ThreadPool_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define if you want to build export makefiles. */
+#undef HAVE_EXPORT_MAKEFILES
+
+/* Define if you are using gnumake - this will shorten your link lines. */
+#undef HAVE_GNUMAKE
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define if want to build libcheck */
+#undef HAVE_LIBCHECK
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* define if we want to use MPI */
+#undef HAVE_MPI
+
+/* Define if want to build threadpool-tests */
+#undef HAVE_NEW_PACKAGE_TESTS
+
+/* Define if you have POSIX threads libraries and header files. */
+#undef HAVE_PTHREAD
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define if want to build tests */
+#undef HAVE_TESTS
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* Define to the necessary symbol if this constant uses a non-standard name on
+   your system. */
+#undef PTHREAD_CREATE_JOINABLE
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/CMakeLists.txt b/openmp-avx512/basic/optional/ThreadPool/test/CMakeLists.txt
new file mode 100644
index 0000000..ff878e7
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/CMakeLists.txt
@@ -0,0 +1,86 @@
+
+INCLUDE(PackageAddExecutableAndTest)
+
+PACKAGE_ADD_EXECUTABLE(
+  test_tpi_unit
+  COMM serial mpi
+  SOURCES test_tpi_unit.c
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_EXECUTABLE(
+  test_c_dnax
+  COMM serial
+  SOURCES test_c_dnax.c
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_EXECUTABLE(
+  test_tpi_cpp
+  COMM serial
+  SOURCES test_tpi.cpp
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_EXECUTABLE(
+  test_tpi_sum
+  COMM serial mpi
+  SOURCES test_mpi_sum.c
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_unit
+  NAME test_tpi_unit_serial
+  COMM serial
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_unit
+  NAME test_tpi_unit_mpi
+  COMM mpi
+  NUM_MPI_PROCS 1
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_cpp
+  NAME test_tpi_cpp
+  COMM serial
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_sum
+  NAME test_tpi_sum_serial
+  COMM serial
+  DIRECTORY .
+  XHOSTTYPE AIX
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_sum
+  NAME test_tpi_sum_np1
+  COMM mpi
+  NUM_MPI_PROCS 1
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_sum
+  NAME test_tpi_sum_np2
+  COMM mpi
+  NUM_MPI_PROCS 2
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_sum
+  NAME test_tpi_sum_np4
+  COMM mpi
+  NUM_MPI_PROCS 4
+  DIRECTORY .
+  )
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/Makefile.am b/openmp-avx512/basic/optional/ThreadPool/test/Makefile.am
new file mode 100644
index 0000000..8e78cbf
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/Makefile.am
@@ -0,0 +1,55 @@
+#@HEADER
+# ************************************************************************
+# 
+#                          ThreadPool Package 
+#                 Copyright (2008) Sandia Corporation
+# 
+# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+# license for use of this work by or on behalf of the U.S. Government.
+# 
+# This library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of the
+# License, or (at your option) any later version.
+#  
+# This library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#  
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
+# 
+# ************************************************************************
+#@HEADER
+
+SUBDIRS =
+
+# The following line helps the test harness recover from build errors.
+
+all-local:
+
+include $(top_builddir)/Makefile.export.threadpool
+
+EXEEXT = .exe
+
+noinst_PROGRAMS = test_tpi test_tpi_cpp test_sum 
+
+test_tpi_SOURCES      = test_main.c test_tpi_unit.c test_c_dnax.c test_c_tpi.c test_pthreads.c
+test_tpi_DEPENDENCIES = $(top_builddir)/src/libtpi.a
+test_tpi_CFLAGS     = $(THREADPOOL_INCLUDES)
+test_tpi_LDADD        = $(THREADPOOL_LIBS)
+
+test_tpi_cpp_SOURCES      = test_tpi.cpp
+test_tpi_cpp_DEPENDENCIES = $(top_builddir)/src/libtpi.a
+test_tpi_cpp_CXXFLAGS     = $(THREADPOOL_INCLUDES)
+test_tpi_cpp_LDADD        = $(THREADPOOL_LIBS)
+
+test_sum_SOURCES      = test_mpi_sum.c
+test_sum_DEPENDENCIES = $(top_builddir)/src/libtpi.a
+test_sum_CFLAGS     = $(THREADPOOL_INCLUDES)
+test_sum_LDADD        = $(THREADPOOL_LIBS)
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/Makefile.in b/openmp-avx512/basic/optional/ThreadPool/test/Makefile.in
new file mode 100644
index 0000000..ffc5220
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/Makefile.in
@@ -0,0 +1,730 @@
+# Makefile.in generated by automake 1.10 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+#@HEADER
+# ************************************************************************
+# 
+#                          ThreadPool Package 
+#                 Copyright (2008) Sandia Corporation
+# 
+# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+# license for use of this work by or on behalf of the U.S. Government.
+# 
+# This library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of the
+# License, or (at your option) any later version.
+#  
+# This library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#  
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
+# 
+# ************************************************************************
+#@HEADER
+
+VPATH = @srcdir@
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+noinst_PROGRAMS = test_tpi$(EXEEXT) test_tpi_cpp$(EXEEXT) \
+	test_sum$(EXEEXT)
+subdir = test
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/config/acx_pthread.m4 \
+	$(top_srcdir)/config/tac_arg_check_mpi.m4 \
+	$(top_srcdir)/config/tac_arg_config_mpi.m4 \
+	$(top_srcdir)/config/tac_arg_enable_export-makefiles.m4 \
+	$(top_srcdir)/config/tac_arg_enable_feature.m4 \
+	$(top_srcdir)/config/tac_arg_enable_feature_sub_check.m4 \
+	$(top_srcdir)/config/tac_arg_with_ar.m4 \
+	$(top_srcdir)/config/tac_arg_with_flags.m4 \
+	$(top_srcdir)/config/tac_arg_with_incdirs.m4 \
+	$(top_srcdir)/config/tac_arg_with_libdirs.m4 \
+	$(top_srcdir)/config/tac_arg_with_libs.m4 \
+	$(top_srcdir)/config/tac_arg_with_perl.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/src/ThreadPool_config.h
+CONFIG_CLEAN_FILES =
+PROGRAMS = $(noinst_PROGRAMS)
+am_test_sum_OBJECTS = test_sum-test_mpi_sum.$(OBJEXT)
+test_sum_OBJECTS = $(am_test_sum_OBJECTS)
+test_sum_LINK = $(CCLD) $(test_sum_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+am_test_tpi_OBJECTS = test_tpi-test_main.$(OBJEXT) \
+	test_tpi-test_tpi_unit.$(OBJEXT) \
+	test_tpi-test_c_dnax.$(OBJEXT) test_tpi-test_c_tpi.$(OBJEXT) \
+	test_tpi-test_pthreads.$(OBJEXT)
+test_tpi_OBJECTS = $(am_test_tpi_OBJECTS)
+test_tpi_LINK = $(CCLD) $(test_tpi_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+am_test_tpi_cpp_OBJECTS = test_tpi_cpp-test_tpi.$(OBJEXT)
+test_tpi_cpp_OBJECTS = $(am_test_tpi_cpp_OBJECTS)
+test_tpi_cpp_LINK = $(CXXLD) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+DEFAULT_INCLUDES = -I. -I$(top_builddir)/src@am__isrc@
+depcomp = $(SHELL) $(top_srcdir)/config/depcomp
+am__depfiles_maybe = depfiles
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
+CXXLD = $(CXX)
+CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
+	-o $@
+SOURCES = $(test_sum_SOURCES) $(test_tpi_SOURCES) \
+	$(test_tpi_cpp_SOURCES)
+DIST_SOURCES = $(test_sum_SOURCES) $(test_tpi_SOURCES) \
+	$(test_tpi_cpp_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
+	html-recursive info-recursive install-data-recursive \
+	install-dvi-recursive install-exec-recursive \
+	install-html-recursive install-info-recursive \
+	install-pdf-recursive install-ps-recursive install-recursive \
+	installcheck-recursive installdirs-recursive pdf-recursive \
+	ps-recursive uninstall-recursive
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+ETAGS = etags
+CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+ALTERNATE_AR = @ALTERNATE_AR@
+AMTAR = @AMTAR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = .exe
+GREP = @GREP@
+HAVE_PERL = @HAVE_PERL@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MKDIR_P = @MKDIR_P@
+MPI_CC_EXISTS = @MPI_CC_EXISTS@
+MPI_CXX = @MPI_CXX@
+MPI_CXX_EXISTS = @MPI_CXX_EXISTS@
+MPI_F77_EXISTS = @MPI_F77_EXISTS@
+MPI_TEMP_CXX = @MPI_TEMP_CXX@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PERL_EXE = @PERL_EXE@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_aux_dir = @ac_aux_dir@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+SUBDIRS = 
+test_tpi_SOURCES = test_main.c test_tpi_unit.c test_c_dnax.c test_c_tpi.c test_pthreads.c
+test_tpi_DEPENDENCIES = $(top_builddir)/src/libtpi.a
+test_tpi_CFLAGS = $(THREADPOOL_INCLUDES)
+test_tpi_LDADD = $(THREADPOOL_LIBS)
+test_tpi_cpp_SOURCES = test_tpi.cpp
+test_tpi_cpp_DEPENDENCIES = $(top_builddir)/src/libtpi.a
+test_tpi_cpp_CXXFLAGS = $(THREADPOOL_INCLUDES)
+test_tpi_cpp_LDADD = $(THREADPOOL_LIBS)
+test_sum_SOURCES = test_mpi_sum.c
+test_sum_DEPENDENCIES = $(top_builddir)/src/libtpi.a
+test_sum_CFLAGS = $(THREADPOOL_INCLUDES)
+test_sum_LDADD = $(THREADPOOL_LIBS)
+all: all-recursive
+
+.SUFFIXES:
+.SUFFIXES: .c .cpp .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \
+		&& exit 0; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  test/Makefile'; \
+	cd $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign  test/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+clean-noinstPROGRAMS:
+	-test -z "$(noinst_PROGRAMS)" || rm -f $(noinst_PROGRAMS)
+test_sum$(EXEEXT): $(test_sum_OBJECTS) $(test_sum_DEPENDENCIES) 
+	@rm -f test_sum$(EXEEXT)
+	$(test_sum_LINK) $(test_sum_OBJECTS) $(test_sum_LDADD) $(LIBS)
+test_tpi$(EXEEXT): $(test_tpi_OBJECTS) $(test_tpi_DEPENDENCIES) 
+	@rm -f test_tpi$(EXEEXT)
+	$(test_tpi_LINK) $(test_tpi_OBJECTS) $(test_tpi_LDADD) $(LIBS)
+test_tpi_cpp$(EXEEXT): $(test_tpi_cpp_OBJECTS) $(test_tpi_cpp_DEPENDENCIES) 
+	@rm -f test_tpi_cpp$(EXEEXT)
+	$(test_tpi_cpp_LINK) $(test_tpi_cpp_OBJECTS) $(test_tpi_cpp_LDADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_sum-test_mpi_sum.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_c_dnax.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_c_tpi.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_main.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_pthreads.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_tpi_unit.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi_cpp-test_tpi.Po@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+test_sum-test_mpi_sum.o: test_mpi_sum.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_sum_CFLAGS) $(CFLAGS) -MT test_sum-test_mpi_sum.o -MD -MP -MF $(DEPDIR)/test_sum-test_mpi_sum.Tpo -c -o test_sum-test_mpi_sum.o `test -f 'test_mpi_sum.c' || echo '$(srcdir)/'`test_mpi_sum.c
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_sum-test_mpi_sum.Tpo $(DEPDIR)/test_sum-test_mpi_sum.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_mpi_sum.c' object='test_sum-test_mpi_sum.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_sum_CFLAGS) $(CFLAGS) -c -o test_sum-test_mpi_sum.o `test -f 'test_mpi_sum.c' || echo '$(srcdir)/'`test_mpi_sum.c
+
+test_sum-test_mpi_sum.obj: test_mpi_sum.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_sum_CFLAGS) $(CFLAGS) -MT test_sum-test_mpi_sum.obj -MD -MP -MF $(DEPDIR)/test_sum-test_mpi_sum.Tpo -c -o test_sum-test_mpi_sum.obj `if test -f 'test_mpi_sum.c'; then $(CYGPATH_W) 'test_mpi_sum.c'; else $(CYGPATH_W) '$(srcdir)/test_mpi_sum.c'; fi`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_sum-test_mpi_sum.Tpo $(DEPDIR)/test_sum-test_mpi_sum.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_mpi_sum.c' object='test_sum-test_mpi_sum.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_sum_CFLAGS) $(CFLAGS) -c -o test_sum-test_mpi_sum.obj `if test -f 'test_mpi_sum.c'; then $(CYGPATH_W) 'test_mpi_sum.c'; else $(CYGPATH_W) '$(srcdir)/test_mpi_sum.c'; fi`
+
+test_tpi-test_main.o: test_main.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_main.o -MD -MP -MF $(DEPDIR)/test_tpi-test_main.Tpo -c -o test_tpi-test_main.o `test -f 'test_main.c' || echo '$(srcdir)/'`test_main.c
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_main.Tpo $(DEPDIR)/test_tpi-test_main.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_main.c' object='test_tpi-test_main.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_main.o `test -f 'test_main.c' || echo '$(srcdir)/'`test_main.c
+
+test_tpi-test_main.obj: test_main.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_main.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_main.Tpo -c -o test_tpi-test_main.obj `if test -f 'test_main.c'; then $(CYGPATH_W) 'test_main.c'; else $(CYGPATH_W) '$(srcdir)/test_main.c'; fi`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_main.Tpo $(DEPDIR)/test_tpi-test_main.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_main.c' object='test_tpi-test_main.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_main.obj `if test -f 'test_main.c'; then $(CYGPATH_W) 'test_main.c'; else $(CYGPATH_W) '$(srcdir)/test_main.c'; fi`
+
+test_tpi-test_tpi_unit.o: test_tpi_unit.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_tpi_unit.o -MD -MP -MF $(DEPDIR)/test_tpi-test_tpi_unit.Tpo -c -o test_tpi-test_tpi_unit.o `test -f 'test_tpi_unit.c' || echo '$(srcdir)/'`test_tpi_unit.c
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_tpi_unit.Tpo $(DEPDIR)/test_tpi-test_tpi_unit.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_tpi_unit.c' object='test_tpi-test_tpi_unit.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_tpi_unit.o `test -f 'test_tpi_unit.c' || echo '$(srcdir)/'`test_tpi_unit.c
+
+test_tpi-test_tpi_unit.obj: test_tpi_unit.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_tpi_unit.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_tpi_unit.Tpo -c -o test_tpi-test_tpi_unit.obj `if test -f 'test_tpi_unit.c'; then $(CYGPATH_W) 'test_tpi_unit.c'; else $(CYGPATH_W) '$(srcdir)/test_tpi_unit.c'; fi`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_tpi_unit.Tpo $(DEPDIR)/test_tpi-test_tpi_unit.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_tpi_unit.c' object='test_tpi-test_tpi_unit.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_tpi_unit.obj `if test -f 'test_tpi_unit.c'; then $(CYGPATH_W) 'test_tpi_unit.c'; else $(CYGPATH_W) '$(srcdir)/test_tpi_unit.c'; fi`
+
+test_tpi-test_c_dnax.o: test_c_dnax.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_c_dnax.o -MD -MP -MF $(DEPDIR)/test_tpi-test_c_dnax.Tpo -c -o test_tpi-test_c_dnax.o `test -f 'test_c_dnax.c' || echo '$(srcdir)/'`test_c_dnax.c
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_c_dnax.Tpo $(DEPDIR)/test_tpi-test_c_dnax.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_c_dnax.c' object='test_tpi-test_c_dnax.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_c_dnax.o `test -f 'test_c_dnax.c' || echo '$(srcdir)/'`test_c_dnax.c
+
+test_tpi-test_c_dnax.obj: test_c_dnax.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_c_dnax.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_c_dnax.Tpo -c -o test_tpi-test_c_dnax.obj `if test -f 'test_c_dnax.c'; then $(CYGPATH_W) 'test_c_dnax.c'; else $(CYGPATH_W) '$(srcdir)/test_c_dnax.c'; fi`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_c_dnax.Tpo $(DEPDIR)/test_tpi-test_c_dnax.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_c_dnax.c' object='test_tpi-test_c_dnax.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_c_dnax.obj `if test -f 'test_c_dnax.c'; then $(CYGPATH_W) 'test_c_dnax.c'; else $(CYGPATH_W) '$(srcdir)/test_c_dnax.c'; fi`
+
+test_tpi-test_c_tpi.o: test_c_tpi.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_c_tpi.o -MD -MP -MF $(DEPDIR)/test_tpi-test_c_tpi.Tpo -c -o test_tpi-test_c_tpi.o `test -f 'test_c_tpi.c' || echo '$(srcdir)/'`test_c_tpi.c
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_c_tpi.Tpo $(DEPDIR)/test_tpi-test_c_tpi.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_c_tpi.c' object='test_tpi-test_c_tpi.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_c_tpi.o `test -f 'test_c_tpi.c' || echo '$(srcdir)/'`test_c_tpi.c
+
+test_tpi-test_c_tpi.obj: test_c_tpi.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_c_tpi.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_c_tpi.Tpo -c -o test_tpi-test_c_tpi.obj `if test -f 'test_c_tpi.c'; then $(CYGPATH_W) 'test_c_tpi.c'; else $(CYGPATH_W) '$(srcdir)/test_c_tpi.c'; fi`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_c_tpi.Tpo $(DEPDIR)/test_tpi-test_c_tpi.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_c_tpi.c' object='test_tpi-test_c_tpi.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_c_tpi.obj `if test -f 'test_c_tpi.c'; then $(CYGPATH_W) 'test_c_tpi.c'; else $(CYGPATH_W) '$(srcdir)/test_c_tpi.c'; fi`
+
+test_tpi-test_pthreads.o: test_pthreads.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_pthreads.o -MD -MP -MF $(DEPDIR)/test_tpi-test_pthreads.Tpo -c -o test_tpi-test_pthreads.o `test -f 'test_pthreads.c' || echo '$(srcdir)/'`test_pthreads.c
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_pthreads.Tpo $(DEPDIR)/test_tpi-test_pthreads.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_pthreads.c' object='test_tpi-test_pthreads.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_pthreads.o `test -f 'test_pthreads.c' || echo '$(srcdir)/'`test_pthreads.c
+
+test_tpi-test_pthreads.obj: test_pthreads.c
+@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_pthreads.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_pthreads.Tpo -c -o test_tpi-test_pthreads.obj `if test -f 'test_pthreads.c'; then $(CYGPATH_W) 'test_pthreads.c'; else $(CYGPATH_W) '$(srcdir)/test_pthreads.c'; fi`
+@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_pthreads.Tpo $(DEPDIR)/test_tpi-test_pthreads.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_pthreads.c' object='test_tpi-test_pthreads.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_pthreads.obj `if test -f 'test_pthreads.c'; then $(CYGPATH_W) 'test_pthreads.c'; else $(CYGPATH_W) '$(srcdir)/test_pthreads.c'; fi`
+
+.cpp.o:
+@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(CXXCOMPILE) -c -o $@ $<
+
+.cpp.obj:
+@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCXX_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+test_tpi_cpp-test_tpi.o: test_tpi.cpp
+@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) -MT test_tpi_cpp-test_tpi.o -MD -MP -MF $(DEPDIR)/test_tpi_cpp-test_tpi.Tpo -c -o test_tpi_cpp-test_tpi.o `test -f 'test_tpi.cpp' || echo '$(srcdir)/'`test_tpi.cpp
+@am__fastdepCXX_TRUE@	mv -f $(DEPDIR)/test_tpi_cpp-test_tpi.Tpo $(DEPDIR)/test_tpi_cpp-test_tpi.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='test_tpi.cpp' object='test_tpi_cpp-test_tpi.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) -c -o test_tpi_cpp-test_tpi.o `test -f 'test_tpi.cpp' || echo '$(srcdir)/'`test_tpi.cpp
+
+test_tpi_cpp-test_tpi.obj: test_tpi.cpp
+@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) -MT test_tpi_cpp-test_tpi.obj -MD -MP -MF $(DEPDIR)/test_tpi_cpp-test_tpi.Tpo -c -o test_tpi_cpp-test_tpi.obj `if test -f 'test_tpi.cpp'; then $(CYGPATH_W) 'test_tpi.cpp'; else $(CYGPATH_W) '$(srcdir)/test_tpi.cpp'; fi`
+@am__fastdepCXX_TRUE@	mv -f $(DEPDIR)/test_tpi_cpp-test_tpi.Tpo $(DEPDIR)/test_tpi_cpp-test_tpi.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='test_tpi.cpp' object='test_tpi_cpp-test_tpi.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) -c -o test_tpi_cpp-test_tpi.obj `if test -f 'test_tpi.cpp'; then $(CYGPATH_W) 'test_tpi.cpp'; else $(CYGPATH_W) '$(srcdir)/test_tpi.cpp'; fi`
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+$(RECURSIVE_TARGETS):
+	@failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+$(RECURSIVE_CLEAN_TARGETS):
+	@failcom='exit 1'; \
+	for f in x $$MAKEFLAGS; do \
+	  case $$f in \
+	    *=* | --[!k]*);; \
+	    *k*) failcom='fail=yes';; \
+	  esac; \
+	done; \
+	dot_seen=no; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	rev=''; for subdir in $$list; do \
+	  if test "$$subdir" = "."; then :; else \
+	    rev="$$subdir $$rev"; \
+	  fi; \
+	done; \
+	rev="$$rev ."; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+ctags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
+	done
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	    $$tags $$unique; \
+	fi
+ctags: CTAGS
+CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(CTAGS_ARGS)$$tags$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$tags $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && cd $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) $$here
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
+	    fi; \
+	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file \
+	    || exit 1; \
+	  fi; \
+	done
+	list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test -d "$(distdir)/$$subdir" \
+	    || $(MKDIR_P) "$(distdir)/$$subdir" \
+	    || exit 1; \
+	    distdir=`$(am__cd) $(distdir) && pwd`; \
+	    top_distdir=`$(am__cd) $(top_distdir) && pwd`; \
+	    (cd $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$top_distdir" \
+	        distdir="$$distdir/$$subdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-recursive
+all-am: Makefile $(PROGRAMS) all-local
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-recursive
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	  `test -z '$(STRIP)' || \
+	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-recursive
+
+clean-am: clean-generic clean-noinstPROGRAMS mostlyclean-am
+
+distclean: distclean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-recursive
+
+dvi-am:
+
+html: html-recursive
+
+info: info-recursive
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-recursive
+
+install-exec-am:
+
+install-html: install-html-recursive
+
+install-info: install-info-recursive
+
+install-man:
+
+install-pdf: install-pdf-recursive
+
+install-ps: install-ps-recursive
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-recursive
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic
+
+pdf: pdf-recursive
+
+pdf-am:
+
+ps: ps-recursive
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) install-am \
+	install-strip
+
+.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
+	all all-am all-local check check-am clean clean-generic \
+	clean-noinstPROGRAMS ctags ctags-recursive distclean \
+	distclean-compile distclean-generic distclean-tags distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs installdirs-am \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic pdf pdf-am ps ps-am \
+	tags tags-recursive uninstall uninstall-am
+
+
+# The following line helps the test harness recover from build errors.
+
+all-local:
+
+include $(top_builddir)/Makefile.export.threadpool
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/build_gnu b/openmp-avx512/basic/optional/ThreadPool/test/build_gnu
new file mode 100755
index 0000000..bba4b90
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/build_gnu
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+TEST_SRC="test_main.c test_c_dnax.c test_tpi_unit.c test_pthreads.c"
+
+LIB_SRC="../src/TPI.c ../src/TPI_Walltime.c"
+
+LIB_OBJ="TPI.o TPI_Walltime.o"
+
+# OPT="-O3"
+OPT="-g"
+# OPT="-O"
+
+#CFLAGS="${OPT} -std=c99   -Wall -Wextra"
+
+CFLAGS=" ${OPT} -std=c89   -Wall -Wextra"
+CCFLAGS="${OPT} -std=c++98 -Wall -Wextra"
+
+echo build: gcc ${CFLAGS}
+
+#-----------------------------------------------------------------------
+
+rm -f ThreadPool_config.h
+echo "#define HAVE_PTHREAD 1" > ThreadPool_config.h
+
+gcc	${CFLAGS} -c	\
+	-I. -I../src ${LIB_SRC}
+
+gcc	${CFLAGS} \
+	-o test_tpi.gnu.exe	\
+	-I. -I../src ${TEST_SRC} ${LIB_OBJ} -lpthread -lm
+
+g++	${CCFLAGS} \
+	-o test_tpi_cpp.gnu.exe	\
+	-I. -I../src test_tpi.cpp ${LIB_OBJ} -lpthread -lstdc++ -lm
+
+gcc	${CFLAGS} \
+	-o test_sum.gnu.exe	\
+	-I. -I../src test_mpi_sum.c ${LIB_OBJ} -lpthread -lm
+
+#-----------------------------------------------------------------------
+
+mpicc	${CFLAGS} \
+	-o test_sum.mpi.gnu.exe	\
+	-I. -I../src -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread -lm
+
+#-----------------------------------------------------------------------
+
+rm -f ThreadPool_config.h
+echo "/* #define HAVE_PTHREAD 1 */" > ThreadPool_config.h
+
+gcc	${CFLAGS} -c	\
+	-I. -I../src ${LIB_SRC}
+
+gcc	${CFLAGS} \
+	-o test_tpi.gnu.noth.exe	\
+	-I. -I../src ${TEST_SRC} ${LIB_OBJ} -lpthread -lm
+
+g++	${CCFLAGS} \
+	-o test_tpi_cpp.gnu.noth.exe	\
+	-I. -I../src test_tpi.cpp ${LIB_OBJ} -lpthread -lstdc++ -lm
+
+gcc	${CFLAGS} \
+	-o test_sum.gnu.noth.exe	\
+	-I. -I../src test_mpi_sum.c ${LIB_OBJ} -lpthread -lm
+
+#-----------------------------------------------------------------------
+
+rm -f ThreadPool_config.h
+echo "/* #define HAVE_PTHREAD 1 */" > ThreadPool_config.h
+echo "#define HAVE_MPI 1" >> ThreadPool_config.h
+
+mpicc	${CFLAGS} \
+	-o test_sum.mpi.gnu.noth.exe	\
+	-I. -I../src -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread -lm
+
+#-----------------------------------------------------------------------
+
+rm -f ThreadPool_config.h
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/build_intel b/openmp-avx512/basic/optional/ThreadPool/test/build_intel
new file mode 100755
index 0000000..accb0a0
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/build_intel
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# . /usr/local/modules/3.2.6/Modules/$MODULE_VERSION/bin/modulecmd tcsh	\
+#	load sierra-devel-desktop-intel-10.1ip
+
+
+TEST_SRC="test_main.c test_c_dnax.c test_tpi_unit.c test_pthreads.c"
+
+LIB_SRC="../src/TPI.c ../src/TPI_Walltime.c"
+
+LIB_OBJ="TPI.o TPI_Walltime.o"
+
+#CFLAGS="-std=c99 -strict-ansi -Wall -Wcheck -Werror -wd141 -wd869 -wd1418 -wd1419"
+#CFLAGS="-std=c89 -strict-ansi -Wall -Wcheck -Werror -wd141 -wd869 -wd1418 -wd1419"
+CCFLAGS="        -strict-ansi -Wall -Wcheck -Werror -wd141 -wd869 -wd1418 -wd1419"
+
+OPT="-O3"
+# OPT="-g"
+# OPT="-O"
+
+echo build ${OPT}
+
+#-----------------------------------------------------------------------
+
+rm -f ThreadPool_config.h
+echo "#define HAVE_PTHREAD 1" > ThreadPool_config.h
+
+icc	${CFLAGS} ${OPT} -c	\
+	-I. -I../src ${LIB_SRC}
+
+icc	${CFLAGS} ${OPT}	\
+	-o test_tpi.intel.exe	\
+	-I. -I../src ${TEST_SRC} ${LIB_OBJ} -lpthread
+
+icc	${CCFLAGS} ${OPT}	\
+	-o test_tpi_cpp.intel.exe	\
+	-I. -I../src test_tpi.cpp ${LIB_OBJ} -lpthread -lstdc++
+
+icc	${CFLAGS} ${OPT}	\
+	-o test_sum.intel.exe	\
+	-I. -I../src test_mpi_sum.c ${LIB_OBJ} -lpthread
+
+#-----------------------------------------------------------------------
+
+mpicc	${CFLAGS} ${OPT}	\
+	-o test_sum.mpi.intel.exe	\
+	-I. -I../src -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread
+
+#-----------------------------------------------------------------------
+
+rm -f ThreadPool_config.h
+echo "/* #define HAVE_PTHREAD 1 */" > ThreadPool_config.h
+
+icc	${CFLAGS} ${OPT} -c	\
+	-I. -I../src ${LIB_SRC}
+
+icc	${CFLAGS} ${OPT}	\
+	-o test_tpi.intel.noth.exe	\
+	-I. -I../src ${TEST_SRC} ${LIB_OBJ} -lpthread
+
+icc	${CCFLAGS} ${OPT}	\
+	-o test_tpi_cpp.intel.noth.exe	\
+	-I. -I../src test_tpi.cpp ${LIB_OBJ} -lpthread -lstdc++
+
+icc	${CFLAGS} ${OPT}	\
+	-o test_sum.intel.noth.exe	\
+	-I. -I../src test_mpi_sum.c ${LIB_OBJ} -lpthread
+
+#-----------------------------------------------------------------------
+
+rm -f ThreadPool_config.h
+echo "/* #define HAVE_PTHREAD 1 */" > ThreadPool_config.h
+echo "#define HAVE_MPI 1" >> ThreadPool_config.h
+
+mpicc	${CFLAGS} ${OPT}	\
+	-o test_sum.mpi.intel.noth.exe	\
+	-I. -I../src -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread
+
+#-----------------------------------------------------------------------
+
+rm -f ThreadPool_config.h
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/build_pgi b/openmp-avx512/basic/optional/ThreadPool/test/build_pgi
new file mode 100755
index 0000000..85799cc
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/build_pgi
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+export LM_LICENSE_FILE=7500@reddish
+PGI_HOME="/usr/local/pgi_64/linux86-64/7.0-7"
+MPICH_HOME="/usr/local/mpi/mpich/64Bit/1.2.7/pgi-6.0"
+
+export PATH="${PGI_HOME}/bin:${PATH}"
+
+TEST_SRC="test_main.c test_c_dnax.c test_c_tpi.c test_pthreads.c"
+
+LIB_SRC="../src/TPI_pthreads.c ../src/TPI_Walltime.c ../src/TPI_Concurrency.c"
+
+LIB_OBJ="TPI_pthreads.o TPI_Walltime.o TPI_Concurrency.o"
+
+#-----------------------------------------------------------------------
+
+pgcc	-O4 -c	\
+	-I../include ${LIB_SRC} -lpthread
+
+pgcc	-O4	\
+	-o test_tpi.pgi.exe	\
+	-I../include ${TEST_SRC} ${LIB_OBJ} -lpthread
+
+pgCC	-O4	\
+	-o test_tpi_cpp.pgi.exe	\
+	-I../include test_tpi.cpp ${LIB_OBJ} -lpthread
+
+#-----------------------------------------------------------------------
+# Enable PGI-MPI installation to accept as large a message as possible, 200 Mb
+
+# export P4_GLOBMEMSIZE="268435456"
+
+export PATH="${MPICH_HOME}/bin:${PGI_HOME}/bin:${PATH}"
+
+mpicc	-c99	\
+	-O4	\
+	-o test_sum.mpi.pgi.exe	\
+	-I../include -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.c b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.c
new file mode 100644
index 0000000..5f2866f
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.c
@@ -0,0 +1,562 @@
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <BoxPartitionIB.h>
+
+/*--------------------------------------------------------------------*/
+/* Recursively split a box into into (up-ip) sub-boxes */
+
+typedef const int RangeInput[2] ;
+typedef       int RangeOutput[2] ;
+typedef RangeInput  * const BoxInput ;
+typedef RangeOutput * const BoxOutput ;
+
+static 
+void box_partition( int ip , int up , int axis ,
+                    BoxInput box ,
+                    int (* const p_box)[3][2] )
+{
+  const int np = up - ip ;
+  if ( 1 == np ) {
+    p_box[ip][0][0] = box[0][0] ; p_box[ip][0][1] = box[0][1] ;
+    p_box[ip][1][0] = box[1][0] ; p_box[ip][1][1] = box[1][1] ;
+    p_box[ip][2][0] = box[2][0] ; p_box[ip][2][1] = box[2][1] ;
+  }
+  else {
+    const int n = box[ axis ][1] - box[ axis ][0] ;
+    const int np_low = np / 2 ;  /* Rounded down */
+    const int np_upp = np - np_low ;
+
+    const int n_upp = (int) (((double) n) * ( ((double)np_upp) / ((double)np)));
+    const int n_low = n - n_upp ;
+    const int next_axis = ( axis + 2 ) % 3 ;
+
+    if ( np_low ) { /* P = [ip,ip+np_low) */
+      int dbox[3][2] ;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      dbox[ axis ][1] = dbox[ axis ][0] + n_low ;
+
+      box_partition( ip, ip + np_low, next_axis,
+                     (const int (*)[2]) dbox, p_box );
+    }
+
+    if ( np_upp ) { /* P = [ip+np_low,ip+np_low+np_upp) */
+      int dbox[3][2] ;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      ip += np_low ;
+      dbox[ axis ][0] += n_low ;
+      dbox[ axis ][1]  = dbox[ axis ][0] + n_upp ;
+
+      box_partition( ip, ip + np_upp, next_axis,
+                     (const int (*)[2]) dbox, p_box );
+    }
+  }
+}
+
+void box_partition_rcb( const int np ,
+                        const int root_box[3][2] ,
+                        int    pbox[][3][2] )
+{
+  box_partition( 0 , np , 2 , root_box , pbox );
+}
+
+/*--------------------------------------------------------------------*/
+
+static int box_intersect( BoxInput a , BoxInput b , BoxOutput c )
+{
+  int i ;
+  for ( i = 0 ; i < 3 ; ++i ) {
+    c[i][0] = a[i][0] < b[i][0] ? b[i][0] : a[i][0] ;
+    c[i][1] = a[i][1] < b[i][1] ? a[i][1] : b[i][1] ;
+  }
+
+  return c[0][0] < c[0][1] && c[1][0] < c[1][1] && c[2][0] < c[2][1] ;
+}
+ 
+
+/*--------------------------------------------------------------------*/
+
+static void global_to_use_box( BoxInput gbox ,
+                               BoxInput pbox ,
+                               const int ghost ,
+                                     BoxOutput interiorBox ,
+                                     BoxOutput useBox )
+{
+  int i = 0 ;
+
+  for ( i = 0 ; i < 3 ; ++i ) {
+    const int n = pbox[i][1] - pbox[i][0] ;
+
+    if ( n < 0 ) {
+      abort();
+    }
+
+    interiorBox[i][0] = gbox[i][0] == pbox[i][0]
+                      ? gbox[i][0] :  pbox[i][0] + ghost ;
+
+    interiorBox[i][1] = gbox[i][1] == pbox[i][1]
+                      ? gbox[i][1] :  pbox[i][1] - ghost ;
+
+    if ( interiorBox[i][1] < pbox[i][0] ) {
+      interiorBox[i][1] = pbox[i][0] ;
+    }
+
+    if ( interiorBox[i][0] > pbox[i][1] ) {
+      interiorBox[i][0] = pbox[i][1] ;
+    }
+
+    if ( interiorBox[i][1] < interiorBox[i][0] ) {
+      interiorBox[i][1] = interiorBox[i][0] ;
+    }
+
+    useBox[i][0] = pbox[i][0] - ghost ;
+    useBox[i][1] = pbox[i][1] + ghost ;
+
+    if ( useBox[i][0] < gbox[i][0] ) { useBox[i][0] = gbox[i][0] ; }
+    if ( useBox[i][1] > gbox[i][1] ) { useBox[i][1] = gbox[i][1] ; }
+  }
+}
+
+
+/*  A use-box is the owned box plus the ghost layers.
+ *  Map a global (x,y,z) to a local integer ordinate.
+ */
+static int map_global_to_use_box( BoxInput useBox ,
+                                  const int global_x ,
+                                  const int global_y ,
+                                  const int global_z )
+{
+  const int nx = useBox[0][1] - useBox[0][0] ;
+  const int ny = useBox[1][1] - useBox[1][0] ;
+  const int nz = useBox[2][1] - useBox[2][0] ;
+  const int ix = global_x     - useBox[0][0] ;
+  const int iy = global_y     - useBox[1][0] ;
+  const int iz = global_z     - useBox[2][0] ;
+
+  const int good = 0 <= ix && ix < nx &&
+                   0 <= iy && iy < ny &&
+                   0 <= iz && iz < nz ;
+
+  if ( nx < 0 || ny < 0 || nz < 0 ) {
+    abort();
+  }
+  if ( ! good ) {
+    abort();
+  }
+
+  return good ? ix + iy * nx + iz * nx * ny : -1 ;
+}
+
+int box_map_local( const int local_uses[3][2] ,
+                   const int map_local_id[] ,
+                   const int global_x ,
+                   const int global_y ,
+                   const int global_z )
+{
+  int i = map_global_to_use_box( local_uses , global_x , global_y , global_z );
+
+  if ( 0 <= i ) { i = map_local_id[i] ; }
+
+  return i ;
+}
+
+
+/*--------------------------------------------------------------------*/
+
+static void resize_int( int ** a , int * allocLen , int newLen )
+{
+  int k = 32;
+  while ( k < newLen ) { k <<= 1 ; }
+  if ( NULL == *a )
+    { *a = malloc( sizeof(int)*(*allocLen = k) ); }
+  else if ( *allocLen < k ) 
+    { *a = realloc(*a , sizeof(int)*(*allocLen = k)); }
+}
+
+void box_partition_map( 
+  const int np ,
+  const int my_p ,
+  const int gbox[3][2] ,
+  const int pbox[][3][2] ,
+  const int ghost ,
+
+  int    map_use_box[3][2] ,
+  int    map_local_id[] ,
+  int *  map_count_interior ,
+  int *  map_count_owns ,
+  int *  map_count_uses ,
+  int ** map_recv_pc ,
+  int ** map_send_pc ,
+  int ** map_send_id )
+{
+  int * recv_pc = (int *) malloc( ( np + 1 ) * sizeof(int) );
+  int * send_pc = (int *) malloc( ( np + 1 ) * sizeof(int) );
+
+  int   id_length = 0 ;
+
+  int * send_id  = NULL ;
+  int   send_id_size = 0 ;
+
+  int own_length , use_length , int_length ;
+  int count_interior , count_parallel ;
+  int iSend ;
+  int g_ix , g_iy , g_iz ;
+  int i ;
+
+  int my_int_box[3][2] ;
+
+  global_to_use_box( gbox , pbox[my_p] , ghost , my_int_box , map_use_box );
+
+  own_length = ( pbox[my_p][0][1] - pbox[my_p][0][0] ) *
+               ( pbox[my_p][1][1] - pbox[my_p][1][0] ) *
+               ( pbox[my_p][2][1] - pbox[my_p][2][0] );
+
+  use_length = ( map_use_box[0][1] - map_use_box[0][0] ) *
+               ( map_use_box[1][1] - map_use_box[1][0] ) *
+               ( map_use_box[2][1] - map_use_box[2][0] );
+
+  int_length = ( my_int_box[0][1] - my_int_box[0][0] ) *
+               ( my_int_box[1][1] - my_int_box[1][0] ) *
+               ( my_int_box[2][1] - my_int_box[2][0] );
+
+  for ( i = 0 ; i < id_length ; ++i ) { map_local_id[i] = -1 ; }
+
+  /* Fill in locally owned portion: { interior , parallel } */
+
+  count_interior = 0 ;
+  count_parallel = int_length ;
+
+  for ( g_iz = pbox[my_p][2][0] ; g_iz < pbox[my_p][2][1] ; ++g_iz ) {
+  for ( g_iy = pbox[my_p][1][0] ; g_iy < pbox[my_p][1][1] ; ++g_iy ) {
+  for ( g_ix = pbox[my_p][0][0] ; g_ix < pbox[my_p][0][1] ; ++g_ix ) {
+
+    const int local =
+      map_global_to_use_box( (BoxInput) map_use_box, g_ix, g_iy, g_iz );
+
+    if ( local < 0 ) { 
+      abort();
+    }
+
+    if ( my_int_box[2][0] <= g_iz && g_iz < my_int_box[2][1] &&
+         my_int_box[1][0] <= g_iy && g_iy < my_int_box[1][1] &&
+         my_int_box[0][0] <= g_ix && g_ix < my_int_box[0][1] ) {
+      /* Interior */
+      map_local_id[ local ] = count_interior++ ;
+    }
+    else {
+      /* Parallel */
+      map_local_id[ local ] = count_parallel++ ;
+    }
+  }
+  }
+  }
+
+  if ( count_interior != int_length ) { abort(); }
+  if ( count_parallel != own_length ) { abort(); }
+
+  /* Fill in off-process received portion: { ( i + my_p ) % np } */
+
+  recv_pc[0] = count_parallel ;
+  recv_pc[1] = count_parallel ;
+  send_pc[0] = 0 ;
+  send_pc[1] = 0 ;
+  iSend = 0 ;
+
+  for ( i = 1 ; i < np ; ++i ) {
+    const int ip = ( i + my_p ) % np ;
+    int recv_box[3][2] ;
+    int send_box[3][2] ;
+    int other_int_box[3][2] ;
+    int other_use_box[3][2] ;
+
+    /* Received portions */
+
+    if ( box_intersect( (BoxInput) map_use_box , (BoxInput) pbox[ip] , recv_box ) ) {
+
+      for ( g_iz = recv_box[2][0] ; g_iz < recv_box[2][1] ; ++g_iz ) {
+      for ( g_iy = recv_box[1][0] ; g_iy < recv_box[1][1] ; ++g_iy ) {
+      for ( g_ix = recv_box[0][0] ; g_ix < recv_box[0][1] ; ++g_ix ) {
+
+        const int local = map_global_to_use_box( (BoxInput) map_use_box, g_ix, g_iy, g_iz );
+
+        map_local_id[ local ] = count_parallel++ ;
+      }
+      }
+      }
+    }
+    recv_pc[i+1] = count_parallel ;
+
+    /* Sent items */
+
+    global_to_use_box( gbox, pbox[ip], ghost, other_int_box, other_use_box );
+
+    if ( box_intersect( (BoxInput) other_use_box , (BoxInput) pbox[my_p] , send_box ) ) {
+
+      int nSend = ( send_box[0][1] - send_box[0][0] ) *
+                  ( send_box[1][1] - send_box[1][0] ) *
+                  ( send_box[2][1] - send_box[2][0] );
+
+      resize_int( & send_id , & send_id_size , (iSend + nSend ) );
+
+      for ( g_iz = send_box[2][0] ; g_iz < send_box[2][1] ; ++g_iz ) {
+      for ( g_iy = send_box[1][0] ; g_iy < send_box[1][1] ; ++g_iy ) {
+      for ( g_ix = send_box[0][0] ; g_ix < send_box[0][1] ; ++g_ix ) {
+
+        const int local = map_global_to_use_box( (BoxInput) map_use_box, g_ix, g_iy, g_iz );
+
+        if ( map_local_id[ local ] < count_interior ) { abort(); }
+
+        send_id[ iSend ] = map_local_id[ local ] ;
+        ++iSend ;
+      }
+      }
+      }
+    }
+    send_pc[i+1] = iSend ;
+  }
+
+  if ( count_parallel != use_length ) { abort(); }
+
+  *map_count_interior = int_length ;
+  *map_count_owns     = own_length ;
+  *map_count_uses     = use_length ;
+  *map_recv_pc        = recv_pc ;
+  *map_send_pc        = send_pc ;
+  *map_send_id        = send_id ;
+}
+
+/*--------------------------------------------------------------------*/
+
+#ifdef UNIT_TEST
+
+static int box_contain( const int a[3][2] , const int b[3][2] )
+{
+  return a[0][0] <= b[0][0] && b[0][1] <= a[0][1] &&
+         a[1][0] <= b[1][0] && b[1][1] <= a[1][1] &&
+         a[2][0] <= b[2][0] && b[2][1] <= a[2][1] ;
+}
+
+static void box_print( FILE * fp , const int a[][2] )
+{
+  fprintf(fp,"{ [ %d , %d ) , [ %d , %d ) , [ %d , %d ) }",
+                a[0][0] , a[0][1] ,  
+                a[1][0] , a[1][1] ,  
+                a[2][0] , a[2][1] );
+}
+
+static int box_disjoint( BoxInput a , BoxInput b )
+{
+  return a[0][1] <= b[0][0] || b[0][1] <= a[0][0] ||
+         a[1][1] <= b[1][0] || b[1][1] <= a[1][0] ||
+         a[2][1] <= b[2][0] || b[2][1] <= a[2][0] ;
+}
+
+
+static void test_box( const int box[3][2] , const int np )
+{
+  const int ncell_box = box[0][1] * box[1][1] * box[2][1] ;
+  int ncell_total = 0 ;
+  int ncell_min = ncell_box ;
+  int ncell_max = 0 ;
+  int (*pbox)[3][2] ;
+  int i , j ;
+
+  pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
+
+  box_partition( 0 , np , 2 , box , pbox );
+
+  for ( i = 0 ; i < np ; ++i ) {
+    const int ncell = ( pbox[i][0][1] - pbox[i][0][0] ) *
+                      ( pbox[i][1][1] - pbox[i][1][0] ) *
+                      ( pbox[i][2][1] - pbox[i][2][0] );
+
+    if ( ! box_contain( box , (const int (*)[2]) pbox[i] ) ) {
+      fprintf(stdout,"  OUT OF BOUNDS pbox[%d/%d] = ",i,np);
+      box_print(stdout,(const int (*)[2]) pbox[i]);
+      fprintf(stdout,"\n");
+      abort();
+    }
+
+    for ( j = i + 1 ; j < np ; ++j ) {
+      if ( ! box_disjoint( (const int (*)[2]) pbox[i] ,
+                           (const int (*)[2]) pbox[j] ) ) {
+        fprintf(stdout,"  NOT DISJOINT pbox[%d/%d] = ",i,np);
+        box_print(stdout, (const int (*)[2]) pbox[i]);
+        fprintf(stdout,"\n");
+        fprintf(stdout,"               pbox[%d/%d] = ",j,np);
+        box_print(stdout, (const int (*)[2]) pbox[j]);
+        fprintf(stdout,"\n");
+        abort();
+      }
+    }
+    ncell_total += ncell ;
+
+    if ( ncell_max < ncell ) { ncell_max = ncell ; }
+    if ( ncell < ncell_min ) { ncell_min = ncell ; }
+  }
+
+  if ( ncell_total != ncell_box ) {
+    fprintf(stdout,"  WRONG CELL COUNT NP = %d\n",np);
+    abort();
+  }
+  fprintf(stdout,"NP = %d, total = %d, avg = %d, min = %d, max = %d\n",
+          np,ncell_box,ncell_box/np,ncell_min,ncell_max);
+
+  free( pbox );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void test_maps( const int root_box[][2] , const int np )
+{
+  const int ghost = 1 ;
+  const int nx_global = root_box[0][1] - root_box[0][0] ;
+  const int ny_global = root_box[1][1] - root_box[1][0] ;
+  int map_count_interior , map_count_owns , map_count_uses ;
+  int map_use_box[3][2] ;
+  int ieq , i , j ;
+  int (*pbox)[3][2] ;
+  int **local_values ;
+  int **map_local_id ;
+  int **map_recv_pc ;
+  int **map_send_pc ;
+  int **map_send_id ;
+  
+  pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
+
+  box_partition( 0 , np , 2 , root_box , pbox );
+
+  local_values = (int **) malloc( sizeof(int*) * np );
+  map_local_id = (int **) malloc( sizeof(int*) * np );
+  map_recv_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_id  = (int **) malloc( sizeof(int*) * np );
+
+  /* Set each local value to the global equation number */
+
+  for ( ieq = i = 0 ; i < np ; ++i ) {
+    const int (*mybox)[2] = (const int (*)[2]) pbox[i] ;
+    const int nx = mybox[0][1] - mybox[0][0] ;
+    const int ny = mybox[1][1] - mybox[1][0] ;
+    const int nz = mybox[2][1] - mybox[2][0] ;
+    int ix , iy , iz ;
+
+    map_local_id[i] = (int *) malloc( sizeof(int) *
+                                      ( nx + 2 * ghost ) *
+                                      ( ny + 2 * ghost ) *
+                                      ( nz + 2 * ghost ) );
+
+    /* Generate the partition maps for this rank */
+    box_partition_map( np , i , root_box ,
+                        (const int (*)[3][2])  pbox , ghost ,
+                        map_use_box ,
+                        map_local_id[i] ,
+                        & map_count_interior ,
+                        & map_count_owns ,
+                        & map_count_uses ,
+                        & map_recv_pc[i] , 
+                        & map_send_pc[i] , & map_send_id[i] );
+
+    if ( map_count_uses != map_recv_pc[i][np] ) { abort(); }
+
+    local_values[i] = (int *) malloc( sizeof(int) * map_count_uses );
+
+    for ( iz = map_use_box[2][0] ; iz < map_use_box[2][1] ; ++iz ) {
+    for ( iy = map_use_box[1][0] ; iy < map_use_box[1][1] ; ++iy ) {
+    for ( ix = map_use_box[0][0] ; ix < map_use_box[0][1] ; ++ix ) {
+
+      const int igrid = map_global_to_use_box((BoxInput)map_use_box,ix,iy,iz);
+      const int ieq   = map_local_id[i][ igrid ];
+
+      if ( 0 <= ieq ) {
+        local_values[i][ ieq ] =
+          ix + iy * nx_global + iz * nx_global * ny_global ;
+      }
+    }
+    }
+    }
+  }
+
+  /* Pair-wise compare the local values */
+  /* i  == receiving processor rank */
+  /* ip == sending   processor rank */
+  /* j  == receiving processor data entry for message from 'ip' */
+  /* jp == sending   processor data entry for message to   'i' */
+
+  for ( i = 0 ; i < np ; ++i ) {
+    for ( j = 1 ; j < np ; ++j ) {
+      const int ip = ( i + j ) % np ;
+      const int jp = ( i + np - ip ) % np ;
+      const int nrecv = map_recv_pc[i] [j+1]  - map_recv_pc[i] [j] ;
+      const int nsend = map_send_pc[ip][jp+1] - map_send_pc[ip][jp] ;
+      int k ;
+      if ( nrecv != nsend ) {
+        fprintf(stderr,"P%d recv %d from P%d\n",i,nrecv,ip);
+        fprintf(stderr,"P%d send %d to   P%d\n",ip,nsend,i);
+        abort();
+      }
+      for ( k = 0 ; k < nrecv ; ++k ) {
+        const int irecv = map_recv_pc[i][j] + k ;
+        const int isend = map_send_pc[ip][jp] + k ;
+        const int val_irecv = local_values[i][irecv] ;
+        const int val_isend = local_values[ip][ map_send_id[ip][isend] ] ;
+        if ( val_irecv != val_isend ) {
+          fprintf(stderr,"P%d recv[%d] = %d , from P%d\n",i,k,val_irecv,ip);
+          fprintf(stderr,"P%d send[%d] = %d , to   P%d\n",ip,k,val_isend,i);
+          abort();
+        }
+      }
+    }
+  }
+
+  for ( i = 0 ; i < np ; ++i ) {
+    free( map_local_id[i] );
+    free( map_recv_pc[i] );
+    free( map_send_pc[i] );
+    free( map_send_id[i] );
+    free( local_values[i] );
+  }
+  free( map_send_id );
+  free( map_send_pc );
+  free( map_recv_pc );
+  free( map_local_id );
+  free( local_values );
+  free( pbox );
+}
+
+/*--------------------------------------------------------------------*/
+
+int main( int argc , char * argv[] )
+{
+  int np_max = 256 ;
+  int box[3][2] = { { 0 , 64 } , { 0 , 64 } , { 0 , 64 } };
+  int np = 0 ;
+
+  switch( argc ) {
+  case 3:
+    sscanf(argv[1],"%d",&np);
+    sscanf(argv[2],"%dx%dx%d",& box[0][1] , & box[1][1] , & box[2][1] );
+    if ( 0 < np ) { test_box(  (const int (*)[2]) box , np ); }
+    if ( 0 < np ) { test_maps( (const int (*)[2]) box , np ); }
+    break ;
+  default:
+    for ( np = 1 ; np <= np_max ; ++np ) {
+      test_box(  (const int (*)[2]) box , np );
+      test_maps( (const int (*)[2]) box , np );
+    }
+    break ;
+  }
+  return 0 ;
+}
+
+#endif
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.h b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.h
new file mode 100644
index 0000000..71d71f5
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.h
@@ -0,0 +1,88 @@
+
+
+#ifndef BoxPartionIB_h
+#define BoxPartionIB_h
+
+/** \brief  Partition a { [ix,jx) X [iy,jy) X [iz,jz) } box.
+ *
+ *  Use recursive coordinate bisection to partition a box 
+ *  into np disjoint sub-boxes.  Allocate (via malloc) and
+ *  populate the sub-boxes, mapping the local (x,y,z) to
+ *  a local ordinal, and mappings for the send-recv messages
+ *  to update the ghost cells.
+ *
+ *  Order local ordinates as follows:
+ *    {
+ *      interior ,
+ *      boundary ,
+ *      remote[ ( my_p + i ) % np ]
+ *    } 
+ *      where i = 1..(np-1)
+ *
+ *  usage:
+ *
+ *  my_nx = pbox[my_p][0][1] - pbox[my_p][0][0] ;
+ *  my_ny = pbox[my_p][1][1] - pbox[my_p][1][0] ;
+ *  my_nz = pbox[my_p][2][1] - pbox[my_p][2][0] ;
+ *
+ *  for ( x = -ghost ; x < my_nx + ghost ; ++x ) {
+ *  for ( y = -ghost ; y < my_ny + ghost ; ++y ) {
+ *  for ( z = -ghost ; z < my_nz + ghost ; ++z ) {
+ *    const int x_global = x + pbox[my_p][0][0] ;
+ *    const int y_global = y + pbox[my_p][1][0] ;
+ *    const int z_global = z + pbox[my_p][2][0] ;
+ *
+ *    const int local_ordinal =
+ *      box_map_local( pbox[my_p], ghost, map_local_id, x, y, z );
+ *
+ *    if ( 0 <= local_ordinal ) {
+ *    }
+ *  }
+ *  
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int recv_processor = ( my_p + i ) % np ;
+ *    const int recv_ordinal_begin = map_recv_pc[i];
+ *    const int recv_ordinal_end   = map_recv_pc[i+1];
+ *  }
+ *
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int send_processor = ( my_p + i ) % np ;
+ *    const int send_map_begin = map_send_pc[i];
+ *    const int send_map_end   = map_send_pc[i+1];
+ *    for ( j = send_map_begin ; j < send_map_end ; ++j ) {
+ *      send_ordinal = map_send_id[j] ;
+ *    }
+ *  }
+ */
+
+
+void box_partition_rcb(
+  const int np              /**< [in] Number of partitions */ ,
+  const int root_box[3][2]  /**< [in] Global 3D box to partition  */ ,
+  int       pbox[][3][2]    /**< [out] Partition of global 3D boxes */ );
+
+void box_partition_map(
+  const int np            /**< [in] Number of partitions */ ,
+  const int my_p          /**< [in] My partition */ ,
+  const int gbox[3][2]    /**< [in] Global 3D box */ ,
+  const int pbox[][3][2]  /**< [in] Partitions of global 3D box */ ,
+  const int ghost         /**< [in] Number of grid points to ghost */ ,
+
+  int    map_uses_box[3][2]  /**< [out] Local box expanded by ghosting */ ,
+  int    map_local_id[]      /**< [out] Mapping for local points */ ,
+  int *  map_count_interior  /**< [out] Number of my interior points */ ,
+  int *  map_count_owns      /**< [out] Number of points I own */ ,
+  int *  map_count_uses      /**< [out] Number of points I access */ ,
+  int ** map_recv_pc         /**< [out] Received prefix spans per process */ ,
+  int ** map_send_pc         /**< [out] Send prefix counts per process */ ,
+  int ** map_send_id         /**< [out] Send grid points */ );
+
+/* \brief  Map a global (x,y,z) to a local ordinal.  */
+int box_map_local( const int local_uses[3][2] ,
+                   const int map_local_id[] ,
+                   const int global_x ,
+                   const int global_y ,
+                   const int global_z );
+
+#endif
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CGSolver.c b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CGSolver.c
new file mode 100644
index 0000000..55f739d
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CGSolver.c
@@ -0,0 +1,311 @@
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <ThreadPool_config.h>
+#include <TPI.h>
+#include <tpi_vector.h>
+#include <CGSolver.h>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+/*--------------------------------------------------------------------*/
+
+void cgsolve_set_lhs( const struct distributed_crs_matrix * const matrix ,
+                      const VECTOR_SCALAR * const x ,
+                            VECTOR_SCALAR * const b )
+{
+  const int nRow = matrix->n_local_row ;
+  const int nVec = matrix->p_recv_pc[ matrix->p_size ] ;
+
+  VECTOR_SCALAR * const p =
+    (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
+
+  tpi_copy( nRow , x , p );
+
+  dcrs_apply( matrix , p , b );
+
+  free( p );
+}
+
+/*--------------------------------------------------------------------*/
+
+/*  x += alpha * p ;
+ *  r -= alpha * Ap ;
+ *  return dot( r , r );
+ */
+static
+double cgsolver_update( const int length ,
+                        const VECTOR_SCALAR alpha ,
+                        const VECTOR_SCALAR * p ,
+                        const VECTOR_SCALAR * Ap ,
+                              VECTOR_SCALAR * x ,
+                              VECTOR_SCALAR * r );
+
+/*--------------------------------------------------------------------*/
+
+void cgsolve_blas( const struct distributed_crs_matrix * matrix ,
+                   const VECTOR_SCALAR * const b ,
+                         VECTOR_SCALAR * const x ,
+                   const VECTOR_SCALAR tolerance ,
+                   const int max_iter ,
+                   const int print_iter ,
+                         int    * const iter_count ,
+                         VECTOR_SCALAR * const norm_resid ,
+                         double * const solve_dt )
+{
+  const int nRow = matrix->n_local_row ;
+  const int nVec = matrix->p_recv_pc[ matrix->p_size ] ;
+
+  const VECTOR_SCALAR tol_2 = tolerance * tolerance ;
+
+  VECTOR_SCALAR * const r  =
+    (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
+  VECTOR_SCALAR * const p  =
+    (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
+  VECTOR_SCALAR * const Ap =
+    (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
+
+  VECTOR_SCALAR rtrans = 0.0 ;
+  VECTOR_SCALAR beta = 0.0 ;
+  VECTOR_SCALAR pAp = 0.0 ;
+  VECTOR_SCALAR alpha ;
+  double time_begin , time_end ;
+
+  int k ;
+
+  tpi_copy( nRow , b , r );
+  tpi_copy( nRow , x , p );
+
+  /*  Ap = matrix * p ; */
+  dcrs_apply( matrix , p , Ap );
+
+  /*  r -= Ap ; */
+  tpi_axpy( nRow , -1.0 , Ap , r );
+
+  rtrans = tpi_dot( nRow , r , r );
+
+  time_begin = TPI_Walltime();
+
+  for ( k = 0 ; k < max_iter && tol_2 < rtrans ; ++k ) {
+
+    /*  p = r + beta * p ; */
+    tpi_xpby( nRow, r, beta, p ); /* parallel */
+
+    dcrs_apply( matrix , p , Ap );
+
+    pAp = tpi_dot( nRow , p , Ap );
+
+    /* If orthogonal then cannot update */
+    alpha = 0 < fabs( pAp ) ? rtrans / pAp : 0.0 ;
+
+    /*  x += alpha * p ;
+     *  r -= alpha * Ap ;
+     *  return dot( r , r );
+     */
+    beta = rtrans ;
+
+    tpi_axpy( nRow ,  alpha , p , x );
+    tpi_axpy( nRow , -alpha , Ap , r );
+    rtrans = tpi_dot( nRow , r , r );
+    beta = rtrans / beta ;
+  }
+
+  time_end = TPI_Walltime();
+
+#ifdef HAVE_MPI
+  {
+    double tb = time_begin ;
+    double te = time_end ;
+    MPI_Allreduce(&tb, &time_begin, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+    MPI_Allreduce(&te, &time_end,   1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  }
+#endif
+
+  *solve_dt += time_end - time_begin ;
+
+  *norm_resid = sqrt( rtrans );
+  *iter_count = k ;
+
+  free( Ap );
+  free( p );
+  free( r );
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+void cgsolve( const struct distributed_crs_matrix * matrix ,
+              const VECTOR_SCALAR * const b ,
+                    VECTOR_SCALAR * const x ,
+              const int overlap_comm ,
+              const VECTOR_SCALAR tolerance ,
+              const int max_iter ,
+              const int print_iter ,
+                    int    * const iter_count ,
+                    VECTOR_SCALAR * const norm_resid ,
+                    double * const solve_dt )
+{
+  const int nRow = matrix->n_local_row ;
+  const int nVec = matrix->p_recv_pc[ matrix->p_size ] ;
+
+  const VECTOR_SCALAR tol_2 = tolerance * tolerance ;
+
+  VECTOR_SCALAR * const r  =
+    (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
+  VECTOR_SCALAR * const p  =
+    (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
+  VECTOR_SCALAR * const Ap =
+    (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
+
+  VECTOR_SCALAR rtrans = 0.0 ;
+  VECTOR_SCALAR beta = 0.0 ;
+  VECTOR_SCALAR pAp = 0.0 ;
+  VECTOR_SCALAR alpha ;
+  double time_begin , time_end ;
+
+  int k ;
+
+  tpi_copy( nRow , b , r );
+  tpi_copy( nRow , x , p );
+
+  /*  gather off-processor components of 'p'.
+   *  Ap = matrix * p ;
+   *  return dot( Ap , p );
+   */
+  pAp = dcrs_apply_and_dot( matrix , p , Ap , overlap_comm );
+
+  /*  r -= 1.0 * Ap ;
+   *  return dot( r , r );
+   */
+  alpha = 1.0 ;
+  rtrans = cgsolver_update( nRow, alpha, NULL, Ap, NULL, r ); /* parallel */
+
+  time_begin = TPI_Walltime();
+
+  for ( k = 0 ; k < max_iter && tol_2 < rtrans ; ++k ) {
+
+    /*  p = r + beta * p ; */
+    tpi_xpby( nRow, r, beta, p ); /* parallel */
+
+    /*  gather off-processor components of 'p'.
+     *  Ap = matrix * p ;
+     *  return dot( Ap , p );
+     */
+    pAp = dcrs_apply_and_dot( matrix , p , Ap , overlap_comm ); /* parallel */
+
+    /* If orthogonal then cannot update */
+    alpha = 0 < fabs( pAp ) ? rtrans / pAp : 0.0 ;
+
+    /*  x += alpha * p ;
+     *  r -= alpha * Ap ;
+     *  return dot( r , r );
+     */
+    beta = rtrans ;
+    rtrans = cgsolver_update( nRow , alpha , p , Ap , x , r ); /* parallel */
+    beta = rtrans / beta ;
+  }
+
+  time_end = TPI_Walltime();
+
+#ifdef HAVE_MPI
+  {
+    double tb = time_begin ;
+    double te = time_end ;
+    MPI_Allreduce(&tb, &time_begin, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+    MPI_Allreduce(&te, &time_end,   1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  }
+#endif
+
+  *solve_dt += time_end - time_begin ;
+
+  *norm_resid = sqrt( rtrans );
+  *iter_count = k ;
+
+  free( Ap );
+  free( p );
+  free( r );
+}
+
+/*--------------------------------------------------------------------*/
+
+struct tpi_work_cgsolve {
+  const VECTOR_SCALAR * p ;
+  const VECTOR_SCALAR * Ap ;
+        VECTOR_SCALAR * x ;
+        VECTOR_SCALAR * r ;
+        VECTOR_SCALAR alpha ;
+  int length ;
+};
+
+static void tpi_work_dot_join( TPI_Work * work , const void * src  )
+{ *((double *) work->reduce ) += *((const double *) src); }
+ 
+static void tpi_work_dot_init( TPI_Work * work )
+{ *((double *) work->reduce ) = 0 ; }
+
+static void tpi_work_update( TPI_Work * work )
+{
+  const struct tpi_work_cgsolve * const cg_work = 
+    (const struct tpi_work_cgsolve *) work->info ;
+
+  const int           length = cg_work->length ;
+  const VECTOR_SCALAR alpha  = cg_work->alpha ;
+  const VECTOR_SCALAR * const p  = cg_work->p ;
+  const VECTOR_SCALAR * const Ap = cg_work->Ap ;
+        VECTOR_SCALAR * const x  = cg_work->x ;
+        VECTOR_SCALAR * const r  = cg_work->r ;
+
+  double mag = 0 ;
+  int iBeg , iEnd , i ;
+
+  tpi_work_span( work , length , & iBeg , & iEnd );
+
+  if ( x ) { for ( i = iBeg ; i < iEnd ; ++i ) { x[i] += alpha * p[i]; } }
+
+  for ( i = iBeg ; i < iEnd ; ++i ) {
+    const VECTOR_SCALAR val = ( r[i] -= alpha * Ap[i] );
+    mag += val * val ;
+  }
+
+  *((double*) work->reduce ) = mag ;
+}
+
+double cgsolver_update( const int length ,
+                        const VECTOR_SCALAR alpha ,
+                        const VECTOR_SCALAR * p ,
+                        const VECTOR_SCALAR * Ap ,
+                              VECTOR_SCALAR * x ,
+                              VECTOR_SCALAR * r )
+{
+  struct tpi_work_cgsolve work ;
+
+  double result = 0.0 ;
+
+  work.length = length ;
+  work.alpha  = alpha ;
+  work.p  = p ;
+  work.Ap = Ap ;
+  work.x  = x ;
+  work.r  = r ;
+
+  TPI_Run_threads_reduce( tpi_work_update , & work ,
+                          tpi_work_dot_join , tpi_work_dot_init ,
+                          sizeof(result) , & result );
+
+#ifdef HAVE_MPI
+  {
+    double local = result ;
+    MPI_Allreduce( & local, & result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+  }
+#endif
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CGSolver.h b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CGSolver.h
new file mode 100644
index 0000000..f0ee6f6
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CGSolver.h
@@ -0,0 +1,40 @@
+
+#ifndef CGSolver_h
+#define CGSolver_h
+
+#include <tpi_vector.h>
+#include <dcrs_matrix.h>
+
+/*--------------------------------------------------------------------*/
+
+void cgsolve_set_lhs( const struct distributed_crs_matrix * matrix ,
+                      const VECTOR_SCALAR * const x ,
+                            VECTOR_SCALAR * const b );
+
+/* Solve with fused loops */
+void cgsolve( const struct distributed_crs_matrix * matrix ,
+              const VECTOR_SCALAR * const b ,
+                    VECTOR_SCALAR * const x ,
+              const int overlap_comm ,
+              const VECTOR_SCALAR tolerance ,
+              const int max_iter ,
+              const int print_iter ,
+                    int    * const iter_count ,
+                    VECTOR_SCALAR * const norm_resid ,
+                    double * const solve_dt );
+
+/* Solve with blas-like calls */
+void cgsolve_blas( const struct distributed_crs_matrix * matrix ,
+                   const VECTOR_SCALAR * const b ,
+                         VECTOR_SCALAR * const x ,
+                   const VECTOR_SCALAR tolerance ,
+                   const int max_iter ,
+                   const int print_iter ,
+                         int    * const iter_count ,
+                         VECTOR_SCALAR * const norm_resid ,
+                         double * const solve_dt );
+
+/*--------------------------------------------------------------------*/
+
+#endif
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CMakeLists.txt b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CMakeLists.txt
new file mode 100644
index 0000000..0c652cd
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/CMakeLists.txt
@@ -0,0 +1,83 @@
+
+INCLUDE(PackageAddExecutableAndTest)
+INCLUDE(PackageLibraryMacros)
+
+####################
+
+SET(HEADERS "")
+SET(SOURCES "")
+ 
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+ 
+SET(HEADERS ${HEADERS}
+  ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h
+  )
+ 
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+ 
+APPEND_SET(HEADERS
+  BoxPartition.h
+  CGSolver.h
+  tpi_vector.h
+  dcrs_matrix.h
+  )
+ 
+####################
+
+
+PACKAGE_ADD_EXECUTABLE(
+  test_tpi_hhpccg
+  COMM serial mpi
+  SOURCES main.c CGSolver.c BoxPartitionIB.c tpi_vector.c dcrs_matrix.c
+  DEPLIBS pthread m
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hhpccg
+  NAME test_tpi_hhpccg_serial_1
+  COMM serial
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hhpccg
+  NAME test_tpi_hhpccg_serial_2
+  COMM serial
+  ARGS "threads=2"
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hhpccg
+  NAME test_tpi_hhpccg_serial_4
+  COMM serial
+  ARGS "threads=4"
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hhpccg
+  NAME test_tpi_hhpccg_mpi_1
+  COMM mpi
+  NUM_MPI_PROCS 1
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hhpccg
+  NAME test_tpi_hhpccg_mpi_2
+  COMM mpi
+  NUM_MPI_PROCS 2
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hhpccg
+  NAME test_tpi_hhpccg_mpi_4
+  COMM mpi
+  NUM_MPI_PROCS 4
+  DIRECTORY .
+  )
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.c b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.c
new file mode 100644
index 0000000..d61404f
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.c
@@ -0,0 +1,314 @@
+
+#include <stdlib.h>
+#include <math.h>
+
+#include <ThreadPool_config.h>
+#include <TPI.h>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <dcrs_matrix.h>
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+#if ! defined( HAVE_MPI )
+
+static
+double comm_sum( double v ) { return v ; }
+
+#define get_off_process_entries( M , V )  /* */
+
+/*--------------------------------------------------------------------*/
+#else /* defined( HAVE_MPI ) */
+/*--------------------------------------------------------------------*/
+
+static
+double comm_sum( double v )
+{
+  double result = 0 ;
+  MPI_Allreduce( & v , & result , 1 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD );
+  return result ;
+}
+
+static
+void get_off_process_entries(
+  const struct distributed_crs_matrix * const matrix ,
+  VECTOR_SCALAR * const vec )
+{
+  const int np   = matrix->p_size ;
+  const int my_p = matrix->p_rank ;
+  const int * const recv_pc = matrix->p_recv_pc ;
+  const int * const send_pc = matrix->p_send_pc ;
+  const int * const send_id = matrix->p_send_id ;
+  int i , irecv ;
+
+  for ( irecv = 0 , i = 1 ; i < np ; ++i ) {
+    if ( recv_pc[i] < recv_pc[i+1] ) ++irecv ;
+  }
+
+  {
+    VECTOR_SCALAR * const send_buf =
+      (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * send_pc[np] );
+
+    MPI_Request * const recv_request =
+      (MPI_Request *) malloc( sizeof(MPI_Request) * irecv );
+
+    MPI_Status * const recv_status =
+      (MPI_Status *) malloc( sizeof(MPI_Status) * irecv );
+
+    for ( irecv = 0 , i = 1 ; i < np ; ++i ) {
+      const int ip = ( i + my_p ) % np ;
+      const int recv_beg    = recv_pc[i];
+      const int recv_length = recv_pc[i+1] - recv_beg ;
+      if ( recv_length ) {
+        MPI_Irecv( vec + recv_beg ,
+                   recv_length * sizeof(VECTOR_SCALAR), MPI_BYTE ,
+                   ip , 0 , MPI_COMM_WORLD , recv_request + irecv );
+        ++irecv ;
+      }
+    }
+
+    /* Gather components into send buffer */
+
+    for ( i = 0 ; i < send_pc[np] ; ++i ) {
+      send_buf[i] = vec[ send_id[i] ];
+    }
+
+    MPI_Barrier( MPI_COMM_WORLD );
+
+    for ( i = 1 ; i < np ; ++i ) {
+      const int ip = ( i + my_p ) % np ;
+      const int send_beg    = send_pc[i];
+      const int send_length = send_pc[i+1] - send_beg ;
+      if ( send_length ) { /* Send to 'i' */
+        MPI_Rsend( send_buf + send_beg ,
+                   send_length * sizeof(VECTOR_SCALAR), MPI_BYTE ,
+                   ip , 0 , MPI_COMM_WORLD );
+      }
+    }
+
+    MPI_Waitall( irecv , recv_request , recv_status );
+
+    free( recv_status );
+    free( recv_request );
+    free( send_buf );
+  }
+}
+
+#endif
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+static void dcrs_apply_and_dot_span(
+  const struct distributed_crs_matrix * const matrix ,
+  const int span_begin ,
+  const int span_end ,
+  const VECTOR_SCALAR * const x ,
+        VECTOR_SCALAR * const y ,
+        double        * const result )
+{
+  const int           * const A_pc  = matrix->A_pc ;
+  const int           * const A_ia  = matrix->A_ia ;
+  const MATRIX_SCALAR * const A_a   = matrix->A_a ;
+
+  double dot_x_y = *result ;
+
+  int row = span_begin ;
+
+  for ( ; row < span_end ; ++row ) {
+    const int pcBeg = A_pc[ row ];
+    const int pcEnd = A_pc[ row + 1 ];
+
+    const int           *       ia    = A_ia + pcBeg ;
+    const MATRIX_SCALAR *       a     = A_a  + pcBeg ;
+    const MATRIX_SCALAR * const a_end = A_a  + pcEnd ;
+
+    VECTOR_SCALAR y_tmp = 0 ;
+    for ( ; a != a_end ; ++a , ++ia ) {
+      y_tmp += *a * x[ *ia ];
+    }
+    dot_x_y += x[ row ] * y_tmp ;
+    y[ row ] = y_tmp ;
+  }
+
+  *result = dot_x_y ;
+}
+
+static void dcrs_apply_span(
+  const struct distributed_crs_matrix * const matrix ,
+  const int span_begin ,
+  const int span_end ,
+  const VECTOR_SCALAR * const x ,
+        VECTOR_SCALAR * const y )
+{
+  const int           * const A_pc  = matrix->A_pc ;
+  const int           * const A_ia  = matrix->A_ia ;
+  const MATRIX_SCALAR * const A_a   = matrix->A_a ;
+
+  int row = span_begin ;
+
+  for ( ; row < span_end ; ++row ) {
+    const int pcBeg = A_pc[ row ];
+    const int pcEnd = A_pc[ row + 1 ];
+
+    const int           *       ia    = A_ia + pcBeg ;
+    const MATRIX_SCALAR *       a     = A_a  + pcBeg ;
+    const MATRIX_SCALAR * const a_end = A_a  + pcEnd ;
+
+    VECTOR_SCALAR y_tmp = 0 ;
+    for ( ; a != a_end ; ++a , ++ia ) {
+      y_tmp += *a * x[ *ia ];
+    }
+    y[ row ] = y_tmp ;
+  }
+}
+
+static void work_span( const int count , const int rank ,
+                       int * jBeg , int * jEnd )
+{
+  const int length = *jEnd - *jBeg ;
+  const int chunk  = ( length + count - 1 ) / count ;
+  const int begin  = chunk * rank ;
+        int end    = begin + chunk ;
+
+  if ( length < end ) { end = length ; }
+
+  *jEnd  = *jBeg + end ;
+  *jBeg += begin ;
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_dot_join( TPI_Work * work , const void * src  )
+{ *((double *) ( work->reduce) ) += *((const double *) src); }
+
+static void tpi_work_dot_init( TPI_Work * work )
+{ *((double *) ( work->reduce) ) = 0 ; }
+
+/*--------------------------------------------------------------------*/
+
+struct work_dcrs {
+  const struct distributed_crs_matrix * matrix ;
+  const VECTOR_SCALAR * x ;
+        VECTOR_SCALAR * y ;
+  int   jBeg ;
+  int   jEnd ;
+};
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_dcrs_apply_and_dot( TPI_Work * work )
+{
+  const struct work_dcrs * const info = (const struct work_dcrs *) work->info ;
+
+  int local_begin = info->jBeg ;
+  int local_end   = info->jEnd ;
+
+  work_span( work->count , work->rank , & local_begin , & local_end );
+
+  dcrs_apply_and_dot_span( info->matrix , local_begin , local_end ,
+                           info->x , info->y , (double *) work->reduce );
+}
+
+double dcrs_apply_and_dot(
+  const struct distributed_crs_matrix * matrix ,
+  VECTOR_SCALAR * x ,
+  VECTOR_SCALAR * y ,
+  const int overlap_communication )
+{
+  struct work_dcrs info ;
+
+  double result = 0.0 ;
+
+  info.matrix = matrix ;
+  info.x      = x ;
+  info.y      = y ;
+
+  if ( overlap_communication &&
+       matrix->n_internal_row < matrix->n_local_row ) {
+
+    double remote_result = 0 ;
+
+    /* Start the internal matrix-vector multiply */
+    /* result += dot( output = A * input , input ); */
+
+    info.jBeg = 0 ;
+    info.jEnd = matrix->n_internal_row ;
+
+    /*  Divide internal work evenly among worker threads.
+     *  This leave the primary thread completely out of the computation.
+     */
+    TPI_Start_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
+                              tpi_work_dot_join ,
+                              tpi_work_dot_init ,
+                              sizeof(result) , & result );
+
+    get_off_process_entries( matrix , x );
+
+    TPI_Wait(); /* Wait for internal result */
+
+    info.jBeg = matrix->n_internal_row ;
+    info.jEnd = matrix->n_local_row ;
+
+    TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
+                            tpi_work_dot_join ,
+                            tpi_work_dot_init ,
+                            sizeof(remote_result) , & remote_result );
+
+    result += remote_result ;
+  }
+  else {
+    info.jBeg = 0 ;
+    info.jEnd = matrix->n_local_row ;
+
+    get_off_process_entries( matrix , x );
+
+    TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
+                            tpi_work_dot_join ,
+                            tpi_work_dot_init ,
+                            sizeof(result) , & result );
+  }
+
+  result = comm_sum( result );
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_dcrs_apply( TPI_Work * work )
+{
+  const struct work_dcrs * const info = (const struct work_dcrs *) work->info ;
+
+  int local_begin = info->jBeg ;
+  int local_end   = info->jEnd ;
+
+  work_span( work->count , work->rank , & local_begin , & local_end );
+
+  dcrs_apply_span( info->matrix , local_begin , local_end ,
+                   info->x , info->y );
+}
+
+void dcrs_apply(
+  const struct distributed_crs_matrix * matrix ,
+  VECTOR_SCALAR * x ,
+  VECTOR_SCALAR * y )
+{
+  struct work_dcrs info ;
+
+  info.matrix = matrix ;
+  info.x      = x ;
+  info.y      = y ;
+  info.jBeg   = 0 ;
+  info.jEnd   = matrix->n_local_row ;
+
+  get_off_process_entries( matrix , x );
+
+  TPI_Run_threads( tpi_work_dcrs_apply , & info , 0 );
+}
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.h b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.h
new file mode 100644
index 0000000..61f2032
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.h
@@ -0,0 +1,41 @@
+
+#ifndef dcrs_matrix_h
+#define dcrs_matrix_h
+
+#include <tpi_vector.h>
+
+struct distributed_crs_matrix {
+  /* Global parallel */
+  int   p_size ;
+  int   p_rank ;
+  int * p_recv_pc ; /* [np+1], span of received off-processor elements */
+  int * p_send_pc ; /* [np+1], span of sent off-processor elements */
+  int * p_send_id ; /* [send_pc[np]], indices of sent elements */
+
+  /* Local and local parallel */
+  int   n_local_column ; /* Number of local columns */
+  int   n_local_row ;    /* Number of local rows */
+  int   n_internal_row ; /* Number of local rows with internal columns */
+  int * A_pc ;           /* Offsets into A_ia array for column indices */
+  int * A_ia ;
+  MATRIX_SCALAR * A_a ;
+};
+
+/*  1) communicate off-processor portions of input.
+ *  2) apply: output = matrix * input ;
+ *  3) return: dot( output , input );
+ */
+double dcrs_apply_and_dot( const struct distributed_crs_matrix * matrix ,
+                           VECTOR_SCALAR * input ,
+                           VECTOR_SCALAR * output ,
+                           const int overlap_communication );
+
+/*  1) communicate off-processor portions of input.
+ *  2) apply: output = matrix * input ;
+ */
+void dcrs_apply( const struct distributed_crs_matrix * matrix ,
+                 VECTOR_SCALAR * input ,
+                 VECTOR_SCALAR * output );
+
+#endif
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/main.c b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/main.c
new file mode 100644
index 0000000..57bb80a
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/main.c
@@ -0,0 +1,422 @@
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <ThreadPool_config.h>
+#include <TPI.h>
+#include <BoxPartitionIB.h>
+#include <dcrs_matrix.h>
+#include <CGSolver.h>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+/*--------------------------------------------------------------------*/
+static
+void hpccg_alloc_and_fill( const int np ,
+                           const int my_p ,
+                           const int gbox[][2] ,
+                           const int ghost ,
+                           struct distributed_crs_matrix * const matrix );
+
+/*--------------------------------------------------------------------*/
+
+int main( int argc , char ** argv )
+{
+  const int ghost = 1 ;
+  const int max_cube = 20 ;
+  int ncube[20] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+                    0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
+
+  FILE * print_file = stdout ;
+  int print_iter = 500 ;
+  int max_iter = 50 ;
+  int overlap_comm = 0 ;
+
+  float tolerance = 0.0 ; /* Force max iterations */
+
+  int gbox[3][2] = { { 0 , 16 } , { 0 , 16 } , { 0 , 16 } };
+  int nt = 0 ;
+  int trials = 6 ;
+  int ntest ;
+  int np = 1;
+  int my_p = 0 ;
+
+#ifdef HAVE_MPI
+  MPI_Init( & argc , & argv );
+  MPI_Comm_size( MPI_COMM_WORLD , & np );
+  MPI_Comm_rank( MPI_COMM_WORLD , & my_p );
+#endif
+
+  if ( ! my_p ) {
+    const char arg_threads[] = "threads=" ;
+    const char arg_cube[] = "cube=" ;
+    const char arg_box[] = "box=" ;
+    const char arg_max[] = "max_iter=" ;
+    const char arg_trials[] = "trials=" ;
+    const char arg_print[] = "print_iter=" ;
+    const char arg_file[] = "print_file=" ;
+    const char arg_comm[] = "overlap_comm=" ;
+    const char arg_tolerance[] = "tolerance=" ;
+    int i ;
+    for ( i = 1 ; i < argc ; ++i ) {
+      if ( ! strncmp(argv[i],arg_threads,strlen(arg_threads)) ) {
+        sscanf(argv[i]+strlen(arg_threads),"%d",&nt);
+      }
+      else if ( ! strncmp(argv[i],arg_box,strlen(arg_box)) ) {
+        sscanf(argv[i]+strlen(arg_box),"%d%*[x]%d%*[x]%d",
+               & gbox[0][1] , & gbox[1][1] , & gbox[2][1] );
+      }
+      else if ( ! strncmp(argv[i],arg_cube,strlen(arg_cube)) ) {
+        sscanf(argv[i]+strlen(arg_cube),
+               "%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d",
+               ncube+0, ncube+1, ncube+2, ncube+3, ncube+4,
+               ncube+5, ncube+6, ncube+7, ncube+8, ncube+9,
+               ncube+10, ncube+11, ncube+12, ncube+13, ncube+14,
+               ncube+15, ncube+16, ncube+17, ncube+18, ncube+19);
+      }
+      else if ( ! strncmp(argv[i],arg_max,strlen(arg_max)) ) {
+        sscanf(argv[i]+strlen(arg_max),"%d",&max_iter);
+      }
+      else if ( ! strncmp(argv[i],arg_trials,strlen(arg_trials)) ) {
+        sscanf(argv[i]+strlen(arg_trials),"%d",&trials);
+      }
+      else if ( ! strncmp(argv[i],arg_print,strlen(arg_print)) ) {
+        sscanf(argv[i]+strlen(arg_print),"%d",&print_iter);
+      }
+      else if ( ! strncmp(argv[i],arg_comm,strlen(arg_comm)) ) {
+        sscanf(argv[i]+strlen(arg_print),"%d",&overlap_comm);
+      }
+      else if ( ! strncmp(argv[i],arg_tolerance,strlen(arg_tolerance)) ) {
+        sscanf(argv[i]+strlen(arg_print),"%f",&tolerance);
+      }
+      else if ( ! strncmp(argv[i],arg_file,strlen(arg_file)) ) {
+        char buffer[256] ;
+        sscanf(argv[i]+strlen(arg_file),"%s",buffer);
+        print_file = fopen(buffer,"a");
+      }
+    }
+  }
+
+#ifdef HAVE_MPI
+  {
+    MPI_Bcast( & nt , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & gbox[0][0] , 6 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( ncube , max_cube , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & overlap_comm , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & max_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & print_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & trials , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & tolerance , 1 , MPI_FLOAT , 0 , MPI_COMM_WORLD );
+  }
+#endif
+
+  if ( nt ) {
+    TPI_Init( nt );
+    TPI_Block();
+    TPI_Unblock();
+  }
+
+  if ( ! my_p ) {
+    fprintf(print_file,"\"PROC\" , \"THREAD\" , \"EQUATION\" , \"NON-ZERO\" , \"FUSED-AVG\", \"FUSED-MAX\", \"BLAS-AVG\", \"BLAS-MAX\", \"FUSED\", \"BLAS\"  , \"Iter\"\n");
+    fprintf(print_file,"\"COUNT\", \"COUNT\"  , \"COUNT\"    , \"COUNT\"    , \"Mflops\"   , \"Mflops\"   , \"Mflops\"  , \"Mflops\"  , \"error\", \"error\" , \"COUNT\"\n");
+  }
+
+  for ( ntest = 0 ; ! ntest || ( ntest < max_cube && ncube[ntest] ) ; ++ntest ) {
+    struct distributed_crs_matrix matrix ;
+
+    if ( ncube[ntest] ) {
+      gbox[0][1] = gbox[1][1] = gbox[2][1] = ncube[ntest] ;
+    }
+
+    hpccg_alloc_and_fill( np, my_p, (const int (*)[2]) gbox, ghost, &matrix);
+
+    {
+      const int nRow = matrix.n_local_row ;
+
+      double solve_dt[2] = { 0 , 0 };
+      double solve_blas_dt[2] = { 0 , 0 };
+      VECTOR_SCALAR norm_resid = 0.0 ;
+      VECTOR_SCALAR norm_resid_blas = 0.0 ;
+      int iter_count = 0 ;
+      int iter_count_blas = 0 ;
+      int k ;
+
+      VECTOR_SCALAR * const b      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * nRow );
+      VECTOR_SCALAR * const x      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * nRow );
+      VECTOR_SCALAR * const x_blas = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * nRow );
+      VECTOR_SCALAR * const xexact = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * nRow );
+
+      {
+        const VECTOR_SCALAR value = 1.0 /* 1.0 / 3.0 */ ;
+        int i ;
+        for ( i = 0 ; i < nRow ; ++i ) xexact[i] = value ;
+      }
+
+      for ( k = 0 ; k < trials ; ++k ) {
+        double dt = 0 ;
+        int i ;
+
+        for ( i = 0 ; i < nRow ; ++i ) { x_blas[i] = 0.0 ; }
+
+        cgsolve_set_lhs( & matrix , xexact , b );
+
+        cgsolve_blas( & matrix, b, x_blas,
+                      tolerance , max_iter , print_iter ,
+                      & iter_count_blas, & norm_resid_blas, & dt );
+
+        solve_blas_dt[0] += dt ;
+        if ( ! k || dt < solve_blas_dt[1] ) { solve_blas_dt[1] = dt ; }
+      }
+
+      for ( k = 0 ; k < trials ; ++k ) {
+        double dt = 0 ;
+        int i ;
+
+        for ( i = 0 ; i < nRow ; ++i ) { x[i] = 0.0 ; }
+
+        cgsolve_set_lhs( & matrix , xexact , b );
+
+        cgsolve( & matrix, b, x, overlap_comm,
+                 tolerance , max_iter , print_iter ,
+                 & iter_count, & norm_resid, & dt );
+
+        solve_dt[0] += dt ;
+        if ( ! k || dt < solve_dt[1] ) { solve_dt[1] = dt ; }
+      }
+
+      {
+        int nnzGlobal = matrix.A_pc[ nRow ];
+        double error[3] = { 0 , 0 , 0 };
+
+        for ( k = 0 ; k < nRow ; ++k ) {
+          error[0] += xexact[k] * xexact[k] ;
+          error[1] += ( x[k] - xexact[k] ) * ( x[k] - xexact[k] );
+          error[2] += ( x_blas[k] - xexact[k] ) * ( x_blas[k] - xexact[k] );
+        }
+
+#ifdef HAVE_MPI
+        {
+          double error_global[3] = { 0.0 , 0.0 , 0.0 };
+          int nnz = nnzGlobal ;
+
+          MPI_Allreduce( & nnz , & nnzGlobal , 1 , MPI_INT , MPI_SUM ,
+                         MPI_COMM_WORLD );
+
+          MPI_Allreduce( error , error_global , 3 , MPI_DOUBLE , MPI_SUM ,
+                         MPI_COMM_WORLD );
+
+          error[0] = error_global[0];
+          error[1] = error_global[1];
+          error[2] = error_global[2];
+        }
+#endif
+
+        error[0] = sqrt( error[0] );
+        error[1] = sqrt( error[1] );
+        error[2] = sqrt( error[2] );
+
+        if ( ! my_p ) {
+          const int nRowGlobal = ( gbox[0][1] - gbox[0][0] ) *
+                                 ( gbox[1][1] - gbox[1][0] ) *
+                                 ( gbox[2][1] - gbox[2][0] );
+
+          const double dt_mean_fuse_step = 1.0e6 * solve_dt[0]      / (double) trials ;
+          const double dt_mean_blas_step = 1.0e6 * solve_blas_dt[0] / (double) trials ;
+          const double dt_min_fuse_step  = 1.0e6 * solve_dt[1] ;
+          const double dt_min_blas_step  = 1.0e6 * solve_blas_dt[1] ;
+
+          const double Mflop_step = 2 * nnzGlobal 
+                                  + 3 * 2 * nRowGlobal 
+                                  + 2 * 2 * nRowGlobal ;
+
+          const double Mflop_mean_fuse = Mflop_step * iter_count / dt_mean_fuse_step ;
+          const double Mflop_mean_blas = Mflop_step * iter_count_blas / dt_mean_blas_step ;
+
+          const double Mflop_max_fuse = Mflop_step * iter_count / dt_min_fuse_step ;
+          const double Mflop_max_blas = Mflop_step * iter_count_blas / dt_min_blas_step ;
+
+          fprintf(print_file,"%8d , %8d , %8d , %8d , %10g , %10g , %10g , %10g , %10g , %10g , %d\n",
+                  np , nt , nRowGlobal , nnzGlobal ,
+                  Mflop_mean_fuse , Mflop_max_fuse ,
+                  Mflop_mean_blas , Mflop_max_blas ,
+                  error[1] / error[0] , error[2] / error[0] , iter_count );
+          fflush(print_file);
+        }
+      }
+
+      free( xexact );
+      free( x_blas );
+      free( x );
+      free( b );
+    }
+    free( matrix.A_a );
+    free( matrix.A_ia );
+    free( matrix.A_pc );
+    free( matrix.p_recv_pc );
+    free( matrix.p_send_pc );
+    free( matrix.p_send_id );
+  }
+
+  if ( nt ) { TPI_Finalize(); }
+
+#ifdef HAVE_MPI
+  MPI_Finalize();
+#endif
+
+  return 0 ;
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+static
+void hpccg_alloc_and_fill( const int np ,
+                           const int my_p ,
+                           const int gbox[][2] ,
+                           const int ghost ,
+                           struct distributed_crs_matrix * const matrix )
+{
+  int (* const pbox)[3][2] = (int (*)[3][2]) malloc( sizeof(int)*np*3*2 );
+
+  const int (* const my_box)[2] = (const int (*)[2]) pbox[my_p] ;
+
+  int my_uses_box[3][2] ;
+  int * map_local_ord = NULL;
+
+  matrix->n_local_row     = 0 ;
+  matrix->n_internal_row  = 0 ;
+  matrix->A_pc            = NULL ;
+  matrix->A_ia            = NULL ;
+  matrix->A_a             = NULL ;
+
+  matrix->p_size    = np ;
+  matrix->p_rank    = my_p ;
+  matrix->p_recv_pc = NULL  ;
+  matrix->p_send_pc = NULL ;
+  matrix->p_send_id = NULL ;
+
+  /* Partition the global box */
+  box_partition_rcb( np , gbox , pbox );
+
+  /* Upper bound */
+  map_local_ord = (int *) malloc( sizeof(int) *
+                                  ( 2 * ghost + my_box[0][1]- my_box[0][0] ) *
+                                  ( 2 * ghost + my_box[1][1]- my_box[1][0] ) *
+                                  ( 2 * ghost + my_box[2][1]- my_box[2][0] ) );
+
+  /* Generate local layout with ghosting. */
+  box_partition_map( np, my_p, gbox,
+                     (const int (* const)[3][2]) pbox,
+                     ghost,
+                     my_uses_box , map_local_ord ,
+                     & matrix->n_internal_row ,
+                     & matrix->n_local_row ,
+                     & matrix->n_local_column ,
+                     & matrix->p_recv_pc ,
+                     & matrix->p_send_pc ,
+                     & matrix->p_send_id );
+
+  {
+    const int nrow = matrix->n_local_row ;
+    int * const pc = (int *) malloc( sizeof(int) * ( nrow + 1 ) );
+    int * ia = NULL ;
+    MATRIX_SCALAR * a = NULL ;
+
+    int ix , iy , iz ;
+    int sx , sy , sz ;
+
+    /* Number of non zeros in each matrix row,
+     * then prefix the array for offsets.
+     */
+    pc[0] = 0 ;
+
+    for ( iz = my_box[2][0] ; iz < my_box[2][1] ; ++iz ) {
+    for ( iy = my_box[1][0] ; iy < my_box[1][1] ; ++iy ) {
+    for ( ix = my_box[0][0] ; ix < my_box[0][1] ; ++ix ) {
+      const int irow = box_map_local( (const int (*const)[2]) my_uses_box, map_local_ord, ix, iy, iz );
+      int count = 1 ; /* Count the diagonal */
+
+      /* Count the off-diagonal terms to follow */
+      for ( sz = -1 ; sz <= 1 ; ++sz ) {
+      for ( sy = -1 ; sy <= 1 ; ++sy ) {
+      for ( sx = -1 ; sx <= 1 ; ++sx ) {
+        const int g_ix = ix + sx ;
+        const int g_iy = iy + sy ;
+        const int g_iz = iz + sz ;
+
+        if ( my_uses_box[0][0] <= g_ix && g_ix < my_uses_box[0][1] &&
+             my_uses_box[1][0] <= g_iy && g_iy < my_uses_box[1][1] &&
+             my_uses_box[2][0] <= g_iz && g_iz < my_uses_box[2][1] &&
+             ! ( sz == 0 && sy == 0 && sx == 0 ) ) {
+          /* This column is within global bounds and is not a diagonal */
+          ++count ;
+        }
+      }
+      }
+      }
+      pc[ irow + 1 ] = count ;
+    }
+    }
+    }
+
+    for ( ix = 0 ; ix < nrow ; ++ix ) { pc[ix+1] += pc[ix] ; }
+
+    ia = (int *)           malloc( sizeof(int)           * pc[ nrow ]  );
+    a  = (MATRIX_SCALAR *) malloc( sizeof(MATRIX_SCALAR) * pc[ nrow ]  );
+
+    for ( iz = my_box[2][0] ; iz < my_box[2][1] ; ++iz ) {
+    for ( iy = my_box[1][0] ; iy < my_box[1][1] ; ++iy ) {
+    for ( ix = my_box[0][0] ; ix < my_box[0][1] ; ++ix ) {
+      const int irow = box_map_local( (const int (*const)[2]) my_uses_box, map_local_ord, ix, iy, iz );
+      int ipc = pc[ irow ];
+
+      /* Diagonal term first */
+      ia[ ipc ] = irow ;
+      a[  ipc ] = 27.0f ;
+      ++ipc ;
+
+      /* Off-diagonal terms to follow */
+      for ( sz = -1 ; sz <= 1 ; ++sz ) {
+      for ( sy = -1 ; sy <= 1 ; ++sy ) {
+      for ( sx = -1 ; sx <= 1 ; ++sx ) {
+        const int g_ix = ix + sx ;
+        const int g_iy = iy + sy ;
+        const int g_iz = iz + sz ;
+
+        if ( my_uses_box[0][0] <= g_ix && g_ix < my_uses_box[0][1] &&
+             my_uses_box[1][0] <= g_iy && g_iy < my_uses_box[1][1] &&
+             my_uses_box[2][0] <= g_iz && g_iz < my_uses_box[2][1] &&
+             ! ( sz == 0 && sy == 0 && sx == 0 ) ) {
+          /* Column is within global bounds and is not a diagonal */
+          /* 'icol' is mapped for communication */
+
+          const int icol =
+            box_map_local( (const int (*const)[2]) my_uses_box, map_local_ord, g_ix, g_iy, g_iz );
+
+          if ( icol < 0 ) { abort(); }
+
+          ia[ ipc ] = icol ;
+          a[  ipc ] = -1.0f ;
+          ++ipc ;
+        }
+      }
+      }
+      }
+      if ( ipc != pc[ irow + 1 ] ) { abort(); }
+    }
+    }
+    }
+
+    matrix->A_pc = pc ;
+    matrix->A_ia = ia ;
+    matrix->A_a  = a ;
+  }
+
+  free( map_local_ord );
+  free( pbox );
+}
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/tpi_vector.c b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/tpi_vector.c
new file mode 100644
index 0000000..e5cc365
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/tpi_vector.c
@@ -0,0 +1,277 @@
+#include <stdio.h>
+#include <stddef.h>
+
+#include <ThreadPool_config.h>
+#include <TPI.h>
+#include <tpi_vector.h>
+
+#if defined( HAVE_MPI )
+#include <mpi.h>
+#endif
+
+/*--------------------------------------------------------------------*/
+
+struct tpi_work_vector {
+        VECTOR_SCALAR alpha ;
+        VECTOR_SCALAR beta ;
+  const VECTOR_SCALAR * x ;
+  const VECTOR_SCALAR * y ;
+        VECTOR_SCALAR * w ; 
+        int  n ;
+};
+
+void tpi_work_span( TPI_Work * const work , const int n ,
+                    int * const iBeg , int * const iEnd )
+{
+  const int chunk = ( n + work->count - 1 ) / work->count ;
+  const int i_end = chunk + ( *iBeg = chunk * work->rank );
+
+  *iEnd = n < i_end ? n : i_end ;
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_fill( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR alpha = h->alpha ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] = alpha ; }
+}
+
+void tpi_fill( int n , VECTOR_SCALAR alpha , VECTOR_SCALAR * x )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.alpha = alpha ;
+  tmp.w = x ;
+  tmp.n = n ;
+  TPI_Run_threads( tpi_work_fill , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_scale( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR beta = h->beta ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] *= beta ; }
+}
+
+void tpi_scale( int n , const VECTOR_SCALAR alpha , VECTOR_SCALAR * x )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.alpha = alpha ;
+  tmp.w = x ;
+  tmp.n = n ;
+  TPI_Run_threads( tpi_work_scale , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_copy( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR * const x = h->x ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] = x[i] ; }
+}
+
+void tpi_copy( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR * y )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.x = x ;
+  tmp.w = y ;
+  tmp.n = n ;
+  TPI_Run_threads( tpi_work_copy , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_axpby( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR alpha = h->alpha ;
+  const VECTOR_SCALAR beta  = h->beta ;
+  const VECTOR_SCALAR * const x = h->x ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] = alpha * x[i] + beta * w[i] ; }
+}
+
+void tpi_axpby( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
+                        VECTOR_SCALAR beta  ,       VECTOR_SCALAR * y )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.alpha = alpha ;
+  tmp.beta  = beta ;
+  tmp.x = x ;
+  tmp.w = y ;
+  tmp.n = n ;
+
+  TPI_Run_threads( tpi_work_axpby , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_axpy( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR alpha = h->alpha ;
+  const VECTOR_SCALAR * const x = h->x ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] += alpha * x[i] ; }
+}
+
+void tpi_axpy( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
+                                                   VECTOR_SCALAR * y )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.alpha = alpha ;
+  tmp.x = x ;
+  tmp.w = y ;
+  tmp.n = n ;
+
+  TPI_Run_threads( tpi_work_axpy , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_xpby( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR beta  = h->beta ;
+  const VECTOR_SCALAR * const x = h->x ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] = x[i] + beta * w[i] ; }
+}
+
+void tpi_xpby( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR beta  ,
+                                                 VECTOR_SCALAR * y )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.beta  = beta ;
+  tmp.x = x ;
+  tmp.w = y ;
+  tmp.n = n ;
+
+  TPI_Run_threads( tpi_work_xpby , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_dot_partial( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR * const x = h->x ;
+  const VECTOR_SCALAR * const y = h->y ;
+  double * const s = (double *) work->reduce ;
+  double tmp = *s ;
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { tmp += x[i] * y[i] ; }
+
+  *s = tmp ;
+}
+
+static void tpi_work_dot_partial_self( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR * const x = h->x ;
+  double * const s = (double *) work->reduce ;
+  double tmp = *s ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { const VECTOR_SCALAR d = x[i] ; tmp += d * d ; }
+
+  *s = tmp ;
+}
+
+static void tpi_work_dot_join( TPI_Work * work , const void * src  )
+{
+  *((double *) ( work->reduce) ) += *((const double *) src);
+}
+
+static void tpi_work_dot_init( TPI_Work * work )
+{
+  *((double *) ( work->reduce) ) = 0 ;
+}
+
+double tpi_dot( int n , const VECTOR_SCALAR * x , const VECTOR_SCALAR * y )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  double result = 0.0 ;
+  tmp.x = x ;
+  tmp.y = y ;
+  tmp.n = n ;
+  if ( x != y ) {
+    TPI_Run_threads_reduce( tpi_work_dot_partial , & tmp ,
+                            tpi_work_dot_join , tpi_work_dot_init ,
+                            sizeof(result) , & result );
+  }
+  else {
+    TPI_Run_threads_reduce( tpi_work_dot_partial_self , & tmp ,
+                            tpi_work_dot_join , tpi_work_dot_init ,
+                            sizeof(result) , & result );
+  }
+#if defined HAVE_MPI
+  {
+    double tmp = result ;
+    MPI_Allreduce( & tmp , & result , 1 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD );
+  }
+#endif
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/tpi_vector.h b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/tpi_vector.h
new file mode 100644
index 0000000..fba628f
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hhpccg/tpi_vector.h
@@ -0,0 +1,30 @@
+
+#ifndef tpi_vector_h
+#define tpi_vector_h
+
+#define VECTOR_SCALAR float
+#define MATRIX_SCALAR float
+
+void tpi_fill( int n , VECTOR_SCALAR alpha , VECTOR_SCALAR * x );
+
+void tpi_scale( int n , const VECTOR_SCALAR alpha , VECTOR_SCALAR * x );
+
+void tpi_copy( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR * y );
+
+void tpi_xpby( int n , const VECTOR_SCALAR * x ,
+                             VECTOR_SCALAR beta  , VECTOR_SCALAR * y );
+
+void tpi_axpy( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
+                                                   VECTOR_SCALAR * y );
+
+void tpi_axpby( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
+                        VECTOR_SCALAR beta  ,       VECTOR_SCALAR * y );
+
+double tpi_dot( int n , const VECTOR_SCALAR * x ,
+                        const VECTOR_SCALAR * y );
+
+void tpi_work_span( TPI_Work * const work , const int n ,
+                    int * const iBeg , int * const iEnd );
+
+#endif
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hpccg/BoxPartition.c b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/BoxPartition.c
new file mode 100644
index 0000000..ef860ae
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/BoxPartition.c
@@ -0,0 +1,487 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <BoxPartition.h>
+
+/*--------------------------------------------------------------------*/
+
+static int box_map_local_entry( const int box[][2] ,
+                                const int ghost ,
+                                int local_x ,
+                                int local_y ,
+                                int local_z )
+{
+  const int nx = 2 * ghost + box[0][1] - box[0][0] ;
+  const int ny = 2 * ghost + box[1][1] - box[1][0] ;
+  const int nz = 2 * ghost + box[2][1] - box[2][0] ;
+  int result = -1 ;
+
+  local_x += ghost ;
+  local_y += ghost ;
+  local_z += ghost ;
+
+  if ( 0 <= local_x && local_x < nx &&
+       0 <= local_y && local_y < ny &&
+       0 <= local_z && local_z < nz ) {
+
+    result = local_z * ny * nx + local_y * nx + local_x ;
+  }
+  return result ;
+}
+
+int box_map_local( const int box_local[][2] ,
+                   const int ghost ,
+                   const int box_local_map[] ,
+                   const int local_x ,
+                   const int local_y ,
+                   const int local_z )
+{
+  int result = box_map_local_entry(box_local,ghost,local_x,local_y,local_z);
+
+  if ( 0 <= result ) {
+    result = box_local_map[ result ];
+  }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+/* Recursively split a box into into (up-ip) sub-boxes */
+
+static 
+void box_partition( int ip , int up , int axis ,
+                    const int box[3][2] ,
+                    int p_box[][3][2] )
+{
+  const int np = up - ip ;
+  if ( 1 == np ) {
+    p_box[ip][0][0] = box[0][0] ; p_box[ip][0][1] = box[0][1] ;
+    p_box[ip][1][0] = box[1][0] ; p_box[ip][1][1] = box[1][1] ;
+    p_box[ip][2][0] = box[2][0] ; p_box[ip][2][1] = box[2][1] ;
+  }
+  else {
+    const int n = box[ axis ][1] - box[ axis ][0] ;
+    const int np_low = np / 2 ;  /* Rounded down */
+    const int np_upp = np - np_low ;
+
+    const int n_upp = (int) (((double) n) * ( ((double)np_upp) / ((double)np)));
+    const int n_low = n - n_upp ;
+    const int next_axis = ( axis + 2 ) % 3 ;
+
+    if ( np_low ) { /* P = [ip,ip+np_low) */
+      int dbox[3][2] ;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      dbox[ axis ][1] = dbox[ axis ][0] + n_low ;
+
+      box_partition( ip, ip + np_low, next_axis,
+                     (const int (*)[2]) dbox, p_box );
+    }
+
+    if ( np_upp ) { /* P = [ip+np_low,ip+np_low+np_upp) */
+      int dbox[3][2] ;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      ip += np_low ;
+      dbox[ axis ][0] += n_low ;
+      dbox[ axis ][1]  = dbox[ axis ][0] + n_upp ;
+
+      box_partition( ip, ip + np_upp, next_axis,
+                     (const int (*)[2]) dbox, p_box );
+    }
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+static int box_disjoint( const int a[3][2] , const int b[3][2] )
+{
+  return a[0][1] <= b[0][0] || b[0][1] <= a[0][0] ||
+         a[1][1] <= b[1][0] || b[1][1] <= a[1][0] ||
+         a[2][1] <= b[2][0] || b[2][1] <= a[2][0] ;
+}
+
+static void resize_int( int ** a , int * allocLen , int newLen )
+{
+  int k = 32;
+  while ( k < newLen ) { k <<= 1 ; }
+  if ( NULL == *a )
+    { *a = malloc( sizeof(int)*(*allocLen = k) ); }
+  else if ( *allocLen < k ) 
+    { *a = realloc(*a , sizeof(int)*(*allocLen = k)); }
+}
+
+static void box_partition_maps( 
+  const int np ,
+  const int my_p ,
+  const int pbox[][3][2] ,
+  const int ghost ,
+  int ** map_local_id ,
+  int ** map_recv_pc ,
+  int ** map_send_pc ,
+  int ** map_send_id )
+{
+  const int (*my_box)[2] = pbox[my_p] ;
+
+  const int my_ix = my_box[0][0] ;
+  const int my_iy = my_box[1][0] ;
+  const int my_iz = my_box[2][0] ;
+  const int my_nx = my_box[0][1] - my_box[0][0] ;
+  const int my_ny = my_box[1][1] - my_box[1][0] ;
+  const int my_nz = my_box[2][1] - my_box[2][0] ;
+
+  const int my_use_nx = 2 * ghost + my_nx ;
+  const int my_use_ny = 2 * ghost + my_ny ;
+  const int my_use_nz = 2 * ghost + my_nz ;
+
+  const int id_length = my_use_nx * my_use_ny * my_use_nz ;
+
+  int * local_id  = (int *) malloc( id_length * sizeof(int) );
+  int * recv_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
+  int * send_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
+
+  int * send_id  = NULL ;
+  int   send_id_size = 0 ;
+
+  int iLocal , iSend ;
+  int i ;
+
+  int my_use_box[3][2] ;
+
+  my_use_box[0][0] = my_box[0][0] - ghost ;
+  my_use_box[0][1] = my_box[0][1] + ghost ;
+  my_use_box[1][0] = my_box[1][0] - ghost ;
+  my_use_box[1][1] = my_box[1][1] + ghost ;
+  my_use_box[2][0] = my_box[2][0] - ghost ;
+  my_use_box[2][1] = my_box[2][1] + ghost ;
+
+  for ( i = 0 ; i < id_length ; ++i ) { local_id[i] = -1 ; }
+
+  iSend = 0 ;
+  iLocal = 0 ;
+
+  /* The vector space is partitioned by processors */
+
+  for ( i = 0 ; i < np ; ++i ) {
+    const int ip = ( i + my_p ) % np ;
+    recv_pc[i] = iLocal ;
+    send_pc[i] = iSend ;
+
+    if ( ! box_disjoint( (const int (*)[2]) my_use_box , pbox[ip] ) ) {
+      const int p_ix = pbox[ip][0][0] ;
+      const int p_iy = pbox[ip][1][0] ;
+      const int p_iz = pbox[ip][2][0] ;
+      const int p_ex = pbox[ip][0][1] ;
+      const int p_ey = pbox[ip][1][1] ;
+      const int p_ez = pbox[ip][2][1] ;
+
+      int local_x , local_y , local_z ;
+
+      /* Run the span of global cells that my processor uses */
+
+      for ( local_z = -ghost ; local_z < my_nz + ghost ; ++local_z ) {
+      for ( local_y = -ghost ; local_y < my_ny + ghost ; ++local_y ) {
+      for ( local_x = -ghost ; local_x < my_nx + ghost ; ++local_x ) {
+
+        const int global_z = local_z + my_iz ;
+        const int global_y = local_y + my_iy ;
+        const int global_x = local_x + my_ix ;
+
+        const int entry = 
+          box_map_local_entry(my_box,ghost,local_x,local_y,local_z);
+
+        if ( entry < 0 ) { abort(); }
+
+        if ( p_iz <= global_z && global_z < p_ez &&
+             p_iy <= global_y && global_y < p_ey &&
+             p_ix <= global_x && global_x < p_ex ) {
+
+          /* This ordinal is owned by processor 'ip' */
+
+          local_id[ entry ] = iLocal++ ;
+
+#if defined(DEBUG_PRINT)
+if ( my_p != ip ) {
+  fprintf(stdout,"  (%d,%d,%d) : P%d recv at local %d from P%d\n",
+                  global_x,global_y,global_z,my_p,local_id[entry],ip);
+  fflush(stdout);
+}
+#endif
+        }
+
+        /* If in my ownership and used by the other processor */
+        if ( my_p != ip &&
+             /* In my ownership: */
+             ( 0 <= local_z && local_z < my_nz &&
+               0 <= local_y && local_y < my_ny &&
+               0 <= local_x && local_x < my_nx ) &&
+             /* In other processors usage: */
+             ( p_iz - ghost <= global_z && global_z < p_ez + ghost &&
+               p_iy - ghost <= global_y && global_y < p_ey + ghost &&
+               p_ix - ghost <= global_x && global_x < p_ex + ghost ) ) {
+
+          resize_int( & send_id , & send_id_size , (iSend + 1) );
+          send_id[ iSend ] = local_id[ entry ] ;
+          ++iSend ;
+
+#if defined(DEBUG_PRINT)
+{
+  fprintf(stdout,"  (%d,%d,%d) : P%d send at local %d to P%d\n",
+                  global_x,global_y,global_z,my_p,local_id[entry],ip);
+  fflush(stdout);
+}
+#endif
+        }
+      }
+    }
+    }
+    }
+  }
+  recv_pc[np] = iLocal ;
+  send_pc[np] = iSend ;
+
+  *map_local_id  = local_id ;
+  *map_recv_pc   = recv_pc ;
+  *map_send_pc   = send_pc ;
+  *map_send_id   = send_id ;
+}
+
+void box_partition_rcb( const int np , 
+                        const int my_p ,
+                        const int root_box[][2] , 
+                        const int ghost ,
+                        int (**pbox)[3][2] , 
+                        int ** map_local_id ,
+                        int ** map_recv_pc ,
+                        int ** map_send_pc ,
+                        int ** map_send_id )
+{
+  *pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
+
+  box_partition( 0 , np , 2 , root_box , *pbox );
+
+  box_partition_maps( np , my_p , (const int (*)[3][2]) *pbox , ghost ,
+                      map_local_id , map_recv_pc , 
+                      map_send_pc , map_send_id );
+}
+
+/*--------------------------------------------------------------------*/
+
+#ifdef UNIT_TEST
+
+static int box_contain( const int a[3][2] , const int b[3][2] )
+{
+  return a[0][0] <= b[0][0] && b[0][1] <= a[0][1] &&
+         a[1][0] <= b[1][0] && b[1][1] <= a[1][1] &&
+         a[2][0] <= b[2][0] && b[2][1] <= a[2][1] ;
+}
+
+static void box_print( FILE * fp , const int a[][2] )
+{
+  fprintf(fp,"{ [ %d , %d ) , [ %d , %d ) , [ %d , %d ) }",
+                a[0][0] , a[0][1] ,  
+                a[1][0] , a[1][1] ,  
+                a[2][0] , a[2][1] );
+}
+
+static void test_box( const int box[3][2] , const int np )
+{
+  const int ncell_box = box[0][1] * box[1][1] * box[2][1] ;
+  int ncell_total = 0 ;
+  int ncell_min = ncell_box ;
+  int ncell_max = 0 ;
+  int (*pbox)[3][2] ;
+  int i , j ;
+
+  pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
+
+  box_partition( 0 , np , 2 , box , pbox );
+
+  for ( i = 0 ; i < np ; ++i ) {
+    const int ncell = ( pbox[i][0][1] - pbox[i][0][0] ) *
+                      ( pbox[i][1][1] - pbox[i][1][0] ) *
+                      ( pbox[i][2][1] - pbox[i][2][0] );
+
+    if ( ! box_contain( box , (const int (*)[2]) pbox[i] ) ) {
+      fprintf(stdout,"  OUT OF BOUNDS pbox[%d/%d] = ",i,np);
+      box_print(stdout,(const int (*)[2]) pbox[i]);
+      fprintf(stdout,"\n");
+      abort();
+    }
+
+    for ( j = i + 1 ; j < np ; ++j ) {
+      if ( ! box_disjoint( (const int (*)[2]) pbox[i] ,
+                           (const int (*)[2]) pbox[j] ) ) {
+        fprintf(stdout,"  NOT DISJOINT pbox[%d/%d] = ",i,np);
+        box_print(stdout, (const int (*)[2]) pbox[i]);
+        fprintf(stdout,"\n");
+        fprintf(stdout,"               pbox[%d/%d] = ",j,np);
+        box_print(stdout, (const int (*)[2]) pbox[j]);
+        fprintf(stdout,"\n");
+        abort();
+      }
+    }
+    ncell_total += ncell ;
+
+    if ( ncell_max < ncell ) { ncell_max = ncell ; }
+    if ( ncell < ncell_min ) { ncell_min = ncell ; }
+  }
+
+  if ( ncell_total != ncell_box ) {
+    fprintf(stdout,"  WRONG CELL COUNT NP = %d\n",np);
+    abort();
+  }
+  fprintf(stdout,"NP = %d, total = %d, avg = %d, min = %d, max = %d\n",
+          np,ncell_box,ncell_box/np,ncell_min,ncell_max);
+
+  free( pbox );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void test_maps( const int root_box[][2] , const int np )
+{
+  const int ghost = 1 ;
+  const int nx_global = root_box[0][1] - root_box[0][0] ;
+  const int ny_global = root_box[1][1] - root_box[1][0] ;
+  int ieq , i , j ;
+  int (*pbox)[3][2] ;
+  int **local_values ;
+  int **map_local_id ;
+  int **map_recv_pc ;
+  int **map_send_pc ;
+  int **map_send_id ;
+  
+  pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
+
+  box_partition( 0 , np , 2 , root_box , pbox );
+
+  local_values = (int **) malloc( sizeof(int*) * np );
+  map_local_id = (int **) malloc( sizeof(int*) * np );
+  map_recv_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_id  = (int **) malloc( sizeof(int*) * np );
+
+  /* Set each local value to the global equation number */
+
+  for ( ieq = i = 0 ; i < np ; ++i ) {
+    const int (*mybox)[2] = (const int (*)[2]) pbox[i] ;
+    const int nx = mybox[0][1] - mybox[0][0] ;
+    const int ny = mybox[1][1] - mybox[1][0] ;
+    const int nz = mybox[2][1] - mybox[2][0] ;
+    int ix , iy , iz ;
+
+    /* Generate the partition maps for this rank */
+    box_partition_maps( np , i , (const int (*)[3][2]) pbox , ghost ,
+                        & map_local_id[i] , & map_recv_pc[i] , 
+                        & map_send_pc[i] , & map_send_id[i] );
+
+    local_values[i] = (int *) malloc( sizeof(int) * map_recv_pc[i][np] );
+
+    for ( iz = -ghost ; iz < nz + ghost ; ++iz ) {
+    for ( iy = -ghost ; iy < ny + ghost ; ++iy ) {
+    for ( ix = -ghost ; ix < nx + ghost ; ++ix ) {
+      const int ieq = box_map_local(mybox,ghost,map_local_id[i],ix,iy,iz);
+
+      if ( 0 <= ieq ) {
+        const int ix_global = ix + mybox[0][0] ;
+        const int iy_global = iy + mybox[1][0] ;
+        const int iz_global = iz + mybox[2][0] ;
+
+        if ( root_box[0][0] <= ix_global && ix_global < root_box[0][1] &&
+             root_box[1][0] <= iy_global && iy_global < root_box[1][1] &&
+             root_box[2][0] <= iz_global && iz_global < root_box[2][1] ) {
+
+          local_values[i][ ieq ] = ix_global +
+                                   iy_global * nx_global +
+                                   iz_global * nx_global * ny_global ;
+        }
+        else {
+          local_values[i][ ieq ] = -1 ;
+        }
+      }
+    }
+    }
+    }
+  }
+
+  /* Pair-wise compare the local values */
+  /* i  == receiving processor rank */
+  /* ip == sending   processor rank */
+  /* j  == receiving processor data entry for message from 'ip' */
+  /* jp == sending   processor data entry for message to   'i' */
+
+  for ( i = 0 ; i < np ; ++i ) {
+    for ( j = 1 ; j < np ; ++j ) {
+      const int ip = ( i + j ) % np ;
+      const int jp = ( i + np - ip ) % np ;
+      const int nrecv = map_recv_pc[i] [j+1]  - map_recv_pc[i] [j] ;
+      const int nsend = map_send_pc[ip][jp+1] - map_send_pc[ip][jp] ;
+      int k ;
+      if ( nrecv != nsend ) {
+        fprintf(stderr,"P%d recv %d from P%d\n",i,nrecv,ip);
+        fprintf(stderr,"P%d send %d to   P%d\n",ip,nsend,i);
+        abort();
+      }
+      for ( k = 0 ; k < nrecv ; ++k ) {
+        const int irecv = map_recv_pc[i][j] + k ;
+        const int isend = map_send_pc[ip][jp] + k ;
+        const int val_irecv = local_values[i][irecv] ;
+        const int val_isend = local_values[ip][ map_send_id[ip][isend] ] ;
+        if ( val_irecv != val_isend ) {
+          fprintf(stderr,"P%d recv[%d] = %d , from P%d\n",i,k,val_irecv,ip);
+          fprintf(stderr,"P%d send[%d] = %d , to   P%d\n",ip,k,val_isend,i);
+          abort();
+        }
+      }
+    }
+  }
+
+  for ( i = 0 ; i < np ; ++i ) {
+    free( map_local_id[i] );
+    free( map_recv_pc[i] );
+    free( map_send_pc[i] );
+    free( map_send_id[i] );
+    free( local_values[i] );
+  }
+  free( map_send_id );
+  free( map_send_pc );
+  free( map_recv_pc );
+  free( map_local_id );
+  free( local_values );
+  free( pbox );
+}
+
+/*--------------------------------------------------------------------*/
+
+int main( int argc , char * argv[] )
+{
+  int np_max = 256 ;
+  int box[3][2] = { { 0 , 64 } , { 0 , 64 } , { 0 , 64 } };
+  int np = 0 ;
+
+  switch( argc ) {
+  case 3:
+    sscanf(argv[1],"%d",&np);
+    sscanf(argv[2],"%dx%dx%d",& box[0][1] , & box[1][1] , & box[2][1] );
+    if ( 0 < np ) { test_box(  (const int (*)[2]) box , np ); }
+    if ( 0 < np ) { test_maps( (const int (*)[2]) box , np ); }
+    break ;
+  default:
+    for ( np = 1 ; np <= np_max ; ++np ) {
+      test_box(  (const int (*)[2]) box , np );
+      test_maps( (const int (*)[2]) box , np );
+    }
+    break ;
+  }
+  return 0 ;
+}
+
+#endif
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hpccg/BoxPartition.h b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/BoxPartition.h
new file mode 100644
index 0000000..3dfd839
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/BoxPartition.h
@@ -0,0 +1,64 @@
+
+/** \brief  Partition a { [ix,jx) X [iy,jy) X [iz,jz) } box.
+ *
+ *  Use recursive coordinate bisection to partition a box 
+ *  into np disjoint sub-boxes.  Allocate (via malloc) and
+ *  populate the sub-boxes, mapping the local (x,y,z) to
+ *  a local ordinal, and mappings for the send-recv messages
+ *  to update the ghost cells.
+ *
+ *  usage:
+ *
+ *  my_nx = pbox[my_p][0][1] - pbox[my_p][0][0] ;
+ *  my_ny = pbox[my_p][1][1] - pbox[my_p][1][0] ;
+ *  my_nz = pbox[my_p][2][1] - pbox[my_p][2][0] ;
+ *
+ *  for ( x = -ghost ; x < my_nx + ghost ; ++x ) {
+ *  for ( y = -ghost ; y < my_ny + ghost ; ++y ) {
+ *  for ( z = -ghost ; z < my_nz + ghost ; ++z ) {
+ *    const int x_global = x + pbox[my_p][0][0] ;
+ *    const int y_global = y + pbox[my_p][1][0] ;
+ *    const int z_global = z + pbox[my_p][2][0] ;
+ *
+ *    const int local_ordinal =
+ *      box_map_local( pbox[my_p], ghost, map_local_id, x, y, z );
+ *
+ *    if ( 0 <= local_ordinal ) {
+ *    }
+ *  }
+ *  
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int recv_processor = ( my_p + i ) % np ;
+ *    const int recv_ordinal_begin = map_recv_pc[i];
+ *    const int recv_ordinal_end   = map_recv_pc[i+1];
+ *  }
+ *
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int send_processor = ( my_p + i ) % np ;
+ *    const int send_map_begin = map_send_pc[i];
+ *    const int send_map_end   = map_send_pc[i+1];
+ *    for ( j = send_map_begin ; j < send_map_end ; ++j ) {
+ *      send_ordinal = map_send_id[j] ;
+ *    }
+ *  }
+ */
+void box_partition_rcb( 
+  const int np            /**< [in]  Number of partitions */ ,
+  const int my_p          /**< [in]  My partition rank    */ ,
+  const int root_box[][2] /**< [in]  3D Box to partition  */ ,
+  const int ghost         /**< [in]  Ghost cell boundary  */ ,
+  int (**pbox)[3][2]      /**< [out] Partition's 3D boxes */ ,
+  int ** map_local_id     /**< [out] Map local cells */ ,
+  int ** map_recv_pc      /**< [out] Receive spans per processor */ ,
+  int ** map_send_pc      /**< [out] Send prefix counts per processor */ ,
+  int ** map_send_id      /**< [out] Send message ordinals */ );
+
+/* \brief  Map a local (x,y,z) to a local ordinal.
+ */
+int box_map_local( const int box_local[][2] ,
+                   const int ghost ,
+                   const int map_local_id[] ,
+                   const int local_x ,
+                   const int local_y ,
+                   const int local_z );
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hpccg/CGSolver.c b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/CGSolver.c
new file mode 100644
index 0000000..2670bf7
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/CGSolver.c
@@ -0,0 +1,248 @@
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <ThreadPool_config.h>
+#include <TPI.h>
+#include <tpi_vector.h>
+#include <CGSolver.h>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+/*--------------------------------------------------------------------*/
+
+#ifdef HAVE_MPI
+
+#define TIMER( DT , F )	\
+  { double tb , te , tbg , teg , dt ; \
+    tb = TPI_Walltime(); \
+    F ; \
+    te = TPI_Walltime(); \
+    MPI_Allreduce(&tb, &tbg, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); \
+    MPI_Allreduce(&te, &teg, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); \
+    DT[0] += dt = teg - tbg ; \
+    DT[1] += dt * dt ; }
+
+#else
+
+#define TIMER( DT , F )	\
+  { const double tb = TPI_Walltime(); double dt ; \
+    F ; \
+    DT[0] += dt = TPI_Walltime() - tb ; \
+    DT[1] += dt * dt ; }
+
+#endif
+
+/*--------------------------------------------------------------------*/
+
+static
+VECTOR_SCALAR comm_sum( VECTOR_SCALAR v )
+{
+#ifdef HAVE_MPI
+  VECTOR_SCALAR result = 0 ;
+  if ( sizeof(VECTOR_SCALAR) == sizeof(double) ) {
+    MPI_Allreduce( & v , & result , 1 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD );
+  }
+  else {
+    MPI_Allreduce( & v , & result , 1 , MPI_FLOAT , MPI_SUM , MPI_COMM_WORLD );
+  }
+  return result ;
+#else
+  return v ;
+#endif
+}
+
+#ifdef HAVE_MPI
+static
+void comm_rhs_vector( const struct cgsolve_data * const data ,
+                      VECTOR_SCALAR * const vec )
+{
+  const int np = data->np ;
+  const int my_p = data->ip ;
+  const int * const recv_pc = data->recv_pc ;
+  const int * const send_pc = data->send_pc ;
+  const int * const send_id = data->send_id ;
+  int i , irecv ;
+
+  for ( irecv = 0 , i = 1 ; i < np ; ++i ) {
+    if ( recv_pc[i] < recv_pc[i+1] ) ++irecv ;
+  }
+
+#ifdef DEBUG_PRINT
+  fflush(stdout);
+  MPI_Barrier( MPI_COMM_WORLD );
+  fflush(stdout);
+#endif
+
+  {
+    VECTOR_SCALAR * const send_buf =
+      (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * send_pc[np] );
+
+    MPI_Request * const recv_request =
+      (MPI_Request *) malloc( sizeof(MPI_Request) * irecv );
+
+    MPI_Status * const recv_status =
+      (MPI_Status *) malloc( sizeof(MPI_Status) * irecv );
+
+    for ( irecv = 0 , i = 1 ; i < np ; ++i ) {
+      const int ip = ( i + my_p ) % np ;
+      const int recv_beg    = recv_pc[i];
+      const int recv_length = recv_pc[i+1] - recv_beg ;
+      if ( recv_length ) {
+#ifdef DEBUG_PRINT
+        fprintf(stdout,"  comm_rhs_vector P%d Irecv P%d : %d\n",
+                       my_p, ip, recv_length );
+        fflush(stdout);
+#endif
+        MPI_Irecv( vec + recv_beg ,
+                   recv_length * sizeof(VECTOR_SCALAR), MPI_BYTE ,
+                   ip , 0 , MPI_COMM_WORLD , recv_request + irecv );
+        ++irecv ;
+      }
+    }
+
+    /* Gather components into send buffer */
+
+    for ( i = 0 ; i < send_pc[np] ; ++i ) {
+      send_buf[i] = vec[ send_id[i] ];
+    }
+
+    MPI_Barrier( MPI_COMM_WORLD );
+
+    for ( i = 1 ; i < np ; ++i ) {
+      const int ip = ( i + my_p ) % np ;
+      const int send_beg    = send_pc[i];
+      const int send_length = send_pc[i+1] - send_beg ;
+      if ( send_length ) { /* Send to 'i' */
+#ifdef DEBUG_PRINT
+        fprintf(stdout,"  comm_rhs_vector P%d Rsend P%d : %d\n",
+                       my_p, ip, send_length );
+        fflush(stdout);
+#endif
+        MPI_Rsend( send_buf + send_beg ,
+                   send_length * sizeof(VECTOR_SCALAR), MPI_BYTE ,
+                   ip , 0 , MPI_COMM_WORLD );
+      }
+    }
+
+    MPI_Waitall( irecv , recv_request , recv_status );
+
+    free( recv_status );
+    free( recv_request );
+    free( send_buf );
+  }
+}
+#else
+#define comm_rhs_vector( D , V ) /* */
+#endif
+
+/*--------------------------------------------------------------------*/
+
+void cgsolve_set_lhs( const struct cgsolve_data * const data ,
+                      const VECTOR_SCALAR * const x ,
+                            VECTOR_SCALAR * const b )
+{
+  const int nRow = data->nRow ;
+  const int nVec = data->recv_pc[ data->np ] ;
+  const int   * const A_pc = data->A_pc ;
+  const int   * const A_ia = data->A_ia ;
+  const MATRIX_SCALAR * const A_a  = data->A_a ;
+
+  VECTOR_SCALAR * const p = (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
+
+  tpi_copy( nRow , x , p );
+
+  comm_rhs_vector( data , p );
+
+  tpi_crs_matrix_apply( nRow, A_pc, A_ia, A_a, p, b );
+
+  free( p );
+}
+
+/*--------------------------------------------------------------------*/
+
+void cgsolve( const struct cgsolve_data * const data ,
+              const VECTOR_SCALAR * const b ,
+                    VECTOR_SCALAR * const x ,
+                    int    * const iter_count ,
+                    VECTOR_SCALAR * const norm_resid ,
+                    double * const dt_mxv ,  
+                    double * const dt_axpby ,
+                    double * const dt_dot )
+{
+  const int nRow = data->nRow ;
+  const int nVec = data->recv_pc[ data->np ] ;
+  const int max_iter = data->max_iter ;
+  const int print_iter = data->print_iter ;
+  const int   * const A_pc = data->A_pc ;
+  const int   * const A_ia = data->A_ia ;
+  const MATRIX_SCALAR * const A_a  = data->A_a ;
+  const VECTOR_SCALAR tolerance = data->tolerance ;
+
+  const VECTOR_SCALAR tol_2 = tolerance * tolerance ;
+
+  VECTOR_SCALAR * const r  = (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
+  VECTOR_SCALAR * const p  = (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
+  VECTOR_SCALAR * const Ap = (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
+
+  VECTOR_SCALAR rtrans = 0.0 ;
+
+  int k ;
+
+  tpi_copy( nRow , b , r );
+  tpi_copy( nRow , x , p );
+
+  comm_rhs_vector( data , p ); tpi_crs_matrix_apply( nRow, A_pc, A_ia, A_a, p, Ap );
+
+  tpi_axpby( nRow , -1.0, Ap, 1.0 , r );
+
+  /* Include timing dot product for 2 * #iter dot products */
+  TIMER( dt_dot , rtrans = comm_sum( tpi_dot( nRow , r , r ) ) );
+
+  for ( k = 0 ; k < max_iter && tol_2 < rtrans ; ++k ) {
+    VECTOR_SCALAR alpha ;
+    VECTOR_SCALAR beta = 0.0 ;
+    VECTOR_SCALAR pAp = 0.0 ;
+
+    if ( k ) {
+      const VECTOR_SCALAR oldrtrans = rtrans ;
+      TIMER( dt_dot , rtrans = comm_sum( tpi_dot( nRow , r , r ) ) );
+      beta = rtrans / oldrtrans ;
+    }
+
+    TIMER( dt_axpby , tpi_axpby( nRow, 1.0, r, beta, p ) );
+
+    TIMER( dt_mxv , comm_rhs_vector( data , p ); tpi_crs_matrix_apply( nRow, A_pc, A_ia, A_a, p, Ap ) );
+
+    TIMER( dt_dot , pAp = comm_sum( tpi_dot( nRow , p , Ap ) ) );
+
+    if ( 0 < fabs( pAp ) ) {
+      alpha = rtrans / pAp ;
+    }
+    else {
+      alpha = rtrans = 0.0 ; /* Orthogonal, cannot continue */
+    }
+
+    if ( ! ( ( k + 1 ) % print_iter ) ) {
+      fprintf(stdout,"  cgsolve | r(%d) | = %g\n",k,sqrt(rtrans));
+      fflush(stdout);
+    }
+  
+    TIMER( dt_axpby , tpi_axpby( nRow , alpha,  p,  1.0, x) );
+    TIMER( dt_axpby , tpi_axpby( nRow , -alpha, Ap, 1.0, r) );
+  }
+
+  *norm_resid = sqrt( rtrans );
+  *iter_count = k ;
+
+  free( Ap );
+  free( p );
+  free( r );
+}
+
+/*--------------------------------------------------------------------*/
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hpccg/CGSolver.h b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/CGSolver.h
new file mode 100644
index 0000000..0660a01
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/CGSolver.h
@@ -0,0 +1,32 @@
+
+#include <tpi_vector.h>
+
+struct cgsolve_data {
+  int             nRow ; 
+  int           * A_pc ; 
+  int           * A_ia ; 
+  MATRIX_SCALAR * A_a ; 
+  int             max_iter ; 
+  int             print_iter ; 
+  VECTOR_SCALAR   tolerance ; 
+
+  int     np ; 
+  int     ip ; 
+  int   * recv_pc ; 
+  int   * send_pc ; 
+  int   * send_id ; 
+}; 
+
+void cgsolve_set_lhs( const struct cgsolve_data * data ,
+                      const VECTOR_SCALAR * const x ,
+                            VECTOR_SCALAR * const b );
+
+void cgsolve( const struct cgsolve_data * data ,
+              const VECTOR_SCALAR * const b ,
+                    VECTOR_SCALAR * const x ,
+                    int    * const iter_count ,
+                    VECTOR_SCALAR * const norm_resid ,
+                    double * const dt_mxv ,
+                    double * const dt_axpby ,
+                    double * const dt_dot );
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hpccg/CMakeLists.txt b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/CMakeLists.txt
new file mode 100644
index 0000000..bfba897
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/CMakeLists.txt
@@ -0,0 +1,83 @@
+
+INCLUDE(PackageAddExecutableAndTest)
+INCLUDE(PackageLibraryMacros)
+
+####################
+
+SET(HEADERS "")
+SET(SOURCES "")
+ 
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+ 
+SET(HEADERS ${HEADERS}
+  ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h
+  )
+ 
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+ 
+APPEND_SET(HEADERS
+  BoxPartition.h
+  CGSolver.h
+  tpi_vector.h
+  )
+ 
+####################
+
+
+PACKAGE_ADD_EXECUTABLE(
+  test_tpi_hpccg
+  COMM serial mpi
+  SOURCES main.c CGSolver.c BoxPartition.c tpi_vector.c
+  DEPLIBS pthread m
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hpccg
+  NAME test_tpi_hpccg_serial_1
+  COMM serial
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hpccg
+  NAME test_tpi_hpccg_serial_2
+  COMM serial
+  ARGS "threads=2"
+  DIRECTORY .
+  XHOSTTYPE AIX
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hpccg
+  NAME test_tpi_hpccg_serial_4
+  COMM serial
+  ARGS "threads=4"
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hpccg
+  NAME test_tpi_hpccg_mpi_1
+  COMM mpi
+  NUM_MPI_PROCS 1
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hpccg
+  NAME test_tpi_hpccg_mpi_2
+  COMM mpi
+  NUM_MPI_PROCS 2
+  DIRECTORY .
+  )
+
+PACKAGE_ADD_TEST(
+  test_tpi_hpccg
+  NAME test_tpi_hpccg_mpi_4
+  COMM mpi
+  NUM_MPI_PROCS 4
+  DIRECTORY .
+  )
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hpccg/main.c b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/main.c
new file mode 100644
index 0000000..676a02d
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/main.c
@@ -0,0 +1,340 @@
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <ThreadPool_config.h>
+#include <TPI.h>
+#include <BoxPartition.h>
+#include <CGSolver.h>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+/*--------------------------------------------------------------------*/
+
+static
+void hpccg_alloc_and_fill( const int np ,
+                           const int my_p ,
+                           const int gbox[][2] ,
+                           const int ghost ,
+                           struct cgsolve_data * const data )
+{
+  int (*pbox)[3][2] = NULL ;
+  int * map_local_ord = NULL;
+
+  data->nRow = 0 ;
+  data->A_pc = NULL ;
+  data->A_ia = NULL ;
+  data->A_a  = NULL ;
+
+  data->np = np ;
+  data->ip = my_p ;
+  data->recv_pc = NULL  ;
+  data->send_pc = NULL ;
+  data->send_id = NULL ;
+
+  box_partition_rcb( np, my_p,
+                     (const int (*)[2]) gbox, ghost,
+                     & pbox ,
+                     & map_local_ord ,
+                     & data->recv_pc ,
+                     & data->send_pc ,
+                     & data->send_id );
+
+  {
+    const int (* const my_box)[2] = (const int (*)[2]) pbox[my_p] ;
+    const int bx = my_box[0][0] ;
+    const int by = my_box[1][0] ;
+    const int bz = my_box[2][0] ;
+    const int nx = my_box[0][1] - bx ;
+    const int ny = my_box[1][1] - by ;
+    const int nz = my_box[2][1] - bz ;
+    const int n = nx * ny * nz ;
+    const int nnz = 27 * n ; /* Upper bound */
+    int    * const pc = (int *)   malloc( sizeof(int) * ( n + 1 ) );
+    int    * const ia = (int *)   malloc( sizeof(int) * nnz );
+    MATRIX_SCALAR  * const a  = (MATRIX_SCALAR *) malloc( sizeof(MATRIX_SCALAR) * nnz );
+
+    int irow = 0 ;
+    int ipc  = 0 ;
+    int ix , iy , iz ;
+    int sx , sy , sz ;
+
+    for ( iz = 0 ; iz < nz ; ++iz ) {
+    for ( iy = 0 ; iy < ny ; ++iy ) {
+    for ( ix = 0 ; ix < nx ; ++ix , ++irow ) {
+
+      if ( irow != box_map_local( my_box, ghost, map_local_ord,ix,iy,iz) ) {
+        fprintf(stderr,"P%d:  irow[%d] != box_map_local(%d,%d,%d) = %d\n",
+                my_p,irow,ix,iy,iz,
+                box_map_local( my_box, ghost, map_local_ord, ix, iy, iz) );
+      }
+
+      pc[ irow ] = ipc ;   /* Beginning of row coefficients */
+      /* Diagonal term first */
+      ia[ ipc ] = irow ;
+      a[  ipc ] = 27.0f ;
+      ++ipc ;
+
+      /* Off-diagonal terms to follow */
+      for ( sz = -1 ; sz <= 1 ; ++sz ) {
+      for ( sy = -1 ; sy <= 1 ; ++sy ) {
+      for ( sx = -1 ; sx <= 1 ; ++sx ) {
+        const int dx = ix + sx ;
+        const int dy = iy + sy ;
+        const int dz = iz + sz ;
+        const int global_x = dx + bx ;
+        const int global_y = dy + by ;
+        const int global_z = dz + bz ;
+
+        if ( gbox[0][0] <= global_x && global_x < gbox[0][1] &&
+             gbox[1][0] <= global_y && global_y < gbox[1][1] &&
+             gbox[2][0] <= global_z && global_z < gbox[2][1] &&
+             ! ( sz == 0 && sy == 0 && sx == 0 ) ) {
+          /* 'icol' is mapped for communication */
+
+          const int icol =
+            box_map_local(my_box,ghost,map_local_ord,dx,dy,dz);
+
+          if ( icol < 0 ) {
+            fprintf(stderr,"P%d : bad column at local (%d,%d,%d) global(%d,%d,%d)\n",
+                    my_p, dx,dy,dz,global_x,global_y,global_z);
+            fflush(stderr);
+            abort();
+          }
+
+          ia[ ipc ] = icol ;
+          a[  ipc ] = -1.0f ;
+          ++ipc ;
+        }
+      }
+      }
+      }
+    }
+    }
+    }
+
+    pc[irow] = ipc ;
+
+    data->nRow = irow ;
+    data->A_pc = pc ;
+    data->A_ia = ia ;
+    data->A_a  = a ;
+  }
+
+  free( map_local_ord );
+  free( pbox );
+}
+
+/*--------------------------------------------------------------------*/
+
+int main( int argc , char ** argv )
+{
+  const int ghost = 1 ;
+  const int max_cube = 20 ;
+  int ncube[20] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+                    0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
+
+  FILE * print_file = stdout ;
+  int print_iter = 500 ;
+  int max_iter = 50 ;
+
+  VECTOR_SCALAR tolerance = 0.0 ; /* Force max iterations */
+
+  int gbox[3][2] = { { 0 , 16 } , { 0 , 16 } , { 0 , 16 } };
+  int nt = 0 ;
+  int trials = 5 ;
+  int ntest ;
+  int np = 1;
+  int my_p = 0 ;
+
+#ifdef HAVE_MPI
+  MPI_Init( & argc , & argv );
+  MPI_Comm_size( MPI_COMM_WORLD , & np );
+  MPI_Comm_rank( MPI_COMM_WORLD , & my_p );
+#endif
+
+  if ( ! my_p ) {
+    const char arg_threads[] = "threads=" ;
+    const char arg_cube[] = "cube=" ;
+    const char arg_box[] = "box=" ;
+    const char arg_max[] = "max_iter=" ;
+    const char arg_trials[] = "trials=" ;
+    const char arg_print[] = "print_iter=" ;
+    const char arg_file[] = "print_file=" ;
+    int i ;
+    for ( i = 1 ; i < argc ; ++i ) {
+      if ( ! strncmp(argv[i],arg_threads,strlen(arg_threads)) ) {
+        sscanf(argv[i]+strlen(arg_threads),"%d",&nt);
+      }
+      else if ( ! strncmp(argv[i],arg_box,strlen(arg_box)) ) {
+        sscanf(argv[i]+strlen(arg_box),"%d%*[x]%d%*[x]%d",
+               & gbox[0][1] , & gbox[1][1] , & gbox[2][1] );
+      }
+      else if ( ! strncmp(argv[i],arg_cube,strlen(arg_cube)) ) {
+        sscanf(argv[i]+strlen(arg_cube),
+               "%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d",
+               ncube+0, ncube+1, ncube+2, ncube+3, ncube+4,
+               ncube+5, ncube+6, ncube+7, ncube+8, ncube+9,
+               ncube+10, ncube+11, ncube+12, ncube+13, ncube+14,
+               ncube+15, ncube+16, ncube+17, ncube+18, ncube+19);
+      }
+      else if ( ! strncmp(argv[i],arg_max,strlen(arg_max)) ) {
+        sscanf(argv[i]+strlen(arg_max),"%d",&max_iter);
+      }
+      else if ( ! strncmp(argv[i],arg_trials,strlen(arg_trials)) ) {
+        sscanf(argv[i]+strlen(arg_trials),"%d",&trials);
+      }
+      else if ( ! strncmp(argv[i],arg_print,strlen(arg_print)) ) {
+        sscanf(argv[i]+strlen(arg_print),"%d",&print_iter);
+      }
+      else if ( ! strncmp(argv[i],arg_file,strlen(arg_file)) ) {
+        char buffer[256] ;
+        sscanf(argv[i]+strlen(arg_file),"%s",buffer);
+        print_file = fopen(buffer,"a");
+      }
+    }
+  }
+
+#ifdef HAVE_MPI
+  {
+    MPI_Bcast( & nt , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & gbox[0][0] , 6 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( ncube , max_cube , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & max_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & print_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
+    MPI_Bcast( & trials , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
+  }
+#endif
+
+  if ( nt ) {
+    TPI_Init( nt );
+    TPI_Block();
+    TPI_Unblock();
+  }
+
+  if ( ! my_p ) {
+    fprintf(print_file,"\"PROC\" , \"THREAD\" , \"EQUATION\" , \"NON-ZERO\" , \"MXV\"    , \"AXPBY\"  , \"DOT\" , \"Xerror\" , \"Iter\"\n");
+    fprintf(print_file,"\"COUNT\" , \"COUNT\"  , \"COUNT\"    , \"COUNT\"    , \"Mflops\" , \"Mflops\" , \"Mflops\" , \"L2norm\" , \"COUNT\"\n");
+  }
+
+  for ( ntest = 0 ; ! ntest || ( ntest < max_cube && ncube[ntest] ) ; ++ntest ) {
+    struct cgsolve_data cgdata ;
+
+    if ( ncube[ntest] ) {
+      gbox[0][1] = gbox[1][1] = gbox[2][1] = ncube[ntest] ;
+    }
+
+    hpccg_alloc_and_fill( np, my_p, (const int (*)[2]) gbox, ghost, &cgdata);
+
+    cgdata.max_iter   = max_iter ;
+    cgdata.print_iter = print_iter ;
+    cgdata.tolerance  = tolerance ;
+
+    {
+      double dt_mxv[2] = { 0 , 0 };
+      double dt_axpby[2] = { 0 , 0 };
+      double dt_dot[2] = { 0 , 0 };
+      VECTOR_SCALAR norm_resid = 0.0 ;
+      int iter_count = 0 ;
+      int iter_total = 0 ;
+      int k ;
+
+      VECTOR_SCALAR * const b      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow );
+      VECTOR_SCALAR * const x      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow );
+      VECTOR_SCALAR * const xexact = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow );
+
+      {
+        const VECTOR_SCALAR value = 1.0 /* 1.0 / 3.0 */ ;
+        int i ;
+        for ( i = 0 ; i < cgdata.nRow ; ++i ) xexact[i] = value ;
+      }
+
+      for ( k = 0 ; k < trials ; ++k ) {
+        int i ;
+
+        for ( i = 0 ; i < cgdata.nRow ; ++i ) { x[i] = 0.0 ; }
+
+        cgsolve_set_lhs( & cgdata , xexact , b );
+
+        cgsolve( & cgdata, b, x,
+                 & iter_count, & norm_resid,
+                 dt_mxv , dt_axpby , dt_dot );
+
+        iter_total += iter_count ;
+      }
+
+      {
+        int nnzGlobal = cgdata.A_pc[ cgdata.nRow ];
+        double error[2] = { 0 , 0 };
+
+        for ( k = 0 ; k < cgdata.nRow ; ++k ) {
+          error[0] += ( x[k] - xexact[k] ) * ( x[k] - xexact[k] );
+          error[1] += xexact[k] * xexact[k] ;
+        }
+
+#ifdef HAVE_MPI
+        {
+          double error_global[2] = { 0.0 , 0.0 };
+          int nnz = nnzGlobal ;
+
+          MPI_Allreduce( & nnz , & nnzGlobal , 1 , MPI_INT , MPI_SUM ,
+                         MPI_COMM_WORLD );
+
+          MPI_Allreduce( error , error_global , 2 , MPI_DOUBLE , MPI_SUM ,
+                         MPI_COMM_WORLD );
+
+          error[0] = error_global[0];
+          error[1] = error_global[1];
+        }
+#endif
+
+        error[0] = sqrt( error[0] );
+        error[1] = sqrt( error[1] );
+
+        if ( ! my_p ) {
+          const int nRowGlobal = ( gbox[0][1] - gbox[0][0] ) *
+                                 ( gbox[1][1] - gbox[1][0] ) *
+                                 ( gbox[2][1] - gbox[2][0] );
+
+          const double mflop_mxv =
+             1.0e-6 * ( iter_total ) * 2 * nnzGlobal / dt_mxv[0] ;
+
+          const double mflop_axpby =
+             1.0e-6 * ( iter_total * 3 ) * 3 * nRowGlobal / dt_axpby[0] ;
+
+          const double mflop_dot =
+             1.0e-6 * ( iter_total * 2 ) * 2 * nRowGlobal / dt_dot[0] ;
+
+          fprintf(print_file,"%8d , %8d , %8d , %8d , %10g , %10g , %10g , %g , %d\n",
+                  np , nt , nRowGlobal , nnzGlobal ,
+                  mflop_mxv , mflop_axpby , mflop_dot ,
+                  error[0] / error[1] , iter_total );
+          fflush(print_file);
+        }
+      }
+
+      free( xexact );
+      free( x );
+      free( b );
+    }
+    free( cgdata.A_a );
+    free( cgdata.A_ia );
+    free( cgdata.A_pc );
+    free( cgdata.recv_pc );
+    free( cgdata.send_pc );
+    free( cgdata.send_id );
+  }
+
+  if ( nt ) { TPI_Finalize(); }
+
+#ifdef HAVE_MPI
+  MPI_Finalize();
+#endif
+
+  return 0 ;
+}
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hpccg/tpi_vector.c b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/tpi_vector.c
new file mode 100644
index 0000000..1b8a26c
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/tpi_vector.c
@@ -0,0 +1,273 @@
+#include <stdio.h>
+
+#include <stddef.h>
+
+#include <TPI.h>
+#include <tpi_vector.h>
+
+/*--------------------------------------------------------------------*/
+
+struct tpi_work_vector {
+        VECTOR_SCALAR alpha ;
+        VECTOR_SCALAR beta ;
+  const VECTOR_SCALAR * x ;
+  const VECTOR_SCALAR * y ;
+        VECTOR_SCALAR * w ; 
+        int  n ;
+};
+
+static void tpi_work_span( TPI_Work * const work , const int n ,
+                           int * const iBeg , int * const iEnd )
+{
+  const int chunk = ( n + work->count - 1 ) / work->count ;
+  const int i_end = chunk + ( *iBeg = chunk * work->rank );
+
+  *iEnd = n < i_end ? n : i_end ;
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_fill( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR alpha = h->alpha ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] = alpha ; }
+}
+
+void tpi_fill( int n , VECTOR_SCALAR alpha , VECTOR_SCALAR * x )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.alpha = alpha ;
+  tmp.w = x ;
+  tmp.n = n ;
+  TPI_Run_threads( tpi_work_fill , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_scale( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR beta = h->beta ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] *= beta ; }
+}
+
+void tpi_scale( int n , const VECTOR_SCALAR alpha , VECTOR_SCALAR * x )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.alpha = alpha ;
+  tmp.w = x ;
+  tmp.n = n ;
+  TPI_Run_threads( tpi_work_scale , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_copy( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR * const x = h->x ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] = x[i] ; }
+}
+
+void tpi_copy( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR * y )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.x = x ;
+  tmp.w = y ;
+  tmp.n = n ;
+  TPI_Run_threads( tpi_work_copy , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_axpby( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  const VECTOR_SCALAR alpha = h->alpha ;
+  const VECTOR_SCALAR beta  = h->beta ;
+  const VECTOR_SCALAR * const x = h->x ;
+  VECTOR_SCALAR * const w = h->w ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { w[i] = alpha * x[i] + beta * w[i] ; }
+}
+
+void tpi_axpby( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
+                        VECTOR_SCALAR beta  ,       VECTOR_SCALAR * y )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  tmp.alpha = alpha ;
+  tmp.beta  = beta ;
+  tmp.x = x ;
+  tmp.w = y ;
+  tmp.n = n ;
+
+  TPI_Run_threads( tpi_work_axpby , & tmp , 0 );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void tpi_work_dot_partial( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  VECTOR_SCALAR * const s = (VECTOR_SCALAR *) work->reduce ;
+  const VECTOR_SCALAR * const x = h->x ;
+  const VECTOR_SCALAR * const y = h->y ;
+  VECTOR_SCALAR tmp = *s ;
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { tmp += x[i] * y[i] ; }
+
+  *s = tmp ;
+}
+
+static void tpi_work_dot_partial_self( TPI_Work * work )
+{
+  const struct tpi_work_vector * const h =
+    (struct tpi_work_vector *) work->info ;
+
+  VECTOR_SCALAR * const s = (VECTOR_SCALAR *) work->reduce ;
+  const VECTOR_SCALAR * const x = h->x ;
+  VECTOR_SCALAR tmp = *s ;
+
+  int i , iEnd ;
+
+  tpi_work_span( work , h->n , & i , & iEnd );
+
+  for ( ; i < iEnd ; ++i ) { const VECTOR_SCALAR d = x[i] ; tmp += d * d ; }
+
+  *s = tmp ;
+}
+
+static void tpi_work_dot_join( TPI_Work * work , const void * src  )
+{
+  *((VECTOR_SCALAR *) ( work->reduce) ) += *((const VECTOR_SCALAR *) src);
+}
+
+static void tpi_work_dot_init( TPI_Work * work )
+{
+  *((VECTOR_SCALAR *) ( work->reduce) ) = 0 ;
+}
+
+VECTOR_SCALAR tpi_dot( int n , const VECTOR_SCALAR * x , const VECTOR_SCALAR * y )
+{
+  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
+  VECTOR_SCALAR result = 0.0 ;
+  tmp.x = x ;
+  tmp.y = y ;
+  tmp.n = n ;
+  if ( x != y ) {
+    TPI_Run_threads_reduce( tpi_work_dot_partial , & tmp ,
+                            tpi_work_dot_join , tpi_work_dot_init ,
+                            sizeof(result) , & result );
+  }
+  else {
+    TPI_Run_threads_reduce( tpi_work_dot_partial_self , & tmp ,
+                            tpi_work_dot_join , tpi_work_dot_init ,
+                            sizeof(result) , & result );
+  }
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+
+struct tpi_crs_matrix {
+        int      nRow ;
+  const int    * A_pc ;
+  const int    * A_ia ;
+  const MATRIX_SCALAR * A_a ;
+  const VECTOR_SCALAR * x ;
+        VECTOR_SCALAR * y ;
+};
+
+static void tpi_work_crs_matrix_apply( TPI_Work * work )
+{
+  const struct tpi_crs_matrix * const h =
+    (struct tpi_crs_matrix *) work->info ;
+
+  const int   * const A_pc = h->A_pc ;
+  const int   * const A_ia = h->A_ia ;
+  const MATRIX_SCALAR * const A_a  = h->A_a ;
+  const VECTOR_SCALAR * const x = h->x ;
+
+  const int nRow  = h->nRow ;
+  const int chunk = ( nRow + work->count - 1 ) / work->count ;
+
+  int row    = chunk * work->rank ;
+  int rowEnd = chunk + row ;
+
+  if ( nRow < rowEnd ) { rowEnd = nRow ; }
+
+  {
+    const int * const pc_end = A_pc + rowEnd ;
+    const int *       pc     = A_pc + row ;
+    VECTOR_SCALAR *   y      = h->y + row ;
+
+    for ( ; pc != pc_end ; ++pc , ++y ) {
+      const int   *       ia    = A_ia + *pc ;
+      const MATRIX_SCALAR *       a     = A_a  + *pc ;
+      const MATRIX_SCALAR * const a_end = A_a  + pc[1] ;
+      VECTOR_SCALAR tmp = 0 ;
+      for ( ; a != a_end ; ++a , ++ia ) {
+        tmp += *a * x[ *ia ];
+      }
+      *y = tmp ;
+    }
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+void tpi_crs_matrix_apply(
+  const int      nRow ,
+  const int    * A_pc ,
+  const int    * A_ia ,
+  const MATRIX_SCALAR * A_a ,
+  const VECTOR_SCALAR * x ,
+        VECTOR_SCALAR * y )
+{
+  struct tpi_crs_matrix h = { 0 , NULL , NULL , NULL , NULL , NULL };
+  h.nRow = nRow ;
+  h.A_pc = A_pc ;
+  h.A_ia = A_ia ;
+  h.A_a  = A_a ;
+  h.x    = x ;
+  h.y    = y ;
+  TPI_Run_threads( tpi_work_crs_matrix_apply , & h , 0 );
+}
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/hpccg/tpi_vector.h b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/tpi_vector.h
new file mode 100644
index 0000000..bcd514e
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/hpccg/tpi_vector.h
@@ -0,0 +1,31 @@
+
+#include <ThreadPool_config.h>
+
+#ifndef tpi_vector_h
+#define tpi_vector_h
+
+#define VECTOR_SCALAR float
+#define MATRIX_SCALAR float
+
+void tpi_fill( int n , VECTOR_SCALAR alpha , VECTOR_SCALAR * x );
+
+void tpi_scale( int n , const VECTOR_SCALAR alpha , VECTOR_SCALAR * x );
+
+void tpi_copy( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR * y );
+
+void tpi_axpby( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
+                        VECTOR_SCALAR beta  ,       VECTOR_SCALAR * y );
+
+VECTOR_SCALAR tpi_dot( int n , const VECTOR_SCALAR * x ,
+                               const VECTOR_SCALAR * y );
+
+void tpi_crs_matrix_apply(
+  const int             nRow ,
+  const int           * A_pc ,
+  const int           * A_ia ,
+  const MATRIX_SCALAR * A_a ,
+  const VECTOR_SCALAR * x ,
+        VECTOR_SCALAR * y );
+
+#endif
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/test_c_dnax.c b/openmp-avx512/basic/optional/ThreadPool/test/test_c_dnax.c
new file mode 100644
index 0000000..4f6ab9b
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/test_c_dnax.c
@@ -0,0 +1,414 @@
+/*------------------------------------------------------------------------*/
+/*                    TPI: Thread Pool Interface                          */
+/*                Copyright (2008) Sandia Corporation                     */
+/*                                                                        */
+/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
+/*  license for use of this work by or on behalf of the U.S. Government.  */
+/*                                                                        */
+/*  This library is free software; you can redistribute it and/or modify  */
+/*  it under the terms of the GNU Lesser General Public License as        */
+/*  published by the Free Software Foundation; either version 2.1 of the  */
+/*  License, or (at your option) any later version.                       */
+/*                                                                        */
+/*  This library is distributed in the hope that it will be useful,       */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
+/*  Lesser General Public License for more details.                       */
+/*                                                                        */
+/*  You should have received a copy of the GNU Lesser General Public      */
+/*  License along with this library; if not, write to the Free Software   */
+/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
+/*  USA                                                                   */
+/*------------------------------------------------------------------------*/
+/**
+ * @author H. Carter Edwards
+ *
+ *  Multi-array 'axpby'
+ */
+
+#include <math.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <TPI.h>
+
+#if defined( HAVE_MPI )
+#include <mpi.h>
+#endif
+
+int test_c_tpi_dnax( int , int );
+
+int main( int argc , char ** argv )
+{
+  int num_thread[] = { 1 , 2 , 4 , 6 , 8 , 12 , 16 };
+  int num_test = sizeof(num_thread) / sizeof(int);
+ 
+  const int ntrial = 1 < argc ? atoi( argv[1] ) : 2 ;
+  int i ;
+
+#if defined( HAVE_MPI )
+  int rank ;
+ 
+  MPI_Init( & argc , & argv );
+  MPI_Comm_rank( MPI_COMM_WORLD , & rank );
+  if ( 0 == rank ) {
+#endif
+
+
+  fprintf( stdout , "\"TESTING Multiarray 'axpby' with: %s\"\n" ,
+           TPI_Version() );
+ 
+  for ( i = 0 ; i < num_test ; ++i ) {
+    test_c_tpi_dnax( num_thread[i] , ntrial );
+  }
+
+#if defined( HAVE_MPI )
+  }
+  MPI_Finalize();
+#endif
+ 
+  return 0 ;
+}
+
+/*------------------------------------------------------------------------*/
+
+typedef double SCALAR ;
+
+/*------------------------------------------------------------------------*/
+
+struct TestTPI_DNAX {
+  SCALAR * coef ;
+  SCALAR * array ;
+  unsigned number ;
+  unsigned length ;
+  unsigned stride ;
+  unsigned chunk_length ;
+};
+
+/*------------------------------------------------------------------------*/
+
+static
+void test_dnax_column( const unsigned num_array , 
+                       const unsigned stride ,
+                       const unsigned length , 
+                       const SCALAR * const coef ,
+                       SCALAR * const array )
+{
+  unsigned i = 0 ;
+  for ( ; i < length ; ++i ) {
+    SCALAR * const a = array + i ;
+    SCALAR tmp = 0 ;
+    unsigned j = 0 ;
+    for ( ; j < num_array ; ++j ) { tmp += coef[j] * a[ j * stride ] ; }
+    a[0] = tmp ;
+  }
+}
+
+static
+void test_dnax_row( const unsigned num_array , 
+                    const unsigned stride ,
+                    const unsigned length , 
+                    const SCALAR * const coef ,
+                    SCALAR * const array )
+{
+  unsigned i = 0 ;
+  for ( ; i < length ; ++i ) {
+    SCALAR * const a = array + i * stride ;
+    SCALAR tmp = 0 ;
+    unsigned j = 0 ;
+    for ( ; j < num_array ; ++j ) { tmp += coef[j] * a[j] ; }
+    a[0] = tmp ;
+  }
+}
+
+/*------------------------------------------------------------------------*/
+/*  The multi-array storage is flat: every array is fully contiguous.
+ *  Work corresponds to a span of the array.
+ */
+static
+void test_dnax_flat_work( TPI_Work * work )
+{
+  const struct TestTPI_DNAX * const info =
+    (struct TestTPI_DNAX *) work->info ;
+
+  const unsigned which_chunk = work->rank ;
+  const unsigned beg_local   = info->chunk_length * which_chunk ;
+  const unsigned max_local   = info->length - beg_local ;
+  const unsigned len_local   = info->chunk_length < max_local ?
+                               info->chunk_length : max_local ;
+
+  test_dnax_column( info->number ,
+                    info->stride ,
+                    len_local ,
+                    info->coef ,
+                    info->array + beg_local );
+
+  return ;
+}
+
+/*  The multi-array storage is chunked: each array has a contiguous chunk;
+ *  but chunk-subarrays are contiguously grouped.
+ */
+static
+void test_dnax_column_work( TPI_Work * work )
+{
+  const struct TestTPI_DNAX * const info =
+    (struct TestTPI_DNAX *) work->info ;
+
+  const unsigned which_chunk = work->rank ;
+  const unsigned beg_local   = info->chunk_length * which_chunk ;
+  const unsigned max_local   = info->length - beg_local ;
+  const unsigned len_local   = info->chunk_length < max_local ?
+                               info->chunk_length : max_local ;
+
+  const unsigned chunk_size = info->chunk_length * info->number ;
+
+  test_dnax_column( info->number ,
+                    info->chunk_length ,
+                    len_local ,
+                    info->coef ,
+                    info->array + which_chunk * chunk_size );
+
+  return ;
+}
+
+static
+void test_dnax_row_work( TPI_Work * work )
+{
+  const struct TestTPI_DNAX * const info =
+    (struct TestTPI_DNAX *) work->info ;
+
+  const unsigned which_chunk = work->rank ;
+  const unsigned beg_local   = info->chunk_length * which_chunk ;
+  const unsigned max_local   = info->length - beg_local ;
+  const unsigned len_local   = info->chunk_length < max_local ?
+                               info->chunk_length : max_local ;
+
+  const unsigned chunk_size = info->chunk_length * info->number ;
+
+  test_dnax_row( info->number ,
+                 info->number ,
+                 len_local ,
+                 info->coef ,
+                 info->array + which_chunk * chunk_size );
+
+  return ;
+}
+
+/*------------------------------------------------------------------------*/
+/* Process identical block of allocated memory as a
+ * as a flat array, chunked-column, and chunked-row.
+ */
+
+static
+void test_tpi_dnax_driver( const int nthread ,
+                           const unsigned Mflop_target ,
+                           const unsigned num_trials ,
+                           const unsigned num_test ,
+                           const unsigned num_test_array[] ,
+                           const unsigned length_array ,
+                           const unsigned length_chunk )
+{
+  const unsigned max_array = num_test_array[ num_test - 1 ];
+
+  const unsigned num_chunk =
+    ( length_array + length_chunk - 1 ) / length_chunk ;
+
+  const unsigned stride_array = num_chunk * length_chunk ;
+  const unsigned size_alloc   = max_array * stride_array ;
+
+  SCALAR * const coef  = (SCALAR *) malloc( max_array * sizeof(SCALAR) );
+  SCALAR * const array = (SCALAR *) malloc( size_alloc * sizeof(SCALAR) );
+
+  struct TestTPI_DNAX data = { NULL , NULL , 0 , 0 , 0 , 0 };
+
+  unsigned i_test , i , j ;
+
+  data.coef = coef ;
+
+  if ( NULL == array ) {
+    fprintf(stderr,"allocation failure for %u\n",size_alloc);
+    abort();
+  }
+
+  for ( i = 0 ; i < max_array ; ++i ) { coef[i] = 0 ; }
+
+  printf("\n\"test_tpi_dnax[%d]( length_array = %u , stride_array = %u )\"\n",
+         nthread , length_array , stride_array );
+  printf("\"NUMBER OF THREADS\" , %d\n" , nthread );
+  printf("\"NUMBER OF CHUNKS\" , %u\n" , num_chunk );
+  printf("\"NUMBER OF TRIALS\" , %u \n", num_trials );
+
+  printf("\"TEST\" , \"#ARRAY\" \"DT-MEAN\" , \"DT-STDDEV\" , \"MFLOP-MEAN\" , \"MFLOP-STDDEV\"\n");
+
+  /*----------------------------------------------------------------------*/
+
+  for ( i_test = 0 ; i_test < num_test ; ++i_test ) {
+    const unsigned num_array = num_test_array[ i_test ];
+    const unsigned num_sets  = max_array / num_array ;
+
+    const double mflop_cycle =
+      ((double)( 2 * num_array * length_array )) / 1.0e6 ;
+
+    const unsigned ncycle = 1 + (unsigned)( Mflop_target / mflop_cycle );
+
+    double dt_sum = 0 ;
+    double dt_sum_2 = 0 ;
+
+    data.length       = length_array ;
+    data.number       = num_array ;
+    data.stride       = stride_array ;
+    data.chunk_length = length_chunk ;
+
+    for ( i = 0 ; i < size_alloc ; ++i ) { array[i] = 0 ; }
+
+    for ( j = 0 ; j < num_trials ; ++j ) {
+
+      double dt_tmp = TPI_Walltime();
+      for ( i = 0 ; i < ncycle ; ++i ) {
+        data.array = array + stride_array * num_array * ( i % num_sets );
+        TPI_Run( & test_dnax_flat_work , & data , num_chunk , 0 );
+      }
+      dt_tmp = TPI_Walltime() - dt_tmp ;
+
+      dt_sum += dt_tmp ;
+      dt_sum_2 += dt_tmp * dt_tmp ;
+    }
+
+    {
+      const double dt_mean = dt_sum / num_trials ;
+      const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) / ( num_trials * ( num_trials - 1 ) ) );
+      const double mflop_mean = mflop_cycle * ncycle / dt_mean ;
+      const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
+
+      printf("\"FLAT  ARRAY\"  , %6u , %9.5g , %9.3g , %9.5g , %9.3g\n",
+             num_array, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
+    }
+  }
+
+  /*----------------------------------------------------------------------*/
+
+  for ( i_test = 0 ; i_test < num_test ; ++i_test ) {
+
+    const unsigned num_array = num_test_array[ i_test ];
+    const unsigned num_sets  = max_array / num_array ;
+
+    const double mflop_cycle =
+      ((double)( 2 * num_array * length_array )) / 1.0e6 ;
+
+    const unsigned ncycle = 1 + (unsigned)( Mflop_target / mflop_cycle );
+
+    double dt_sum = 0 ;
+    double dt_sum_2 = 0 ;
+
+    data.length       = length_array ;
+    data.number       = num_array ;
+    data.stride       = stride_array ;
+    data.chunk_length = length_chunk ;
+
+    for ( i = 0 ; i < size_alloc ; ++i ) { array[i] = 0 ; }
+
+    for ( j = 0 ; j < num_trials ; ++j ) {
+
+      double dt_tmp = TPI_Walltime();
+      for ( i = 0 ; i < ncycle ; ++i ) {
+        data.array = array + stride_array * num_array * ( i % num_sets );
+        TPI_Run( & test_dnax_column_work , & data , num_chunk , 0 );
+      }
+      dt_tmp = TPI_Walltime() - dt_tmp ;
+
+      dt_sum += dt_tmp ;
+      dt_sum_2 += dt_tmp * dt_tmp ;
+    }
+
+    {
+      const double dt_mean = dt_sum / num_trials ;
+      const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) / ( num_trials * ( num_trials - 1 ) ) );
+      const double mflop_mean = mflop_cycle * ncycle / dt_mean ;
+      const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
+
+      printf("\"CHUNK COLUMN\" , %6u , %9.5g , %9.3g , %9.5g , %9.3g\n",
+             num_array, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
+    }
+  }
+
+  /*----------------------------------------------------------------------*/
+
+  for ( i_test = 0 ; i_test < num_test ; ++i_test ) {
+
+    const unsigned num_array = num_test_array[ i_test ];
+    const unsigned num_sets  = max_array / num_array ;
+
+    const double mflop_cycle =
+      ((double)( 2 * num_array * length_array )) / 1.0e6 ;
+
+    const unsigned ncycle = 1 + (unsigned)( Mflop_target / mflop_cycle );
+
+    double dt_sum = 0 ;
+    double dt_sum_2 = 0 ;
+
+    data.length       = length_array ;
+    data.number       = num_array ;
+    data.stride       = stride_array ;
+    data.chunk_length = length_chunk ;
+
+    for ( i = 0 ; i < size_alloc ; ++i ) { array[i] = 0 ; }
+
+    for ( j = 0 ; j < num_trials ; ++j ) {
+
+      double dt_tmp = TPI_Walltime();
+
+      for ( i = 0 ; i < ncycle ; ++i ) {
+        data.array = array + stride_array * num_array * ( i % num_sets );
+        TPI_Run( & test_dnax_row_work , & data , num_chunk , 0 );
+      }
+      dt_tmp = TPI_Walltime() - dt_tmp ;
+
+      dt_sum += dt_tmp ;
+      dt_sum_2 += dt_tmp * dt_tmp ;
+    }
+
+    {
+      const double dt_mean = dt_sum / num_trials ;
+      const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) / ( num_trials * ( num_trials - 1 ) ) );
+      const double mflop_mean = mflop_cycle * ncycle / dt_mean ;
+      const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
+
+      printf("\"CHUNK ROW\"    , %6u , %9.5g , %9.3g , %9.5g , %9.3g\n",
+             num_array, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
+    }
+  }
+
+  /*----------------------------------------------------------------------*/
+
+  free( array );
+  free( coef );
+}
+
+/*------------------------------------------------------------------------*/
+
+int test_c_tpi_dnax( int nthread , int ntrial )
+{
+  const unsigned Mflop_target = 10 ;
+  const unsigned num_array[6] = { 2 , 5 , 10 , 20 , 50 , 100 };
+  const unsigned ntest = sizeof(num_array) / sizeof(unsigned);
+
+  if ( ntrial <= 0 ) { ntrial = 7 ; }
+
+  TPI_Init( nthread );
+
+  test_tpi_dnax_driver( nthread ,
+                        Mflop_target * nthread ,
+                        ntrial    /* number trials */ ,
+                        ntest     /* number of tests */ ,
+                        num_array /* number of arrays for each test */ ,
+                        1e6       /* array computation length */ ,
+                        1000      /* chunk length */ );
+
+  TPI_Finalize();
+
+  return 0 ;
+}
+
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/test_mpi_sum.c b/openmp-avx512/basic/optional/ThreadPool/test/test_mpi_sum.c
new file mode 100644
index 0000000..51d6b9e
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/test_mpi_sum.c
@@ -0,0 +1,764 @@
+/*------------------------------------------------------------------------*/
+/*                    TPI: Thread Pool Interface                          */
+/*                Copyright (2008) Sandia Corporation                     */
+/*                                                                        */
+/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
+/*  license for use of this work by or on behalf of the U.S. Government.  */
+/*                                                                        */
+/*  This library is free software; you can redistribute it and/or modify  */
+/*  it under the terms of the GNU Lesser General Public License as        */
+/*  published by the Free Software Foundation; either version 2.1 of the  */
+/*  License, or (at your option) any later version.                       */
+/*                                                                        */
+/*  This library is distributed in the hope that it will be useful,       */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
+/*  Lesser General Public License for more details.                       */
+/*                                                                        */
+/*  You should have received a copy of the GNU Lesser General Public      */
+/*  License along with this library; if not, write to the Free Software   */
+/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
+/*  USA                                                                   */
+/*------------------------------------------------------------------------*/
+/**
+ * @author H. Carter Edwards
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <TPI.h>
+#include <ThreadPool_config.h>
+
+int rand_r( unsigned int * );
+
+/*--------------------------------------------------------------------*/
+
+#if defined(HAVE_MPI)
+
+#include <mpi.h>
+
+typedef MPI_Comm COMM ;
+
+#else
+
+typedef int COMM ;
+
+#endif
+
+static int comm_size( COMM );
+static int comm_rank( COMM );
+static void comm_reduce_dmax( COMM , double * );
+static void comm_reduce_dsum( COMM , double * );
+static void comm_reduce_d4_sum( COMM , double * );
+
+/*--------------------------------------------------------------------*/
+
+static void my_span( const unsigned count , const unsigned rank ,
+                     const unsigned size ,
+                     unsigned * begin , unsigned * length )
+{
+  const unsigned int max = ( size + count - 1 ) / count ;
+  const unsigned int end = size - max * ( count - ( rank + 1 ) );
+  if ( rank ) {
+    *begin  = end - max ;
+    *length = max ;
+  }
+  else {
+    *begin  = 0 ;
+    *length = end ;
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+#define LESS_ABS( X , Y )	( ( X < 0 ? -X : X ) < ( Y < 0 ? -Y : Y ) )
+
+static void d2_add_d( double v[] , const double a )
+{
+  const int AltV = a < 0 ? ( - a < ( v[0] < 0 ? - v[0] : v[0] ) )
+                         : (   a < ( v[0] < 0 ? - v[0] : v[0] ) );
+
+  const double VpA = v[0] + a ;
+
+  v[1] += AltV ? ( a - ( VpA - v[0] ) ) : ( v[0] - ( VpA - a ) );
+  v[0]  = VpA + v[1] ;
+  v[1] += VpA - v[0] ;
+}
+
+void d4_dot( double v[] , unsigned n , const double * x , const double * y )
+{
+  double * pos = v ;
+  double * neg = v + 2 ;
+  const double * const x_end = x + n ;
+  for ( ; x < x_end ; ++x , ++y ) {
+    const double a = *x * *y ;
+    if ( a < 0 ) { d2_add_d( neg , a ); }
+    else         { d2_add_d( pos , a ); }
+  }
+}
+
+double ddot( unsigned n , const double * x , const double * y )
+{
+  double val = 0 ;
+  const double * const x_end = x + n ;
+  for ( ; x < x_end ; ++x , ++y ) { val += *x * *y ; }
+  return val ;
+}
+
+/*--------------------------------------------------------------------*/
+
+struct TaskXY {
+  unsigned int   nreduce ;
+  unsigned int   n ;
+  const double * x ;
+  const double * y ;
+};
+
+static
+void reduce_init( TPI_Work * work )
+{
+  struct TaskXY * const info = (struct TaskXY *) work->info ;
+  double        * const dst  = (double *)        work->reduce ;
+
+  if ( info->nreduce == 4 ) {
+    dst[0] = 0 ;
+    dst[1] = 0 ;
+    dst[2] = 0 ;
+    dst[3] = 0 ;
+  }
+  else if ( info->nreduce == 1 ) {
+    dst[0] = 0 ;
+  }
+}
+
+static
+void reduce_join( TPI_Work * work , const void * arg_src )
+{
+  struct TaskXY * const info = (struct TaskXY *) work->info ;
+  double        * const dst  = (double *)        work->reduce ;
+  const double  * const src  = (const double *)  arg_src ;
+
+  if ( info->nreduce == 4 ) {
+    d2_add_d( dst ,     src[0] );
+    d2_add_d( dst ,     src[1] );
+    d2_add_d( dst + 2 , src[2] );
+    d2_add_d( dst + 2 , src[3] );
+  }
+  else if ( info->nreduce == 1 ) {
+    dst[0] += src[0] ;
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+static
+void work_d4_dot_tp( TPI_Work * work )
+{
+  struct TaskXY * const info = (struct TaskXY *) work->info ;
+  double        * const dst  = (double *)        work->reduce ;
+
+  unsigned int begin , length ;
+
+  my_span( work->count , work->rank , info->n , & begin , & length );
+
+  d4_dot( dst , length , info->x + begin , info->y + begin );
+}
+
+double d4_dot_tp( COMM comm, unsigned nwork, unsigned n,
+                  const double * x, const double * y )
+{
+  struct TaskXY info = { 4 , 0 , NULL , NULL };
+  double result[4] = { 0 , 0 , 0 , 0 };
+  info.n = n ;
+  info.x = x ;
+  info.y = y ;
+
+  if ( nwork ) {
+    TPI_Run_reduce( work_d4_dot_tp , & info , nwork ,
+                    reduce_join, reduce_init, sizeof(result) , result );
+  }
+  else {
+    TPI_Run_threads_reduce( work_d4_dot_tp , & info ,
+                            reduce_join, reduce_init, sizeof(result), result);
+  }
+
+  comm_reduce_d4_sum( comm , result );
+
+  d2_add_d( result , result[2] );
+  d2_add_d( result , result[3] );
+
+  return result[0] ;
+}
+
+static
+void task_ddot_tp( TPI_Work * work )
+{
+  struct TaskXY * const info = (struct TaskXY *) work->info ;
+  double        * const dst  = (double *) work->reduce ;
+  unsigned int begin , length ;
+
+  my_span( work->count , work->rank , info->n , & begin , & length );
+
+  *dst += ddot( length , info->x + begin , info->y + begin );
+
+  return ;
+}
+
+double ddot_tp( COMM comm, unsigned nwork, unsigned n,
+                const double * x, const double * y )
+{
+  struct TaskXY info = { 1 , 0 , NULL , NULL };
+  double result = 0 ;
+  info.n = n ;
+  info.x = x ;
+  info.y = y ;
+
+  if ( nwork ) {
+    TPI_Run_reduce( task_ddot_tp , & info , nwork ,
+                    reduce_join, reduce_init, sizeof(result), & result);
+  }
+  else {
+    TPI_Run_threads_reduce( task_ddot_tp , & info ,
+                            reduce_join, reduce_init, sizeof(result), & result);
+  }
+
+  comm_reduce_dsum( comm , & result );
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+
+void dfill_rand( unsigned seed , unsigned n , double * x , double mag )
+{
+  const double scale = 2.0 * mag / (double) RAND_MAX ;
+  double * const xe = x + n ;
+  for ( ; xe != x ; ++x , ++seed ) {
+    unsigned s = seed ;
+    *x = scale * ((double) rand_r( & s )) - mag ;
+  }
+}
+
+struct FillWork {
+  double   mag ;
+  double * beg ;
+  unsigned length ;
+  unsigned seed ;
+};
+
+static void task_dfill_rand( TPI_Work * work )
+{
+  struct FillWork * const w = (struct FillWork *) work->info ;
+
+  unsigned int begin , length ;
+
+  my_span( work->count, work->rank, w->length, & begin , & length );
+
+  dfill_rand( w->seed + begin , length , w->beg + begin , w->mag );
+}
+
+void dfill_rand_tp( unsigned nblock , unsigned seed ,
+                    unsigned n , double * x , double mag )
+{
+  struct FillWork data ;
+  data.mag    = mag ;
+  data.beg    = x ;
+  data.length = n ;
+  data.seed   = seed ;
+  if ( nblock ) {
+    const int nwork = ( n + nblock - 1 ) / nblock ;
+    TPI_Run( & task_dfill_rand , & data , nwork , 0 );
+  }
+  else {
+    TPI_Run_threads( & task_dfill_rand , & data , 0 );
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+static
+void test_ddot_performance(
+  COMM comm ,
+  const int nthreads ,
+  const int nblock ,
+  const unsigned int num_trials ,
+  const unsigned int num_tests ,
+  const unsigned int length_array[]  /* Global array length for each test */ ,
+  const double   mag )
+{
+  const unsigned int ddot_flop   = 2 ;  /* 1 mult, 1 sum */
+  const unsigned int d4_dot_flop = 12 ; /* 1 mult, 7 sum, 4 compare */
+
+  const unsigned int p_rank = comm_rank( comm );
+  const unsigned int p_size = comm_size( comm );
+
+  const unsigned int max_array = length_array[ num_tests - 1 ];
+
+  unsigned int local_max_size = 0 ;
+  unsigned int i_test ;
+
+  TPI_Init( nthreads );
+
+  if ( 0 == p_rank ) {
+    fprintf(stdout,"\n\"DDOT and D4DOT Performance testing\"\n");
+    fprintf(stdout,"\"MPI size = %u , TPI size = %d , BlockSize = %d , #Trials = %u\"\n",p_size,nthreads,nblock,num_trials);
+    fprintf(stdout,"\"TEST\" , \"LENGTH\" , \"#CYCLE\" , \"DT-MEAN\" , \"DT-STDDEV\" , \"MFLOP-MEAN\" , \"MFLOP-STDDEV\"\n");
+  }
+
+  for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
+    const unsigned length = length_array[ i_test ]; /* Global */
+    const unsigned ncycle = 2 * max_array / length ;
+    const unsigned local_max = ncycle * ( ( length + p_size - 1 ) / p_size );
+    if ( local_max_size < local_max ) { local_max_size = local_max ; }
+  }
+
+  {
+    double * const x = (double*) malloc(local_max_size * 2 * sizeof(double));
+    double * const y = x + local_max_size ;
+
+    unsigned int i , j ;
+
+    dfill_rand_tp( nblock, 0,              local_max_size, x, mag );
+    dfill_rand_tp( nblock, local_max_size, local_max_size, y, mag );
+
+    for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
+      const unsigned length = length_array[ i_test ]; /* Global */
+      const unsigned ncycle = 2 * max_array / length ;
+
+      unsigned int local_begin , local_length , local_nwork ;
+
+      double dt_sum = 0.0 ;
+      double dt_sum_2 = 0.0 ;
+
+      my_span( p_size, p_rank, length, & local_begin , & local_length );
+
+      local_nwork = nblock ? ( local_length + nblock - 1 ) / nblock : 0 ;
+
+      /*--------------------------------------------------------------*/
+
+      for ( i = 0 ; i < num_trials ; ++i ) {
+        double dt = TPI_Walltime();
+        for ( j = 0 ; j < ncycle ; ++j ) {
+            ddot_tp( comm, local_nwork, local_length,
+                     x + j * local_length ,
+                     y + j * local_length );
+        }
+        dt = TPI_Walltime() - dt ;
+        comm_reduce_dmax( comm , & dt );
+        dt_sum   += dt ;
+        dt_sum_2 += dt * dt ;
+      }
+
+      if ( 0 == p_rank ) {
+        const double mflop = ((double)( ddot_flop * length * ncycle ) ) / ((double) 1e6 );
+
+        const double dt_mean = dt_sum / num_trials ;
+        const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) /
+                                     ( num_trials * ( num_trials - 1 ) ) );
+        const double mflop_mean = mflop / dt_mean ;
+        const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
+
+        fprintf(stdout,"\"DDOT\"  , %8u , %8u , %9.5g , %9.5g , %9.5g , %9.5g\n",
+                length, ncycle, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
+        fflush(stdout);
+      }
+    }
+
+    for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
+      const unsigned length = length_array[ i_test ]; /* Global */
+      const unsigned ncycle = 2 * max_array / length ;
+
+      unsigned int local_begin , local_length , local_nwork ;
+
+      double dt_sum = 0 ;
+      double dt_sum_2 = 0 ;
+
+      my_span( p_size, p_rank, length, & local_begin , & local_length );
+
+      local_nwork = nblock ? ( local_length + nblock - 1 ) / nblock : 0 ;
+
+      /*--------------------------------------------------------------*/
+
+      for ( i = 0 ; i < num_trials ; ++i ) {
+        double dt = TPI_Walltime();
+        for ( j = 0 ; j < ncycle ; ++j ) {
+            d4_dot_tp( comm, local_nwork, local_length,
+                       x + j * local_length ,
+                       y + j * local_length );
+        }
+        dt = TPI_Walltime() - dt ;
+        comm_reduce_dmax( comm , & dt );
+        dt_sum   += dt ;
+        dt_sum_2 += dt * dt ;
+      }
+
+      if ( 0 == p_rank ) {
+        const double mflop = ((double)( d4_dot_flop * length * ncycle ) ) / ((double) 1e6 );
+
+        const double dt_mean = dt_sum / num_trials ;
+        const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) /
+                                     ( num_trials * ( num_trials - 1 ) ) );
+        const double mflop_mean = mflop / dt_mean ;
+        const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
+
+        fprintf(stdout,"\"D4DOT\" , %8u , %8u , %9.5g , %9.5g , %9.5g , %9.5g\n",
+                length, ncycle, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
+        fflush(stdout);
+      }
+    }
+
+    /*--------------------------------------------------------------*/
+
+    free( x );
+  }
+
+  TPI_Finalize();
+
+  return ;
+}
+
+/*--------------------------------------------------------------------*/
+
+static
+void test_ddot_accuracy(
+  COMM comm ,
+  const int nthreads ,
+  const int nblock ,
+  const unsigned int num_tests ,
+  const unsigned int length_array[]  /* Global array length for each test */ ,
+  const double   mag )
+{
+  const unsigned int p_rank = comm_rank( comm );
+  const unsigned int p_size = comm_size( comm );
+
+  const unsigned int max_array = length_array[ num_tests - 1 ];
+  const unsigned int local_max_size = ( max_array + p_size - 1 ) / p_size ;
+
+  unsigned int i_test ;
+
+  TPI_Init( nthreads );
+
+  if ( 0 == p_rank ) {
+    fprintf(stdout,"\n\"DDOT and D4DOT Accuracy testing\"\n");
+    fprintf(stdout,"\"MPI size = %u , TPI size = %d , BlockSize = %d\"\n",p_size,nthreads,nblock);
+    fprintf(stdout,"\"TEST\" , \"LENGTH\" , \"VALUE\"\n");
+  }
+
+  {
+    double * const x = (double*) malloc(local_max_size * 2 * sizeof(double));
+    double * const y = x + local_max_size ;
+
+    for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
+      const unsigned length      = length_array[ i_test ]; /* Global */
+      const unsigned length_half = length / 2 ;
+
+      unsigned local_begin , local_length , local_nwork ;
+
+      double val_ddot ;
+
+      my_span( p_size, p_rank, length, & local_begin , & local_length );
+
+      local_nwork = nblock ? ( local_length + nblock - 1 ) / nblock : 0 ;
+
+      /*--------------------------------------------------------------*/
+
+      if ( local_begin < length_half ) {
+        const unsigned len = local_length < length_half - local_begin
+                           ? local_length : length_half - local_begin ;
+
+        dfill_rand_tp( nblock,          local_begin, len, x, mag );
+        dfill_rand_tp( nblock, length + local_begin, len, y, mag );
+      }
+
+      if ( length_half < local_begin + local_length ) {
+        const unsigned beg = length_half > local_begin
+                           ? length_half : local_begin ;
+        const unsigned off = beg - local_begin ;
+        const unsigned len = local_length - off ;
+
+        dfill_rand_tp( nblock,          beg - length_half, len, x + off, mag );
+        dfill_rand_tp( nblock, length + beg - length_half, len, y + off, - mag );
+      }
+
+      /*--------------------------------------------------------------*/
+
+      val_ddot = ddot_tp( comm, local_nwork, local_length, x, y );
+
+      if ( 0 == p_rank ) {
+        fprintf(stdout,"\"DDOT\"  , %8u , %9.3g\n", length , val_ddot );
+        fflush(stdout);
+      }
+    }
+
+    for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
+      const unsigned length      = length_array[ i_test ]; /* Global */
+      const unsigned length_half = length / 2 ;
+
+      unsigned local_begin , local_length , local_nwork ;
+
+      double val_d4_dot ;
+
+      my_span( p_size, p_rank, length, & local_begin , & local_length );
+
+      local_nwork = nblock ? ( local_length + nblock - 1 ) / nblock : 0 ;
+
+      /*--------------------------------------------------------------*/
+
+      if ( local_begin < length_half ) {
+        const unsigned len = local_length < length_half - local_begin
+                           ? local_length : length_half - local_begin ;
+
+        dfill_rand_tp( nblock,          local_begin, len, x, mag );
+        dfill_rand_tp( nblock, length + local_begin, len, y, mag );
+      }
+
+      if ( length_half < local_begin + local_length ) {
+        const unsigned beg = length_half > local_begin
+                           ? length_half : local_begin ;
+        const unsigned off = beg - local_begin ;
+        const unsigned len = local_length - off ;
+
+        dfill_rand_tp( nblock,          beg - length_half, len, x + off, mag );
+        dfill_rand_tp( nblock, length + beg - length_half, len, y + off, - mag );
+      }
+
+      /*--------------------------------------------------------------*/
+
+      val_d4_dot = d4_dot_tp( comm, local_nwork, local_length, x , y );
+
+      if ( 0 == p_rank ) {
+        fprintf(stdout,"\"DDOT\"  , %8u , %9.3g\n", length , val_d4_dot );
+        fflush(stdout);
+      }
+    }
+
+    /*--------------------------------------------------------------*/
+
+    free( x );
+  }
+
+  TPI_Finalize();
+
+  return ;
+}
+
+/*--------------------------------------------------------------------*/
+
+const unsigned test_lengths[] = 
+  { 1e4 , 2e4 , 5e4 ,
+    1e5 , 2e5 , 5e5 ,
+    1e6 , 2e6 , 5e6 , 1e7 };
+
+const unsigned test_count = sizeof(test_lengths) / sizeof(unsigned);
+const unsigned nblock = 2500 ;
+
+const double test_mag = 1e4 ;
+
+static void test_performance(
+  COMM comm , const int test_thread_count , const int test_thread[] )
+{
+  const unsigned num_trials = 11 ;
+
+  int i ;
+
+  for ( i = 0 ; i < test_thread_count ; ++i ) {
+
+    test_ddot_performance( comm , test_thread[i] , nblock,
+                           num_trials , test_count , test_lengths , test_mag );
+
+    test_ddot_performance( comm , test_thread[i] , 0,
+                           num_trials , test_count , test_lengths , test_mag );
+  }
+}
+
+static void test_accuracy(
+  COMM comm , const int test_thread_count , const int test_thread[] ,
+              unsigned test_do )
+{
+  int i ;
+
+  if ( test_count < test_do ) { test_do = test_count ; }
+
+  for ( i = 0 ; i < test_thread_count ; ++i ) {
+
+    test_ddot_accuracy( comm, test_thread[i], nblock,
+                        test_do, test_lengths, test_mag );
+
+    test_ddot_accuracy( comm, test_thread[i], 0,
+                        test_do, test_lengths, test_mag );
+  }
+}
+
+/*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+#define TEST_THREAD_MAX 128
+
+#if defined(HAVE_MPI)
+
+int main( int argc , char **argv )
+{
+  int nthread[ TEST_THREAD_MAX ];
+  int i ;
+
+  MPI_Init( & argc , & argv );
+
+  for ( i = 0 ; i < TEST_THREAD_MAX ; ++i ) { nthread[i] = 0 ; }
+
+  if ( 0 == comm_rank( MPI_COMM_WORLD ) ) {
+    if ( 1 < argc && argc < TEST_THREAD_MAX ) {
+      nthread[0] = 1 ;
+      nthread[1] = argc - 1 ;
+      for ( i = 1 ; i < argc ; ++i ) { nthread[i+1] = atoi( argv[i] ); }
+    }
+    else {
+      nthread[0] = 0 ;
+      nthread[1] = 1 ;
+      nthread[2] = 1 ;
+    }
+  }
+
+  MPI_Bcast( nthread , TEST_THREAD_MAX , MPI_INT , 0 , MPI_COMM_WORLD );
+
+  if ( nthread[0] ) {
+    test_accuracy(    MPI_COMM_WORLD , nthread[1] , nthread + 2 , test_count );
+    test_performance( MPI_COMM_WORLD , nthread[1] , nthread + 2 );
+  }
+  else {
+    test_accuracy(    MPI_COMM_WORLD , nthread[1] , nthread + 2 , 3 );
+  }
+
+  MPI_Finalize();
+
+  return 0 ;
+}
+
+static int comm_size( COMM comm )
+{
+  int size = 0 ;
+  MPI_Comm_size( comm , & size );
+  return size ;
+}
+
+static int comm_rank( COMM comm )
+{
+  int rank = 0 ;
+  MPI_Comm_rank( comm , & rank );
+  return rank ;
+}
+
+static void comm_reduce_dmax( COMM comm , double * val )
+{
+  double tmp ;
+  if ( MPI_SUCCESS ==
+       MPI_Allreduce( val , & tmp , 1 , MPI_DOUBLE , MPI_MAX , comm ) ) {
+    *val = tmp ;
+  }
+  else {
+    *val = 0 ;
+  }
+}
+
+static void comm_reduce_dsum( COMM comm , double * val )
+{
+  double tmp ;
+  if ( MPI_SUCCESS ==
+       MPI_Allreduce( val , & tmp , 1 , MPI_DOUBLE , MPI_SUM , comm ) ) {
+    *val = tmp ;
+  }
+  else {
+    *val = 0 ;
+  }
+}
+
+static void comm_reduce_d4_op( void * argin ,
+                               void * argout ,
+                               int * n ,
+                               MPI_Datatype * d )
+{
+  if ( d && n && *n == 4 ) {
+    double * const in  = (double*) argin ;
+    double * const out = (double*) argout ;
+    d2_add_d( out ,     in[0] );
+    d2_add_d( out ,     in[1] );
+    d2_add_d( out + 2 , in[2] );
+    d2_add_d( out + 2 , in[3] );
+  }
+  return ; 
+}
+
+static void comm_reduce_d4_sum( COMM comm , double * val )
+{
+  double tmp[4] ;
+  MPI_Op mpi_op = MPI_OP_NULL ;
+
+  /* Use Reduce->Bcast instead of Allreduce due to a bug with the SUN MPI. */
+
+  MPI_Op_create( comm_reduce_d4_op , 0 , & mpi_op );
+  MPI_Reduce( val , tmp , 4 , MPI_DOUBLE , mpi_op , 0 , comm );
+  MPI_Bcast(        tmp , 4 , MPI_DOUBLE ,          0 , comm );
+  MPI_Op_free( & mpi_op );
+
+  val[0] = tmp[0] ;
+  val[1] = tmp[1] ;
+  val[2] = tmp[2] ;
+  val[3] = tmp[3] ;
+}
+
+#else
+
+int main( int argc , char **argv )
+{
+  int nthread[ TEST_THREAD_MAX ];
+  int i ;
+
+  for ( i = 0 ; i < TEST_THREAD_MAX ; ++i ) { nthread[i] = 0 ; }
+
+  if ( 1 < argc && argc < TEST_THREAD_MAX ) {
+    nthread[0] = 1 ;
+    nthread[1] = argc - 1 ;
+    for ( i = 1 ; i < argc ; ++i ) { nthread[i+1] = atoi( argv[i] ); }
+  }
+  else {
+    nthread[0] = 0 ;
+    nthread[1] = 4 ;
+    nthread[2] = 1 ;
+    nthread[3] = 2 ;
+    nthread[4] = 4 ;
+    nthread[5] = 8 ;
+  }
+
+  if ( nthread[0] ) {
+    test_accuracy(    0 , nthread[1] , nthread + 2 , test_count );
+    test_performance( 0 , nthread[1] , nthread + 2 );
+  }
+  else {
+    test_accuracy(    0 , nthread[1] , nthread + 2 , 3 );
+  }
+
+  return 0 ;
+}
+
+static int comm_size( COMM comm ) { return comm ? -1 : 1 ; }
+static int comm_rank( COMM comm ) { return comm ? -1 : 0 ; }
+static void comm_reduce_dmax( COMM comm , double * val )
+{
+  if ( comm ) { *val = 0 ; }
+  return ;
+}
+static void comm_reduce_dsum( COMM comm , double * val )
+{
+  if ( comm ) { *val = 0 ; }
+  return ;
+}
+static void comm_reduce_d4_sum( COMM comm , double * val )
+{
+  if ( comm ) { val[0] = val[1] = val[2] = val[3] = 0 ; }
+  return ;
+}
+
+#endif
+
+/*--------------------------------------------------------------------*/
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/test_pthreads.c b/openmp-avx512/basic/optional/ThreadPool/test/test_pthreads.c
new file mode 100644
index 0000000..235eb41
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/test_pthreads.c
@@ -0,0 +1,279 @@
+/*------------------------------------------------------------------------*/
+/*                    TPI: Thread Pool Interface                          */
+/*                Copyright (2008) Sandia Corporation                     */
+/*                                                                        */
+/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
+/*  license for use of this work by or on behalf of the U.S. Government.  */
+/*                                                                        */
+/*  This library is free software; you can redistribute it and/or modify  */
+/*  it under the terms of the GNU Lesser General Public License as        */
+/*  published by the Free Software Foundation; either version 2.1 of the  */
+/*  License, or (at your option) any later version.                       */
+/*                                                                        */
+/*  This library is distributed in the hope that it will be useful,       */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
+/*  Lesser General Public License for more details.                       */
+/*                                                                        */
+/*  You should have received a copy of the GNU Lesser General Public      */
+/*  License along with this library; if not, write to the Free Software   */
+/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
+/*  USA                                                                   */
+/*------------------------------------------------------------------------*/
+/**
+ * @author H. Carter Edwards
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <TPI.h>
+
+/*------------------------------------------------------------------------*/
+/* Test various ways of controling worker threads */
+
+typedef struct TestPthreads_struct {
+  pthread_mutex_t  m_lock ;
+  pthread_cond_t   m_cond ;
+  int              m_thread_rank ;
+  int              m_thread_count ;
+} TestPthreads ;
+
+/*------------------------------------------------------------------------*/
+/*------------------------------------------------------------------------*/
+
+static void * test_driver( void * arg )
+{
+  TestPthreads * const data = (TestPthreads*) arg ;
+  TestPthreads * const root = data - data->m_thread_rank ;
+
+  /*------------------------------*/
+  /* Initializing */
+
+  pthread_mutex_lock(   & data->m_lock );
+
+  pthread_mutex_lock(   & root->m_lock );
+  pthread_cond_signal(  & root->m_cond );
+  pthread_mutex_unlock( & root->m_lock );
+
+  /*------------------------------*/
+
+  while ( data->m_thread_rank ) { 
+    pthread_cond_wait( & data->m_cond , & data->m_lock );
+  } 
+  pthread_mutex_unlock( & data->m_lock );
+
+  /*------------------------------*/
+  /* Terminating */
+
+  pthread_mutex_lock( & root->m_lock );
+  if ( 0 == --( root->m_thread_count ) ) {
+    pthread_cond_signal( & root->m_cond );
+  }
+  pthread_mutex_unlock( & root->m_lock );
+
+  return NULL ;
+}
+
+
+static void test_run( pthread_attr_t * const thread_attr ,
+                      const int number_threads ,
+                      const int number_trials ,
+                      const int number_loops ,
+                      double * const dt_start_stop ,
+                      double * const dt_loop )
+{
+  TestPthreads data[ number_threads ];
+  double dt_total ;
+  double dt_run = 0 ;
+  int j ;
+
+  dt_total = TPI_Walltime();
+
+  for ( j = 0 ; j < number_trials ; ++j ) {
+    int i ;
+
+    for ( i = 0 ; i < number_threads ; ++i ) {
+      pthread_cond_init( & data[i].m_cond , NULL );
+      pthread_mutex_init( & data[i].m_lock , NULL );
+      data[i].m_thread_rank = i ;
+      data[i].m_thread_count = number_threads ;
+    }
+
+    pthread_mutex_lock( & data->m_lock );
+
+    for ( i = 1 ; i < number_threads ; ++i ) {
+      pthread_t pt ;
+      pthread_create( & pt, thread_attr, & test_driver , data + i );
+      pthread_cond_wait( & data->m_cond , & data->m_lock );
+      pthread_mutex_lock( & data[i].m_lock );
+    }
+
+    /* Running */
+
+    {
+      double dt = TPI_Walltime();
+      int k ;
+
+      for ( k = 1 ; k < number_loops ; ++k ) {
+        for ( i = 1 ; i < number_threads ; ++i ) {
+          pthread_cond_signal(  & data[i].m_cond );
+          pthread_mutex_unlock( & data[i].m_lock );
+        }
+
+        /* Work goes here */
+
+        for ( i = 1 ; i < number_threads ; ++i ) {
+          pthread_mutex_lock( & data[i].m_lock );
+        }
+      }
+
+      dt_run += TPI_Walltime() - dt ;
+    }
+
+    /* Termination */
+
+    --( data->m_thread_count );
+
+    if ( data->m_thread_count ) {
+      for ( i = 1 ; i < number_threads ; ++i ) {
+        data[i].m_thread_rank = 0 ;
+        pthread_cond_signal(  & data[i].m_cond );
+        pthread_mutex_unlock( & data[i].m_lock );
+      }
+
+      pthread_cond_wait( & data->m_cond , & data->m_lock );
+    }
+
+    pthread_mutex_unlock( & data->m_lock );
+
+    for ( i = 0 ; i < number_threads ; ++i ) {
+      pthread_cond_destroy( & data[i].m_cond );
+      pthread_mutex_destroy( & data[i].m_lock );
+    }
+  }
+
+  dt_total = TPI_Walltime() - dt_total ;
+
+  *dt_loop       = 1.0e6 * dt_run / (double) ( number_trials * number_loops );
+  *dt_start_stop = 1.0e6 * ( dt_total - dt_run ) / (double) number_trials ;
+}
+
+/*------------------------------------------------------------------------*/
+/*------------------------------------------------------------------------*/
+
+static double test_mutex_init_destroy( const int number )
+{
+  pthread_mutex_t mutex ;
+  double dt ;
+  int i ;
+  dt = TPI_Walltime();
+  for ( i = 0 ; i < number ; ++i ) {
+    pthread_mutex_init( & mutex , NULL );
+    pthread_mutex_destroy( & mutex );
+  }
+  dt = ( TPI_Walltime() - dt ) / (double) number ;
+  return dt ;
+}
+
+static double test_mutex_lock_unlock( const int number )
+{
+  pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER ;
+  double dt ;
+  int i ;
+
+  dt = TPI_Walltime();
+  for ( i = 0 ; i < number ; ++i ) {
+    pthread_mutex_lock( & mutex );
+    pthread_mutex_unlock( & mutex );
+  }
+  dt = ( TPI_Walltime() - dt ) / (double) number ;
+
+  pthread_mutex_destroy( & mutex );
+  return dt ;
+}
+
+/*------------------------------------------------------------------------*/
+
+void test_pthreads_performance( int n_test , int * n_concurrent )
+{
+  const int n_mutex = 1e4 /* 1e8 */ ;
+  const int n_trial = 1e2 /* 1e4 */ ;
+  const int n_loop  = 1e3 /* 1e4 */ ;
+
+  {
+    const double dt = 1e6 * test_mutex_init_destroy( n_mutex );
+    fprintf(stdout,"\n\"test pthreads mutex init/destroy (microsec)\" , %g\n",dt);
+  }
+
+  {
+    const double dt = 1e6 * test_mutex_lock_unlock( n_mutex );
+    fprintf(stdout,"\n\"test pthreads mutex lock/unlock (microsec)\" , %g\n",dt);
+  }
+
+  /*------------------------------------------------------------------*/
+
+  {
+    int i ;
+
+    pthread_attr_t thread_attr ;
+
+    fprintf(stdout,"\n\"test pthreads SCOPE_SYSTEM run-blocking\"\n");
+    fprintf(stdout,"\"#Threads\" , \"#Spawned\" \"Spawn (microsec)\" , \"Loop (microsec)\"\n");
+
+    pthread_attr_init( & thread_attr );
+    pthread_attr_setscope(       & thread_attr, PTHREAD_SCOPE_SYSTEM );
+    pthread_attr_setdetachstate( & thread_attr, PTHREAD_CREATE_DETACHED );
+
+    for ( i = 0 ; i < n_test ; ++i ) {
+      const int nthread = n_concurrent[i] ;
+      double dt_start_stop , dt_loop ;
+
+      test_run( & thread_attr, nthread, n_trial, n_loop,
+                & dt_start_stop , & dt_loop );
+
+      fprintf( stdout, "%d , %d , %g , %g\n",
+               nthread , nthread - 1 , dt_start_stop , dt_loop );
+      fflush( stdout );
+    }
+
+    pthread_attr_destroy( & thread_attr );
+  }
+
+  /*------------------------------------------------------------------*/
+
+  {
+    int i ;
+
+    pthread_attr_t thread_attr ;
+
+    fprintf(stdout,"\n\"test pthreads SCOPE_PROCESS run-blocking\"\n");
+    fprintf(stdout,"\"#Threads\" , \"#Spawned\" \"Spawn (microsec)\" , \"Loop (microsec)\"\n");
+
+    pthread_attr_init( & thread_attr );
+    pthread_attr_setscope(       & thread_attr, PTHREAD_SCOPE_PROCESS );
+    pthread_attr_setdetachstate( & thread_attr, PTHREAD_CREATE_DETACHED );
+
+    for ( i = 0 ; i < n_test ; ++i ) {
+      const int nthread = n_concurrent[i] ;
+      double dt_start_stop , dt_loop ;
+
+      test_run( & thread_attr, nthread, n_trial, n_loop,
+                & dt_start_stop , & dt_loop );
+
+      fprintf( stdout, "%d , %d , %g , %g\n",
+               nthread , nthread - 1 , dt_start_stop , dt_loop );
+      fflush( stdout );
+    }
+
+    pthread_attr_destroy( & thread_attr );
+  }
+
+  /*------------------------------------------------------------------*/
+
+  fflush( stdout );
+}
+
+/*------------------------------------------------------------------------*/
+
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/test_tpi.cpp b/openmp-avx512/basic/optional/ThreadPool/test/test_tpi.cpp
new file mode 100644
index 0000000..cf5a649
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/test_tpi.cpp
@@ -0,0 +1,123 @@
+/*------------------------------------------------------------------------*/
+/*                    TPI: Thread Pool Interface                          */
+/*                Copyright (2008) Sandia Corporation                     */
+/*                                                                        */
+/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
+/*  license for use of this work by or on behalf of the U.S. Government.  */
+/*                                                                        */
+/*  This library is free software; you can redistribute it and/or modify  */
+/*  it under the terms of the GNU Lesser General Public License as        */
+/*  published by the Free Software Foundation; either version 2.1 of the  */
+/*  License, or (at your option) any later version.                       */
+/*                                                                        */
+/*  This library is distributed in the hope that it will be useful,       */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
+/*  Lesser General Public License for more details.                       */
+/*                                                                        */
+/*  You should have received a copy of the GNU Lesser General Public      */
+/*  License along with this library; if not, write to the Free Software   */
+/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
+/*  USA                                                                   */
+/*------------------------------------------------------------------------*/
+/**
+ * @author H. Carter Edwards
+ */
+
+#include <stdexcept>
+#include <iostream>
+#include <TPI.hpp>
+
+/*------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+
+template<unsigned N> class TEST ;
+
+template<unsigned N>
+class TEST {
+public:
+  int m_flag[N] ;
+  ~TEST() {}
+  TEST();
+  void flag( TPI::Work & );
+  void verify();
+private:
+  TEST( const TEST & );
+  TEST & operator = ( const TEST & );
+};
+
+template<unsigned N>
+TEST<N>::TEST()
+{
+  for ( unsigned i = 0 ; i < N ; ++i ) { m_flag[i] = 0 ; }
+}
+
+template<unsigned N>
+void TEST<N>::flag( TPI::Work & work )
+{
+  static const char method[] = "TEST::flag" ;
+  if ( work.count != (int) N ) {
+    std::cerr << method
+              << "<" << N << "> count(" << work.count << ") failed"
+              << std::endl ;
+    throw std::exception();
+  }
+  m_flag[ work.rank ] = 1 ;
+}
+
+template<unsigned N>
+void TEST<N>::verify()
+{
+  static const char method[] = "TEST::verify" ;
+
+  for ( unsigned i = 0 ; i < N ; ++i ) {
+    if ( ! m_flag[i] ) {
+      std::cerr << method
+                << "<" << N << "> m_flag[" << i << "] failed"
+                << std::endl ;
+      throw std::exception();
+    }
+    else {
+      m_flag[i] = 0 ;
+    }
+  }
+}
+
+void test_tpi_cpp( int np )
+{
+  TEST<1> test_1 ;
+  TEST<2> test_2 ;
+  TEST<4> test_4 ;
+  TEST<8> test_8 ;
+  TEST<16> test_16 ;
+
+  TPI::Init( np );
+
+  TPI::Run( test_1 , & TEST<1>::flag , 1 );
+  TPI::Run( test_2 , & TEST<2>::flag , 2 );
+  TPI::Run( test_4 , & TEST<4>::flag , 4 );
+  TPI::Run( test_8 , & TEST<8>::flag , 8 );
+  TPI::Run( test_16 , & TEST<16>::flag , 16 );
+
+  test_1.verify();
+  test_2.verify();
+  test_4.verify();
+  test_8.verify();
+  test_16.verify();
+
+  TPI::Finalize();
+}
+
+int main( int argc , char ** argv )
+{
+  if ( argc ) { std::cout << argv[0] ; }
+  else        { std::cout << "test" ; }
+  test_tpi_cpp(1); std::cout << " 1 " ;
+  test_tpi_cpp(2); std::cout << " 2 " ;
+  test_tpi_cpp(4); std::cout << " 4 " ;
+  test_tpi_cpp(8); std::cout << " 8 " ;
+  test_tpi_cpp(16); std::cout << " 16 " ;
+  std::cout << " passed" << std::endl ;
+  return 0 ;
+}
+
diff --git a/openmp-avx512/basic/optional/ThreadPool/test/test_tpi_unit.c b/openmp-avx512/basic/optional/ThreadPool/test/test_tpi_unit.c
new file mode 100644
index 0000000..34faef8
--- /dev/null
+++ b/openmp-avx512/basic/optional/ThreadPool/test/test_tpi_unit.c
@@ -0,0 +1,505 @@
+/*------------------------------------------------------------------------*/
+/*                    TPI: Thread Pool Interface                          */
+/*                Copyright (2008) Sandia Corporation                     */
+/*                                                                        */
+/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
+/*  license for use of this work by or on behalf of the U.S. Government.  */
+/*                                                                        */
+/*  This library is free software; you can redistribute it and/or modify  */
+/*  it under the terms of the GNU Lesser General Public License as        */
+/*  published by the Free Software Foundation; either version 2.1 of the  */
+/*  License, or (at your option) any later version.                       */
+/*                                                                        */
+/*  This library is distributed in the hope that it will be useful,       */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
+/*  Lesser General Public License for more details.                       */
+/*                                                                        */
+/*  You should have received a copy of the GNU Lesser General Public      */
+/*  License along with this library; if not, write to the Free Software   */
+/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
+/*  USA                                                                   */
+/*------------------------------------------------------------------------*/
+/**
+ * @author H. Carter Edwards
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <TPI.h>
+
+#if defined( HAVE_MPI )
+#include <mpi.h>
+#endif
+
+/*--------------------------------------------------------------------*/
+
+static void test_work( TPI_Work * );
+static void test_reduce_work( TPI_Work * );
+static void test_reduce_init( TPI_Work * );
+static void test_reduce_join( TPI_Work * , const void * );
+static void test_reduce_via_lock( TPI_Work * );
+static void test_reduce_via_nolock( TPI_Work * );
+
+void test_tpi_init(   const int ntest, const int nthread[], const int ntrial);
+void test_tpi_block(  const int ntest, const int nthread[], const int ntrial);
+void test_tpi_reduce( const int ntest, const int nthread[], const int ntrial);
+void test_tpi_work(   const int ntest, const int nthread[],
+                      const int nwork , const int ntrial );
+void test_tpi_work_async(
+  const int ntest , const int nthread[] , const int nwork , const int ntrial );
+
+int main( int argc , char ** argv )
+{
+  int num_thread[] = { 1 , 2 , 4 , 6 , 8 , 12 , 16 };
+  int num_test = sizeof(num_thread) / sizeof(int);
+
+#if defined( HAVE_MPI )
+  int rank ;
+
+  MPI_Init( & argc , & argv );
+  MPI_Comm_rank( MPI_COMM_WORLD , & rank );
+  if ( 0 == rank ) {
+#endif
+ 
+  const int ntrial = 1 < argc ? atoi( argv[1] ) : 5 ;
+  const int nwork  = 2 < argc ? atoi( argv[2] ) : 100 ;
+ 
+  /* Get the configuration print message out. */
+  fprintf( stdout , "\"%s\"\n" , TPI_Version() );
+  fprintf( stdout , "\"Unit Testing: ntrial = %d , nwork = %d\"\n" , ntrial , nwork );
+ 
+  test_tpi_init(   num_test , num_thread , ntrial );
+  test_tpi_block(  num_test , num_thread , ntrial );
+  test_tpi_reduce( num_test , num_thread , ntrial );
+  test_tpi_work(   num_test , num_thread , nwork , ntrial );
+  test_tpi_work_async( num_test , num_thread , nwork , ntrial );
+ 
+#if defined( HAVE_MPI )
+  }
+  MPI_Finalize();
+#endif
+
+  return 0 ;
+}
+
+/*--------------------------------------------------------------------*/
+
+void test_tpi_init( const int ntest , const int nthread[] , const int ntrial )
+{
+  int j ;
+
+  fprintf( stdout , "\n\"TEST TPI_Init / TPI_Finalize\"\n" );
+  fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Init(avg-msec)\" , \"TPI_Init(stddev-msec)\" , \"TPI_Finalize(avg-msec)\" , \"TPI_Finalize(stddev-msec)\"\n");
+
+  for ( j = 0 ; j < ntest ; ++j ) {
+    const int nth = nthread[j];
+    double dt_init_total   = 0.0 ;
+    double dt_init_total_2 = 0.0 ;
+    double dt_fin_total    = 0.0 ;
+    double dt_fin_total_2  = 0.0 ;
+    int i ;
+    int result ;
+
+    for ( i = 0 ; i < ntrial ; ++i ) {
+      double t , dt ;
+
+      t = TPI_Walltime();
+      result = TPI_Init( nth );
+      dt = TPI_Walltime() - t ;
+      dt_init_total += dt ;
+      dt_init_total_2 += dt * dt ;
+
+      if ( result != nth ) {
+        fprintf(stderr,"%d != TPI_Init(%d) : FAILED at trial %d\n",
+                result , nth , i );
+        abort();
+      }
+
+      t = TPI_Walltime();
+      TPI_Finalize();
+      dt = TPI_Walltime() - t ;
+      dt_fin_total += dt ;
+      dt_fin_total_2 += dt * dt ;
+    }
+
+    if ( 1 < ntrial ) {
+      const double init_mean = 1.0e6 * dt_init_total / ntrial ;
+      const double init_sdev = 1.0e6 * sqrt( ( ntrial * dt_init_total_2 -
+                                       dt_init_total * dt_init_total ) /
+                                     ( ntrial * ( ntrial - 1 ) ) );
+
+      const double fin_mean = 1.0e6 * dt_fin_total / ntrial ;
+      const double fin_sdev = 1.0e6 * sqrt( ( ntrial * dt_fin_total_2 -
+                                      dt_fin_total * dt_fin_total ) /
+                                    ( ntrial * ( ntrial - 1 ) ) );
+      
+      fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n",
+              nth , ntrial , init_mean , init_sdev , fin_mean , fin_sdev );
+    }
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+void test_tpi_block( const int ntest , const int nthread[] , const int ntrial )
+{
+  int i, j ;
+
+  fprintf( stdout , "\n\"TEST TPI_Block / TPI_Unblock\"\n" );
+  fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Block(avg-msec)\" , \"TPI_Block(stddev-msec)\" , \"TPI_Unblock(avg-msec)\" , \"TPI_Unblock(stddev-msec)\"\n");
+
+  for ( j = 0 ; j < ntest ; ++j ) {
+    const int nth = nthread[j];
+
+    double dt_block_total   = 0.0 ;
+    double dt_block_total_2 = 0.0 ;
+    double dt_unblock_total    = 0.0 ;
+    double dt_unblock_total_2  = 0.0 ;
+
+    int result = TPI_Init( nth );
+
+    if ( result != nth ) {
+      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
+      abort();
+    }
+
+    for ( i = 0 ; i < ntrial ; ++i ) {
+      double t , dt ;
+
+      t = TPI_Walltime();
+      TPI_Block();
+      dt = TPI_Walltime() - t ;
+      dt_block_total += dt ;
+      dt_block_total_2 += dt * dt ;
+
+
+      t = TPI_Walltime();
+      TPI_Unblock();
+      dt = TPI_Walltime() - t ;
+      dt_unblock_total += dt ;
+      dt_unblock_total_2 += dt * dt ;
+    }
+
+    TPI_Finalize();
+
+    if ( 1 < ntrial ) {
+      const double block_mean = 1.0e6 * dt_block_total / ntrial ;
+      const double block_sdev = 1.0e6 * sqrt( ( ntrial * dt_block_total_2 -
+                                        dt_block_total * dt_block_total ) /
+                                      ( ntrial * ( ntrial - 1 ) ) );
+
+      const double unblock_mean = 1.0e6 * dt_unblock_total / ntrial ;
+      const double unblock_sdev = 1.0e6 * sqrt( ( ntrial * dt_unblock_total_2 -
+                                          dt_unblock_total * dt_unblock_total) /
+                                        ( ntrial * ( ntrial - 1 ) ) );
+      
+      fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n",
+              nth , ntrial , block_mean , block_sdev , unblock_mean , unblock_sdev );
+    }
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+void test_tpi_reduce( const int ntest , const int nthread[] , const int ntrial )
+{
+  int j ;
+
+  fprintf( stdout , "\n\"TEST TPI_Run_threads(reduce) / TPI_Run_threads_reduce\"\n" );
+  fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Run_threads(avg-msec)\" , \"TPI_Run_threads(stddev-msec)\" , \"TPI_Run_threads_reduce(avg-msec)\" , \"TPI_Run_threads_reduce(stddev-msec)\"\n");
+
+  for ( j = 0 ; j < ntest ; ++j ) {
+    const int nth = nthread[j];
+
+    double dt_lock_total   = 0.0 ;
+    double dt_lock_total_2 = 0.0 ;
+    double dt_reduce_total    = 0.0 ;
+    double dt_reduce_total_2  = 0.0 ;
+    int i ;
+
+    int result = TPI_Init( nth );
+
+    if ( result != nth ) {
+      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
+    }
+
+    for ( i = 0 ; i < ntrial ; ++i ) {
+      double t , dt ;
+      int value = 0 ;
+      int * const ptr = & value ;
+
+      t = TPI_Walltime();
+      TPI_Run_threads( test_reduce_via_lock , & ptr , 1 );
+      dt = TPI_Walltime() - t ;
+      dt_lock_total += dt ;
+      dt_lock_total_2 += dt * dt ;
+
+      if ( value != nth ) {
+        fprintf(stderr,
+                "TPI_Run_threads(reduce,...) : FAILED at trial %d\n",
+                i );
+        abort();
+      }
+
+      value = 0 ;
+
+      t = TPI_Walltime();
+      TPI_Run_threads_reduce( test_reduce_via_nolock , NULL ,
+                              test_reduce_join , test_reduce_init ,
+                              sizeof(value) , & value );
+  
+      dt = TPI_Walltime() - t ;
+      dt_reduce_total += dt ;
+      dt_reduce_total_2 += dt * dt ;
+
+      if ( value != nth ) {
+        fprintf(stderr,
+                "TPI_Run_threads_reduce(...) : FAILED at trial %d\n",
+                i );
+        abort();
+      }
+    }
+
+    TPI_Finalize();
+
+    if ( 1 < ntrial ) {
+      const double lock_mean = 1.0e6 * dt_lock_total / ntrial ;
+      const double lock_sdev = 1.0e6 * sqrt( ( ntrial * dt_lock_total_2 -
+                                       dt_lock_total * dt_lock_total ) /
+                                     ( ntrial * ( ntrial - 1 ) ) );
+
+      const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ;
+      const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 -
+                                         dt_reduce_total * dt_reduce_total) /
+                                       ( ntrial * ( ntrial - 1 ) ) );
+      
+      fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n",
+              nth, ntrial, lock_mean, lock_sdev, reduce_mean, reduce_sdev);
+    }
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+void test_tpi_work( const int ntest , const int nthread[] , const int nwork ,
+                    const int ntrial )
+{
+  int * const flags = (int *) malloc( sizeof(int) * nwork );
+  int j ;
+
+  fprintf( stdout , "\n\"TEST TPI_Run / TPI_Run_reduce\"\n" );
+  fprintf( stdout , "\"#Thread\" , \"#Work\" , \"#Trial\" , \"TPI_Run(avg-msec)\" , \"TPI_Run(stddev-msec)\" , \"TPI_Run_reduce(avg-msec)\" , \"TPI_Run_reduce(stddev-msec)\"\n");
+
+  for ( j = 0 ; j < ntest ; ++j ) {
+    const int nth = nthread[j];
+
+    double dt_work_total   = 0.0 ;
+    double dt_work_total_2 = 0.0 ;
+    double dt_reduce_total    = 0.0 ;
+    double dt_reduce_total_2  = 0.0 ;
+    int i , k ;
+
+    int result = TPI_Init( nth );
+
+    if ( result != nth ) {
+      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
+    }
+
+    for ( i = 0 ; i < ntrial ; ++i ) {
+      double t , dt ;
+      int value = 0 ;
+
+      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }
+
+      t = TPI_Walltime();
+      TPI_Run( test_work , & flags , nwork , 0 );
+      dt = TPI_Walltime() - t ;
+      dt_work_total += dt ;
+      dt_work_total_2 += dt * dt ;
+
+      for ( k = 0 ; k < nwork && flags[k] ; ++k );
+
+      if ( k < nwork ) {
+        fprintf(stderr, "TPI_Run(...) : FAILED at trial %d\n", i );
+        abort();
+      }
+
+      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }
+
+      t = TPI_Walltime();
+      TPI_Run_reduce( test_reduce_work , & flags , nwork ,
+                      test_reduce_join , test_reduce_init ,
+                      sizeof(value) , & value );
+  
+      dt = TPI_Walltime() - t ;
+      dt_reduce_total += dt ;
+      dt_reduce_total_2 += dt * dt ;
+
+      for ( k = 0 ; k < nwork && flags[k] ; ++k );
+
+      if ( value != nwork || k < nwork ) {
+        fprintf(stderr, "TPI_Run_reduce(...) : FAILED at trial %d\n", i );
+        abort();
+      }
+    }
+
+    TPI_Finalize();
+
+    if ( 1 < ntrial ) {
+      const double work_mean = 1.0e6 * dt_work_total / ntrial ;
+      const double work_sdev = 1.0e6 * sqrt( ( ntrial * dt_work_total_2 -
+                                       dt_work_total * dt_work_total ) /
+                                     ( ntrial * ( ntrial - 1 ) ) );
+
+      const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ;
+      const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 -
+                                         dt_reduce_total * dt_reduce_total) /
+                                       ( ntrial * ( ntrial - 1 ) ) );
+      
+      fprintf(stdout,"%d , %d , %d , %10g , %10g , %10g , %10g\n",
+              nth, ntrial, nwork, work_mean, work_sdev, reduce_mean, reduce_sdev);
+    }
+  }
+
+  free( flags );
+}
+
+/*--------------------------------------------------------------------*/
+
+void test_tpi_work_async(
+  const int ntest , const int nthread[] , const int nwork , const int ntrial )
+{
+  int * const flags = (int *) malloc( sizeof(int) * nwork );
+  int j ;
+
+  fprintf( stdout , "\n\"TEST TPI_Start / TPI_Start_reduce\"\n" );
+  fprintf( stdout , "\"#Thread\" , \"#Work\" , \"#Trial\" , \"TPI_Start(avg-msec)\" , \"TPI_Start(stddev-msec)\" , \"TPI_Start_reduce(avg-msec)\" , \"TPI_Start_reduce(stddev-msec)\"\n");
+
+  for ( j = 0 ; j < ntest ; ++j ) {
+    const int nth = nthread[j];
+
+    double dt_work_total   = 0.0 ;
+    double dt_work_total_2 = 0.0 ;
+    double dt_reduce_total    = 0.0 ;
+    double dt_reduce_total_2  = 0.0 ;
+    int i , k ;
+
+    int result = TPI_Init( nth );
+
+    if ( result != nth ) {
+      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
+    }
+
+    for ( i = 0 ; i < ntrial ; ++i ) {
+      double t , dt ;
+      int value = 0 ;
+
+      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }
+
+      t = TPI_Walltime();
+      TPI_Start( test_work , & flags , nwork , 0 );
+      TPI_Wait();
+      dt = TPI_Walltime() - t ;
+      dt_work_total += dt ;
+      dt_work_total_2 += dt * dt ;
+
+      for ( k = 0 ; k < nwork && flags[k] ; ++k );
+
+      if ( k < nwork ) {
+        fprintf(stderr, "TPI_Run(...) : FAILED at trial %d\n", i );
+        abort();
+      }
+
+      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }
+
+      t = TPI_Walltime();
+
+      TPI_Start_reduce( test_reduce_work , & flags , nwork ,
+                        test_reduce_join , test_reduce_init ,
+                        sizeof(value) , & value );
+      TPI_Wait();
+  
+      dt = TPI_Walltime() - t ;
+      dt_reduce_total += dt ;
+      dt_reduce_total_2 += dt * dt ;
+
+      for ( k = 0 ; k < nwork && flags[k] ; ++k );
+
+      if ( value != nwork || k < nwork ) {
+        fprintf(stderr, "TPI_Run_reduce(...) : FAILED at trial %d\n", i );
+        abort();
+      }
+    }
+
+    TPI_Finalize();
+
+    if ( 1 < ntrial ) {
+      const double work_mean = 1.0e6 * dt_work_total / ntrial ;
+      const double work_sdev = 1.0e6 * sqrt( ( ntrial * dt_work_total_2 -
+                                       dt_work_total * dt_work_total ) /
+                                     ( ntrial * ( ntrial - 1 ) ) );
+
+      const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ;
+      const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 -
+                                         dt_reduce_total * dt_reduce_total) /
+                                       ( ntrial * ( ntrial - 1 ) ) );
+      
+      fprintf(stdout,"%d , %d , %d , %10g , %10g , %10g , %10g\n",
+              nth, ntrial, nwork, work_mean, work_sdev, reduce_mean, reduce_sdev);
+    }
+  }
+
+  free( flags );
+}
+
+/*--------------------------------------------------------------------*/
+
+static void test_work( TPI_Work * work )
+{
+  int * const flags = * (int *const*) work->info ;
+  flags[ work->rank ] = 1 ;
+}
+
+static void test_reduce_work( TPI_Work * work )
+{
+  int * const flags = * (int *const*) work->info ;
+  flags[ work->rank ] = 1 ;
+
+  *((int *) work->reduce) += 1 ;
+}
+
+static void test_reduce_init( TPI_Work * work )
+{
+  *((int *) work->reduce) = 0 ;
+}
+
+static void test_reduce_join( TPI_Work * work , const void * src )
+{
+  *((int *) work->reduce) += *( (const int *) src );
+}
+
+static void test_reduce_via_lock( TPI_Work * work )
+{
+  int * const value = * ((int *const*) work->info );
+  int result ;
+  if ( ( result = TPI_Lock(0) ) ) {
+    fprintf(stderr,"TPI_Lock(0) = %d : FAILED\n", result);
+    abort();
+  }
+  *value += 1 ;
+  if ( ( result = TPI_Unlock(0) ) ) {
+    fprintf(stderr,"TPI_Unlock(0) = %d : FAILED\n", result);
+    abort();
+  }
+}
+
+static void test_reduce_via_nolock( TPI_Work * work )
+{
+  int * const value = (int *) work->reduce ;
+  *value += 1 ;
+}
+
+/*--------------------------------------------------------------------*/
+
diff --git a/openmp-avx512/basic/optional/copy_from_trilinos b/openmp-avx512/basic/optional/copy_from_trilinos
new file mode 100755
index 0000000..042e4fb
--- /dev/null
+++ b/openmp-avx512/basic/optional/copy_from_trilinos
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+TRILINOS_SRC=$1
+
+if [ -d "${TRILINOS_SRC}" -a -d "${TRILINOS_SRC}/packages" ] ;
+then
+
+#-----------------------------------------------------------------------
+cp -r ${TRILINOS_SRC}/packages/ThreadPool/* ThreadPool
+rm -rf ThreadPool/doc
+
+cat << END_CAT > ThreadPool/ThreadPool_config.h
+#ifndef HAVE_PTHREAD
+#define HAVE_PTHREAD
+#endif
+END_CAT
+
+#-----------------------------------------------------------------------
+
+else
+
+  echo 'usage: ' $0 '<path-to-Trilinos-source>'
+
+fi
+
diff --git a/openmp-avx512/basic/optional/cuda/CudaCall.hpp b/openmp-avx512/basic/optional/cuda/CudaCall.hpp
new file mode 100644
index 0000000..f4b8c70
--- /dev/null
+++ b/openmp-avx512/basic/optional/cuda/CudaCall.hpp
@@ -0,0 +1,21 @@
+#ifndef stk_algsup_CudaCall_hpp
+#define stk_algsup_CudaCall_hpp
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+//----------------------------------------------------------------
+inline
+void stk_cuda_call(cudaError err , const char* name )
+{
+  if ( err != cudaSuccess ) {
+    fprintf(stderr, "%s error: %s\n",name, cudaGetErrorString(err) );
+    exit(-1);
+  }
+}
+
+#define CUDA_CALL( cuda_fn ) stk_cuda_call( cuda_fn , #cuda_fn )
+
+
+#endif
+
diff --git a/openmp-avx512/basic/optional/cuda/CudaMemoryModel.hpp b/openmp-avx512/basic/optional/cuda/CudaMemoryModel.hpp
new file mode 100644
index 0000000..54d189e
--- /dev/null
+++ b/openmp-avx512/basic/optional/cuda/CudaMemoryModel.hpp
@@ -0,0 +1,152 @@
+#ifndef _CudaMemoryModel_hpp_
+#define _CudaMemoryModel_hpp_
+
+#include <iostream>
+#ifdef MINIFE_HAVE_CUDA
+
+#include <stdio.h>
+#include <stdexcept>
+#include <map>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <CudaCall.hpp>
+
+class CudaMemoryModel {
+  public:
+    CudaMemoryModel()
+     : host_to_device_map(),
+       device_to_host_map()
+    {}
+
+    /** Destructor
+     * Upon destruction this class de-allocates all device-buffers that
+     * it was tracking.
+     */
+    virtual ~CudaMemoryModel();
+
+    /** Return a device-pointer corresponding to the given host-ptr and size.
+     * The returned device-pointer points to a buffer which has been allocated
+     * on the CUDA device with length buf_size*sizeof(T), but not initialized.
+     *
+     * If a device-pointer has already been allocated for the given host-pointer
+     * (by a previous call to this method) then that (previously-allocated) device-pointer
+     * is returned.
+     */
+    template<class T>
+    T* get_buffer(const T* host_ptr, size_t buf_size);
+
+    /** Destroy (free) the specified device-pointer.
+     *
+     * De-allocates the cuda-device buffer.
+     */
+    template<class T>
+    void destroy_buffer(T*& device_ptr);
+
+    /** Copy the contents of the given host-ptr to the given device-ptr.
+     * If the given device-ptr is not known (was not created by a previous
+     * call to get_buffer), an exception is thrown.
+     */
+    template<class T>
+    void copy_to_buffer(const T* host_ptr, size_t buf_size, T* device_ptr);
+
+    /** Copy the contents of the given device-ptr to the given host-ptr.
+     * If the given device-ptr is not known (was not created by a previous
+     * call to get_buffer), an exception is thrown.
+     */
+    template<class T>
+    void copy_from_buffer(T* host_ptr, size_t buf_size, const T* device_ptr);
+
+ private:
+  std::map<const void*,void*> host_to_device_map;
+  std::map<const void*,const void*> device_to_host_map;
+};
+
+//------------------------------------------------------------------------------
+template<class T>
+inline
+T* CudaMemoryModel::get_buffer(const T* host_ptr, size_t buf_size)
+{
+  T* device_ptr = NULL;
+
+  std::map<const void*,void*>::iterator iter = host_to_device_map.find(host_ptr);
+
+  if (iter == host_to_device_map.end()) {
+    CUDA_CALL( cudaMalloc( (void**)&device_ptr, sizeof(T)*buf_size) );
+
+    host_to_device_map.insert( std::make_pair(host_ptr, device_ptr) );
+    device_to_host_map.insert( std::make_pair(device_ptr, host_ptr) );
+  }
+  else {
+    device_ptr = reinterpret_cast<T*>(iter->second);
+  }
+
+  return device_ptr;
+}
+
+//------------------------------------------------------------------------------
+template<class T>
+inline
+void CudaMemoryModel::destroy_buffer(T*& device_ptr)
+{
+  std::map<const void*,const void*>::iterator iter = device_to_host_map.find(device_ptr);
+  if (iter != device_to_host_map.end()) {
+    const void* host_ptr = iter->second;
+    if (host_ptr != NULL) {
+      std::map<const void*,void*>::iterator iter2 = host_to_device_map.find(host_ptr);
+      if (iter2 != host_to_device_map.end()) {
+        host_to_device_map.erase(iter2);
+      }
+    }
+    CUDA_CALL( cudaFree(device_ptr) );
+    device_ptr = NULL;
+    device_to_host_map.erase(iter);
+  }
+}
+
+//------------------------------------------------------------------------------
+template<class T>
+inline
+void CudaMemoryModel::copy_to_buffer(const T* host_ptr, size_t buf_size, T* device_ptr)
+{
+  std::map<const void*,const void*>::iterator iter = device_to_host_map.find(device_ptr);
+  if (iter == device_to_host_map.end()) {
+    //failed to find device_ptr in device_to_host_map
+    throw std::runtime_error("CudaMemoryModel::copy_to_buffer ERROR, device_ptr not known.");
+  }
+
+  CUDA_CALL( cudaMemcpy( device_ptr, host_ptr, sizeof(T)*buf_size, cudaMemcpyHostToDevice) );
+}
+
+//------------------------------------------------------------------------------
+template<class T>
+inline
+void CudaMemoryModel::copy_from_buffer(T* host_ptr, size_t buf_size, const T* device_ptr)
+{
+  std::map<const void*,const void*>::iterator iter = device_to_host_map.find(device_ptr);
+  if (iter == device_to_host_map.end()) {
+    //failed to find device_ptr in device_to_host_map
+    throw std::runtime_error("CudaMemoryModel::copy_from_buffer ERROR, device_ptr not known.");
+  }
+
+  CUDA_CALL( cudaMemcpy( host_ptr, device_ptr, sizeof(T)*buf_size, cudaMemcpyDeviceToHost) );
+}
+
+inline
+CudaMemoryModel::~CudaMemoryModel()
+{
+  std::map<const void*,const void*>::iterator
+    iter = device_to_host_map.begin(),
+    iter_end = device_to_host_map.end();
+
+  for(; iter!=iter_end; ++iter) {
+    //cast away const so we can free the pointer:
+    void* dev_ptr = const_cast<void*>(iter->first);
+    CUDA_CALL( cudaFree(dev_ptr) );
+  }
+}
+
+#endif
+
+#endif
+
diff --git a/openmp-avx512/basic/optional/cuda/CudaNode.cpp b/openmp-avx512/basic/optional/cuda/CudaNode.cpp
new file mode 100644
index 0000000..5ddc580
--- /dev/null
+++ b/openmp-avx512/basic/optional/cuda/CudaNode.cpp
@@ -0,0 +1,96 @@
+#include <CudaNode.hpp>
+#include <stdexcept>
+#include <iostream>
+#include <cutil_inline_runtime.h>
+
+// some CUDA rules of thumb employed here (stolen from slides by Mike Bailey, Oregon State)
+// -The number of Blocks should be at least twice the number of MPs 
+// -The number of Threads per Block should be a multiple of 64 
+// -  192 or 256 are good numbers for Threads/Block 
+// We will enforce that numThreads is a power of two (to ease the reduction kernel)
+// greater than 64
+
+CUDANode::CUDANode(int device, int numBlocks, int numThreads, int verbose)
+: numBlocks_(numBlocks)
+, numThreads_(numThreads)
+, h_blk_mem_(NULL)
+, d_blk_mem_(NULL)
+, blk_mem_size_(0)
+{
+  using std::cout;
+  using std::endl;
+  using std::runtime_error;
+  // enforce that numThreads_ is a multiple of 64
+  if (numThreads_ != 64 && numThreads_ != 128 && numThreads_ != 256 && numThreads_ != 512
+      && numThreads_ != 1 && numThreads_ != 2 && numThreads_ != 4 && numThreads_ != 8 && numThreads_ != 16
+      && numThreads_ != 32) {
+//    throw runtime_error("CUDANode::CUDANode(): number of threads per block must be a power of two in [1,512].");
+  }
+  int deviceCount; cudaGetDeviceCount(&deviceCount); 
+  if (device >= deviceCount) {
+    if (deviceCount == 0) {
+//      throw runtime_error("CUDANode::CUDANode(): system has no CUDA devices.");
+    }
+    if (verbose) {
+      cout << "CUDANode::CUDANode(): specified device number not valid. Using device 0." << endl;
+    }
+    device = 0;
+  }
+  cudaDeviceProp deviceProp; 
+  int deviceAlreadyBeingUsed = -1;
+  cudaGetDevice(&deviceAlreadyBeingUsed);
+  if (deviceAlreadyBeingUsed >= 0 && deviceAlreadyBeingUsed < deviceCount) {
+    device = deviceAlreadyBeingUsed;
+  }
+  else {
+    cudaSetDevice(device);
+  }
+  cudaGetDeviceProperties(&deviceProp, device); 
+  // as of CUDA 2.1, device prop contains the following fields
+  // char name[256]; 
+  // size_t totalGlobalMem, sharedMemPerBlock; 
+  // int regsPerBlock, warpSize; 
+  // size_t memPitch; 
+  // int maxThreadsPerBlock, maxThreadsDim[3], maxGridSize[3]; 
+  // size_t totalConstMem; 
+  // int major, minor;
+  // int clockRate; 
+  // size_t textureAlignment; 
+  // int deviceOverlap; 
+  // int multiProcessorCount; 
+  // int kernelExecTimeoutEnabled; 
+  if (verbose) {
+    cout << "CUDANode attached to device #" << device << " \"" << deviceProp.name 
+         << "\", of compute capability " << deviceProp.major << "." << deviceProp.minor
+         << endl;
+  }
+  totalMem_ = deviceProp.totalGlobalMem;
+
+  expand_blk_mem(numBlocks_*8);
+} 
+
+void CUDANode::expand_blk_mem(size_t size_in_bytes)
+{
+  if (blk_mem_size_ >= size_in_bytes) return;
+
+  if (d_blk_mem_ != NULL) {
+    cutilSafeCallNoSync( cudaFree(d_blk_mem_) );
+    delete [] h_blk_mem_;
+  }
+
+  cutilSafeCallNoSync( cudaMalloc(&d_blk_mem_, size_in_bytes) );
+  h_blk_mem_ = new char[size_in_bytes];
+  blk_mem_size_ = size_in_bytes;
+}
+
+CUDANode::~CUDANode()
+{
+  if (d_blk_mem_ != NULL) {
+    cutilSafeCallNoSync( cudaFree(d_blk_mem_) );
+    d_blk_mem_ = NULL; 
+    delete [] h_blk_mem_;
+    h_blk_mem_ = NULL;
+  }
+  blk_mem_size_ = 0;
+}
+
diff --git a/openmp-avx512/basic/optional/cuda/CudaNode.cuh b/openmp-avx512/basic/optional/cuda/CudaNode.cuh
new file mode 100644
index 0000000..9b1b4fb
--- /dev/null
+++ b/openmp-avx512/basic/optional/cuda/CudaNode.cuh
@@ -0,0 +1,66 @@
+#ifndef CUDANODE_CUH_
+#define CUDANODE_CUH_
+
+#include <stdio.h>
+#include <cuda.h>
+#include <sharedmem.cuh>
+#include <cutil_inline_runtime.h>
+#include <cublas.h>
+
+// must define this before including any kernels
+#define KERNEL_PREFIX __device__ __host__
+
+#include <CudaNode.hpp>
+
+#include <DotOp.hpp>
+
+#ifdef CUDANODE_INCLUDE_PARALLEL_FOR
+template <class WDP>
+__global__ void
+Tkern1D(int length, WDP wd, int stride)
+{
+  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+  while(i < length) {
+    wd(i);
+    i += stride;
+  }
+}
+
+template <class WDP>
+void CUDANode::parallel_for(int length, WDP wd) {
+  if (length == 0) return;
+  unsigned int stride = numThreads_ * numBlocks_;
+  Tkern1D<WDP> <<< numBlocks_, numThreads_ >>>(length,wd,stride);
+}
+#endif // parallel_for
+
+#ifdef CUDANODE_INCLUDE_PARALLEL_REDUCE
+template<typename SCALAR>
+void call_dot(DotOp<SCALAR>& wd)
+{
+  printf("ERROR, unknown scalar-type, skipping cuda dot-product.\n");
+}
+template<>
+void call_dot(DotOp<double>& wd)
+{
+  wd.result = cublasDdot(wd.n, wd.x, 1, wd.y, 1);
+}
+template<>
+void call_dot(DotOp<float>& wd)
+{
+  wd.result = cublasSdot(wd.n, wd.x, 1, wd.y, 1);
+}
+
+template <class WDP>
+void CUDANode::parallel_reduce(int length, WDP& wd) 
+{
+  if (length == 1) {
+    wd.result = wd.generate(0);
+    return;
+  }
+
+  call_dot(wd);
+}
+#endif // parallel_reduce
+
+#endif
diff --git a/openmp-avx512/basic/optional/cuda/CudaNode.hpp b/openmp-avx512/basic/optional/cuda/CudaNode.hpp
new file mode 100644
index 0000000..de078ea
--- /dev/null
+++ b/openmp-avx512/basic/optional/cuda/CudaNode.hpp
@@ -0,0 +1,57 @@
+#ifndef CUDANODE_HPP_
+#define CUDANODE_HPP_
+
+#include <CudaMemoryModel.hpp>
+
+// forward declaration
+class CUDANode;
+
+class CUDANode : public CudaMemoryModel {
+  public:
+
+    CUDANode(int device = 0, int numBlocks = -1, int numThreads = 256, int verbose = 1);
+
+    ~CUDANode();
+
+    //@{ Computational methods
+
+    template <class WDP>
+    void parallel_for(int length, WDP wdp);
+
+    template <class WDP>
+    void parallel_reduce(int length, WDP& wd);
+
+    //@} 
+
+    static CUDANode& singleton(int device=0, int numBlocks=-1, int numThreads=256)
+    {
+      static CUDANode* cuda_node = NULL;
+      if (cuda_node == NULL) {
+        cuda_node = new CUDANode(device, numBlocks, numThreads);
+      }
+      return *cuda_node;
+    }
+      
+  private:
+    //template <class WDP, int FirstLevel>
+    //void call_reduce(int length, WDP wd, int threads, int blocks, void * d_blkpart);
+    // numBlocks_ is 
+    // - the number of blocks launched in a call to parallel_for()
+    // - not used by parallel_reduce()
+    int numBlocks_;
+    // numThreads_ is required to be a power-of-two (our requirement) between 1 and 512 (CUDA's requirement). It is:
+    // - the maximum number of threads used by parallel_reduce()
+    // - the number of threads per block in a call to parallel_for()
+    int numThreads_;
+    // total global device memory, in bytes
+    int totalMem_;
+
+    void expand_blk_mem(size_t size_in_bytes);
+
+    char* h_blk_mem_;
+    void* d_blk_mem_;
+    size_t blk_mem_size_;
+
+};
+
+#endif
diff --git a/openmp-avx512/basic/optional/cuda/CudaNodeImpl.hpp b/openmp-avx512/basic/optional/cuda/CudaNodeImpl.hpp
new file mode 100644
index 0000000..4b94562
--- /dev/null
+++ b/openmp-avx512/basic/optional/cuda/CudaNodeImpl.hpp
@@ -0,0 +1,15 @@
+#ifndef CUDANODE_IMPL_HPP_
+#define CUDANODE_IMPL_HPP_
+
+#include <CudaNode.hpp>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cutil_inline_runtime.h>
+#include <stdlib.h>
+#include <stdexcept>
+
+// TODO: consider using cudaMallocHost to allocate page-locked host memory
+//       this speeds up transfer between device and host, and could be very 
+//       useful in the case of Import/Export multivector operations
+
+#endif
diff --git a/openmp-avx512/basic/optional/cuda/Matrix.cu b/openmp-avx512/basic/optional/cuda/Matrix.cu
new file mode 100644
index 0000000..1487f1a
--- /dev/null
+++ b/openmp-avx512/basic/optional/cuda/Matrix.cu
@@ -0,0 +1,22 @@
+#define CUDANODE_INCLUDE_PARALLEL_FOR
+
+// include for CudaNode method implementations
+#include <CudaNode.cuh>
+
+// includes for all operators for which Matrix needs support
+#include <MatvecOp.hpp>
+#include <MatrixInitOp.hpp>
+#include <MatrixCopyOp.hpp>
+
+#include <Vector.hpp>
+#include <SparseMatrix.hpp>
+
+// explicit instantiations for Matrix class
+#define EXPLICIT_MATRIX_SUPPORT(MATRIX,VECTOR) \
+template void CUDANode::parallel_for<MatvecOp< MATRIX, VECTOR > >(int , MatvecOp< MATRIX, VECTOR >);
+
+typedef miniFE::SparseMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,CUDANode> Matrix_type;
+typedef miniFE::Vector<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,CUDANode> Vector_type;
+
+EXPLICIT_MATRIX_SUPPORT(Matrix_type,Vector_type)
+
diff --git a/openmp-avx512/basic/optional/cuda/Vector.cu b/openmp-avx512/basic/optional/cuda/Vector.cu
new file mode 100644
index 0000000..9a79955
--- /dev/null
+++ b/openmp-avx512/basic/optional/cuda/Vector.cu
@@ -0,0 +1,19 @@
+#define CUDANODE_INCLUDE_PARALLEL_REDUCE
+#define CUDANODE_INCLUDE_PARALLEL_FOR
+
+// include for CudaNode method implementations
+#include <CudaNode.cuh>
+
+// includes for all operators for which Vector needs support
+#include <WaxpbyOp.hpp>
+#include <DotOp.hpp>
+#include <MemInitOp.hpp>
+#include <FEComputeElem.hpp>
+
+// explicit instantiations for Vectors
+#define EXPLICIT_VECTOR_SUPPORT(GLOBALORDINAL, SCALAR) \
+template void CUDANode::parallel_for<WaxpbyOp< SCALAR > >(int , WaxpbyOp< SCALAR >); \
+template void CUDANode::parallel_reduce< DotOp< SCALAR > >(int ,  DotOp< SCALAR >& ); \
+template void CUDANode::parallel_for<FEComputeElem< GLOBALORDINAL, SCALAR > >(int , FEComputeElem< GLOBALORDINAL, SCALAR > );
+
+EXPLICIT_VECTOR_SUPPORT(MINIFE_GLOBAL_ORDINAL, MINIFE_SCALAR)
diff --git a/openmp-avx512/basic/optional/cuda/cutil_inline_runtime.h b/openmp-avx512/basic/optional/cuda/cutil_inline_runtime.h
new file mode 100644
index 0000000..1f49afb
--- /dev/null
+++ b/openmp-avx512/basic/optional/cuda/cutil_inline_runtime.h
@@ -0,0 +1,63 @@
+#ifndef _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
+#define _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <cufft.h>
+
+// We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
+// The advantage is the developers gets to use the inline function so they can debug
+#define cutilSafeCallNoSync(err)     __cudaSafeCallNoSync(err, __FILE__, __LINE__)
+#define cutilSafeCall(err)           __cudaSafeCall      (err, __FILE__, __LINE__)
+#define cutilSafeThreadSync()        __cudaSafeThreadSync(__FILE__, __LINE__)
+#define cutilCheckMsg(msg)           __cutilCheckMsg     (msg, __FILE__, __LINE__)
+
+inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
+{
+    if( cudaSuccess != err) {
+        fprintf(stderr, "cudaSafeCallNoSync() Runtime API error in file <%s>, line %i : %s.\n",
+                file, line, cudaGetErrorString( err) );
+        exit(-1);
+    }
+}
+
+inline void __cudaSafeCall( cudaError err, const char *file, const int line )
+{
+    if( cudaSuccess != err) {
+        fprintf(stderr, "cudaSafeCall() Runtime API error in file <%s>, line %i : %s.\n",
+                file, line, cudaGetErrorString( err) );
+        exit(-1);
+    }
+}
+
+inline void __cudaSafeThreadSync( const char *file, const int line )
+{
+    cudaError err = cudaThreadSynchronize();
+    if ( cudaSuccess != err) {
+        fprintf(stderr, "cudaThreadSynchronize() Driver API error in file '%s' in line %i : %s.\n",
+                file, line, cudaGetErrorString( err) );
+        exit(-1);
+    }
+}
+
+inline void __cutilCheckMsg( const char *errorMessage, const char *file, const int line )
+{
+    cudaError_t err = cudaGetLastError();
+    if( cudaSuccess != err) {
+        fprintf(stderr, "cutilCheckMsg() CUTIL CUDA error: %s in file <%s>, line %i : %s.\n",
+                errorMessage, file, line, cudaGetErrorString( err) );
+        exit(-1);
+    }
+#ifdef _DEBUG
+    err = cudaThreadSynchronize();
+    if( cudaSuccess != err) {
+        fprintf(stderr, "cutilCheckMsg cudaThreadSynchronize error: %s in file <%s>, line %i : %s.\n",
+                errorMessage, file, line, cudaGetErrorString( err) );
+        exit(-1);
+    }
+#endif
+}
+
+#endif // _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
diff --git a/openmp-avx512/basic/optional/make_targets b/openmp-avx512/basic/optional/make_targets
new file mode 100644
index 0000000..01ed2c8
--- /dev/null
+++ b/openmp-avx512/basic/optional/make_targets
@@ -0,0 +1,54 @@
+#-----------------------------------------------------------------------
+
+TPI.o : ./optional/ThreadPool/src/TPI.c
+	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
+
+#-----------------------------------------------------------------------
+
+CudaNode.o : ./optional/cuda/CudaNode.cpp ./optional/cuda/*.hpp ./optional/cuda/*.h
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
+
+CudaVector.o : ./optional/cuda/Vector.cu ./optional/cuda/*.cuh
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -c -o $@ $<
+
+CudaMatrix.o : ./optional/cuda/Matrix.cu ./optional/cuda/*.cuh
+	nvcc $(CUDAFLAGS) $(CPPFLAGS) -c -o $@ $<
+
+#-----------------------------------------------------------------------
+# Recursive make to create the object files in this directory,
+# generate the archive, and then remove the object files.
+
+libstk.a :
+	cd ./optional ; \
+	$(MAKE) "CC=$(CC)" "CXX=$(CXX)" "CPPFLAGS=$(CPPFLAGS)" "CFLAGS=$(CFLAGS)" "CXXFLAGS=$(CXXFLAGS)" -f make_targets stk_library
+
+STK_SOURCE =	\
+	./shards/src/*.cpp	\
+	./stk_util/util/*.cpp	\
+	./stk_util/environment/*.cpp	\
+	./stk_util/parallel/*.cpp	\
+	./stk_mesh/base/*.cpp	\
+	./stk_mesh/baseImpl/*.cpp	\
+	./stk_mesh/fem/*.cpp	\
+	stk_helpers.cpp
+
+STK_INCLUDES =	\
+	./shards/src/*.hpp	\
+	./shards/src/*.h	\
+	./stk_util/util/*.hpp	\
+	./stk_util/environment/*.hpp	\
+	./stk_util/parallel/*.hpp	\
+	./stk_mesh/base/*.hpp	\
+	./stk_mesh/fem/*.hpp
+
+STK_INC = -I${PWD}/ThreadPool -I${PWD}/shards
+
+stk_library : $(STK_SOURCE) $(STK_INCLUDES)
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $(STK_INC) -c $(STK_SOURCE)
+	ar -qc ../libstk.a *.o
+	ranlib ../libstk.a
+	rm *.o
+
+#-----------------------------------------------------------------------
+
+
diff --git a/openmp-avx512/basic/perform_element_loop.hpp b/openmp-avx512/basic/perform_element_loop.hpp
new file mode 100644
index 0000000..f65ad4f
--- /dev/null
+++ b/openmp-avx512/basic/perform_element_loop.hpp
@@ -0,0 +1,110 @@
+#ifndef _perform_element_loop_hpp_
+#define _perform_element_loop_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <BoxIterator.hpp>
+#include <simple_mesh_description.hpp>
+#include <SparseMatrix_functions.hpp>
+#include <box_utils.hpp>
+#include <Hex8_box_utils.hpp>
+#include <Hex8_ElemData.hpp>
+
+namespace miniFE {
+
+template<typename GlobalOrdinal,
+         typename MatrixType, typename VectorType>
+void
+perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
+                     const Box& local_elem_box,
+                     MatrixType& A, VectorType& b,
+                     Parameters& /*params*/)
+{
+  typedef typename MatrixType::ScalarType Scalar;
+
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+
+  //We will iterate the local-element-box (local portion of the mesh), and
+  //get element-IDs in preparation for later assembling the FE operators
+  //into the global sparse linear-system.
+
+  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
+  std::vector<GlobalOrdinal> elemIDs(num_elems);
+
+  BoxIterator iter = BoxIterator::begin(local_elem_box);
+  BoxIterator end  = BoxIterator::end(local_elem_box);
+
+  for(size_t i=0; iter != end; ++iter, ++i) {
+    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
+                                       iter.x, iter.y, iter.z);
+//#ifdef MINIFE_DEBUG
+//std::cout << "elem ID " << elemIDs[i] << " ("<<iter.x<<","<<iter.y<<","<<iter.z<<")"<<std::endl;
+//#endif
+  }
+
+  //Now do the actual finite-element assembly loop:
+
+  ElemData<GlobalOrdinal,Scalar> elem_data;
+
+  compute_gradient_values(elem_data.grad_vals);
+
+  timer_type t_gn = 0, t_ce = 0, t_si = 0;
+  timer_type t0 = 0;
+  for(size_t i=0; i<elemIDs.size(); ++i) {
+    //Given an element-id, populate elem_data with the
+    //element's node_ids and nodal-coords:
+
+    TICK();
+    get_elem_nodes_and_coords(mesh, elemIDs[i], elem_data);
+    TOCK(t_gn);
+
+    //Next compute element-diffusion-matrix and element-source-vector:
+
+    TICK();
+    compute_element_matrix_and_vector(elem_data);
+    TOCK(t_ce);
+
+    //Now assemble the (dense) element-matrix and element-vector into the
+    //global sparse linear system:
+
+    TICK();
+    sum_into_global_linear_system(elem_data, A, b);
+    TOCK(t_si);
+  }
+//std::cout << std::endl<<"get-nodes: " << t_gn << std::endl;
+//std::cout << "compute-elems: " << t_ce << std::endl;
+//std::cout << "sum-in: " << t_si << std::endl;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/perform_element_loop_TBB_pipe.hpp b/openmp-avx512/basic/perform_element_loop_TBB_pipe.hpp
new file mode 100644
index 0000000..044a049
--- /dev/null
+++ b/openmp-avx512/basic/perform_element_loop_TBB_pipe.hpp
@@ -0,0 +1,382 @@
+#ifndef _perform_element_loop_TBB_pipe_hpp_
+#define _perform_element_loop_TBB_pipe_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#ifdef MINIFE_HAVE_TBB
+
+#include <LockingMatrix.hpp>
+#include <LockingVector.hpp>
+#include <BoxIterator.hpp>
+#include <simple_mesh_description.hpp>
+#include <SparseMatrix_functions.hpp>
+#include <Hex8_box_utils.hpp>
+#include <Hex8_ElemData.hpp>
+
+#include <tbb/pipeline.h>
+
+namespace miniFE {
+
+//---------------------------------------------------------------------
+//This file contains three 'filter' classes, and a 'perform_element_loop'
+//function that uses those filter classes to run a TBB pipeline.
+//
+//The filter classes are as follows:
+//1. GetElemNodesCoords
+//     For each element in the mesh, create an elem-data object with coords
+//     and node-ids.
+//2. Compute_FE_Operators
+//     Given an elem-data object (with coords and node-ids), compute the
+//     diffusion-matrix and source-vector.
+//3. LockingSumIntoLinearSystem
+//     Given an elem-data object (with diffusion-matrix and source-vector),
+//     assemble into global-sparse linear-system. Uses a lock on each
+//     matrix row to ensure that multiple threads don't update the same row
+//     at the same time.
+//... or:
+//3. SumIntoLinearSystem
+//     Given an elem-data object (with diffusion-matrix and source-vector),
+//     assemble into global-sparse linear-system.
+//     There are several of these filters, usually 1 per thread, and each
+//     will be responsible for a certain slice of equations. It will check
+//     the elem-data for equations that are in its slice, assemble those, and
+//     pass the elem-data on so that the next SumIntoLinearSystem filter can
+//     deal with equations in a different 'slice'.
+//
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+
+/** Filter 1.: GetElemNodesCoords
+ */
+template<typename GlobalOrdinal,typename Scalar>
+class GetElemNodesCoords : public tbb::filter {
+public:
+  GetElemNodesCoords(const std::vector<GlobalOrdinal>& elemIDs,
+                     const simple_mesh_description<GlobalOrdinal>& mesh,
+                     size_t num_elems_at_a_time)
+   : tbb::filter(/*is_serial=*/true),
+     elemIDs_(elemIDs),
+     i_(0),
+     mesh_(mesh),
+     num_elems_(num_elems_at_a_time)
+  {
+    if (num_elems_ < 1) num_elems_ = 1;
+  }
+
+  ~GetElemNodesCoords(){}
+
+private:
+  /** This operator launches an elem-data object for a 'group' (size num_elems_)
+    * of elements. When all elements have been launched, return NULL to signal
+    * that we're done issuing data.
+   */
+  void* operator()(void* item) {
+    if (i_ >= elemIDs_.size()) return NULL;
+
+    size_t num = num_elems_;
+    if (i_+num > elemIDs_.size()) num = elemIDs_.size() - i_;
+
+    std::vector<ElemData<GlobalOrdinal,Scalar> >* elemdata_vec = new std::vector<ElemData<GlobalOrdinal,Scalar> >(num);
+
+    size_t i=0;
+    while (i_ < elemIDs_.size() && i < num) {
+      get_elem_nodes_and_coords(mesh_, elemIDs_[i_], (*elemdata_vec)[i]);
+      ++i_;
+      ++i;
+    }
+
+    return elemdata_vec;
+  }
+
+  const std::vector<GlobalOrdinal>& elemIDs_;
+  size_t i_;
+  const simple_mesh_description<GlobalOrdinal>& mesh_;
+  size_t num_elems_;
+};
+
+//---------------------------------------------------------------------
+
+/** Filter 2.: Compute_FE_Operators
+ */
+template<typename GlobalOrdinal,typename Scalar>
+class Compute_FE_Operators : public tbb::filter {
+public:
+  Compute_FE_Operators() : tbb::filter(/*is_serial=*/false) {}
+  ~Compute_FE_Operators() {}
+
+private:
+  /** This operator takes a vector of elem-data objects which are assumed
+    * to have nodal-coordinates already populated, and computes the
+    * element-diffusion-matrix and element-source-vector for each.
+   */
+  void* operator()(void* item) {
+    if (item == NULL) return NULL;
+    std::vector<ElemData<GlobalOrdinal,Scalar> >* elemdata = static_cast<std::vector<ElemData<GlobalOrdinal,Scalar> >*>(item);
+
+    for(size_t i=0; i<elemdata->size(); ++i) {
+      compute_element_matrix_and_vector((*elemdata)[i]);
+    }
+    return elemdata;
+  }
+};
+
+//---------------------------------------------------------------------
+
+/** Filter 3.: SumIntoLinearSystem
+ */
+template<typename MatrixType, typename VectorType>
+class SumIntoLinearSystem : public tbb::filter {
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+public:
+  SumIntoLinearSystem(GlobalOrdinal myFirstRow,
+                      GlobalOrdinal myLastRow,
+                      MatrixType& mat, VectorType& vec)
+   : tbb::filter(/*is_serial=*/true),
+     A_(mat), b_(vec),
+     myFirstRow_(myFirstRow),
+     myLastRow_(myLastRow)
+  {
+  }
+
+  ~SumIntoLinearSystem() {}
+
+private:
+  /** This operator takes a vector of elem-data objects which have an
+    * element-diffusion-matrix and source-vector, looks through it for
+    * any rows in this filter's slice of the global matrix, assembles
+    * those rows into the linear-system, then passes the elem-data object
+    * on for use by the next assembly filter.
+    * If this assembly filter is responsible for the last slice of the
+    * row-space, then this is the last filter and so we delete the
+    * elem-data object.
+    */
+  void* operator()(void* item) {
+    if (item == NULL) return NULL;
+    std::vector<ElemData<GlobalOrdinal,Scalar> >* elemdata_vec = static_cast<std::vector<ElemData<GlobalOrdinal,Scalar> >*>(item);
+
+    for(size_t e=0; e<elemdata_vec->size(); ++e) {
+      ElemData<GlobalOrdinal,Scalar>& elemdata = (*elemdata_vec)[e];
+      size_t nnodes = elemdata.nodes_per_elem;
+      for(size_t i=0; i<nnodes; ++i) {
+        GlobalOrdinal row = elemdata.elem_node_ids[i];
+        if (row < myFirstRow_ || row > myLastRow_) continue;
+  
+        sum_into_row(row, nnodes, elemdata.elem_node_ids,
+                     &(elemdata.elem_diffusion_matrix[i*nnodes]), A_);
+        sum_into_vector(1, &row, &(elemdata.elem_source_vector[i]), b_);
+      }
+    }
+
+    if (myLastRow_ >= A_.rows.size()) {
+      delete elemdata_vec;
+      return NULL;
+    }
+
+    return elemdata_vec;
+  }
+
+  MatrixType& A_;
+  VectorType& b_;
+  GlobalOrdinal myFirstRow_;
+  GlobalOrdinal myLastRow_;
+};
+
+//---------------------------------------------------------------------
+
+static tbb::atomic<size_t> matrix_suminto;
+
+/** Filter 3.: SumIntoLinearSystem with locking
+ */
+template<typename MatrixType, typename VectorType>
+class LockingSumIntoLinearSystem : public tbb::filter {
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+public:
+  LockingSumIntoLinearSystem(MatrixType& mat, VectorType& vec)
+   : tbb::filter(/*is_serial=*/false),
+     A_(mat), b_(vec)
+  {
+  }
+
+  ~LockingSumIntoLinearSystem() {}
+
+private:
+  /** This operator takes a vector of elem-data objects which have an
+    * element-diffusion-matrix and source-vector, and assembles into
+    * the linear-system, using locking to make sure no other
+    * thread is assembling the same global row at the same time.
+    */
+  void* operator()(void* item) {
+    if (item == NULL) return NULL;
+    std::vector<ElemData<GlobalOrdinal,Scalar> >* elemdata_vec = static_cast<std::vector<ElemData<GlobalOrdinal,Scalar> >*>(item);
+
+    for(size_t e=0; e<elemdata_vec->size(); ++e) {
+      ElemData<GlobalOrdinal,Scalar>& elemdata = (*elemdata_vec)[e];
+      size_t nnodes = elemdata.nodes_per_elem;
+      size_t offset = 0;
+      for(size_t i=0; i<nnodes; ++i) {
+        GlobalOrdinal row = elemdata.elem_node_ids[i];
+        //The contiguous row starting from the diagonal is the upper triangle.
+        const Scalar* row_coefs = &elemdata.elem_diffusion_matrix[offset];
+        const GlobalOrdinal* col_inds = &elemdata.elem_node_ids[i];
+        size_t row_len = nnodes-i;
+
+        ++matrix_suminto;
+  
+        A_.sum_in(row, row_len, col_inds, row_coefs);
+
+        //Now we have to loop to sum in the lower triangle:
+        for(size_t j=i+1; j<nnodes; ++j) {
+          const Scalar* row_coef = &row_coefs[j];
+          const GlobalOrdinal* col = &col_inds[j];
+          A_.sum_in(*col, 1, &row, row_coef);
+        }
+
+        b_.sum_in(1, &row, &(elemdata.elem_source_vector[i]));
+      }
+    }
+
+    delete elemdata_vec;
+    return NULL;
+  }
+
+  LockingMatrix<MatrixType> A_;
+  LockingVector<VectorType> b_;
+};
+
+//---------------------------------------------------------------------
+
+template<typename GlobalOrdinal,
+         typename MatrixType, typename VectorType>
+void
+perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
+                     const Box& local_elem_box,
+                     MatrixType& A, VectorType& b,
+                     Parameters& params)
+{
+  typedef typename MatrixType::ScalarType Scalar;
+
+  if (A.rows.size() == 0) return;
+
+  int num_threads = params.numthreads;
+
+  //We will iterate the local-element-box (local portion of the mesh), and
+  //assemble the FE operators into the global sparse linear-system.
+
+  tbb::pipeline pipe;
+  
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+
+  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
+  std::vector<GlobalOrdinal> elemIDs(num_elems);
+
+  BoxIterator iter = BoxIterator::begin(local_elem_box);
+  BoxIterator end  = BoxIterator::end(local_elem_box);
+
+  for(size_t i=0; iter != end; ++iter, ++i) {
+    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
+                                       iter.x, iter.y, -iter.z);
+  }
+
+  //Create the first stage of the pipeline, the filter that will
+  //launch elem-data from the mesh, through the pipeline.
+  GetElemNodesCoords<GlobalOrdinal,Scalar> get_nodes_coords(elemIDs, mesh, params.elem_group_size);
+
+  //Create the second stage of the pipeline, the parallel filter that will
+  //compute element-matrices and element-vectors.
+  Compute_FE_Operators<GlobalOrdinal,Scalar> fe_ops;
+
+  //Add the filters to the pipeline:
+  pipe.add_filter(get_nodes_coords);
+  pipe.add_filter(fe_ops);
+
+  LockingSumIntoLinearSystem<MatrixType,VectorType>* sum_into_linsys = NULL;
+  size_t num_assembly_filters = 0;
+  std::vector<SumIntoLinearSystem<MatrixType,VectorType>*> linsys;
+
+  bool use_locking = params.use_locking==1;
+  if (use_locking) {
+    sum_into_linsys = new LockingSumIntoLinearSystem<MatrixType,VectorType>(A, b);
+    pipe.add_filter(*sum_into_linsys);
+  }
+  else {
+    //If not using locking, create several assembly filters, each of which
+    //will be responsible for assembling rows into a certain slice of the
+    //global matrix.
+  
+    num_assembly_filters = num_threads/3;
+    if (num_assembly_filters == 0) num_assembly_filters = 1;
+    num_assembly_filters = 2;
+  
+    size_t num_rows = A.rows.size();
+    size_t rows_per_thread = num_rows/num_assembly_filters;
+    if (num_rows % num_assembly_filters > 0) ++rows_per_thread;
+    size_t first_row = A.rows[0];
+    for(int i=0; i<num_assembly_filters; ++i) {
+      size_t last_row = first_row + rows_per_thread - 1;
+      SumIntoLinearSystem<MatrixType,VectorType> * sum_into = new SumIntoLinearSystem<MatrixType,VectorType>(first_row, last_row, A, b);
+      linsys.push_back(sum_into);
+      pipe.add_filter(*sum_into);
+  
+      first_row += rows_per_thread;
+    }
+  }
+
+  //Running the pipeline carries out the element-loop and assembly.
+  pipe.run(num_threads);
+
+  pipe.clear();
+
+  if (use_locking) {
+    std::cout << "\n{number of matrix conflicts: " << miniFE_num_matrix_conflicts << "}"<<std::endl;
+    std::cout << "{number of vector conflicts: " << miniFE_num_vector_conflicts << "}"<<std::endl;
+    std::cout << "matrix_suminto: " << matrix_suminto << std::endl;
+  }
+  else {
+    std::cout << "no locking, num-assembly-filters: "<<num_assembly_filters<<std::endl;
+  }
+
+  delete sum_into_linsys;
+  for(size_t i=0; i<linsys.size(); ++i) delete linsys[i];
+}
+
+}//namespace miniFE
+
+#else
+#error "ERROR, this file shouldn't be compiled if MINIFE_HAVE_TBB is not defined."
+#endif
+
+#endif
+
diff --git a/openmp-avx512/basic/perform_element_loop_TBB_pllfor1.hpp b/openmp-avx512/basic/perform_element_loop_TBB_pllfor1.hpp
new file mode 100644
index 0000000..c2afb5b
--- /dev/null
+++ b/openmp-avx512/basic/perform_element_loop_TBB_pllfor1.hpp
@@ -0,0 +1,126 @@
+#ifndef _perform_element_loop_TBB_pllfor1_hpp_
+#define _perform_element_loop_TBB_pllfor1_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#ifdef MINIFE_HAVE_TBB
+
+#include <LockingMatrix.hpp>
+#include <LockingVector.hpp>
+#include <BoxIterator.hpp>
+#include <simple_mesh_description.hpp>
+#include <SparseMatrix_functions.hpp>
+#include <Hex8_box_utils.hpp>
+#include <Hex8_ElemData.hpp>
+#include <mytimer.hpp>
+
+namespace miniFE {
+
+//---------------------------------------------------------------------
+
+template<typename GlobalOrdinal,typename Scalar,
+         typename MatrixType, typename VectorType>
+struct FEAssembleSumInto {
+  const simple_mesh_description<GlobalOrdinal>* mesh;
+  GlobalOrdinal* elemIDs;
+  LockingMatrix<MatrixType>* A;
+  LockingVector<VectorType>* b;
+
+inline void operator()(int i)
+{
+  ElemData<GlobalOrdinal,Scalar> elem_data;
+  GlobalOrdinal elemID = elemIDs[i];
+  get_elem_nodes_and_coords(*mesh, elemID, elem_data.elem_node_ids,
+                            elem_data.elem_node_coords);
+  compute_element_matrix_and_vector(elem_data);
+  sum_into_global_linear_system(elem_data, *A, *b);
+}
+};
+
+template<typename GlobalOrdinal,
+         typename MatrixType, typename VectorType>
+void
+perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
+                     const Box& local_elem_box,
+                     MatrixType& A, VectorType& b,
+                     Parameters& params)
+{
+  typedef typename MatrixType::ScalarType Scalar;
+
+  if (A.rows.size() == 0) return;
+
+  int num_threads = params.numthreads;
+
+  timer_type t0 = mytimer();
+
+  //We will iterate the local-element-box (local portion of the mesh), and
+  //assemble the FE operators into the global sparse linear-system.
+  
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+
+  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
+  std::vector<GlobalOrdinal> elemIDs(num_elems);
+
+  BoxIterator iter = BoxIterator::begin(local_elem_box);
+  BoxIterator end  = BoxIterator::end(local_elem_box);
+
+  for(size_t i=0; iter != end; ++iter, ++i) {
+    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
+                                       iter.x, iter.y, iter.z);
+  }
+
+  LockingMatrix<MatrixType> lockingA(A);
+  LockingVector<VectorType> lockingb(b);
+
+  FEAssembleSumInto<GlobalOrdinal,Scalar,MatrixType,VectorType> fe_op;
+  fe_op.mesh = &mesh;
+  fe_op.elemIDs = &elemIDs[0];
+  fe_op.A = &lockingA;
+  fe_op.b = &lockingb;
+  
+  typedef typename VectorType::ComputeNodeType ComputeNodeType;
+
+  ComputeNodeType& compute_node = b.compute_node;
+
+  compute_node.parallel_for(elemIDs.size(), fe_op);
+
+  std::cout << "\n{number of matrix conflicts: " << miniFE_num_matrix_conflicts << "}"<<std::endl;
+  std::cout << "{number of vector conflicts: " << miniFE_num_vector_conflicts << "}"<<std::endl;
+}
+
+}//namespace miniFE
+
+#else
+#error "ERROR, this file shouldn't be compiled if MINIFE_HAVE_TBB is not defined."
+#endif
+
+#endif
+
diff --git a/openmp-avx512/basic/perform_element_loop_TBB_pllfor2.hpp b/openmp-avx512/basic/perform_element_loop_TBB_pllfor2.hpp
new file mode 100644
index 0000000..7889787
--- /dev/null
+++ b/openmp-avx512/basic/perform_element_loop_TBB_pllfor2.hpp
@@ -0,0 +1,162 @@
+#ifndef _perform_element_loop_TBB_pllfor2_hpp_
+#define _perform_element_loop_TBB_pllfor2_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#ifdef MINIFE_HAVE_TBB
+
+#ifdef MINIFE_HAVE_CUDA
+#include <CudaNode.hpp>
+#endif
+
+#include <LockingMatrix.hpp>
+#include <LockingVector.hpp>
+#include <ElemData.hpp>
+#include <BoxIterator.hpp>
+#include <simple_mesh_description.hpp>
+#include <SparseMatrix_functions.hpp>
+#include <Hex8_box_utils.hpp>
+#include <GetNodesCoords.hpp>
+#include <FEComputeElem.hpp>
+#include <SumInLinSys.hpp>
+#include <mytimer.hpp>
+
+namespace miniFE {
+
+//---------------------------------------------------------------------
+
+template<typename GlobalOrdinal,
+         typename MatrixType, typename VectorType>
+void
+perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
+                     const Box& local_elem_box,
+                     MatrixType& A, VectorType& b,
+                     Parameters& params)
+{
+  typedef typename MatrixType::ScalarType Scalar;
+
+  if (A.rows.size() == 0) return;
+
+  int num_threads = params.numthreads;
+
+  timer_type t0 = mytimer();
+
+  //We will iterate the local-element-box (local portion of the mesh), and
+  //assemble the FE operators into the global sparse linear-system.
+  
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+
+  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
+  std::vector<GlobalOrdinal> elemIDs(num_elems);
+
+  BoxIterator iter = BoxIterator::begin(local_elem_box);
+  BoxIterator end  = BoxIterator::end(local_elem_box);
+
+  for(size_t i=0; iter != end; ++iter, ++i) {
+    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
+                                       iter.x, iter.y, iter.z);
+  }
+
+  std::vector<GlobalOrdinal> node_ordinals(num_elems*Hex8::numNodesPerElem);
+  std::vector<Scalar> node_coords(num_elems*Hex8::numNodesPerElem*Hex8::spatialDim);
+  std::vector<Scalar> elem_matrices(num_elems*Hex8::numNodesPerElem*Hex8::numNodesPerElem);
+  std::vector<Scalar> elem_vectors(num_elems*Hex8::numNodesPerElem);
+
+  LockingMatrix<MatrixType> lockingA(A);
+  LockingVector<VectorType> lockingb(b);
+
+  GetNodesCoords<GlobalOrdinal,Scalar> get_nodes_coords;
+  get_nodes_coords.elemIDs = &elemIDs[0];
+  get_nodes_coords.mesh = &mesh;
+  get_nodes_coords.node_ordinals = &node_ordinals[0];
+  get_nodes_coords.elem_node_coords = &node_coords[0];
+  
+  typedef typename VectorType::ComputeNodeType ComputeNodeType;
+
+  ComputeNodeType& compute_node = b.compute_node;
+
+  compute_node.parallel_for(elemIDs.size(), get_nodes_coords);
+
+  timer_type t_gn = mytimer() - t0;
+  t0 = mytimer();
+
+#ifdef MINIFE_HAVE_CUDA
+  CUDANode& elem_compute_node = CUDANode::singleton();
+#else
+  ComputeNodeType& elem_compute_node = compute_node;
+#endif
+  timer_type t_ccn = mytimer() - t0;
+  t0 = mytimer();
+
+  Scalar* d_node_coords = elem_compute_node.get_buffer(&node_coords[0], node_coords.size());
+  Scalar* d_elem_matrices = elem_compute_node.get_buffer(&elem_matrices[0], elem_matrices.size());
+  Scalar* d_elem_vectors  = elem_compute_node.get_buffer(&elem_vectors[0], elem_vectors.size());
+
+  elem_compute_node.copy_to_buffer(&node_coords[0], node_coords.size(), d_node_coords);
+
+  FEComputeElem<GlobalOrdinal,Scalar> fe_compute_elem;
+  fe_compute_elem.elem_node_coords = &d_node_coords[0];
+  fe_compute_elem.elem_diffusion_matrix = &d_elem_matrices[0];
+  fe_compute_elem.elem_source_vector = &d_elem_vectors[0];
+
+  elem_compute_node.parallel_for(elemIDs.size(), fe_compute_elem);
+
+  elem_compute_node.copy_from_buffer(&elem_matrices[0], elem_matrices.size(), d_elem_matrices);
+  elem_compute_node.copy_from_buffer(&elem_vectors[0], elem_vectors.size(), d_elem_vectors);
+
+ timer_type t_ce = mytimer() - t0;
+
+  t0 = mytimer();
+  SumInLinSys<GlobalOrdinal,Scalar,MatrixType,VectorType> sum_in;
+  sum_in.node_ordinals = &node_ordinals[0];
+  sum_in.elem_diffusion_matrix = &elem_matrices[0];
+  sum_in.elem_source_vector = &elem_vectors[0];
+  sum_in.A = &lockingA;
+  sum_in.b = &lockingb;
+
+  compute_node.parallel_for(elemIDs.size(), sum_in);
+
+  timer_type t_si = mytimer() - t0;
+  std::cout << "time to get nodes/coords: " << t_gn << std::endl;
+  std::cout << "time to create compute-node: " << t_ccn << ", time to compute elements: " << t_ce << std::endl;
+  std::cout << "time to sum into linsys: " << t_si << std::endl;
+  std::cout << "\n{number of matrix conflicts: " << miniFE_num_matrix_conflicts << "}"<<std::endl;
+  std::cout << "{number of vector conflicts: " << miniFE_num_vector_conflicts << "}"<<std::endl;
+}
+
+}//namespace miniFE
+
+#else
+#error "ERROR, this file shouldn't be compiled if MINIFE_HAVE_TBB is not defined."
+#endif
+
+#endif
+
diff --git a/openmp-avx512/basic/run_one_test b/openmp-avx512/basic/run_one_test
new file mode 100755
index 0000000..de4b188
--- /dev/null
+++ b/openmp-avx512/basic/run_one_test
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+if [ $# != 4 ]; then
+echo "usage: run_one_test <np> <nx> <ny> <nz>"
+exit 1
+fi
+
+np=$1
+nx=$2
+ny=$3
+nz=$4
+
+echo " "
+echo "running miniFE test for np=${np}, nx=${nx} ny=${ny} nz=${nz}..."
+
+if [ ! -x miniFE.x ]; then
+echo "miniFE.x doesn't exist or isn't executable. Aborting."
+exit -1
+fi
+
+mpirun -np ${np} miniFE.x nx=${nx} ny=${ny} nz=${nz} >& miniFE_run.out
+rm miniFE_run.out
+
+if [ ! -f A.mtx.${np}.0 ]; then
+echo "matrix file A.mtx.${np}.0 doesn't exist. build miniFE with -DMINIFE_DEBUG."
+fi
+
+p=0
+while [ $p -lt ${np} ]; do
+diff A.mtx.${np}.$p gold_files/1x1x2_A.mtx.${np}.$p >& diff.A.$p.txt
+diff b.vec.${np}.$p gold_files/1x1x2_b.vec.${np}.$p >& diff.b.$p.txt
+diff x.vec.${np}.$p gold_files/1x1x2_x.vec.${np}.$p >& diff.x.$p.txt
+
+test_result="passed"
+if [ -s diff.A.$p.txt ]; then
+echo "TEST FAILED: see diff.A.${p}.txt"
+test_result="failed"
+fi
+
+if [ -s diff.b.$p.txt ]; then
+echo "TEST FAILED: see diff.b.${p}.txt"
+test_result="failed"
+fi
+
+if [ -s diff.x.$p.txt ]; then
+echo "TEST FAILED: see diff.x.${p}.txt"
+test_result="failed"
+fi
+
+if [ $test_result != "passed" ]; then
+echo "test failed"
+exit 1
+fi
+
+let p=p+1
+rm diff.*.txt
+done
+
+echo "tests passed"
+
diff --git a/openmp-avx512/basic/run_tests b/openmp-avx512/basic/run_tests
new file mode 100755
index 0000000..5e03399
--- /dev/null
+++ b/openmp-avx512/basic/run_tests
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+echo " "
+echo "running miniFE tests..."
+
+if [ ! -x miniFE.x ]; then
+echo "miniFE.x doesn't exist or isn't executable. Aborting."
+exit -1
+fi
+
+./run_one_test 1 1 1 2
+if [ $? != 0 ]; then
+echo "test failed"
+exit $?
+fi
+
+./run_one_test 2 1 1 2
+if [ $? != 0 ]; then
+echo "test failed"
+exit $?
+fi
+
diff --git a/openmp-avx512/basic/sharedmem.cuh b/openmp-avx512/basic/sharedmem.cuh
new file mode 100644
index 0000000..b13c4f2
--- /dev/null
+++ b/openmp-avx512/basic/sharedmem.cuh
@@ -0,0 +1,153 @@
+/*
+* Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.  This source code is a "commercial item" as
+* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer software" and "commercial computer software
+* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*/
+
+#ifndef _SHAREDMEM_H_
+#define _SHAREDMEM_H_
+
+//****************************************************************************
+// Because dynamically sized shared memory arrays are declared "extern",
+// we can't templatize them directly.  To get around this, we declare a 
+// simple wrapper struct that will declare the extern array with a different 
+// name depending on the type.  This avoids compiler errors about duplicate
+// definitions.
+// 
+// To use dynamically allocated shared memory in a templatized __global__ or 
+// __device__ function, just replace code like this:
+//
+//
+//  template<class T>
+//  __global__ void
+//  foo( T* g_idata, T* g_odata) 
+//  {
+//      // Shared mem size is determined by the host app at run time
+//      extern __shared__  T sdata[];
+//      ...
+//      doStuff(sdata);
+//      ...
+//   }
+//  
+//   With this
+//  template<class T>
+//  __global__ void
+//  foo( T* g_idata, T* g_odata) 
+//  {
+//      // Shared mem size is determined by the host app at run time
+//      SharedMemory<T> smem;
+//      T* sdata = smem.getPointer();
+//      ...
+//      doStuff(sdata);
+//      ...
+//   }
+//****************************************************************************
+
+// This is the un-specialized struct.  Note that we prevent instantiation of this 
+// struct by putting an undefined symbol in the function body so it won't compile.
+template <typename T>
+struct SharedMemory
+{
+    // Ensure that we won't compile any un-specialized types
+    __device__ T* getPointer() {
+        extern __device__ void error(void);
+        error();
+        return NULL;
+    }
+};
+
+// Following are the specializations for the following types.
+// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
+// One could also specialize it for user-defined types.
+
+template <>
+struct SharedMemory <int>
+{
+    __device__ int* getPointer() { extern __shared__ int s_int[]; return s_int; }    
+};
+
+template <>
+struct SharedMemory <unsigned int>
+{
+    __device__ unsigned int* getPointer() { extern __shared__ unsigned int s_uint[]; return s_uint; }    
+};
+
+template <>
+struct SharedMemory <char>
+{
+    __device__ char* getPointer() { extern __shared__ char s_char[]; return s_char; }    
+};
+
+template <>
+struct SharedMemory <unsigned char>
+{
+    __device__ unsigned char* getPointer() { extern __shared__ unsigned char s_uchar[]; return s_uchar; }    
+};
+
+template <>
+struct SharedMemory <short>
+{
+    __device__ short* getPointer() { extern __shared__ short s_short[]; return s_short; }    
+};
+
+template <>
+struct SharedMemory <unsigned short>
+{
+    __device__ unsigned short* getPointer() { extern __shared__ unsigned short s_ushort[]; return s_ushort; }    
+};
+
+template <>
+struct SharedMemory <long>
+{
+    __device__ long* getPointer() { extern __shared__ long s_long[]; return s_long; }    
+};
+
+template <>
+struct SharedMemory <unsigned long>
+{
+    __device__ unsigned long* getPointer() { extern __shared__ unsigned long s_ulong[]; return s_ulong; }    
+};
+
+template <>
+struct SharedMemory <bool>
+{
+    __device__ bool* getPointer() { extern __shared__ bool s_bool[]; return s_bool; }    
+};
+
+template <>
+struct SharedMemory <float>
+{
+    __device__ float* getPointer() { extern __shared__ float s_float[]; return s_float; }    
+};
+
+template <>
+struct SharedMemory <double>
+{
+    __device__ double* getPointer() { extern __shared__ double s_double[]; return s_double; }    
+};
+
+
+#endif //_SHAREDMEM_H_
diff --git a/openmp-avx512/basic/simple_mesh_description.hpp b/openmp-avx512/basic/simple_mesh_description.hpp
new file mode 100644
index 0000000..717dc6c
--- /dev/null
+++ b/openmp-avx512/basic/simple_mesh_description.hpp
@@ -0,0 +1,239 @@
+
+#ifndef _simple_mesh_description_hpp_
+#define _simple_mesh_description_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <utils.hpp>
+#include <set>
+#include <map>
+
+namespace miniFE {
+
+template<typename GlobalOrdinal>
+class simple_mesh_description {
+public:
+  simple_mesh_description(const Box& global_box_in, const Box& local_box_in)
+  {
+   Box local_node_box;
+    for(int i=0; i<3; ++i) {
+      global_box[i][0] = global_box_in[i][0];
+      global_box[i][1] = global_box_in[i][1];
+      local_box[i][0] = local_box_in[i][0];
+      local_box[i][1] = local_box_in[i][1];
+      local_node_box[i][0] = local_box_in[i][0];
+      local_node_box[i][1] = local_box_in[i][1];
+      //num-owned-nodes == num-elems+1 in this dimension if the elem box is not empty
+      //and we are at the high end of the global range in that dimension:
+      if (local_box_in[i][1] > local_box_in[i][0] && local_box_in[i][1] == global_box[i][1]) local_node_box[i][1] += 1;
+    }
+
+    int max_node_x = global_box[0][1]+1;
+    int max_node_y = global_box[1][1]+1;
+    int max_node_z = global_box[2][1]+1;
+    create_map_id_to_row(max_node_x, max_node_y, max_node_z, local_node_box,
+                         map_ids_to_rows);
+
+    //As described in analytic_soln.hpp,
+    //we will impose a 0 boundary-condition on faces x=0, y=0, z=0, y=1, z=1
+    //we will impose a 1 boundary-condition on face x=1
+
+#ifdef MINIFE_DEBUG
+std::cout<<std::endl;
+#endif
+    const int X=0;
+    const int Y=1;
+    const int Z=2;
+
+    const int x1 = max_node_x - 1;
+    const int y1 = max_node_y - 1;
+    const int z1 = max_node_z - 1;
+
+    //if we're on the x=0 face:
+    if (global_box[X][0] == local_box[X][0]) {
+      int miny = local_node_box[Y][0];
+      int minz = local_node_box[Z][0];
+      int maxy = local_node_box[Y][1];
+      int maxz = local_node_box[Z][1];
+      //expand y and z dimensions to include ghost layer
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+
+      for(int iz=minz; iz<maxz; ++iz) {
+        for(int iy=miny; iy<maxy; ++iy) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             0, iy, iz);
+#ifdef MINIFE_DEBUG
+std::cout<<"x=0 BC, node "<<nodeID<<", (0,"<<iy<<","<<iz<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the y=0 face:
+    if (global_box[Y][0] == local_box[Y][0]) {
+      int minx = local_node_box[X][0];
+      int minz = local_node_box[Z][0];
+      int maxx = local_node_box[X][1];
+      int maxz = local_node_box[Z][1];
+      //expand x and z dimensions to include ghost layer
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+
+      for(int iz=minz; iz<maxz; ++iz) {
+        for(int ix=minx; ix<maxx; ++ix) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, 0, iz);
+#ifdef MINIFE_DEBUG
+std::cout<<"y=0 BC, node "<<nodeID<<", ("<<ix<<",0,"<<iz<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the z=0 face:
+    if (global_box[Z][0] == local_box[Z][0]) {
+      int minx = local_node_box[X][0];
+      int miny = local_node_box[Y][0];
+      int maxx = local_node_box[X][1];
+      int maxy = local_node_box[Y][1];
+      //expand x and y dimensions to include ghost layer
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+
+      for(int iy=miny; iy<maxy; ++iy) {
+        for(int ix=minx; ix<maxx; ++ix) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, iy, 0);
+#ifdef MINIFE_DEBUG
+std::cout<<"z=0 BC, node "<<nodeID<<", ("<<ix<<","<<iy<<",0)"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the x=1 face:
+    if (global_box[X][1] == local_box[X][1]) {
+      int minz = local_node_box[Z][0];
+      int miny = local_node_box[Y][0];
+      int maxz = local_node_box[Z][1];
+      int maxy = local_node_box[Y][1];
+      //expand z and y dimensions to include ghost layer
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+
+      for(int iy=miny; iy<maxy; ++iy) {
+        for(int iz=minz; iz<maxz; ++iz) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             x1, iy, iz);
+          int row = map_id_to_row(nodeID);
+#ifdef MINIFE_DEBUG
+std::cout<<"x=1 BC, node "<<nodeID<<", row "<<row<<", ("<<x1<<","<<iy<<","<<iz<<")"<<std::endl;
+#endif
+          bc_rows_1.insert(row);
+        }
+      }
+    }
+
+    //if we're on the y=1 face:
+    if (global_box[Y][1] == local_box[Y][1]) {
+      int minz = local_node_box[Z][0];
+      int minx = local_node_box[X][0];
+      int maxz = local_node_box[Z][1];
+      int maxx = local_node_box[X][1];
+      //expand z and x dimensions to include ghost layer
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+
+      for(int ix=minx; ix<maxx; ++ix) {
+        for(int iz=minz; iz<maxz; ++iz) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, y1, iz);
+#ifdef MINIFE_DEBUG
+std::cout<<"y=1 BC, node "<<nodeID<<", ("<<ix<<","<<y1<<","<<iz<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the z=1 face:
+    if (global_box[Z][1] == local_box[Z][1]) {
+      int miny = local_node_box[Y][0];
+      int minx = local_node_box[X][0];
+      int maxy = local_node_box[Y][1];
+      int maxx = local_node_box[X][1];
+      //expand x and y dimensions to include ghost layer
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+
+      for(int ix=minx; ix<maxx; ++ix) {
+        for(int iy=miny; iy<maxy; ++iy) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, iy, z1);
+#ifdef MINIFE_DEBUG
+std::cout<<"z=1 BC, node "<<nodeID<<", ("<<ix<<","<<iy<<","<<z1<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+  }
+
+  GlobalOrdinal map_id_to_row(const GlobalOrdinal& id) const
+  {
+    return find_row_for_id(id, map_ids_to_rows);
+  }
+
+  std::set<GlobalOrdinal> bc_rows_0;
+  std::set<GlobalOrdinal> bc_rows_1;
+  std::map<GlobalOrdinal,GlobalOrdinal> map_ids_to_rows;
+  Box global_box;
+  Box local_box;
+};//class simple_mesh_description
+
+}//namespace miniFE
+
+#endif
diff --git a/openmp-avx512/basic/time_kernels.hpp b/openmp-avx512/basic/time_kernels.hpp
new file mode 100644
index 0000000..b14f743
--- /dev/null
+++ b/openmp-avx512/basic/time_kernels.hpp
@@ -0,0 +1,140 @@
+#ifndef _time_kernels_hpp_
+#define _time_kernels_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+
+#include <Vector_functions.hpp>
+#include <mytimer.hpp>
+
+#ifdef MINIFE_HAVE_CUDA
+#include <cuda.h>
+#endif
+
+namespace miniFE {
+
+template<typename OperatorType,
+         typename VectorType,
+         typename Matvec>
+void
+time_kernels(OperatorType& A,
+             const VectorType& b,
+             VectorType& x,
+             Matvec matvec,
+             typename OperatorType::LocalOrdinalType max_iter,
+             typename OperatorType::ScalarType& xdotp,
+             timer_type* my_kern_times)
+{
+  typedef typename OperatorType::ScalarType ScalarType;
+  typedef typename OperatorType::LocalOrdinalType OrdinalType;
+  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;
+
+  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0;
+
+  int myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (!A.has_local_indices) {
+    std::cerr << "miniFE::time_kernels ERROR, A.has_local_indices is false, needs to be true. This probably means "
+       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::time_kernels."
+       << std::endl;
+    return;
+  }
+
+  OrdinalType nrows = A.rows.size();
+  OrdinalType ncols = A.num_cols;
+
+  VectorType p(0, ncols, b.compute_node);
+
+  ScalarType one = 1.0;
+  ScalarType zero = 0.0;
+
+  typedef typename VectorType::ComputeNodeType ComputeNodeType;
+  ComputeNodeType& compute_node = x.compute_node;
+
+  //The following lines that create and initialize buffers are no-ops in many
+  //cases, but perform actual allocations and copies if a off-cpu device such as
+  //a GPU is being used by compute_node.
+
+  //Do any required allocations for buffers that will be needed during CG:
+  ScalarType* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  ScalarType* d_p = compute_node.get_buffer(&p.coefs[0], p.coefs.size());
+  ScalarType* d_b = compute_node.get_buffer(&b.coefs[0], b.coefs.size());
+  OrdinalType* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
+  OrdinalType* d_Acols   = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
+  ScalarType* d_Acoefs  = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
+
+  //Copy data to buffers that need to be initialized from input data:
+  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
+  compute_node.copy_to_buffer(&b.coefs[0], b.coefs.size(), d_b);
+  compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
+  compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
+  compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);
+
+  TICK();
+  for(OrdinalType i=0; i<max_iter; ++i) {
+    waxpby(one, x, zero, x, p);
+  }
+#ifdef MINIFE_HAVE_CUDA
+  cudaThreadSynchronize();
+#endif
+  TOCK(tWAXPY);
+
+  TICK();
+  for(OrdinalType i=0; i<max_iter; ++i) {
+    matvec(A, p, x);
+  }
+#ifdef MINIFE_HAVE_CUDA
+  cudaThreadSynchronize();
+#endif
+  TOCK(tMATVEC);
+
+  TICK();
+  xdotp = 0;
+  for(OrdinalType i=0; i<max_iter; ++i) {
+    xdotp += dot(x, p);
+  }
+#ifdef MINIFE_HAVE_CUDA
+  cudaThreadSynchronize();
+#endif
+  TOCK(tDOT);
+
+  my_kern_times[WAXPY] = tWAXPY;
+  my_kern_times[DOT] = tDOT;
+  my_kern_times[MATVEC] = tMATVEC;
+  my_kern_times[TOTAL] = 0;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/basic/utest.cpp b/openmp-avx512/basic/utest.cpp
new file mode 100644
index 0000000..623c72a
--- /dev/null
+++ b/openmp-avx512/basic/utest.cpp
@@ -0,0 +1,68 @@
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+#include <iostream>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <utest_case.hpp>
+#include <utest_cases.hpp>
+
+int main(int argc, char** argv) {
+
+#ifdef HAVE_MPI
+  MPI_Init(&argc, &argv);
+#endif
+
+  //utest_case.hpp declares the 'get_utest_cases' function.
+
+  std::vector<utest_case*>& utest_cases = get_utest_cases();
+  bool tests_passed = true;
+
+  for(size_t i=0; i<utest_cases.size(); ++i) {
+    bool passed = utest_cases[i]->run();
+    if (passed) std::cout << "   pass: " << utest_cases[i]->name() << std::endl;
+    else {
+      std::cout << "!!!FAIL: " << utest_cases[i]->name() << std::endl;
+      tests_passed = false;
+    }
+  }
+
+  if (!tests_passed) {
+    std::cout << "at least 1 test failed."<<std::endl;
+  }
+
+#ifdef HAVE_MPI
+  MPI_Finalize();
+#endif
+
+  return 0;
+}
+
diff --git a/openmp-avx512/basic/utest_case.hpp b/openmp-avx512/basic/utest_case.hpp
new file mode 100644
index 0000000..d6dbf3d
--- /dev/null
+++ b/openmp-avx512/basic/utest_case.hpp
@@ -0,0 +1,55 @@
+#ifndef _utest_case_hpp_
+#define _utest_case_hpp_
+
+#include <vector>
+
+class utest_case;
+
+std::vector<utest_case*>& get_utest_cases()
+{
+  static std::vector<utest_case*> utest_cases;
+  return utest_cases;
+}
+
+//When a class that inherits the utest_case class is constructed,
+//it gets added to the vector of utest_cases returned by
+//the above 'get_utest_cases' function.
+class utest_case {
+public:
+  utest_case(){ get_utest_cases().push_back(this); }
+  ~utest_case(){}
+  virtual const char* name() = 0;
+  virtual bool run() = 0;
+};
+
+//The following macro declares and instantiates a class that
+//inherits the above utest_case interfaces.
+//
+//use the macro like this:
+//   UTEST_CASE(mytest)
+//   {
+//      ... test code here ...
+//   }
+//
+//See example usages in utest_cases.hpp
+//
+#define UTEST_CASE(TESTNAME) \
+  class TESTNAME##_utest : public utest_case { \
+  public: \
+    TESTNAME##_utest(){} \
+    const char* name() {return #TESTNAME;} \
+    bool run(); \
+  }; \
+  \
+  TESTNAME##_utest instance_##TESTNAME##_utest; \
+  \
+  bool TESTNAME##_utest::run()
+
+#define TEST_EQUAL(A,B) \
+  if ((A) != (B)) return false;
+
+#define TEST_EQUAL_TOL(A,B,tol) \
+  if (std::abs((A) - (B)) > tol) return false;
+
+#endif
+
diff --git a/openmp-avx512/basic/utest_cases.hpp b/openmp-avx512/basic/utest_cases.hpp
new file mode 100644
index 0000000..d15ef9d
--- /dev/null
+++ b/openmp-avx512/basic/utest_cases.hpp
@@ -0,0 +1,1232 @@
+#ifndef _utest_cases_hpp_
+#define _utest_cases_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <iostream>
+#include <cmath>
+
+#include <BoxPartition.hpp>
+#include <box_utils.hpp>
+#include <simple_mesh_description.hpp>
+#include <generate_matrix_structure.hpp>
+#include <Hex8.hpp>
+#include <Hex8_box_utils.hpp>
+#include <assemble_FE_data.hpp>
+#include <Parameters.hpp>
+#include <make_local_matrix.hpp>
+#include <exchange_externals.hpp>
+#include <Vector_functions.hpp>
+#include <BoxIterator.hpp>
+#include <mytimer.hpp>
+
+#include <SerialComputeNode.hpp>
+
+#ifdef MINIFE_HAVE_TPI
+#include <TPI.h>
+#include <TPINode.hpp>
+#endif
+
+#ifdef MINIFE_HAVE_TBB
+#include <tbb/task_scheduler_init.h>
+#include <TBBNode.hpp>
+#endif
+
+#ifdef MINIFE_HAVE_CUDA
+#include <CudaNode.hpp>
+#endif
+
+#include <utest_case.hpp>
+
+typedef MINIFE_SCALAR Scalar;
+typedef MINIFE_LOCAL_ORDINAL LocalOrdinal;
+typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinal;
+
+template<typename T>
+inline
+int check_get_id(int nx, int ny, int nz, int x, int y, int z, T expected, const char* testname)
+{
+  T val = miniFE::get_id<T>(nx,ny,nz,x,y,z);
+  if (val != expected) {
+    std::cout << testname << " failed. val=" << val<<", expected " << expected << std::endl;
+    return -1;
+  }
+  return 0;
+}
+
+UTEST_CASE(box_partition)
+{
+  int global_box[3][2] = { { 0, 2000 }, { 0, 2000}, { 0, 2000} };
+  int numprocs = 4, myproc = 0;
+
+  int (*local_boxes0)[3][2] = (int(*)[3][2])std::malloc(sizeof(int)*numprocs*3*2);
+  int (*local_boxes1)[3][2] = (int(*)[3][2])std::malloc(sizeof(int)*numprocs*3*2);
+  int (*local_boxes2)[3][2] = (int(*)[3][2])std::malloc(sizeof(int)*numprocs*3*2);
+  int (*local_boxes3)[3][2] = (int(*)[3][2])std::malloc(sizeof(int)*numprocs*3*2);
+
+  box_partition(0, numprocs, 2, global_box, local_boxes0);
+  box_partition(0, numprocs, 2, global_box, local_boxes1);
+  box_partition(0, numprocs, 2, global_box, local_boxes2);
+  box_partition(0, numprocs, 2, global_box, local_boxes3);
+
+  for(int i=1; i<numprocs; ++i) {
+    if (miniFE::get_num_ids<int>(local_boxes0[i]) !=
+        miniFE::get_num_ids<int>(local_boxes0[0])) {
+      return false;
+    }
+    if (miniFE::get_num_ids<int>(local_boxes1[i]) !=
+        miniFE::get_num_ids<int>(local_boxes1[0])) {
+      return false;
+    }
+    if (miniFE::get_num_ids<int>(local_boxes2[i]) !=
+        miniFE::get_num_ids<int>(local_boxes2[0])) {
+      return false;
+    }
+    if (miniFE::get_num_ids<int>(local_boxes3[i]) !=
+        miniFE::get_num_ids<int>(local_boxes3[0])) {
+      return false;
+    }
+
+    if (miniFE::get_num_ids<int>(local_boxes0[i]) < 0 ||
+        miniFE::get_num_ids<int>(local_boxes0[i]) > 2000000000) {
+      return false;
+    }
+  }
+
+  std::free(local_boxes0);
+  std::free(local_boxes1);
+  std::free(local_boxes2);
+  std::free(local_boxes3);
+
+  return true;
+}
+
+UTEST_CASE(generate_matrix_structure1)
+{
+  int global_box[3][2] = {{ 0, 1 }, { 0, 1 }, { 0, 1 } };
+  int box[3][2] = {{ 0, 1 }, { 0, 1 }, { 0, 1 } };
+
+  miniFE::simple_mesh_description<int> mesh(global_box, box);
+
+  SerialComputeNode compute_node;
+  miniFE::CSRMatrix<Scalar, int, int, SerialComputeNode> A(compute_node);
+
+  miniFE::generate_matrix_structure(mesh, A);
+
+  int nodes_x = global_box[0][1]+1;
+  int nodes_y = global_box[1][1]+1;
+  int nodes_z = global_box[2][1]+1;
+  int nrows = nodes_x*nodes_y*nodes_z;
+  
+  if (A.rows.size() != nrows) {
+    return false;
+  }
+
+  if (A.row_offsets[nrows] != 64) {
+    return false;
+  }
+
+  return true;
+}
+
+UTEST_CASE(generate_matrix_structure2)
+{
+  int global_box[3][2] = {{ 0, 2 }, { 0, 2 }, { 0, 2 } };
+  int box[3][2] = {{ 0, 2 }, { 0, 2 }, { 0, 2 } };
+
+  miniFE::simple_mesh_description<int> mesh(global_box, box);
+
+  SerialComputeNode compute_node;
+  miniFE::CSRMatrix<Scalar, int, int,SerialComputeNode> A(compute_node);
+
+  int nodes_x = global_box[0][1]+1;
+  int nodes_y = global_box[1][1]+1;
+  int nodes_z = global_box[2][1]+1;
+  int nrows = nodes_x*nodes_y*nodes_z;
+  
+  if (nrows != 27) {
+    return false;
+  }
+
+  miniFE::generate_matrix_structure(mesh, A);
+
+  if (A.row_offsets.size() != nrows+1) {
+    return false;
+  }
+
+  if (A.row_offsets[nrows] != 343) {
+    return false;
+  }
+
+  if (A.row_offsets[14]-A.row_offsets[13] != 27) {
+    return false;
+  }
+
+  return true;
+}
+
+UTEST_CASE(get_hex8_node_coords_3d)
+{
+  std::vector<Scalar> coords(24);
+  coords[0] = 0;
+  coords[1] = 0;
+  coords[2] = 0;
+  coords[3] = 1;
+  coords[4] = 0;
+  coords[5] = 0;
+  coords[6] = 1;
+  coords[7] = 0;
+  coords[8] = -1;
+  coords[9] = 0;
+  coords[10] = 0;
+  coords[11] = -1;
+  coords[12] = 0;
+  coords[13] = 1;
+  coords[14] = 0;
+  coords[15] = 1;
+  coords[16] = 1;
+  coords[17] = 0;
+  coords[18] = 1;
+  coords[19] = 1;
+  coords[20] = -1;
+  coords[21] = 0;
+  coords[22] = 1;
+  coords[23] = -1;
+
+  std::vector<Scalar> testcoords(24);
+
+  miniFE::get_hex8_node_coords_3d(0, 0, 0, 1.0, &testcoords[0]);
+
+  if (coords != testcoords) {
+    return false;
+  }
+
+  return true;
+}
+
+inline
+void get_test_elem_mat(std::vector<Scalar>& elem_mat)
+{
+//after much careful debugging, I'm convinced that the following is a
+//correct element-diffusion matrix for the element with local-node-0 at
+//coordinates 0,0,0. So pasting this into a unit-test will guard against
+//unintended changes as I continue working on the code for various reasons.
+
+  elem_mat.resize(36);
+elem_mat[0] = 0.6666666664477059;
+elem_mat[1] = 1.094804871759614e-10;
+elem_mat[2] = -0.1666666666666667;
+elem_mat[3] = 1.094805019211109e-10;
+elem_mat[4] = 1.094804871759614e-10;
+elem_mat[5] = -0.1666666666666667;
+elem_mat[6] = -0.1666666667761472;
+elem_mat[7] = -0.1666666666666667;
+elem_mat[8] = 0.666666666447706;
+elem_mat[9] = 1.094804941148553e-10;
+elem_mat[10] = -0.1666666666666667;
+elem_mat[11] = -0.1666666666666667;
+elem_mat[12] = 1.094804732981736e-10;
+elem_mat[13] = -0.1666666666666667;
+elem_mat[14] = -0.1666666667761472;
+elem_mat[15] = 0.666666666447706;
+elem_mat[16] = 1.094804841401953e-10;
+elem_mat[17] = -0.1666666667761472;
+elem_mat[18] = -0.1666666666666667;
+elem_mat[19] = 1.094804871759614e-10;
+elem_mat[20] = -0.1666666666666667;
+elem_mat[21] = 0.6666666664477059;
+elem_mat[22] = -0.1666666666666668;
+elem_mat[23] = -0.1666666667761472;
+elem_mat[24] = -0.1666666666666667;
+elem_mat[25] = 1.094804702624075e-10;
+elem_mat[26] = 0.666666666447706;
+elem_mat[27] = 1.094804802370675e-10;
+elem_mat[28] = -0.1666666666666667;
+elem_mat[29] = 1.094804698287266e-10;
+elem_mat[30] = 0.666666666447706;
+elem_mat[31] = 1.094805079926431e-10;
+elem_mat[32] = -0.1666666666666667;
+elem_mat[33] = 0.666666666447706;
+elem_mat[34] = 1.094804663592797e-10;
+elem_mat[35] = 0.666666666447706;
+}
+
+UTEST_CASE(diffusionMatrix)
+{
+  std::vector<Scalar> elem_mat_correct(64);
+  get_test_elem_mat(elem_mat_correct);
+
+  const size_t len = miniFE::Hex8::numNodesPerElem*miniFE::Hex8::numNodesPerElem;
+  Scalar elem_mat[len];
+  Scalar testcoords[miniFE::Hex8::numNodesPerElem*miniFE::Hex8::spatialDim];
+
+  miniFE::get_hex8_node_coords_3d(0, 0, 0, 1.0, &testcoords[0]);
+
+  miniFE::Hex8::diffusionMatrix_symm(testcoords, elem_mat);
+
+  for(size_t i=0; i<len; ++i) {
+    if (std::abs(elem_mat[i] - elem_mat_correct[i]) > 1.e-6) {
+      return false;
+    }
+  }
+
+  Scalar elem_vec_correct[miniFE::Hex8::numNodesPerElem];
+  elem_vec_correct[0] = 0.125;
+  elem_vec_correct[1] = 0.125;
+  elem_vec_correct[2] = 0.125;
+  elem_vec_correct[3] = 0.125;
+  elem_vec_correct[4] = 0.125;
+  elem_vec_correct[5] = 0.125;
+  elem_vec_correct[6] = 0.125;
+  elem_vec_correct[7] = 0.125;
+
+  Scalar elem_vec[miniFE::Hex8::numNodesPerElem];
+  miniFE::Hex8::sourceVector(testcoords, elem_vec);
+
+  const size_t nn = miniFE::Hex8::numNodesPerElem;
+  for(size_t i=0; i<nn; ++i) {
+    if (std::abs(elem_vec[i] - elem_vec_correct[i]) > 1.e-13) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+UTEST_CASE(sum_into_row)
+{
+  SerialComputeNode compute_node;
+  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> A(compute_node);
+  A.rows.resize(1,0);
+  A.row_offsets.resize(2,0);
+  A.row_offsets[1] = 4;
+  A.packed_cols.resize(4);
+  A.packed_cols[0] = 0;
+  A.packed_cols[1] = 1;
+  A.packed_cols[2] = 2;
+  A.packed_cols[3] = 3;
+  A.packed_coefs.resize(4,0);
+
+  std::vector<int> indices(4);
+  indices[0] = 2;
+  indices[1] = 0;
+  indices[2] = 1;
+  indices[3] = 3;
+  std::vector<Scalar> coefs(4);
+  coefs[0] = 2.0;
+  coefs[1] = 0.0;
+  coefs[2] = 1.0;
+  coefs[3] = 3.0;
+
+  miniFE::sum_into_row(0, 4, &indices[0], &coefs[0], A);
+
+  coefs[0] = 0.0;
+  coefs[1] = 1.0;
+  coefs[2] = 2.0;
+  coefs[3] = 3.0;
+
+  if (coefs != A.packed_coefs) {
+    return false;
+  }
+
+  return true;
+}
+
+UTEST_CASE(sum_in_elem_matrix)
+{
+  SerialComputeNode compute_node;
+  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> A(compute_node);
+  A.rows.resize(4,0);
+  A.rows[0] = 0;
+  A.rows[1] = 1;
+  A.rows[2] = 2;
+  A.rows[3] = 3;
+  A.row_offsets.resize(5,0);
+  A.row_offsets[1] = 4;
+  A.row_offsets[2] = 8;
+  A.row_offsets[3] = 12;
+  A.row_offsets[4] = 16;
+  A.packed_cols.resize(16);
+  A.packed_cols[0] = 0;
+  A.packed_cols[1] = 1;
+  A.packed_cols[2] = 2;
+  A.packed_cols[3] = 3;
+  A.packed_cols[4] = 0;
+  A.packed_cols[5] = 1;
+  A.packed_cols[6] = 2;
+  A.packed_cols[7] = 3;
+  A.packed_cols[8] = 0;
+  A.packed_cols[9] = 1;
+  A.packed_cols[10] = 2;
+  A.packed_cols[11] = 3;
+  A.packed_cols[12] = 0;
+  A.packed_cols[13] = 1;
+  A.packed_cols[14] = 2;
+  A.packed_cols[15] = 3;
+
+  A.packed_coefs.resize(16,0);
+
+  std::vector<int> indices(4);
+  indices[0] = 2;
+  indices[1] = 0;
+  indices[2] = 1;
+  indices[3] = 3;
+  std::vector<Scalar> coefs(16);
+  coefs[0] = 2.0;
+  coefs[1] = 0.0;
+  coefs[2] = 1.0;
+  coefs[3] = 3.0;
+  coefs[4] = 2.0;
+  coefs[5] = 0.0;
+  coefs[6] = 1.0;
+  coefs[7] = 3.0;
+  coefs[8] = 2.0;
+  coefs[9] = 0.0;
+  coefs[10] = 1.0;
+  coefs[11] = 3.0;
+  coefs[12] = 2.0;
+  coefs[13] = 0.0;
+  coefs[14] = 1.0;
+  coefs[15] = 3.0;
+
+  miniFE::sum_in_elem_matrix(4, &indices[0], &coefs[0], A);
+
+  coefs[0] = 0.0;
+  coefs[1] = 1.0;
+  coefs[2] = 2.0;
+  coefs[3] = 3.0;
+  coefs[4] = 0.0;
+  coefs[5] = 1.0;
+  coefs[6] = 2.0;
+  coefs[7] = 3.0;
+  coefs[8] = 0.0;
+  coefs[9] = 1.0;
+  coefs[10] = 2.0;
+  coefs[11] = 3.0;
+  coefs[12] = 0.0;
+  coefs[13] = 1.0;
+  coefs[14] = 2.0;
+  coefs[15] = 3.0;
+
+  if (coefs != A.packed_coefs) {
+    return false;
+  }
+
+  return true;
+}
+
+UTEST_CASE(assemble_FE_data)
+{
+  int global_box[3][2] = {{ 0, 1 }, { 0, 1 }, { 0, 1 } };
+  int box[3][2] = {{ 0, 1 }, { 0, 1 }, { 0, 1 } };
+
+  miniFE::simple_mesh_description<int> mesh(global_box, box);
+
+  SerialComputeNode compute_node;
+  miniFE::CSRMatrix<Scalar, int, int, SerialComputeNode> A(compute_node);
+
+  miniFE::generate_matrix_structure(mesh, A);
+
+  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> b(0, 8, compute_node);
+
+  const int num_nodes = 8;
+
+  std::vector<Scalar> symm_elem_mat_correct;
+  get_test_elem_mat(symm_elem_mat_correct);
+  std::vector<Scalar> full_elem_mat_correct(num_nodes*num_nodes);
+
+  int offset = 0;
+  for(int i=0; i<num_nodes; ++i) {
+    for(int j=0; j<num_nodes; ++j) {
+      if (j>=i) {
+        Scalar coef = symm_elem_mat_correct[offset++];
+        full_elem_mat_correct[i*num_nodes+j] = coef;
+        full_elem_mat_correct[j*num_nodes+i] = coef;
+      }
+    }
+  }
+
+  std::vector<int> elem_node_ids(num_nodes);
+  elem_node_ids[0] = 0;
+  elem_node_ids[1] = 1;
+  elem_node_ids[2] = 5;
+  elem_node_ids[3] = 4;
+  elem_node_ids[4] = 2;
+  elem_node_ids[5] = 3;
+  elem_node_ids[6] = 7;
+  elem_node_ids[7] = 6;
+
+  //now for each row of of the 8x8 elem_mat_correct, reorder that
+  //row according to the order of elem_node_ids, rows and columns.
+  std::vector<Scalar> elem_mat_reordered(num_nodes*num_nodes);
+  offset = 0;
+  int row = 0;
+  for(int i=0; i<num_nodes; ++i) {
+    row = num_nodes*elem_node_ids[i];
+    for(int j=0; j<num_nodes; ++j) {
+      elem_mat_reordered[row+elem_node_ids[j]] = full_elem_mat_correct[offset+j];
+    }
+    offset += num_nodes;
+  }
+
+  //now elem_mat_reordered should contain the same coefficients,
+  //in the same order, as the assembled-matrix coefficients that will be
+  //produced in A by assemble_FE_data:
+
+  miniFE::Parameters params;
+  params.use_locking = 1;
+
+  miniFE::assemble_FE_data(mesh, A, b, params);
+
+  std::vector<Scalar>& assembled_mat = A.packed_coefs;
+
+  for(size_t i=0; i<elem_mat_reordered.size(); ++i) {
+    if (std::abs(elem_mat_reordered[i] - assembled_mat[i]) > 1.e-13) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+UTEST_CASE(pll_matvec2)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs != 2) {
+    if (myproc == 0) std::cout <<"pll_matvec2_utest only runs when numprocs=2."<<std::endl;
+    return true;
+  }
+
+  //create the following matrix and vector:
+  //
+  // A = | 1 1      |  x = | 1 |
+  //     | 2 1 -1 1 |      | 2 |
+  //     |  -2  1   |      | 3 |
+  //     |   2    1 |      | 4 |
+  //
+  // with the first 2 rows on proc 0 and the other rows on proc 1.
+  //
+  //So a matvec should produce y = | 3 |
+  //                               | 5 |
+  //                               |-1 |
+  //                               | 8 |
+
+  SerialComputeNode compute_node;
+
+  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> A(compute_node);
+  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> x(myproc, 4,compute_node) ,y(myproc, 4,compute_node);
+
+  A.rows.resize(2, 0);
+  if (myproc == 0) {
+    A.rows[0] = 0; A.rows[1] = 1;
+  }
+  else {
+    A.rows[0] = 2; A.rows[1] = 3;
+  }
+
+  A.row_offsets.resize(3, 0);
+  if (myproc == 0) {
+    A.row_offsets[1] = 2; A.row_offsets[2] = 6;
+  }
+  else {
+    A.row_offsets[1] = 2; A.row_offsets[2] = 4;
+  }
+
+  if (myproc == 0) {
+    A.packed_cols.resize(6, 0);
+    A.packed_cols[1] = 1;
+    A.packed_cols[2] = 0;
+    A.packed_cols[3] = 1;
+    A.packed_cols[4] = 2;
+    A.packed_cols[5] = 3;
+  }
+  else {
+    A.packed_cols.resize(4, 0);
+    A.packed_cols[0] = 1;
+    A.packed_cols[1] = 2;
+    A.packed_cols[2] = 1;
+    A.packed_cols[3] = 3;
+  }
+  if (myproc == 0) {
+    A.packed_coefs.resize(6, 1);
+    A.packed_coefs[2] = 2;
+    A.packed_coefs[4] = -1;
+  }
+  else {
+    A.packed_coefs.resize(4, 1);
+    A.packed_coefs[0] = -2;
+    A.packed_coefs[2] = 2;
+  }
+
+  if (myproc == 0) {
+    x.coefs[0] = 1; x.coefs[1] = 2;
+  }
+  else {
+    x.coefs[0] = 3; x.coefs[1] = 4;
+  }
+
+  miniFE::make_local_matrix(A);
+  miniFE::exchange_externals(A, x);
+  miniFE::matvec(A, x, y);
+
+  if (myproc == 0) {
+    if (y.coefs[0] != 3.0 || y.coefs[1] != 5.0) {
+      std::cout << "proc 0: pll_matvec2_utest failed" << std::endl;
+      return false;
+    }
+  }
+  else {
+    if (y.coefs[0] != -1.0 || y.coefs[1] != 8.0) {
+      std::cout << "proc 1: pll_matvec2_utest failed" << std::endl;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+UTEST_CASE(pll_matvec3)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs != 3) {
+    if (myproc == 0) std::cout <<"pll_matvec3_utest only runs when numprocs=3."<<std::endl;
+    return true;
+  }
+
+  //create the following matrix and vector:
+  //
+  // cols: 0  1  2  3  4  5 
+  // A = | 1       -1        |  x = | 1 |
+  //     |    1          -1  |      | 2 |
+  //     | 2     1    -1     |      | 3 |
+  //     |          1        |      | 4 |
+  //     |    2        1     |      | 5 |
+  //     |          2     1  |      | 6 |
+  //
+  // with the first 2 rows on proc 0, next 2 on proc 1, last 2 on proc 2.
+  //
+  //So a matvec should produce y = |-3 |
+  //                               |-4 |
+  //                               | 0 |
+  //                               | 4 |
+  //                               | 9 |
+  //                               |14 |
+
+  SerialComputeNode compute_node;
+
+  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> A(compute_node);
+  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> x(myproc, 6, compute_node) ,y(myproc, 6, compute_node);
+
+  A.rows.resize(2, 0);
+  A.rows[0] = myproc*2; A.rows[1] = myproc*2+1;
+
+  A.row_offsets.resize(3, 0);
+  if (myproc == 0) {
+    A.row_offsets[1] = 2; A.row_offsets[2] = 4;
+  }
+  else if (myproc == 1) {
+    A.row_offsets[1] = 3; A.row_offsets[2] = 4;
+  }
+  else {
+    A.row_offsets[1] = 2; A.row_offsets[2] = 4;
+  }
+
+  A.packed_cols.resize(4, 0);
+  if (myproc == 0) {
+    A.packed_cols[1] = 3;
+    A.packed_cols[2] = 1;
+    A.packed_cols[3] = 5;
+  }
+  else if (myproc == 1) {
+    A.packed_cols[1] = 2;
+    A.packed_cols[2] = 4;
+    A.packed_cols[3] = 3;
+  }
+  else {
+    A.packed_cols[0] = 1;
+    A.packed_cols[1] = 4;
+    A.packed_cols[2] = 3;
+    A.packed_cols[3] = 5;
+  }
+
+  A.packed_coefs.resize(4, 1);
+  if (myproc == 0) {
+    A.packed_coefs[1] = -1;
+    A.packed_coefs[3] = -1;
+  }
+  else if (myproc == 1) {
+    A.packed_coefs[0] = 2;
+    A.packed_coefs[2] = -1;
+  }
+  else {
+    A.packed_coefs[0] = 2;
+    A.packed_coefs[2] = 2;
+  }
+
+  if (myproc == 0) {
+    x.coefs[0] = 1; x.coefs[1] = 2;
+  }
+  else if (myproc == 1) {
+    x.coefs[0] = 3; x.coefs[1] = 4;
+  }
+  else {
+    x.coefs[0] = 5; x.coefs[1] = 6;
+  }
+
+  miniFE::make_local_matrix(A);
+  miniFE::exchange_externals(A, x);
+  miniFE::matvec(A, x, y);
+
+  if (myproc == 0) {
+    if (y.coefs[0] != -3.0 || y.coefs[1] != -4.0) {
+      std::cout << "proc 0: pll_matvec3 failed" << std::endl;
+      return false;
+    }
+  }
+  else if (myproc == 1) {
+    if (y.coefs[0] != 0.0 || y.coefs[1] != 4.0) {
+      std::cout << "proc 1: pll_matvec3 failed" << std::endl;
+      return false;
+    }
+  }
+  else {
+    if (y.coefs[0] != 9.0 || y.coefs[1] != 14.0) {
+      std::cout << "proc 2: pll_matvec3 failed" << std::endl;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+UTEST_CASE(ComputeNode_waxpy1)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs != 1) {
+    if (myproc == 0) std::cout <<"ComputeNode_waxpy1 only runs when numprocs=1."<<std::endl;
+    return true;
+  }
+
+#ifdef MINIFE_HAVE_CUDA
+  CUDANode compute_node(0,16,64);
+  typedef CUDANode ComputeNodeType;
+#else
+  SerialComputeNode compute_node;
+  typedef SerialComputeNode ComputeNodeType;
+#endif
+
+  size_t len = 10;
+
+  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, len, compute_node), y(0, len, compute_node), w(0, len, compute_node);
+
+  std::vector<GlobalOrdinal> inds(len, 0);
+  for(size_t i=0; i<len; ++i) inds[i] = i;
+
+  std::vector<Scalar> coefs(len, 1);
+
+  miniFE::sum_into_vector(len, &inds[0], &coefs[0], x);
+  miniFE::sum_into_vector(len, &inds[0], &coefs[0], y);
+  miniFE::sum_into_vector(len, &inds[0], &coefs[0], w);
+
+  Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  Scalar* d_y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
+
+  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
+  compute_node.copy_to_buffer(&y.coefs[0], y.coefs.size(), d_y);
+
+  miniFE::waxpby(1.0, x, 1.0, y, w);
+
+  Scalar* d_w = compute_node.get_buffer(&w.coefs[0], w.coefs.size());
+  compute_node.copy_from_buffer(&w.coefs[0], w.coefs.size(), d_w);
+
+  Scalar expected = 2;
+  Scalar tol = 1.e-7;
+
+  for(size_t i=0; i<len; ++i) { 
+    if (std::abs(w.coefs[i]-expected) > tol) {
+      return false;
+    }
+  }
+  return true;
+}
+
+UTEST_CASE(ComputeNode_dot1)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs != 1) {
+    if (myproc == 0) std::cout <<"ComputeNode_dot1 only runs when numprocs=1."<<std::endl;
+    return true;
+  }
+
+#ifdef MINIFE_HAVE_CUDA
+  CUDANode compute_node(0,1,64);
+  typedef CUDANode ComputeNodeType;
+#else
+  SerialComputeNode compute_node;
+  typedef SerialComputeNode ComputeNodeType;
+#endif
+
+  size_t N = 100;
+  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, N, compute_node), y(0, N, compute_node);
+
+  std::vector<int> inds(N, 0);
+  for(size_t i=0; i<N; ++i) inds[i] = i;
+
+  std::vector<Scalar> coefs(N, 1);
+
+  miniFE::sum_into_vector(N, &inds[0], &coefs[0], x);
+  miniFE::sum_into_vector(N, &inds[0], &coefs[0], y);
+
+  Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  Scalar* d_y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
+
+  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
+  compute_node.copy_to_buffer(&y.coefs[0], y.coefs.size(), d_y);
+
+  Scalar dot_prod = miniFE::dot(x,y);
+
+  if (dot_prod != N) {
+    return false;
+  }
+
+  return true;
+}
+
+UTEST_CASE(ComputeNode_TBB_dot1)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs != 1) {
+    if (myproc == 0) std::cout <<"ComputeNode_TBB_dot1_utest only runs when numprocs=1."<<std::endl;
+    return true;
+  }
+
+#ifdef MINIFE_HAVE_TBB
+  TBBNode compute_node(2);
+  typedef TBBNode ComputeNodeType;
+
+  size_t N = 10;
+  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, N, compute_node), y(0, N, compute_node);
+
+  std::vector<GlobalOrdinal> inds(N, 0);
+  for(size_t i=0; i<N; ++i) inds[i] = i;
+
+  std::vector<Scalar> coefs(N, 1);
+
+  miniFE::sum_into_vector(inds.size(), &inds[0], &coefs[0], x);
+  miniFE::sum_into_vector(inds.size(), &inds[0], &coefs[0], y);
+
+  Scalar dot_prod = miniFE::dot(x,y);
+
+  if (dot_prod != N) {
+    return false;
+  }
+
+#else
+  std::cout << "ComputeNode_TBB_dot1_utest only runs when MINIFE_HAVE_TBB is defined."<<std::endl;
+#endif
+  return true;
+}
+
+UTEST_CASE(ComputeNode_dot2)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs != 1) {
+    if (myproc == 0) std::cout <<"ComputeNode_dot2_utest only runs when numprocs=1."<<std::endl;
+    return true;
+  }
+
+#ifdef MINIFE_HAVE_CUDA
+  CUDANode compute_node(0,64,128);
+  typedef CUDANode ComputeNodeType;
+#else
+  SerialComputeNode compute_node;
+  typedef SerialComputeNode ComputeNodeType;
+#endif
+
+  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, 10, compute_node), y(0, 10, compute_node);
+
+  size_t len = 10;
+  std::vector<int> inds(len, 0);
+  for(size_t i=0; i<len; ++i) inds[i] = i;
+
+  std::vector<Scalar> coefs(len, 1);
+
+  miniFE::sum_into_vector(len, &inds[0], &coefs[0], x);
+  miniFE::sum_into_vector(len, &inds[0], &coefs[0], y);
+
+  Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  Scalar* d_y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
+
+  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
+  compute_node.copy_to_buffer(&y.coefs[0], y.coefs.size(), d_y);
+
+  Scalar dot_prod = miniFE::dot(x, y);
+
+  if (std::abs(dot_prod-10.0) > 1.e-12) {
+    return false;
+  }
+  return true;
+}
+
+UTEST_CASE(ser_matvec1)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs != 1) {
+    if (myproc == 0) std::cout <<"ser_matvec1_utest only runs when numprocs=1."<<std::endl;
+    return 0;
+  }
+
+  //create the following matrix and vector:
+  //
+  // A = | 1 1      |  x = | 1 |
+  //     | 2 1 -1 1 |      | 2 |
+  //     |  -2  1   |      | 3 |
+  //     |   2    1 |      | 4 |
+  //
+  // with the first 2 rows on proc 0 and the other rows on proc 1.
+  //
+  //So a matvec should produce y = | 3 |
+  //                               | 5 |
+  //                               |-1 |
+  //                               | 8 |
+
+#ifdef MINIFE_HAVE_CUDA
+  CUDANode compute_node(0,64,128);
+  typedef CUDANode ComputeNodeType;
+#else
+  SerialComputeNode compute_node;
+  typedef SerialComputeNode ComputeNodeType;
+#endif
+
+  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> A(compute_node);
+  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, 4,compute_node) ,y(0, 4,compute_node);
+
+  A.rows.resize(4, 0);
+  A.rows[0] = 0; A.rows[1] = 1;
+  A.rows[2] = 2; A.rows[3] = 3;
+
+  A.row_offsets.resize(5, 0);
+  A.row_offsets[1] = 2; A.row_offsets[2] = 6;
+  A.row_offsets[3] = 8; A.row_offsets[4] = 10;
+
+  A.packed_cols.resize(10, 0);
+  A.packed_cols[1] = 1;
+  A.packed_cols[2] = 0;
+  A.packed_cols[3] = 1;
+  A.packed_cols[4] = 2;
+  A.packed_cols[5] = 3;
+  A.packed_cols[6] = 1;
+  A.packed_cols[7] = 2;
+  A.packed_cols[8] = 1;
+  A.packed_cols[9] = 3;
+
+  A.packed_coefs.resize(10, 1);
+  A.packed_coefs[2] = 2;
+  A.packed_coefs[4] = -1;
+  A.packed_coefs[6] = -2;
+  A.packed_coefs[8] = 2;
+
+  x.coefs[0] = 1; x.coefs[1] = 2; x.coefs[2] = 3; x.coefs[3] = 4;
+
+  for(size_t i=0; i<y.coefs.size(); ++i) y.coefs[i] = 0;
+
+  miniFE::make_local_matrix(A);
+
+  Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
+
+  LocalOrdinal* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
+  GlobalOrdinal* d_Acols   = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
+  Scalar* d_Acoefs  = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
+
+  compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
+  compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
+  compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);
+
+  miniFE::matvec(A, x, y);
+
+  Scalar* ybuf = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
+  compute_node.copy_from_buffer(&y.coefs[0], y.coefs.size(), ybuf);
+
+  if (std::abs(y.coefs[0] - 3.0) > 1.e-12) {
+    std::cout << "failed 0. y.coefs[0]=" <<y.coefs[0]<<", expected 3.0" << std::endl;
+    return false;
+  }
+
+  if (std::abs(y.coefs[1] - 5.0) > 1.e-12) {
+    std::cout << "failed 1. y.coefs[1]=" <<y.coefs[1]<<", expected 5.0" << std::endl;
+    return false;
+  }
+
+  if (std::abs(y.coefs[2] - -1.0) > 1.e-12) {
+    std::cout << "failed 2. y.coefs[2]=" <<y.coefs[2]<<", expected -1.0" << std::endl;
+    return false;
+  }
+
+  if (std::abs(y.coefs[3] - 8.0) > 1.e-12) {
+    std::cout << "failed 3. y.coefs[3]=" <<y.coefs[3]<<", expected 8.0" << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+using miniFE::mytimer;
+
+UTEST_CASE(waxpby_perf)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs != 1) {
+    if (myproc == 0) std::cout <<"waxpby_perf_utest only runs when numprocs=1."<<std::endl;
+    return true;
+  }
+
+  size_t num_iters = 10;
+  size_t len = 8193;
+
+#ifdef MINIFE_HAVE_CUDA
+  CUDANode compute_node(0,16,64);
+  typedef CUDANode ComputeNodeType;
+#else
+  SerialComputeNode compute_node;
+  typedef SerialComputeNode ComputeNodeType;
+#endif
+
+  miniFE::timer_type t0 = 0, tWAXPY = 0;
+
+  while(tWAXPY < 1.e-2) {
+
+    tWAXPY = 0;
+    len *= 2;
+
+    miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, len,compute_node) ,y(0, len,compute_node), w(0, len,compute_node);
+  
+    Scalar one = 1, zero = 0;
+  
+    for(size_t i=0; i<len; ++i) {
+      x.coefs[i] = one;
+      y.coefs[i] = one;
+      w.coefs[i] = zero;
+    }
+
+    Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+    Scalar* d_y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
+
+    compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
+    compute_node.copy_to_buffer(&y.coefs[0], y.coefs.size(), d_y);
+
+    TICK();
+    for(size_t i=0; i<num_iters; ++i) {
+      miniFE::waxpby(one, x, one, y, w);
+    }
+    TOCK(tWAXPY);
+
+#ifdef MINIFE_HAVE_CUDA
+//on cuda this time tends to stay very small because (I think) the
+//waxpby function returns before the cuda calculation finishes. So if we
+//don't artificially make this time large, this loop will go on forever.
+    if (tWAXPY < 1.e-2) tWAXPY = 1.e-2;
+#endif
+  }
+
+  Scalar waxpy_flops = len*3.0*num_iters;
+  Scalar waxpy_mflops = tWAXPY>1.e-2 ? 1.e-6 * (waxpy_flops/tWAXPY) : 0;
+
+  std::cout << "waxpby_perf_utest: WAXPBY time: " << tWAXPY << ", len: " << len << ", num_iters: " << num_iters
+      << ", MFLOPS: " << waxpy_mflops << std::endl;
+  return true;
+}
+
+UTEST_CASE(matmat3x3_1)
+{
+  Scalar A[] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
+  Scalar B[] = {1, 1, 1, 2, 2, 2, 3, 3, 3};
+  Scalar C[9];
+
+  miniFE::matmat3x3<Scalar>(A, B, C);
+
+  TEST_EQUAL(C[0], 6.0);
+  TEST_EQUAL(C[1], 15.0);
+  TEST_EQUAL(C[2], 24.0);
+  TEST_EQUAL(C[3], 12.0);
+  TEST_EQUAL(C[4], 30.0);
+  TEST_EQUAL(C[5], 48.0);
+  TEST_EQUAL(C[6], 18.0);
+  TEST_EQUAL(C[7], 45.0);
+  TEST_EQUAL(C[8], 72.0);
+
+  return true;
+}
+
+UTEST_CASE(matmat3x3_X_3xn_1)
+{
+  Scalar A[] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
+  Scalar B[] = {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
+  Scalar C[18];
+
+  miniFE::matmat3x3_X_3xn<Scalar>(A, 6, B, C);
+
+  TEST_EQUAL(C[0], 6.0);
+  TEST_EQUAL(C[1], 15.0);
+  TEST_EQUAL(C[2], 24.0);
+  TEST_EQUAL(C[3], 12.0);
+  TEST_EQUAL(C[4], 30.0);
+  TEST_EQUAL(C[5], 48.0);
+  TEST_EQUAL(C[6], 18.0);
+  TEST_EQUAL(C[7], 45.0);
+  TEST_EQUAL(C[8], 72.0);
+  TEST_EQUAL(C[9], 24.0);
+  TEST_EQUAL(C[10], 60.0);
+  TEST_EQUAL(C[11], 96.0);
+  TEST_EQUAL(C[12], 30.0);
+  TEST_EQUAL(C[13], 75.0);
+  TEST_EQUAL(C[14], 120.0);
+  TEST_EQUAL(C[15], 36.0);
+  TEST_EQUAL(C[16], 90.0);
+  TEST_EQUAL(C[17], 144.0);
+
+  return true;
+}
+
+UTEST_CASE(matTransMat3x3_X_3xn_1)
+{
+  Scalar A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  Scalar B[] = {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
+  Scalar C[18];
+
+  miniFE::matTransMat3x3_X_3xn<Scalar>(A, 6, B, C);
+
+  TEST_EQUAL(C[0], 6.0);
+  TEST_EQUAL(C[1], 15.0);
+  TEST_EQUAL(C[2], 24.0);
+  TEST_EQUAL(C[3], 12.0);
+  TEST_EQUAL(C[4], 30.0);
+  TEST_EQUAL(C[5], 48.0);
+  TEST_EQUAL(C[6], 18.0);
+  TEST_EQUAL(C[7], 45.0);
+  TEST_EQUAL(C[8], 72.0);
+  TEST_EQUAL(C[9], 24.0);
+  TEST_EQUAL(C[10], 60.0);
+  TEST_EQUAL(C[11], 96.0);
+  TEST_EQUAL(C[12], 30.0);
+  TEST_EQUAL(C[13], 75.0);
+  TEST_EQUAL(C[14], 120.0);
+  TEST_EQUAL(C[15], 36.0);
+  TEST_EQUAL(C[16], 90.0);
+  TEST_EQUAL(C[17], 144.0);
+
+  return true;
+}
+
+UTEST_CASE(BoxIterator1)
+{
+  int box1[3][2] = {{ 0, 2 }, { 0, 2 }, { 0, 2 } };
+  miniFE::BoxIterator iter = miniFE::BoxIterator::begin(box1);
+  miniFE::BoxIterator end = miniFE::BoxIterator::end(box1);
+
+  for(int iz=box1[2][0]; iz<box1[2][1]; ++iz) {
+   for(int iy=box1[1][0]; iy<box1[1][1]; ++iy) {
+    for(int ix=box1[0][0]; ix<box1[0][1]; ++ix) {
+      TEST_EQUAL((iter == end), false);
+      TEST_EQUAL(ix, iter.x);
+      TEST_EQUAL(iy, iter.y);
+      TEST_EQUAL(iz, iter.z);
+      ++iter;
+    }
+   }
+  }
+
+  TEST_EQUAL((iter == end), true);
+
+  return true;
+}
+
+UTEST_CASE(BoxIterator_get_coords)
+{
+  const int nx=2;
+  const int ny=3;
+  const int nz=4;
+  int box1[3][2] = {{ 0, nx }, { 0, ny }, { 0, nz } };
+  miniFE::BoxIterator iter = miniFE::BoxIterator::begin(box1);
+  miniFE::BoxIterator end = miniFE::BoxIterator::end(box1);
+
+  for(; iter!=end; ++iter) {
+    int elemID = miniFE::get_id<int>(nx,ny,nz,iter.x,iter.y,-iter.z);
+    int x, y, z;
+    miniFE::get_coords<int>(elemID, nx,ny,nz, x,y,z);
+    TEST_EQUAL(x,iter.x);
+    TEST_EQUAL(y,iter.y);
+    TEST_EQUAL(z,-iter.z);
+  }
+
+  return true;
+}
+
+#endif
+
diff --git a/openmp-avx512/basic/verify_solution.hpp b/openmp-avx512/basic/verify_solution.hpp
new file mode 100644
index 0000000..fb3bd3b
--- /dev/null
+++ b/openmp-avx512/basic/verify_solution.hpp
@@ -0,0 +1,170 @@
+#ifndef _verify_solution_hpp_
+#define _verify_solution_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#include <sstream>
+#include <stdexcept>
+#include <map>
+#include <algorithm>
+
+#include <simple_mesh_description.hpp>
+#include <analytic_soln.hpp>
+#include <box_utils.hpp>
+#include <utils.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename Scalar>
+struct err_info {
+  Scalar err;
+  Scalar computed;
+  Scalar analytic;
+  Scalar coords[3];
+};
+
+template<typename VectorType>
+void
+verify_solution(const simple_mesh_description<typename VectorType::GlobalOrdinalType>& mesh,
+                const VectorType& x)
+{
+  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename VectorType::ScalarType Scalar;
+
+  int global_nodes_x = mesh.global_box[0][1]+1;
+  int global_nodes_y = mesh.global_box[1][1]+1;
+  int global_nodes_z = mesh.global_box[2][1]+1;
+  Box box;
+  copy_box(mesh.local_box, box);
+
+  //num-owned-nodes in each dimension is num-elems+1
+  //only if num-elems > 0 in that dimension *and*
+  //we are at the high end of the global range in that dimension:
+  if (box[0][1] > box[0][0] && box[0][1] == mesh.global_box[0][1]) ++box[0][1];
+  if (box[1][1] > box[1][0] && box[1][1] == mesh.global_box[1][1]) ++box[1][1];
+  if (box[2][1] > box[2][0] && box[2][1] == mesh.global_box[2][1]) ++box[2][1];
+
+  GlobalOrdinal nrows = get_num_ids<GlobalOrdinal>(box);
+
+  std::vector<GlobalOrdinal> rows(nrows);
+  std::vector<Scalar> row_coords(nrows*3);
+
+  unsigned roffset = 0;
+
+  for(int iz=box[2][0]; iz<box[2][1]; ++iz) {
+   for(int iy=box[1][0]; iy<box[1][1]; ++iy) {
+    for(int ix=box[0][0]; ix<box[0][1]; ++ix) {
+      GlobalOrdinal row_id =
+          get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                ix, iy, iz);
+      Scalar x, y, z;
+      get_coords(row_id, global_nodes_x, global_nodes_y, global_nodes_z, x, y, z);
+
+      rows[roffset] = mesh.map_id_to_row(row_id);
+      row_coords[roffset*3] = x;
+      row_coords[roffset*3+1] = y;
+      row_coords[roffset*3+2] = z;
+      ++roffset;
+    }
+   }
+  }
+
+  if (x.local_size != rows.size() || x.local_size != nrows) {
+    throw std::runtime_error("verify_solution ERROR, size mismatch");
+  }
+
+  const int num_terms = 300;
+
+  err_info<Scalar> max_error;
+  max_error.err = 0.0;
+
+  for(size_t i=0; i<rows.size(); ++i) {
+    Scalar computed_soln = x.coefs[i];
+    Scalar x = row_coords[i*3];
+    Scalar y = row_coords[i*3+1];
+    Scalar z = row_coords[i*3+2];
+    Scalar analytic_soln = 0.0;
+    //set exact boundary-conditions:
+    if (x == 1.0) {
+      //x==1 is first, we want soln to be 1 even around the edges
+      //of the x==1 plane where y and/or z may be 0 or 1...
+      analytic_soln = 1;
+    }
+    else if (x == 0.0 || y == 0.0 || z == 0.0) {
+      analytic_soln = 0;
+    }
+    else if (y == 1.0 || z == 1.0) {
+      analytic_soln = 0;
+    }
+    else {
+      analytic_soln = soln(x, y, z, num_terms, num_terms);
+    }
+
+#ifdef MINIFE_DEBUG
+std::cout<<"("<<x<<","<<y<<","<<z<<") row "<<rows[i]<<": computed: "<<computed_soln<<",  analytic: "<<analytic_soln<<std::endl;
+#endif
+    Scalar err = std::abs(analytic_soln - computed_soln);
+    if (err > max_error.err) {
+      max_error.err = err;
+      max_error.computed = computed_soln;
+      max_error.analytic = analytic_soln;
+      max_error.coords[0] = x;
+      max_error.coords[1] = y;
+      max_error.coords[2] = z;
+    }
+  }
+
+  Scalar local_max_err = max_error.err;
+  Scalar global_max_err = 0;
+#ifdef HAVE_MPI
+  MPI_Allreduce(&local_max_err, &global_max_err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  global_max_err = local_max_err;
+#endif
+
+  if (local_max_err == global_max_err) {
+    if (max_error.err > 1.e-6) {
+      std::cout << "max absolute error is "<<max_error.err<<":"<<std::endl;
+      std::cout << "   at position ("<<max_error.coords[0]<<","<<max_error.coords[1]<<","<<max_error.coords[2]<<"), "<<std::endl;
+      std::cout << "   computed solution: "<<max_error.computed<<",  analytic solution: "<<max_error.analytic<<std::endl;
+    }
+    else {
+      std::cout << "solution matches analytic solution to within 1.e-6 or better."<<std::endl;
+    }
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/fem/ElemData.hpp b/openmp-avx512/fem/ElemData.hpp
new file mode 100644
index 0000000..a77bfe8
--- /dev/null
+++ b/openmp-avx512/fem/ElemData.hpp
@@ -0,0 +1,64 @@
+#ifndef _ElemData_hpp_
+#define _ElemData_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <Hex8_enums.hpp>
+
+namespace miniFE {
+
+template<typename GlobalOrdinal, typename Scalar>
+struct ElemData {
+  ElemData() : nodes_per_elem(Hex8::numNodesPerElem) {}
+  ~ElemData(){}
+
+  const size_t nodes_per_elem;
+  GlobalOrdinal elem_node_ids[Hex8::numNodesPerElem];
+  Scalar grad_vals[Hex8::numGaussPointsPerDim * Hex8::numGaussPointsPerDim * Hex8::numGaussPointsPerDim * Hex8::numNodesPerElem * Hex8::spatialDim];
+  Scalar elem_node_coords[Hex8::numNodesPerElem*Hex8::spatialDim];
+  Scalar elem_diffusion_matrix[(Hex8::numNodesPerElem*(Hex8::numNodesPerElem+1))/2];
+  Scalar elem_source_vector[Hex8::numNodesPerElem];
+};
+
+template<typename GlobalOrdinal, typename Scalar>
+struct ElemDataPtr {
+  ElemDataPtr() : nodes_per_elem(Hex8::numNodesPerElem) {}
+  ~ElemDataPtr(){}
+
+  const size_t nodes_per_elem;
+  GlobalOrdinal elem_node_ids[Hex8::numNodesPerElem];
+  Scalar grad_vals[Hex8::numGaussPointsPerDim * Hex8::numGaussPointsPerDim * Hex8::numGaussPointsPerDim * Hex8::numNodesPerElem * Hex8::spatialDim];
+  Scalar elem_node_coords[(Hex8::numNodesPerElem*(Hex8::spatialDim+1))/2];
+  Scalar* elem_diffusion_matrix;
+  Scalar* elem_source_vector;
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/fem/Hex8.hpp b/openmp-avx512/fem/Hex8.hpp
new file mode 100644
index 0000000..d2cd4f2
--- /dev/null
+++ b/openmp-avx512/fem/Hex8.hpp
@@ -0,0 +1,417 @@
+#ifndef _Hex8_hpp_
+#define _Hex8_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KERNEL_PREFIX 
+#define KERNEL_PREFIX
+#endif
+
+#include <gauss_pts.hpp>
+#include <matrix_algebra_3x3.hpp>
+#include <Hex8_enums.hpp>
+
+namespace miniFE {
+
+namespace Hex8 {
+
+template<typename Scalar>
+KERNEL_PREFIX void shape_fns(const Scalar* x, Scalar* values_at_nodes)
+{
+  //assumptions: values_at_nodes has length numNodesPerElem
+  //             x has length 3 (hard-coded spatialDim)
+
+  const Scalar u = 1.0 - x[0];
+  const Scalar v = 1.0 - x[1];
+  const Scalar w = 1.0 - x[2];
+
+  const Scalar up1 = 1.0 + x[0];
+  const Scalar vp1 = 1.0 + x[1];
+  const Scalar wp1 = 1.0 + x[2];
+
+  values_at_nodes[0] = 0.125 *   u *   v *   w;//(1-x)*(1-y)*(1-z)
+  values_at_nodes[1] = 0.125 * up1 *   v *   w;//(1+x)*(1-y)*(1-z)
+  values_at_nodes[2] = 0.125 * up1 * vp1 *   w;//(1+x)*(1+y)*(1-z)
+  values_at_nodes[3] = 0.125 *   u * vp1 *   w;//(1-x)*(1+y)*(1-z)
+  values_at_nodes[4] = 0.125 *   u *   v * wp1;//(1-x)*(1-y)*(1+z)
+  values_at_nodes[5] = 0.125 * up1 *   v * wp1;//(1+x)*(1-y)*(1+z)
+  values_at_nodes[6] = 0.125 * up1 * vp1 * wp1;//(1+x)*(1+y)*(1+z)
+  values_at_nodes[7] = 0.125 *   u * vp1 * wp1;//(1-x)*(1+y)*(1+z)
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void gradients(const Scalar* x, Scalar* values_per_fn)
+{
+  //assumptions values_per_fn has length 24 (numNodesPerElem*spatialDim)
+  //        spatialDim == 3
+
+  const Scalar u = 1.0 - x[0];
+  const Scalar v = 1.0 - x[1];
+  const Scalar w = 1.0 - x[2];
+
+  const Scalar up1 = 1.0 + x[0];
+  const Scalar vp1 = 1.0 + x[1];
+  const Scalar wp1 = 1.0 + x[2];
+
+//fn 0
+  values_per_fn[0] = -0.125 *  v *  w;
+  values_per_fn[1] = -0.125 *  u *  w;
+  values_per_fn[2] = -0.125 *  u *  v;
+//fn 1
+  values_per_fn[3] =  0.125 *  v   *  w;
+  values_per_fn[4] = -0.125 *  up1 *  w;
+  values_per_fn[5] = -0.125 *  up1 *  v;
+//fn 2
+  values_per_fn[6] =  0.125 *  vp1 *  w;
+  values_per_fn[7] =  0.125 *  up1 *  w;
+  values_per_fn[8] = -0.125 *  up1 *  vp1;
+//fn 3
+  values_per_fn[9]  = -0.125 *  vp1 *  w;
+  values_per_fn[10] =  0.125 *  u   *  w;
+  values_per_fn[11] = -0.125 *  u   *  vp1;
+//fn 4
+  values_per_fn[12] = -0.125 *  v   * wp1;
+  values_per_fn[13] = -0.125 *  u   * wp1;
+  values_per_fn[14] =  0.125 *  u   * v;
+//fn 5
+  values_per_fn[15] =  0.125 *  v * wp1;
+  values_per_fn[16] = -0.125 *  up1 * wp1;
+  values_per_fn[17] =  0.125 *  up1 * v;
+//fn 6
+  values_per_fn[18] =  0.125 *  vp1 * wp1;
+  values_per_fn[19] =  0.125 *  up1 * wp1;
+  values_per_fn[20] =  0.125 *  up1 * vp1;
+//fn 7
+  values_per_fn[21] = -0.125 *  vp1 * wp1;
+  values_per_fn[22] =  0.125 *  u   * wp1;
+  values_per_fn[23] =  0.125 *  u   * vp1;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void gradients_and_detJ(const Scalar* elemNodeCoords,
+                                          const Scalar* grad_vals,
+                                          Scalar& detJ)
+{
+/**
+  pt is the point at which the jacobian is to be computed.
+*/
+
+  //assumptions on the lengths of input arguments:
+  //elemNodeCoords has length numNodesPerElem*spatialDim,
+  //grad_vals has length numNodesPerElem*spatialDim
+
+  const Scalar zero = 0;
+
+  Scalar J00 = zero;
+  Scalar J01 = zero;
+  Scalar J02 = zero;
+
+  Scalar J10 = zero;
+  Scalar J11 = zero;
+  Scalar J12 = zero;
+
+  Scalar J20 = zero;
+  Scalar J21 = zero;
+  Scalar J22 = zero;
+
+  size_t i_X_spatialDim = 0;
+  for(size_t i=0; i<numNodesPerElem; ++i) {
+//    size_t offset = 0;
+//    for(size_t gd=0; gd<spatialDim; ++gd) {
+//
+//      Scalar gval = grad_vals[i_X_spatialDim+gd];
+//
+//      for(size_t jd=0; jd<spatialDim; ++jd) {
+//        J[offset++] += gval*elemNodeCoords[i_X_spatialDim+jd];
+//      }
+//    }
+    //for optimization, unroll the above double-loop over spatialDim:
+    //(hard-coded assumption that spatialDim == 3)
+    J00 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+0];
+    J01 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+1];
+    J02 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+2];
+
+    J10 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+0];
+    J11 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+1];
+    J12 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+2];
+
+    J20 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+0];
+    J21 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+1];
+    J22 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+2];
+
+    i_X_spatialDim += spatialDim;
+  }
+
+  Scalar term0 = J22*J11 - J21*J12;
+  Scalar term1 = J22*J01 - J21*J02;
+  Scalar term2 = J12*J01 - J11*J02;
+
+  detJ = J00*term0 - J10*term1 + J20*term2;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void gradients_and_invJ_and_detJ(const Scalar* elemNodeCoords,
+                                               const Scalar* grad_vals,
+                                               Scalar* invJ,
+                                               Scalar& detJ)
+{
+/**
+  pt is the point at which the jacobian is to be computed.
+*/
+
+  //assumptions on the lengths of input arguments:
+  //pt has length spatialDim,
+  //elemNodeCoords has length numNodesPerElem*spatialDim,
+  //grad_vals has length numNodesPerElem*spatialDim, and
+  //J has length spatialDim*spatialDim
+
+  const Scalar zero = 0;
+
+  //
+  //First we compute the jacobian J:
+  //
+  Scalar J00 = zero;
+  Scalar J01 = zero;
+  Scalar J02 = zero;
+
+  Scalar J10 = zero;
+  Scalar J11 = zero;
+  Scalar J12 = zero;
+
+  Scalar J20 = zero;
+  Scalar J21 = zero;
+  Scalar J22 = zero;
+
+  size_t i_X_spatialDim = 0;
+  for(size_t i=0; i<numNodesPerElem; ++i) {
+//    size_t offset = 0;
+//    for(size_t gd=0; gd<spatialDim; ++gd) {
+//
+//      Scalar gval = grad_vals[i_X_spatialDim+gd];
+//
+//      for(size_t jd=0; jd<spatialDim; ++jd) {
+//        J[offset++] += gval*elemNodeCoords[i_X_spatialDim+jd];
+//      }
+//    }
+    //for optimization, unroll the above double-loop over spatialDim:
+    //(a hard-coded assumption that spatialDim == 3)
+    J00 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+0];
+    J01 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+1];
+    J02 += grad_vals[i_X_spatialDim+0]*elemNodeCoords[i_X_spatialDim+2];
+
+    J10 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+0];
+    J11 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+1];
+    J12 += grad_vals[i_X_spatialDim+1]*elemNodeCoords[i_X_spatialDim+2];
+
+    J20 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+0];
+    J21 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+1];
+    J22 += grad_vals[i_X_spatialDim+2]*elemNodeCoords[i_X_spatialDim+2];
+
+    i_X_spatialDim += spatialDim;
+  }
+
+  Scalar term0 = J22*J11 - J21*J12;
+  Scalar term1 = J22*J01 - J21*J02;
+  Scalar term2 = J12*J01 - J11*J02;
+
+  detJ = J00*term0 - J10*term1 + J20*term2;
+
+  Scalar inv_detJ = 1.0/detJ;
+
+  invJ[0] =  term0*inv_detJ;
+  invJ[1] = -term1*inv_detJ;
+  invJ[2] =  term2*inv_detJ;
+
+  invJ[3] = -(J22*J10 - J20*J12)*inv_detJ;
+  invJ[4] =  (J22*J00 - J20*J02)*inv_detJ;
+  invJ[5] = -(J12*J00 - J10*J02)*inv_detJ;
+
+  invJ[6] =  (J21*J10 - J20*J11)*inv_detJ;
+  invJ[7] = -(J21*J00 - J20*J01)*inv_detJ;
+  invJ[8] =  (J11*J00 - J10*J01)*inv_detJ;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void diffusionMatrix_symm(const Scalar* elemNodeCoords,
+                        const Scalar* grad_vals,
+                        Scalar* elem_mat)
+{
+  int len = (numNodesPerElem * (numNodesPerElem+1))/2;
+  const Scalar zero = 0;
+  miniFE::fill(elem_mat, elem_mat+len, zero);
+
+  Scalar gpts[numGaussPointsPerDim];
+  Scalar gwts[numGaussPointsPerDim];
+
+  gauss_pts(numGaussPointsPerDim, gpts, gwts);
+
+  const Scalar k = 1.0;
+  Scalar detJ = 0.0;
+
+  Scalar dpsidx[numNodesPerElem], dpsidy[numNodesPerElem], dpsidz[numNodesPerElem];
+
+  Scalar invJ[spatialDim*spatialDim];
+
+  //The following nested loop implements equations 3.4.5 and 3.4.7 on page 88
+  //of Reddy & Gartling, "The Finite Element Method in Heat Transfer and Fluid
+  //Dynamics", 2nd edition,
+  //to compute the element diffusion matrix for the steady conduction equation.
+
+  Scalar pt[spatialDim];
+
+#ifdef MINIFE_DEBUG
+  Scalar volume = zero;
+#endif
+
+  size_t gv_offset = 0;
+  for(size_t ig=0; ig<numGaussPointsPerDim; ++ig) {
+    Scalar wi = gwts[ig];
+
+    for(size_t jg=0; jg<numGaussPointsPerDim; ++jg) {
+      Scalar wi_wj = wi*gwts[jg];
+
+      for(size_t kg=0; kg<numGaussPointsPerDim; ++kg) {
+        Scalar wi_wj_wk = wi_wj*gwts[kg];
+        const Scalar* grad_vals_ptr = &grad_vals[gv_offset];
+        gv_offset += numNodesPerElem*spatialDim;
+        gradients_and_invJ_and_detJ(elemNodeCoords, grad_vals_ptr, invJ, detJ);
+
+#ifdef MINIFE_DEBUG
+        volume += detJ;
+#endif
+        Scalar k_detJ_wi_wj_wk = k*detJ*wi_wj_wk;
+
+        const Scalar* gv = grad_vals_ptr;
+        for(int i=0; i<numNodesPerElem; ++i) {
+          Scalar gv0 = gv[0], gv1 = gv[1], gv2 = gv[2];
+          dpsidx[i] = gv0 * invJ[0] +
+                      gv1 * invJ[1] +
+                      gv2 * invJ[2];
+          dpsidy[i] = gv0 * invJ[3] +
+                      gv1 * invJ[4] +
+                      gv2 * invJ[5];
+          dpsidz[i] = gv0 * invJ[6] +
+                      gv1 * invJ[7] +
+                      gv2 * invJ[8];
+          gv += spatialDim;
+        }
+
+        int offset = 0;
+        for(int m=0; m<numNodesPerElem; ++m) {
+          const Scalar dpsidx_m = dpsidx[m];
+          const Scalar dpsidy_m = dpsidy[m];
+          const Scalar dpsidz_m = dpsidz[m];
+
+          elem_mat[offset++] += k_detJ_wi_wj_wk *
+                              ((dpsidx_m*dpsidx_m) +
+                               (dpsidy_m*dpsidy_m) +
+                               (dpsidz_m*dpsidz_m));
+
+          for(int n=m+1; n<numNodesPerElem; ++n) {
+            elem_mat[offset++] += k_detJ_wi_wj_wk *
+                                  ((dpsidx_m * dpsidx[n]) +
+                                   (dpsidy_m * dpsidy[n]) +
+                                   (dpsidz_m * dpsidz[n]));
+          }
+        }
+
+      }//for kg
+    }//for jg
+  }//for ig
+
+//int offset = 0;
+//std::cout.precision(16);
+//for(int m=0; m<numNodesPerElem; ++m) {
+//  for(int n=m; n<numNodesPerElem; ++n) {
+//std::cout<<"elem_mat["<<offset<<"] = "<<elem_mat[offset]<<";"<<std::endl;
+//   ++offset;
+//  }
+//}
+#ifdef MINIFE_DEBUG
+//  std::cout << "element volume: " << volume << std::endl;
+//  if (std::abs(volume - 1) > 1.e-7) {
+//    std::cout << "element volume is "<<volume<<", expected 1.0."<<std::endl;
+//  }
+#endif
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void sourceVector(const Scalar* elemNodeCoords,
+                                const Scalar* grad_vals,
+                                Scalar* elem_vec)
+{
+  int len = numNodesPerElem;
+  const Scalar zero = 0;
+  miniFE::fill(elem_vec, elem_vec+len, zero);
+
+  Scalar gpts[numGaussPointsPerDim];
+  Scalar gwts[numGaussPointsPerDim];
+
+  Scalar psi[numNodesPerElem];
+
+  gauss_pts(numGaussPointsPerDim, gpts, gwts);
+
+  Scalar Q = 1.0;
+
+  Scalar pt[spatialDim];
+
+  size_t gv_offset = 0;
+  for(size_t ig=0; ig<numGaussPointsPerDim; ++ig) {
+    pt[0] = gpts[ig];
+    Scalar wi = gwts[ig];
+
+    for(size_t jg=0; jg<numGaussPointsPerDim; ++jg) {
+      pt[1] = gpts[jg];
+      Scalar wj = gwts[jg];
+
+      for(size_t kg=0; kg<numGaussPointsPerDim; ++kg) {
+        pt[2] = gpts[kg];
+        Scalar wk = gwts[kg];
+    
+        shape_fns(pt, psi);
+        const Scalar* grad_vals_ptr = &grad_vals[gv_offset];
+        gv_offset += numNodesPerElem*spatialDim;
+        Scalar detJ;
+        gradients_and_detJ(elemNodeCoords, grad_vals_ptr, detJ);
+    
+        Scalar term = Q*detJ*wi*wj*wk;
+
+        for(int i=0; i<numNodesPerElem; ++i) {
+          elem_vec[i] += psi[i]*term;
+        }
+      }
+    }
+  }
+}
+
+}//namespace Hex8
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/fem/Hex8_ElemData.hpp b/openmp-avx512/fem/Hex8_ElemData.hpp
new file mode 100644
index 0000000..f4789c6
--- /dev/null
+++ b/openmp-avx512/fem/Hex8_ElemData.hpp
@@ -0,0 +1,86 @@
+#ifndef _Hex8_ElemData_hpp_
+#define _Hex8_ElemData_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <Hex8_enums.hpp>
+#include <Hex8.hpp>
+#include <ElemData.hpp>
+
+namespace miniFE {
+
+template<typename Scalar>
+void compute_gradient_values(Scalar* grad_vals)
+{
+  Scalar gpts[Hex8::numGaussPointsPerDim];
+  Scalar gwts[Hex8::numGaussPointsPerDim];
+
+  gauss_pts(Hex8::numGaussPointsPerDim, gpts, gwts);
+
+  Scalar pt[Hex8::spatialDim];
+
+  Scalar* grad_vals_ptr = grad_vals;
+  for(size_t ig=0; ig<Hex8::numGaussPointsPerDim; ++ig) {
+    pt[0] = gpts[ig];
+    for(size_t jg=0; jg<Hex8::numGaussPointsPerDim; ++jg) {
+      pt[1] = gpts[jg];
+      for(size_t kg=0; kg<Hex8::numGaussPointsPerDim; ++kg) {
+        pt[2] = gpts[kg];
+
+        Hex8::gradients(pt, grad_vals_ptr);
+
+        grad_vals_ptr += Hex8::numNodesPerElem*Hex8::spatialDim;
+      }
+    }
+  }
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+void
+compute_element_matrix_and_vector(ElemData<GlobalOrdinal,Scalar>& elem_data)
+{
+  Hex8::diffusionMatrix_symm(elem_data.elem_node_coords, elem_data.grad_vals,
+                             elem_data.elem_diffusion_matrix);
+  Hex8::sourceVector(elem_data.elem_node_coords, elem_data.grad_vals,
+                     elem_data.elem_source_vector);
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+void
+compute_element_matrix_and_vector(ElemDataPtr<GlobalOrdinal,Scalar>& elem_data)
+{
+  Hex8::diffusionMatrix_symm(elem_data.elem_node_coords, elem_data.grad_vals,
+                             elem_data.elem_diffusion_matrix);
+  Hex8::sourceVector(elem_data.elem_node_coords, elem_data.grad_vals,
+                     elem_data.elem_source_vector);
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/fem/Hex8_enums.hpp b/openmp-avx512/fem/Hex8_enums.hpp
new file mode 100644
index 0000000..3dfac26
--- /dev/null
+++ b/openmp-avx512/fem/Hex8_enums.hpp
@@ -0,0 +1,52 @@
+#ifndef _Hex8_enums_hpp_
+#define _Hex8_enums_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+namespace miniFE {
+
+namespace Hex8 {
+
+//   !!!!!!!
+//Important note: there are places in miniFE code where
+//loops over spatialDim are unrolled (spatialDim is assumed to be 3).
+//Thus, changing this enum is not enough to make miniFE code
+//work for spatialDim values other than 3.
+//   !!!!!!!
+enum {
+  spatialDim = 3,
+  numNodesPerElem = 8,
+  numGaussPointsPerDim = 2
+};
+
+}//namespace Hex8
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/fem/analytic_soln.hpp b/openmp-avx512/fem/analytic_soln.hpp
new file mode 100644
index 0000000..2d130e2
--- /dev/null
+++ b/openmp-avx512/fem/analytic_soln.hpp
@@ -0,0 +1,116 @@
+#ifndef _analytic_soln_hpp_
+#define _analytic_soln_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+
+#ifndef MINIFE_SCALAR
+#define MINIFE_SCALAR double;
+#endif
+
+namespace miniFE {
+
+typedef MINIFE_SCALAR Scalar;
+
+// The 'soln' function below computes the analytic solution for
+// steady state temperature in a brick-shaped domain (formally called
+// a rectangular parallelepiped). The inputs to the function are
+// the x,y,z coordinates of the point at which temperature is to be
+// computed, and the number of terms p,q in the series expansion.
+//
+// The equations used for the temperature solution are equations 9 and 10
+// in section 6.2 of Carslaw & Jaeger, "Conduction of Heat in Solids".
+//
+// The paralellepiped being used is defined by this domain:
+// 0 <= x <= 1.0
+// 0 <= y <= 1.0
+// 0 <= z <= 1.0
+//
+// With boundary conditions prescribing the temperature to be 1.0 on
+// the x==1.0 face, and 0.0 on all other faces.
+//
+// Thus, in the equations from Carslaw & Jaeger, the following constants
+// are used:
+//
+// a == b == c == 1.0  (the extents of the domain)
+// v1 == 0.0           (temperature at x == 0.0)
+// v2 == 1.0           (temperature at x == 1.0)
+//
+
+const Scalar PI = 3.141592653589793238462;
+const Scalar PI_SQR = PI*PI;
+const Scalar term0 = 16.0/(PI_SQR);
+
+inline Scalar fcn_l(int p, int q)
+{
+  return std::sqrt((2*p+1)*(2*p+1)*PI_SQR + (2*q+1)*(2*q+1)*PI_SQR);
+}
+
+inline Scalar fcn(int n, Scalar u)
+{
+  return (2*n+1)*PI*u;
+}
+
+inline Scalar soln(Scalar x, Scalar y, Scalar z, int max_p, int max_q)
+{
+  Scalar sum = 0;
+  for(int p=0; p<=max_p; ++p) {
+    const Scalar p21y = fcn(p, y);
+    const Scalar sin_py = std::sin(p21y)/(2*p+1);
+    for(int q=0; q<=max_q; ++q) {
+      const Scalar q21z = fcn(q, z);
+      const Scalar sin_qz = std::sin(q21z)/(2*q+1);
+
+      const Scalar l = fcn_l(p, q);
+
+      const Scalar sinh1 = std::sinh(l*x);
+      const Scalar sinh2 = std::sinh(l);
+
+      const Scalar tmp = (sinh1*sin_py)*(sin_qz/sinh2);
+
+      //if the scalar l gets too big, sinh(l) becomes inf.
+      //if that happens, tmp is a NaN.
+      //crude check for NaN:
+      //if tmp != tmp, tmp is NaN
+      if (tmp == tmp) {
+        sum += tmp;
+      }
+      else {
+        //if we got a NaN, break out of this inner loop and go to
+        //the next iteration of the outer loop.
+        break;
+      }
+    }
+  }
+  return term0*sum;
+}
+
+}//namespace miniFE
+
+#endif /* _analytic_soln_hpp_ */
diff --git a/openmp-avx512/fem/gauss_pts.hpp b/openmp-avx512/fem/gauss_pts.hpp
new file mode 100644
index 0000000..7652839
--- /dev/null
+++ b/openmp-avx512/fem/gauss_pts.hpp
@@ -0,0 +1,67 @@
+#ifndef _gauss_pts_hpp_
+#define _gauss_pts_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KERNEL_PREFIX 
+#define KERNEL_PREFIX
+#endif
+
+namespace miniFE {
+
+template<typename Scalar>
+inline
+KERNEL_PREFIX void gauss_pts(int N, Scalar* pts, Scalar* wts)
+{
+  const Scalar x2 = 0.577350269; // 1.0/sqrt(3.0)
+  const Scalar x3 = 0.77459667; // sqrt(3.0/5.0)
+  const Scalar w1 = 0.55555556; // 5.0/9.0
+  const Scalar w2 = 0.88888889; // 8.0/9.0
+
+  switch(N) {
+  case 1:
+    pts[0] = 0.0; wts[0] = 2.0;
+    break;
+  case 2:
+    pts[0] = -x2; wts[0] = 1.0;
+    pts[1] = x2;  wts[1] = 1.0;
+    break;
+  case 3:
+    pts[0] =  -x3;  wts[0] = w1;
+    pts[1] =  0.0;  wts[1] = w2;
+    pts[2] =   x3;  wts[2] = w1;
+    break;
+  default:
+    break;
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/fem/matrix_algebra_3x3.hpp b/openmp-avx512/fem/matrix_algebra_3x3.hpp
new file mode 100644
index 0000000..012ae82
--- /dev/null
+++ b/openmp-avx512/fem/matrix_algebra_3x3.hpp
@@ -0,0 +1,166 @@
+#ifndef _matrix_algebra_3x3_hpp_
+#define _matrix_algebra_3x3_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KERNEL_PREFIX
+#define KERNEL_PREFIX
+#endif
+
+namespace miniFE {
+
+template<typename Scalar>
+#ifdef __CUDACC__
+  __host__ __device__
+#endif
+KERNEL_PREFIX void fill(Scalar* begin, Scalar* end, const Scalar& val)
+{
+  while(begin != end) {*begin++ = val;}
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void inverse_and_determinant3x3(const Scalar* J, Scalar* invJ, Scalar& detJ)
+{
+  //hardwired "3x3" in function-name allows us to assume
+  //that J and invJ have length 9:
+
+  Scalar J00 = J[0];
+  Scalar J01 = J[1];
+  Scalar J02 = J[2];
+
+  Scalar J10 = J[3];
+  Scalar J11 = J[4];
+  Scalar J12 = J[5];
+
+  Scalar J20 = J[6];
+  Scalar J21 = J[7];
+  Scalar J22 = J[8];
+
+  Scalar term0 = J22*J11 - J21*J12;
+  Scalar term1 = J22*J01 - J21*J02;
+  Scalar term2 = J12*J01 - J11*J02;
+
+  detJ = J00*term0 - J10*term1 + J20*term2;
+
+  Scalar inv_detJ = 1.0/detJ;
+
+  invJ[0] =  term0*inv_detJ;
+  invJ[1] = -term1*inv_detJ;
+  invJ[2] =  term2*inv_detJ;
+
+  invJ[3] = -(J22*J10 - J20*J12)*inv_detJ;
+  invJ[4] =  (J22*J00 - J20*J02)*inv_detJ;
+  invJ[5] = -(J12*J00 - J10*J02)*inv_detJ;
+
+  invJ[6] =  (J21*J10 - J20*J11)*inv_detJ;
+  invJ[7] = -(J21*J00 - J20*J01)*inv_detJ;
+  invJ[8] =  (J11*J00 - J10*J01)*inv_detJ;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void matmat3x3(const Scalar* A, const Scalar* B, Scalar* C)
+{
+  //hardwired "3x3" in function-name allows us to assume args have length 9:
+  //A,B,C are all assumed to be ordered such that columns are contiguous.
+
+  const Scalar zero = 0;
+  miniFE::fill(C, C+9, zero);
+
+  for(int i=0; i<3; ++i) {
+    for(int j=0; j<3; ++j) {
+      C[i+j*3] = A[i+0]*B[j*3+0]
+               + A[i+3]*B[j*3+1]
+               + A[i+6]*B[j*3+2];
+    }
+  }
+}
+
+template<typename Scalar>
+KERNEL_PREFIX Scalar determinant3x3(const Scalar* J)
+{
+  //hardwired "3x3" in function-name allows us to assume that J has length 9:
+
+  Scalar J00 = J[0];
+  Scalar J01 = J[1];
+  Scalar J02 = J[2];
+
+  Scalar J10 = J[3];
+  Scalar J11 = J[4];
+  Scalar J12 = J[5];
+
+  Scalar J20 = J[6];
+  Scalar J21 = J[7];
+  Scalar J22 = J[8];
+
+  Scalar term0 = J22*J11 - J21*J12;
+  Scalar term1 = J22*J01 - J21*J02;
+  Scalar term2 = J12*J01 - J11*J02;
+
+  Scalar detJ = J00*term0 - J10*term1 + J20*term2;
+
+  return detJ;
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void matmat3x3_X_3xn(const Scalar* A, int n, const Scalar* B, Scalar* C)
+{
+  //A is 3x3, B is 3xn. So C is also 3xn.
+  //A,B,C are all assumed to be ordered such that columns are contiguous.
+
+  Scalar* Cj = C;
+  const Scalar* Bj = B;
+  for(int j=0; j<n; ++j) {
+    Cj[0] = A[0]*Bj[0] + A[3]*Bj[1] + A[6]*Bj[2];
+    Cj[1] = A[1]*Bj[0] + A[4]*Bj[1] + A[7]*Bj[2];
+    Cj[2] = A[2]*Bj[0] + A[5]*Bj[1] + A[8]*Bj[2];
+    Bj += 3;
+    Cj += 3;
+  }
+}
+
+template<typename Scalar>
+KERNEL_PREFIX void matTransMat3x3_X_3xn(const Scalar* A, int n, const Scalar* B, Scalar* C)
+{
+  //A is 3x3, B is 3xn. So C is also 3xn.
+  //A,B,C are all assumed to be ordered such that columns are contiguous.
+
+  Scalar* Cj = C;
+  const Scalar* Bj = B;
+  for(int j=0; j<n; ++j) {
+    Cj[0] = A[0]*Bj[0] + A[1]*Bj[1] + A[2]*Bj[2];
+    Cj[1] = A[3]*Bj[0] + A[4]*Bj[1] + A[5]*Bj[2];
+    Cj[2] = A[6]*Bj[0] + A[7]*Bj[1] + A[8]*Bj[2];
+    Bj += 3;
+    Cj += 3;
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/fem/verify_solution.hpp b/openmp-avx512/fem/verify_solution.hpp
new file mode 100644
index 0000000..52d4815
--- /dev/null
+++ b/openmp-avx512/fem/verify_solution.hpp
@@ -0,0 +1,179 @@
+#ifndef _verify_solution_hpp_
+#define _verify_solution_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <sstream>
+#include <stdexcept>
+#include <map>
+#include <algorithm>
+
+#include <simple_mesh_description.hpp>
+#include <analytic_soln.hpp>
+#include <box_utils.hpp>
+#include <utils.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename Scalar>
+struct err_info {
+  Scalar err;
+  Scalar computed;
+  Scalar analytic;
+  Scalar coords[3];
+};
+
+template<typename VectorType>
+int
+verify_solution(const simple_mesh_description<typename VectorType::GlobalOrdinalType>& mesh,
+                const VectorType& x, double tolerance, bool verify_whole_domain = false)
+{
+  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename VectorType::ScalarType Scalar;
+
+  int global_nodes_x = mesh.global_box[0][1]+1;
+  int global_nodes_y = mesh.global_box[1][1]+1;
+  int global_nodes_z = mesh.global_box[2][1]+1;
+  Box box;
+  copy_box(mesh.local_box, box);
+
+  //num-owned-nodes in each dimension is num-elems+1
+  //only if num-elems > 0 in that dimension *and*
+  //we are at the high end of the global range in that dimension:
+  if (box[0][1] > box[0][0] && box[0][1] == mesh.global_box[0][1]) ++box[0][1];
+  if (box[1][1] > box[1][0] && box[1][1] == mesh.global_box[1][1]) ++box[1][1];
+  if (box[2][1] > box[2][0] && box[2][1] == mesh.global_box[2][1]) ++box[2][1];
+
+  std::vector<GlobalOrdinal> rows;
+  std::vector<Scalar> row_coords;
+
+  int roffset = 0;
+  for(int iz=box[2][0]; iz<box[2][1]; ++iz) {
+   for(int iy=box[1][0]; iy<box[1][1]; ++iy) {
+    for(int ix=box[0][0]; ix<box[0][1]; ++ix) {
+      GlobalOrdinal row_id =
+          get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                ix, iy, iz);
+      Scalar x, y, z;
+      get_coords(row_id, global_nodes_x, global_nodes_y, global_nodes_z, x, y, z);
+
+      bool verify_this_point = false;
+      if (verify_whole_domain) verify_this_point = true;
+      else if (std::abs(x - 0.5) < 0.05 && std::abs(y - 0.5) < 0.05 && std::abs(z - 0.5) < 0.05) {
+        verify_this_point = true;
+      }
+
+      if (verify_this_point) {
+        rows.push_back(roffset);
+        row_coords.push_back(x);
+        row_coords.push_back(y);
+        row_coords.push_back(z);
+      }
+
+      ++roffset;
+    }
+   }
+  }
+
+  int return_code = 0;
+
+  const int num_terms = 300;
+
+  err_info<Scalar> max_error;
+  max_error.err = 0.0;
+
+  for(size_t i=0; i<rows.size(); ++i) {
+    Scalar computed_soln = x.coefs[rows[i]];
+    Scalar x = row_coords[i*3];
+    Scalar y = row_coords[i*3+1];
+    Scalar z = row_coords[i*3+2];
+    Scalar analytic_soln = 0.0;
+    //set exact boundary-conditions:
+    if (x == 1.0) {
+      //x==1 is first, we want soln to be 1 even around the edges
+      //of the x==1 plane where y and/or z may be 0 or 1...
+      analytic_soln = 1;
+    }
+    else if (x == 0.0 || y == 0.0 || z == 0.0) {
+      analytic_soln = 0;
+    }
+    else if (y == 1.0 || z == 1.0) {
+      analytic_soln = 0;
+    }
+    else {
+      analytic_soln = soln(x, y, z, num_terms, num_terms);
+    }
+
+#ifdef MINIFE_DEBUG_VERBOSE
+std::cout<<"("<<x<<","<<y<<","<<z<<") row "<<rows[i]<<": computed: "<<computed_soln<<",  analytic: "<<analytic_soln<<std::endl;
+#endif
+    Scalar err = std::abs(analytic_soln - computed_soln);
+    if (err > max_error.err) {
+      max_error.err = err;
+      max_error.computed = computed_soln;
+      max_error.analytic = analytic_soln;
+      max_error.coords[0] = x;
+      max_error.coords[1] = y;
+      max_error.coords[2] = z;
+    }
+  }
+
+  Scalar local_max_err = max_error.err;
+  Scalar global_max_err = 0;
+#ifdef HAVE_MPI
+  MPI_Allreduce(&local_max_err, &global_max_err, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+#else
+  global_max_err = local_max_err;
+#endif
+
+  if (local_max_err == global_max_err) {
+    if (max_error.err > tolerance) {
+      std::cout << "max absolute error is "<<max_error.err<<":"<<std::endl;
+      std::cout << "   at position ("<<max_error.coords[0]<<","<<max_error.coords[1]<<","<<max_error.coords[2]<<"), "<<std::endl;
+      std::cout << "   computed solution: "<<max_error.computed<<",  analytic solution: "<<max_error.analytic<<std::endl;
+    }
+    else {
+      std::cout << "solution matches analytic solution to within "<<tolerance<<" or better."<<std::endl;
+    }
+  }
+
+  if (global_max_err > tolerance) {
+    return_code = 1;
+  }
+
+  return return_code;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/CSRMatrix.hpp b/openmp-avx512/src/CSRMatrix.hpp
new file mode 100644
index 0000000..abebacb
--- /dev/null
+++ b/openmp-avx512/src/CSRMatrix.hpp
@@ -0,0 +1,146 @@
+#ifndef _CSRMatrix_hpp_
+#define _CSRMatrix_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <algorithm>
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal>
+struct
+CSRMatrix {
+  CSRMatrix()
+   : has_local_indices(false),
+     rows(), row_offsets(), row_offsets_external(),
+     packed_cols(), packed_coefs(),
+     num_cols(0)
+#ifdef HAVE_MPI
+     ,external_index(), external_local_index(), elements_to_send(),
+      neighbors(), recv_length(), send_length(), send_buffer(), request()
+#endif
+  {
+  }
+
+  ~CSRMatrix()
+  {}
+
+  typedef Scalar        ScalarType;
+  typedef LocalOrdinal  LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+
+  bool                       has_local_indices;
+  std::vector<GlobalOrdinal> rows;
+  std::vector<LocalOrdinal>  row_offsets;
+  std::vector<LocalOrdinal>  row_offsets_external;
+  std::vector<GlobalOrdinal> packed_cols;
+  std::vector<Scalar>        packed_coefs;
+  LocalOrdinal               num_cols;
+
+#ifdef HAVE_MPI
+  std::vector<GlobalOrdinal> external_index;
+  std::vector<GlobalOrdinal>  external_local_index;
+  std::vector<GlobalOrdinal> elements_to_send;
+  std::vector<int>           neighbors;
+  std::vector<LocalOrdinal>  recv_length;
+  std::vector<LocalOrdinal>  send_length;
+  std::vector<Scalar>        send_buffer;
+  std::vector<MPI_Request>   request;
+#endif
+
+  size_t num_nonzeros() const
+  {
+    return row_offsets[row_offsets.size()-1];
+  }
+
+  void reserve_space(unsigned nrows, unsigned ncols_per_row)
+  {
+    rows.resize(nrows);
+    row_offsets.resize(nrows+1);
+    packed_cols.reserve(nrows * ncols_per_row);
+    packed_coefs.reserve(nrows * ncols_per_row);
+
+    #pragma omp parallel for
+    for(MINIFE_GLOBAL_ORDINAL i = 0; i < nrows; ++i) {
+	rows[i] = 0;
+	row_offsets[i] = 0;
+    }
+
+    #pragma omp parallel for
+    for(MINIFE_GLOBAL_ORDINAL i = 0; i < (nrows * ncols_per_row); ++i) {
+	packed_cols[i] = 0;
+	packed_coefs[i] = 0;
+    }
+  }
+
+  void get_row_pointers(GlobalOrdinalType row, size_t& row_length,
+                        GlobalOrdinalType*& cols,
+                        ScalarType*& coefs)
+  {
+    ptrdiff_t local_row = -1;
+    //first see if we can get the local-row index using fast direct lookup:
+    if (rows.size() >= 1) {
+      ptrdiff_t idx = row - rows[0];
+      if (idx < rows.size() && rows[idx] == row) {
+        local_row = idx;
+      }
+    }
+ 
+    //if we didn't get the local-row index using direct lookup, try a
+    //more expensive binary-search:
+    if (local_row == -1) {
+      typename std::vector<GlobalOrdinal>::iterator row_iter =
+          std::lower_bound(rows.begin(), rows.end(), row);
+  
+      //if we still haven't found row, it's not local so jump out:
+      if (row_iter == rows.end() || *row_iter != row) {
+        row_length = 0;
+        return;
+      }
+  
+      local_row = row_iter - rows.begin();
+    }
+
+    LocalOrdinalType offset = row_offsets[local_row];
+    row_length = row_offsets[local_row+1] - offset;
+    cols = &packed_cols[offset];
+    coefs = &packed_coefs[offset];
+  }
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/ELLMatrix.hpp b/openmp-avx512/src/ELLMatrix.hpp
new file mode 100644
index 0000000..f405f6d
--- /dev/null
+++ b/openmp-avx512/src/ELLMatrix.hpp
@@ -0,0 +1,139 @@
+#ifndef _ELLMatrix_hpp_
+#define _ELLMatrix_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <algorithm>
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal>
+struct
+ELLMatrix {
+  ELLMatrix()
+   : has_local_indices(false),
+     rows(),
+     cols(), coefs(),
+     num_cols(0),
+     num_cols_per_row(0)
+#ifdef HAVE_MPI
+     ,external_index(), external_local_index(), elements_to_send(),
+      neighbors(), recv_length(), send_length(), send_buffer(), request()
+#endif
+  {
+  }
+
+  ~ELLMatrix()
+  {}
+
+  typedef Scalar        ScalarType;
+  typedef LocalOrdinal  LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+
+  bool                       has_local_indices;
+  std::vector<GlobalOrdinal> rows;
+  std::vector<GlobalOrdinal> cols;
+  std::vector<Scalar>        coefs;
+  LocalOrdinal               num_cols;
+  LocalOrdinal               num_cols_per_row;
+
+#ifdef HAVE_MPI
+  std::vector<GlobalOrdinal> external_index;
+  std::vector<GlobalOrdinal>  external_local_index;
+  std::vector<GlobalOrdinal> elements_to_send;
+  std::vector<int>           neighbors;
+  std::vector<LocalOrdinal>  recv_length;
+  std::vector<LocalOrdinal>  send_length;
+  std::vector<Scalar>        send_buffer;
+  std::vector<MPI_Request>   request;
+#endif
+
+  size_t num_nonzeros() const
+  {
+    return rows.size()*num_cols_per_row;
+  }
+
+  void reserve_space(unsigned nrows, unsigned ncols_per_row)
+  {
+    rows.resize(nrows);
+    cols.resize(nrows * ncols_per_row);
+    coefs.resize(nrows * ncols_per_row);
+    num_cols_per_row = ncols_per_row;
+  }
+
+  void get_row_pointers(GlobalOrdinalType row, size_t& row_length,
+                        GlobalOrdinalType*& cols_ptr,
+                        ScalarType*& coefs_ptr)
+  {
+    ptrdiff_t local_row = -1;
+    //first see if we can get the local-row index using fast direct lookup:
+    if (rows.size() >= 1) {
+      ptrdiff_t idx = row - rows[0];
+      if (idx < rows.size() && rows[idx] == row) {
+        local_row = idx;
+      }
+    }
+ 
+    //if we didn't get the local-row index using direct lookup, try a
+    //more expensive binary-search:
+    if (local_row == -1) {
+      typename std::vector<GlobalOrdinal>::iterator row_iter =
+          std::lower_bound(rows.begin(), rows.end(), row);
+  
+      //if we still haven't found row, it's not local so jump out:
+      if (row_iter == rows.end() || *row_iter != row) {
+        row_length = 0;
+        return;
+      }
+  
+      local_row = row_iter - rows.begin();
+    }
+
+    cols_ptr = &cols[local_row*num_cols_per_row];
+    coefs_ptr = &coefs[local_row*num_cols_per_row];
+    
+    int idx = num_cols_per_row-1;
+    while(idx>=0) {
+      if (cols_ptr[idx] != 0) break;
+      --idx;
+    }
+    row_length = idx+1;
+  }
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/GetNodesCoords.hpp b/openmp-avx512/src/GetNodesCoords.hpp
new file mode 100644
index 0000000..5278dd1
--- /dev/null
+++ b/openmp-avx512/src/GetNodesCoords.hpp
@@ -0,0 +1,51 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _GETNODESCOORDS_HPP_
+#define _GETNODESCOORDS_HPP_
+
+#include <Hex8_enums.hpp>
+#include <simple_mesh_description.hpp>
+
+template<typename GlobalOrdinal,typename Scalar>
+struct GetNodesCoords {
+  const miniFE::simple_mesh_description<GlobalOrdinal>* mesh;
+  GlobalOrdinal* elemIDs;
+  GlobalOrdinal* node_ordinals;
+  Scalar* elem_node_coords;
+
+inline void operator()(int i)
+{
+  unsigned nnodes = miniFE::Hex8::numNodesPerElem;
+  GlobalOrdinal elemID = elemIDs[i];
+  GlobalOrdinal* node_ords = node_ordinals+i*nnodes;
+  Scalar* node_coords = elem_node_coords+i*nnodes*miniFE::Hex8::spatialDim;
+  get_elem_nodes_and_coords(*mesh, elemID, node_ords, node_coords);
+}
+};
+
+#endif
diff --git a/openmp-avx512/src/Hex8_box_utils.hpp b/openmp-avx512/src/Hex8_box_utils.hpp
new file mode 100644
index 0000000..da38684
--- /dev/null
+++ b/openmp-avx512/src/Hex8_box_utils.hpp
@@ -0,0 +1,173 @@
+#ifndef _Hex8_box_utils_hpp_
+#define _Hex8_box_utils_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <stdexcept>
+
+#include <box_utils.hpp>
+#include <ElemData.hpp>
+#include <simple_mesh_description.hpp>
+#include <Hex8.hpp>
+
+namespace miniFE {
+
+
+template<typename GlobalOrdinal>
+void get_hex8_node_ids(int nx, int ny,
+                       GlobalOrdinal node0,
+                       GlobalOrdinal* elem_node_ids)
+{
+//Given box dimensions nx and ny, and a starting node
+//(local-node-0 for a hex8), compute the other nodes
+//of the hex8 using the exodus ordering convention.
+  elem_node_ids[0] = node0;
+  elem_node_ids[1] = node0 + 1;
+  elem_node_ids[2] = node0 + nx + 1;
+  elem_node_ids[3] = node0 + nx;
+  elem_node_ids[4] = node0 +     nx*ny;
+  elem_node_ids[5] = node0 + 1 + nx*ny;
+  elem_node_ids[6] = node0 + nx + nx*ny + 1;
+  elem_node_ids[7] = node0 + nx + nx*ny;
+}
+
+template<typename Scalar>
+void get_hex8_node_coords_3d(Scalar x, Scalar y, Scalar z,
+                             Scalar hx, Scalar hy, Scalar hz,
+                             Scalar* elem_node_coords)
+{
+  //Input: x,y,z are the coordinates of local-node 0 for a Hex8.
+  //'hx', 'hy', 'hz' are the lengths of the sides of the element
+  //in each direction.
+
+  elem_node_coords[0] = x;
+  elem_node_coords[1] = y;
+  elem_node_coords[2] = z;
+
+  elem_node_coords[3] = x + hx;
+  elem_node_coords[4] = y;
+  elem_node_coords[5] = z;
+
+  elem_node_coords[6] = x + hx;
+  elem_node_coords[7] = y + hy;
+  elem_node_coords[8] = z;
+
+  elem_node_coords[9]  = x;
+  elem_node_coords[10] = y + hy;
+  elem_node_coords[11] = z;
+
+  elem_node_coords[12] = x;
+  elem_node_coords[13] = y;
+  elem_node_coords[14] = z + hz;
+
+  elem_node_coords[15] = x + hx;
+  elem_node_coords[16] = y;
+  elem_node_coords[17] = z + hz;
+
+  elem_node_coords[18] = x + hx;
+  elem_node_coords[19] = y + hy;
+  elem_node_coords[20] = z + hz;
+
+  elem_node_coords[21] = x;
+  elem_node_coords[22] = y + hy;
+  elem_node_coords[23] = z + hz;
+}
+
+template<typename GlobalOrdinal, typename Scalar>
+void
+get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
+                          GlobalOrdinal elemID,
+                          GlobalOrdinal* node_ords, Scalar* node_coords)
+{
+  int global_nodes_x = mesh.global_box[0][1]+1;
+  int global_nodes_y = mesh.global_box[1][1]+1;
+  int global_nodes_z = mesh.global_box[2][1]+1;
+ 
+  if (elemID < 0) {
+    //I don't think this can happen, but check for the sake of paranoia...
+    throw std::runtime_error("get_elem_nodes_and_coords ERROR, negative elemID");
+  }
+
+  int elem_int_x, elem_int_y, elem_int_z;
+  get_int_coords(elemID, global_nodes_x-1, global_nodes_y-1, global_nodes_z-1,
+             elem_int_x, elem_int_y, elem_int_z);
+  GlobalOrdinal nodeID = get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z, elem_int_x, elem_int_y, elem_int_z);
+
+#ifdef MINIFE_DEBUG_VERBOSE
+  std::cout<<"\nelemID: "<<elemID<<", nodeID: "<<nodeID<<std::endl;
+#endif
+  get_hex8_node_ids(global_nodes_x, global_nodes_y, nodeID, node_ords);
+
+  //Map node-IDs to rows because each processor may have a non-contiguous block of
+  //node-ids, but needs a contiguous block of row-numbers:
+#ifdef MINIFE_DEBUG_VERBOSE
+  std::cout<<"elem "<<elemID<<" nodes: ";
+#endif
+  for(int i=0; i<Hex8::numNodesPerElem; ++i) {
+#ifdef MINIFE_DEBUG_VERBOSE
+    std::cout<<node_ords[i]<<" ";
+#endif
+    node_ords[i] = mesh.map_id_to_row(node_ords[i]);
+  }
+#ifdef MINIFE_DEBUG_VERBOSE
+  std::cout << std::endl;
+#endif
+
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+ 
+  Scalar ix,iy,iz;
+  get_coords<GlobalOrdinal,Scalar>(nodeID, global_nodes_x,global_nodes_y,global_nodes_z,
+                            ix,iy,iz);
+  Scalar hx = 1.0/global_elems_x;
+  Scalar hy = 1.0/global_elems_y;
+  Scalar hz = 1.0/global_elems_z;
+  get_hex8_node_coords_3d(ix, iy, iz, hx, hy, hz, node_coords);
+#ifdef MINIFE_DEBUG_VERBOSE
+  int offset = 0;
+  for(int i=0; i<Hex8::numNodesPerElem; ++i) {
+    std::cout << "("<<node_coords[offset++]<<","<<node_coords[offset++]<<","<<node_coords[offset++]<<")";
+  }
+  std::cout << std::endl;
+#endif
+}
+
+template<typename GlobalOrdinal, typename Scalar>
+void
+get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
+                          GlobalOrdinal elemID,
+                          ElemData<GlobalOrdinal,Scalar>& elem_data)
+{
+  get_elem_nodes_and_coords(mesh, elemID, elem_data.elem_node_ids, elem_data.elem_node_coords);
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/Makefile b/openmp-avx512/src/Makefile
new file mode 100644
index 0000000..4b2888e
--- /dev/null
+++ b/openmp-avx512/src/Makefile
@@ -0,0 +1,42 @@
+#-----------------------------------------------------------------------
+# This file compiles for OpenMP and MPI hybrid operations using the GNU
+# compile chain.
+
+MINIFE_TYPES =  \
+        -DMINIFE_SCALAR=double   \
+        -DMINIFE_LOCAL_ORDINAL=int      \
+		-DMINIFE_GLOBAL_ORDINAL=int
+
+#MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
+#MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
+MINIFE_MATRIX_TYPE = -DMINIFE_SELL_MATRIX
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3 -fopenmp
+CXXFLAGS = $(CFLAGS)
+
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) \
+	$(MINIFE_MATRIX_TYPE) \
+	-DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK \
+	-DMINIFE_REPORT_RUSAGE
+
+LDFLAGS=$(CFLAGS)
+LIBS=
+
+# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
+# such as the one on my cygwin machine.
+
+#CXX=mpiicpc
+#CC=mpiicc
+
+#CXX=g++
+#CC=g++
+
+#CXX=icpc
+#CC=icc
+
+CXX=mpicxx
+CC=mpicc
+
+include make_targets
diff --git a/openmp-avx512/src/Makefile.cray.xc30 b/openmp-avx512/src/Makefile.cray.xc30
new file mode 100644
index 0000000..2710a45
--- /dev/null
+++ b/openmp-avx512/src/Makefile.cray.xc30
@@ -0,0 +1,35 @@
+#-----------------------------------------------------------------------
+# ATTENTION:
+#
+# This file does not enable MPI in the binary, this is just OpenMP
+# only. To enable MPI calls add -DHAVE_MPI to CPPFLAGS
+#
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+        -DMINIFE_SCALAR=double   \
+        -DMINIFE_LOCAL_ORDINAL=int      \
+        -DMINIFE_GLOBAL_ORDINAL=int
+
+MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
+# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
+
+#-----------------------------------------------------------------------
+
+MPIDIR=/opt/cray/mpt/default/gni/mpich2-cray64/81
+
+CFLAGS = -O3
+CXXFLAGS = $(CFLAGS) -I$(MPIDIR)/include
+
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) \
+	-DHAVE_MPI
+
+LDFLAGS=-L$(MPIDIR)/lib
+LIBS=-lmpichcxx
+
+CXX=CC
+CC=cc
+
+#-----------------------------------------------------------------------
+
+include make_targets
diff --git a/openmp-avx512/src/Makefile.gnu.openmp b/openmp-avx512/src/Makefile.gnu.openmp
new file mode 100644
index 0000000..b377eac
--- /dev/null
+++ b/openmp-avx512/src/Makefile.gnu.openmp
@@ -0,0 +1,32 @@
+#-----------------------------------------------------------------------
+# ATTENTION:
+#
+# This file does not enable MPI in the binary, this is just OpenMP
+# only. To enable MPI calls add -DHAVE_MPI to CPPFLAGS
+#
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+        -DMINIFE_SCALAR=double   \
+        -DMINIFE_LOCAL_ORDINAL=int      \
+        -DMINIFE_GLOBAL_ORDINAL=int
+
+MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
+#MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3 -fopenmp -fopt-info-loop-missed-optimized=results/gcc/vec_report
+CXXFLAGS = $(CFLAGS)
+
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE)
+
+LDFLAGS=
+LIBS=
+
+CXX=g++
+CC=gcc
+
+#-----------------------------------------------------------------------
+
+include make_targets
diff --git a/openmp-avx512/src/Makefile.intel.openmp b/openmp-avx512/src/Makefile.intel.openmp
new file mode 100644
index 0000000..823c211
--- /dev/null
+++ b/openmp-avx512/src/Makefile.intel.openmp
@@ -0,0 +1,34 @@
+#-----------------------------------------------------------------------
+# ATTENTION:
+#
+# This file does not enable MPI in the binary, this is just OpenMP
+# only. To enable MPI calls add -DHAVE_MPI to CPPFLAGS
+#
+#-----------------------------------------------------------------------
+
+MINIFE_TYPES =  \
+        -DMINIFE_SCALAR=double   \
+        -DMINIFE_LOCAL_ORDINAL=int      \
+        -DMINIFE_GLOBAL_ORDINAL=int
+
+# MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
+# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
+MINIFE_MATRIX_TYPE = -DMINIFE_SELL_MATRIX
+
+
+#-----------------------------------------------------------------------
+
+CFLAGS = -O3 -fopenmp -march=native -mtune=native -DAVX512_ZMM
+CXXFLAGS = $(CFLAGS)
+
+CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE)
+
+LDFLAGS=
+LIBS=
+
+CXX=icpc
+CC=icc
+
+#-----------------------------------------------------------------------
+
+include make_targets
diff --git a/openmp-avx512/src/MatrixCopyOp.hpp b/openmp-avx512/src/MatrixCopyOp.hpp
new file mode 100644
index 0000000..9d24be0
--- /dev/null
+++ b/openmp-avx512/src/MatrixCopyOp.hpp
@@ -0,0 +1,60 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _MatrixCopyOp_hpp_
+#define _MatrixCopyOp_hpp_
+
+template<typename MatrixType>
+struct MatrixCopyOp {
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+  typedef typename MatrixType::ScalarType ScalarType;
+
+  const GlobalOrdinalType* src_rows;
+  const LocalOrdinalType*  src_rowoffsets;
+  const GlobalOrdinalType* src_cols;
+  const ScalarType*        src_coefs;
+
+  GlobalOrdinalType* dest_rows;
+  LocalOrdinalType*  dest_rowoffsets;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+
+  inline void operator()(int i)
+  {
+    dest_rows[i] = src_rows[i];
+    dest_rowoffsets[i] = src_rowoffsets[i];
+    for(int j=src_rowoffsets[i]; j<src_rowoffsets[i+1]; ++j) {
+      dest_cols[j] = src_cols[j];
+      dest_coefs[j] = src_coefs[j];
+    }
+  }
+};
+
+#endif
+
diff --git a/openmp-avx512/src/MatrixInitOp.hpp b/openmp-avx512/src/MatrixInitOp.hpp
new file mode 100644
index 0000000..bc29082
--- /dev/null
+++ b/openmp-avx512/src/MatrixInitOp.hpp
@@ -0,0 +1,320 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _MatrixInitOp_hpp_
+#define _MatrixInitOp_hpp_
+
+#include <simple_mesh_description.hpp>
+#include <box_utils.hpp>
+
+#include <CSRMatrix.hpp>
+#include <ELLMatrix.hpp>
+#include <SELLMatrix.hpp>
+
+#include <algorithm>
+
+template<typename GlobalOrdinal>
+void sort_if_needed(GlobalOrdinal* list,
+                    GlobalOrdinal list_len,
+					int stride=1)
+{
+  bool need_to_sort = false;
+  for(GlobalOrdinal i=list_len-1; i>=1; --i) {
+    if (list[i*stride] < list[(i-1)*stride]) {
+      need_to_sort = true;
+      break;
+    }
+  }
+
+  if (need_to_sort) {
+	std::cout << "SORTING!!!" << std::endl;
+	throw std::runtime_error("ERROR sorting not possible for SELL matrix.");
+    std::sort(list,list+list_len);
+  }
+}
+
+template<typename MatrixType>
+struct MatrixInitOp {
+};
+
+template<>
+struct MatrixInitOp<miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL> > {
+  MatrixInitOp(const std::vector<MINIFE_GLOBAL_ORDINAL>& rows_vec,
+               const std::vector<MINIFE_LOCAL_ORDINAL>& row_offsets_vec,
+               const std::vector<int>& row_coords_vec,
+               int global_nx, int global_ny, int global_nz,
+               MINIFE_GLOBAL_ORDINAL global_n_rows,
+               const miniFE::simple_mesh_description<MINIFE_GLOBAL_ORDINAL>& input_mesh,
+               miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL>& matrix)
+   : rows(&rows_vec[0]),
+     row_offsets(&row_offsets_vec[0]),
+     row_coords(&row_coords_vec[0]),
+     global_nodes_x(global_nx),
+     global_nodes_y(global_ny),
+     global_nodes_z(global_nz),
+     global_nrows(global_n_rows),
+     mesh(&input_mesh),
+     dest_rows(&matrix.rows[0]),
+     dest_rowoffsets(&matrix.row_offsets[0]),
+     dest_cols(&matrix.packed_cols[0]),
+     dest_coefs(&matrix.packed_coefs[0]),
+     n(matrix.rows.size())
+  {
+    if (matrix.packed_cols.capacity() != matrix.packed_coefs.capacity()) {
+      std::cout<<"Warning, packed_cols.capacity ("<<matrix.packed_cols.capacity()<<") != "
+        << "packed_coefs.capacity ("<<matrix.packed_coefs.capacity()<<")"<<std::endl;
+    }
+
+    size_t nnz = row_offsets_vec[n];
+    if (matrix.packed_cols.capacity() < nnz) {
+      std::cout<<"Warning, packed_cols.capacity ("<<matrix.packed_cols.capacity()<<") < "
+        " nnz ("<<nnz<<")"<<std::endl;
+    }
+
+    matrix.packed_cols.resize(nnz);
+    matrix.packed_coefs.resize(nnz);
+    dest_rowoffsets[n] = nnz;
+#ifdef HAVE_MPI 
+   MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+#else
+   proc = 0;
+#endif
+  }
+
+  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
+  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
+  typedef MINIFE_SCALAR ScalarType;
+
+  const GlobalOrdinalType* rows;
+  const LocalOrdinalType*  row_offsets;
+  const int*               row_coords;
+
+  int global_nodes_x;
+  int global_nodes_y;
+  int global_nodes_z;
+
+  GlobalOrdinalType global_nrows;
+
+  GlobalOrdinalType* dest_rows;
+  LocalOrdinalType*  dest_rowoffsets;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+  int proc;
+
+  const miniFE::simple_mesh_description<GlobalOrdinalType>* mesh;
+
+  inline void operator()(int i)
+  {
+    dest_rows[i] = rows[i];
+    int offset = row_offsets[i];
+    dest_rowoffsets[i] = offset;
+    int ix = row_coords[i*3];
+    int iy = row_coords[i*3+1];
+    int iz = row_coords[i*3+2];
+    GlobalOrdinalType nnz = 0;
+    for(int sz=-1; sz<=1; ++sz) {
+      for(int sy=-1; sy<=1; ++sy) {
+        for(int sx=-1; sx<=1; ++sx) {
+          GlobalOrdinalType col_id =
+              miniFE::get_id<GlobalOrdinalType>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                   ix+sx, iy+sy, iz+sz);
+          if (col_id >= 0 && col_id < global_nrows) {
+            GlobalOrdinalType col = mesh->map_id_to_row(col_id);
+            if (col >= global_nrows) {
+              std::cout << "mesh->map_id_to_row produced col="<<col<<" from col_id="<<col_id<<", but global_nrows="<<global_nrows<<", max_row_in_map="<<mesh->max_row_in_map()<<", proc="<<proc<<std::endl;
+            }
+            dest_cols[offset+nnz] = col;
+            dest_coefs[offset+nnz] = 0;
+            ++nnz;
+          }
+        }
+      }
+    }
+
+    sort_if_needed(&dest_cols[offset], nnz);
+  }
+};
+
+template<>
+struct MatrixInitOp<miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL> > {
+  MatrixInitOp(const std::vector<MINIFE_GLOBAL_ORDINAL>& rows_vec,
+               const std::vector<MINIFE_LOCAL_ORDINAL>& /*row_offsets_vec*/,
+               const std::vector<int>& row_coords_vec,
+               int global_nx, int global_ny, int global_nz,
+               MINIFE_GLOBAL_ORDINAL global_n_rows,
+               const miniFE::simple_mesh_description<MINIFE_GLOBAL_ORDINAL>& input_mesh,
+               miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL>& matrix)
+   : rows(&rows_vec[0]),
+     row_coords(&row_coords_vec[0]),
+     global_nodes_x(global_nx),
+     global_nodes_y(global_ny),
+     global_nodes_z(global_nz),
+     global_nrows(global_n_rows),
+     mesh(&input_mesh),
+     dest_rows(&matrix.rows[0]),
+     dest_cols(&matrix.cols[0]),
+     dest_coefs(&matrix.coefs[0]),
+     n(matrix.rows.size()),
+     ncols_per_row(matrix.num_cols_per_row)
+  {
+  }
+
+  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
+  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
+  typedef MINIFE_SCALAR ScalarType;
+
+  const GlobalOrdinalType* rows;
+  const int*               row_coords;
+
+  int global_nodes_x;
+  int global_nodes_y;
+  int global_nodes_z;
+
+  GlobalOrdinalType global_nrows;
+
+  GlobalOrdinalType* dest_rows;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+  int ncols_per_row;
+
+  const miniFE::simple_mesh_description<GlobalOrdinalType>* mesh;
+
+  inline void operator()(int i)
+  {
+    dest_rows[i] = rows[i];
+    int offset = i*ncols_per_row;
+    int ix = row_coords[i*3];
+    int iy = row_coords[i*3+1];
+    int iz = row_coords[i*3+2];
+    GlobalOrdinalType nnz = 0;
+    for(int sz=-1; sz<=1; ++sz)
+      for(int sy=-1; sy<=1; ++sy)
+        for(int sx=-1; sx<=1; ++sx) {
+          GlobalOrdinalType col_id =
+              miniFE::get_id<GlobalOrdinalType>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                   ix+sx, iy+sy, iz+sz);
+          if (col_id >= 0 && col_id < global_nrows) {
+            GlobalOrdinalType col = mesh->map_id_to_row(col_id);
+            dest_cols[offset+nnz] = col;
+            dest_coefs[offset+nnz] = 0;
+            ++nnz;
+          }
+        }
+
+    sort_if_needed(&dest_cols[offset], nnz);
+  }
+};
+
+
+template<>
+struct MatrixInitOp<miniFE::SELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL> > {
+  MatrixInitOp(const std::vector<MINIFE_GLOBAL_ORDINAL>& rows_vec,
+               const std::vector<MINIFE_LOCAL_ORDINAL>& /*row_offsets_vec*/,
+               const std::vector<int>& row_coords_vec,
+               int global_nx, int global_ny, int global_nz,
+               MINIFE_GLOBAL_ORDINAL global_n_rows,
+               const miniFE::simple_mesh_description<MINIFE_GLOBAL_ORDINAL>& input_mesh,
+               miniFE::SELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL>& matrix)
+   : rows(&rows_vec[0]),
+     row_coords(&row_coords_vec[0]),
+     global_nodes_x(global_nx),
+     global_nodes_y(global_ny),
+     global_nodes_z(global_nz),
+     global_nrows(global_n_rows),
+     mesh(&input_mesh),
+     dest_rows(&matrix.rows[0]),
+     dest_cols(&matrix.cols[0]),
+     dest_coefs(&matrix.coefs[0]),
+     n(matrix.rows.size()),
+     ncols_per_row(matrix.num_cols_per_row),
+	 n_blocks(matrix.num_blocks),
+	 nrows_per_block(matrix.num_rows_per_block)
+  {
+  }
+
+  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
+  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
+  typedef MINIFE_SCALAR ScalarType;
+
+  const GlobalOrdinalType* rows;
+  const int*               row_coords;
+
+  int global_nodes_x;
+  int global_nodes_y;
+  int global_nodes_z;
+
+  GlobalOrdinalType global_nrows;
+
+  GlobalOrdinalType* dest_rows;
+  GlobalOrdinalType* dest_cols;
+  ScalarType*        dest_coefs;
+  int n;
+  int ncols_per_row;
+  int nrows_per_block;
+  int n_blocks;
+
+  const miniFE::simple_mesh_description<GlobalOrdinalType>* mesh;
+
+  inline void operator()(int i)
+  {
+	dest_rows[i] = rows[i];
+	int block_id = i / nrows_per_block;
+	int offset = i%nrows_per_block + (block_id * nrows_per_block * ncols_per_row);
+    int ix = row_coords[i*3];
+    int iy = row_coords[i*3+1];
+    int iz = row_coords[i*3+2];
+
+    int stride = nrows_per_block;
+    // if we are in last block, stride might be less than nrows_per_block
+    if (block_id == n_blocks - 1 && n%nrows_per_block != 0) {
+    	stride = n % nrows_per_block;
+    }
+
+    GlobalOrdinalType nnz = 0;
+    for(int sz=-1; sz<=1; ++sz)
+      for(int sy=-1; sy<=1; ++sy)
+        for(int sx=-1; sx<=1; ++sx) {
+          GlobalOrdinalType col_id =
+              miniFE::get_id<GlobalOrdinalType>(global_nodes_x, global_nodes_y, global_nodes_z,
+                                   ix+sx, iy+sy, iz+sz);
+          if (col_id >= 0 && col_id < global_nrows) {
+            GlobalOrdinalType col = mesh->map_id_to_row(col_id);
+            dest_cols[offset+nnz*stride] = col;
+            dest_coefs[offset+nnz*stride] = 0;
+            ++nnz;
+          }
+        }
+
+    sort_if_needed(&dest_cols[offset], nnz, stride);
+  }
+};
+
+
+#endif
+
diff --git a/openmp-avx512/src/README.md b/openmp-avx512/src/README.md
new file mode 100644
index 0000000..e2a916c
--- /dev/null
+++ b/openmp-avx512/src/README.md
@@ -0,0 +1,6 @@
+This implementation includes the sliced ELLpack matrix format, specifically optimized for AVX-512.
+
+MPI is not yet supported.
+
+Compile with:
+`make -f Makefile.intel.openmp`
diff --git a/openmp-avx512/src/SELLMatrix.hpp b/openmp-avx512/src/SELLMatrix.hpp
new file mode 100644
index 0000000..e2d2623
--- /dev/null
+++ b/openmp-avx512/src/SELLMatrix.hpp
@@ -0,0 +1,160 @@
+#ifndef _SELLMatrix_hpp_
+#define _SELLMatrix_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <algorithm>
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <immintrin.h>
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal>
+struct
+SELLMatrix {
+  SELLMatrix()
+   : has_local_indices(false),
+     rows(),
+     cols(), coefs(),
+     num_cols(0),
+     num_cols_per_row(0),
+	 num_rows_per_block()
+#ifdef HAVE_MPI
+     ,external_index(), external_local_index(), elements_to_send(),
+      neighbors(), recv_length(), send_length(), send_buffer(), request()
+#endif
+  {
+  }
+
+  ~SELLMatrix()
+  {}
+
+  typedef Scalar        ScalarType;
+  typedef LocalOrdinal  LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+
+  bool                       has_local_indices;
+  std::vector<GlobalOrdinal> rows;
+  std::vector<GlobalOrdinal> cols;
+  std::vector<Scalar>        coefs;
+  LocalOrdinal               num_cols;
+  LocalOrdinal               num_cols_per_row;
+  LocalOrdinal				 num_blocks;
+  LocalOrdinal				 num_rows_per_block;
+
+#ifdef HAVE_MPI
+  std::vector<GlobalOrdinal> external_index;
+  std::vector<GlobalOrdinal>  external_local_index;
+  std::vector<GlobalOrdinal> elements_to_send;
+  std::vector<int>           neighbors;
+  std::vector<LocalOrdinal>  recv_length;
+  std::vector<LocalOrdinal>  send_length;
+  std::vector<Scalar>        send_buffer;
+  std::vector<MPI_Request>   request;
+#endif
+
+  size_t num_nonzeros() const
+  {
+    return rows.size()*num_cols_per_row;
+  }
+
+  void reserve_space(unsigned nrows, unsigned ncols_per_row)
+  {
+    rows.resize(nrows);
+    cols.resize(nrows * ncols_per_row);
+    coefs.resize(nrows * ncols_per_row);
+    num_cols_per_row = ncols_per_row;
+
+    // make size of the block equal to AVX512 size
+#ifdef AVX512_ZMM
+    num_rows_per_block = 8;
+#else
+    num_rows_per_block = 4;
+#endif
+
+    num_blocks = (nrows + num_rows_per_block - 1) / num_rows_per_block;
+  }
+
+  void get_row_pointers(GlobalOrdinalType row, size_t& row_length, size_t& stride,
+                        GlobalOrdinalType*& cols_ptr,
+                        ScalarType*& coefs_ptr)
+  {
+    ptrdiff_t local_row = -1;
+    //first see if we can get the local-row index using fast direct lookup:
+    if (rows.size() >= 1) {
+      ptrdiff_t idx = row - rows[0];
+      if (idx < rows.size() && rows[idx] == row) {
+        local_row = idx;
+      }
+    }
+
+    //if we didn't get the local-row index using direct lookup, try a
+    //more expensive binary-search:
+    if (local_row == -1) {
+      typename std::vector<GlobalOrdinal>::iterator row_iter =
+          std::lower_bound(rows.begin(), rows.end(), row);
+
+      //if we still haven't found row, it's not local so jump out:
+      if (row_iter == rows.end() || *row_iter != row) {
+        row_length = 0;
+        return;
+      }
+
+      local_row = row_iter - rows.begin();
+    }
+
+    int block_id = local_row  / num_rows_per_block;
+    if (rows.size() % num_rows_per_block == 0 || block_id < num_blocks - 1){
+    	stride = num_rows_per_block;
+    } else {
+    	stride = rows.size() % num_rows_per_block;
+    }
+
+    cols_ptr = &cols[local_row % num_rows_per_block + block_id * num_rows_per_block * num_cols_per_row];
+    coefs_ptr = &coefs[local_row % num_rows_per_block + block_id * num_rows_per_block * num_cols_per_row];
+
+    int idx = num_cols_per_row-1;
+    while(idx>=0) {
+      if (cols_ptr[idx * stride] != 0) break;
+      --idx;
+    }
+    row_length = idx+1;
+  }
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/SparseMatrix_functions.hpp b/openmp-avx512/src/SparseMatrix_functions.hpp
new file mode 100644
index 0000000..d79a283
--- /dev/null
+++ b/openmp-avx512/src/SparseMatrix_functions.hpp
@@ -0,0 +1,837 @@
+#ifndef _SparseMatrix_functions_hpp_
+#define _SparseMatrix_functions_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <sstream>
+#include <fstream>
+
+#include <Vector.hpp>
+#include <Vector_functions.hpp>
+#include <ElemData.hpp>
+#include <MatrixInitOp.hpp>
+#include <MatrixCopyOp.hpp>
+#include <exchange_externals.hpp>
+#include <mytimer.hpp>
+
+#ifdef MINIFE_HAVE_TBB
+#include <LockingMatrix.hpp>
+#endif
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType>
+void init_matrix(MatrixType& M,
+                 const std::vector<typename MatrixType::GlobalOrdinalType>& rows,
+                 const std::vector<typename MatrixType::LocalOrdinalType>& row_offsets,
+                 const std::vector<int>& row_coords,
+                 int global_nodes_x,
+                 int global_nodes_y,
+                 int global_nodes_z,
+                 typename MatrixType::GlobalOrdinalType global_nrows,
+                 const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh)
+{
+  MatrixInitOp<MatrixType> mat_init(rows, row_offsets, row_coords,
+                                 global_nodes_x, global_nodes_y, global_nodes_z,
+                                 global_nrows, mesh, M);
+
+  for(int i=0; i<mat_init.n; ++i) {
+    mat_init(i);
+  }
+}
+
+template<typename T,
+         typename U>
+void sort_with_companions(ptrdiff_t len, T* array, U* companions)
+{
+  ptrdiff_t i, j, index;
+  U companion;
+
+  for (i=1; i < len; i++) {
+    index = array[i];
+    companion = companions[i];
+    j = i;
+    while ((j > 0) && (array[j-1] > index))
+    {
+      array[j] = array[j-1];
+      companions[j] = companions[j-1];
+      j = j - 1;
+    }
+    array[j] = index;
+    companions[j] = companion;
+  }
+}
+
+template<typename MatrixType>
+void write_matrix(const std::string& filename, 
+                  MatrixType& mat)
+{
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::ScalarType ScalarType;
+
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  std::ostringstream osstr;
+  osstr << filename << "." << numprocs << "." << myproc;
+  std::string full_name = osstr.str();
+  std::ofstream ofs(full_name.c_str());
+
+  size_t nrows = mat.rows.size();
+  size_t nnz = mat.num_nonzeros();
+
+  for(int p=0; p<numprocs; ++p) {
+    if (p == myproc) {
+      if (p == 0) {
+        ofs << nrows << " " << nnz << std::endl;
+      }
+      for(size_t i=0; i<nrows; ++i) {
+        size_t row_len = 0;
+        GlobalOrdinalType* cols = NULL;
+        ScalarType* coefs = NULL;
+#if defined(MINIFE_SELL_MATRIX)
+        size_t stride;
+        mat.get_row_pointers(mat.rows[i], row_len, stride, cols, coefs);
+        for(size_t j=0; j<row_len; ++j) {
+          ofs << mat.rows[i] << " " << cols[j*stride] << " " << coefs[j*stride] << std::endl;
+        }
+#else
+        mat.get_row_pointers(mat.rows[i], row_len, cols, coefs);
+        for(size_t j=0; j<row_len; ++j) {
+          ofs << mat.rows[i] << " " << cols[j] << " " << coefs[j] << std::endl;
+        }
+ #endif
+      }
+    }
+#ifdef HAVE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  }
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+void
+sum_into_row_SELL_matrix(int row_len,
+             GlobalOrdinal* row_indices,
+             Scalar* row_coefs,
+			 int stride,
+             int num_inputs,
+             const GlobalOrdinal* input_indices,
+             const Scalar* input_coefs)
+{
+  for(size_t i=0; i<num_inputs; ++i) {
+	  // we find loc with binary search. other option would be to override iterator to jump
+	  // around with stride and use lower_bound which seems more complex
+	  int first=0, last=row_len-1, mid;
+	  while (first < last) {
+		  mid = first + (last-first) / 2;
+		  if (row_indices[mid*stride] < input_indices[i]){
+			  first = mid+1;
+		  }
+		  else
+			  last = mid;
+	  }
+	  GlobalOrdinal* loc = &row_indices[first * stride];
+    if (loc-row_indices < row_len * stride && *loc == input_indices[i]) {
+//if(flag && *loc==6)
+//std::cout<<"  ("<<*loc<<":"<<row_coefs[loc-row_indices]<<" += "<<input_coefs[i]<<")"<<std::endl;
+      #pragma omp atomic
+      row_coefs[first * stride] += input_coefs[i];
+    }
+  }
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+void
+sum_into_row(int row_len,
+             GlobalOrdinal* row_indices,
+             Scalar* row_coefs,
+             int num_inputs,
+             const GlobalOrdinal* input_indices,
+             const Scalar* input_coefs)
+{
+  for(size_t i=0; i<num_inputs; ++i) {
+    GlobalOrdinal* loc = std::lower_bound(row_indices, row_indices+row_len,
+                                          input_indices[i]);
+    if (loc-row_indices < row_len && *loc == input_indices[i]) {
+//if(flag && *loc==6)
+//std::cout<<"  ("<<*loc<<":"<<row_coefs[loc-row_indices]<<" += "<<input_coefs[i]<<")"<<std::endl;
+      #pragma omp atomic
+      row_coefs[loc-row_indices] += input_coefs[i];
+    }
+  }
+}
+
+template<typename MatrixType>
+void
+sum_into_row(typename MatrixType::GlobalOrdinalType row,
+             size_t num_indices,
+             const typename MatrixType::GlobalOrdinalType* col_inds,
+             const typename MatrixType::ScalarType* coefs,
+             MatrixType& mat)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  size_t row_len = 0;
+  GlobalOrdinal* mat_row_cols = NULL;
+  Scalar* mat_row_coefs = NULL;
+
+#if defined(MINIFE_SELL_MATRIX)
+  size_t stride;
+  mat.get_row_pointers(row, row_len, stride, mat_row_cols, mat_row_coefs);
+  if (row_len == 0) return;
+
+  sum_into_row(row_len, mat_row_cols, mat_row_coefs, stride, num_indices, col_inds, coefs);
+#else
+  mat.get_row_pointers(row, row_len, mat_row_cols, mat_row_coefs);
+  if (row_len == 0) return;
+
+  sum_into_row(row_len, mat_row_cols, mat_row_coefs, num_indices, col_inds, coefs);
+#endif
+}
+
+template<typename MatrixType>
+void
+sum_in_symm_elem_matrix(size_t num,
+                   const typename MatrixType::GlobalOrdinalType* indices,
+                   const typename MatrixType::ScalarType* coefs,
+                   MatrixType& mat)
+{
+  typedef typename MatrixType::ScalarType Scalar;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+//indices is length num (which should be nodes-per-elem)
+//coefs is the upper triangle of the element diffusion matrix
+//which should be length num*(num+1)/2
+//std::cout<<std::endl;
+
+  int row_offset = 0;
+  bool flag = false;
+  for(size_t i=0; i<num; ++i) {
+    GlobalOrdinal row = indices[i];
+ 
+    const Scalar* row_coefs = &coefs[row_offset];
+    const GlobalOrdinal* row_col_inds = &indices[i];
+    size_t row_len = num - i;
+    row_offset += row_len;
+
+    size_t mat_row_len = 0;
+    GlobalOrdinal* mat_row_cols = NULL;
+    Scalar* mat_row_coefs = NULL;
+  
+#if defined(MINIFE_SELL_MATRIX)
+    size_t stride;
+    mat.get_row_pointers(row, mat_row_len, stride, mat_row_cols, mat_row_coefs);
+    if (mat_row_len == 0) continue;
+
+    sum_into_row_SELL_matrix(mat_row_len, mat_row_cols, mat_row_coefs, stride,
+                 row_len, row_col_inds, row_coefs);
+#else
+    mat.get_row_pointers(row, mat_row_len, mat_row_cols, mat_row_coefs);
+    if (mat_row_len == 0) continue;
+
+    sum_into_row(mat_row_len, mat_row_cols, mat_row_coefs,
+                 row_len, row_col_inds, row_coefs);
+#endif
+    int offset = i;
+    for(size_t j=0; j<i; ++j) {
+      Scalar coef = coefs[offset];
+//std::cout<<"i: "<<i<<", j: "<<j<<", offset: "<<offset<<std::endl;
+#if defined(MINIFE_SELL_MATRIX)
+      sum_into_row_SELL_matrix(mat_row_len, mat_row_cols, mat_row_coefs, stride,
+                   1, &indices[j], &coef);
+#else
+      sum_into_row(mat_row_len, mat_row_cols, mat_row_coefs,
+                   1, &indices[j], &coef);
+#endif
+      offset += num - (j+1);
+    }
+  }
+}
+
+template<typename MatrixType>
+void
+sum_in_elem_matrix(size_t num,
+                   const typename MatrixType::GlobalOrdinalType* indices,
+                   const typename MatrixType::ScalarType* coefs,
+                   MatrixType& mat)
+{
+  size_t offset = 0;
+
+  for(size_t i=0; i<num; ++i) {
+    sum_into_row(indices[i], num,
+                 &indices[0], &coefs[offset], mat);
+    offset += num;
+  }
+}
+
+template<typename GlobalOrdinal, typename Scalar,
+         typename MatrixType, typename VectorType>
+void
+sum_into_global_linear_system(ElemData<GlobalOrdinal,Scalar>& elem_data,
+                              MatrixType& A, VectorType& b)
+{
+  sum_in_symm_elem_matrix(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                     elem_data.elem_diffusion_matrix, A);
+  sum_into_vector(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                  elem_data.elem_source_vector, b);
+}
+
+#ifdef MINIFE_HAVE_TBB
+template<typename MatrixType>
+void
+sum_in_elem_matrix(size_t num,
+                   const typename MatrixType::GlobalOrdinalType* indices,
+                   const typename MatrixType::ScalarType* coefs,
+                   LockingMatrix<MatrixType>& mat)
+{
+  size_t offset = 0;
+
+  for(size_t i=0; i<num; ++i) {
+    mat.sum_in(indices[i], num, &indices[0], &coefs[offset]);
+    offset += num;
+  }
+}
+
+template<typename GlobalOrdinal, typename Scalar,
+         typename MatrixType, typename VectorType>
+void
+sum_into_global_linear_system(ElemData<GlobalOrdinal,Scalar>& elem_data,
+                              LockingMatrix<MatrixType>& A, LockingVector<VectorType>& b)
+{
+  sum_in_elem_matrix(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                     elem_data.elem_diffusion_matrix, A);
+  sum_into_vector(elem_data.nodes_per_elem, elem_data.elem_node_ids,
+                  elem_data.elem_source_vector, b);
+}
+#endif
+
+template<typename MatrixType>
+void
+add_to_diagonal(typename MatrixType::ScalarType value, MatrixType& mat)
+{
+  for(size_t i=0; i<mat.rows.size(); ++i) {
+    sum_into_row(mat.rows[i], 1, &mat.rows[i], &value, mat);
+  }
+}
+
+template<typename MatrixType>
+double
+parallel_memory_overhead_MB(const MatrixType& A)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  double mem_MB = 0;
+
+#ifdef HAVE_MPI
+  double invMB = 1.0/(1024*1024);
+  mem_MB = invMB*A.external_index.size()*sizeof(GlobalOrdinal);
+  mem_MB += invMB*A.external_local_index.size()*sizeof(GlobalOrdinal);
+  mem_MB += invMB*A.elements_to_send.size()*sizeof(GlobalOrdinal);
+  mem_MB += invMB*A.neighbors.size()*sizeof(int);
+  mem_MB += invMB*A.recv_length.size()*sizeof(LocalOrdinal);
+  mem_MB += invMB*A.send_length.size()*sizeof(LocalOrdinal);
+
+  double tmp = mem_MB;
+  MPI_Allreduce(&tmp, &mem_MB, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  return mem_MB;
+}
+
+template<typename MatrixType>
+void rearrange_matrix_local_external(MatrixType& A)
+{
+  //This function will rearrange A so that local entries are contiguous at the front
+  //of A's memory, and external entries are contiguous at the back of A's memory.
+  //
+  //A.row_offsets will describe where the local entries occur, and
+  //A.row_offsets_external will describe where the external entries occur.
+
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  size_t nrows = A.rows.size();
+  std::vector<LocalOrdinal> tmp_row_offsets(nrows*2);
+  std::vector<LocalOrdinal> tmp_row_offsets_external(nrows*2);
+
+  LocalOrdinal num_local_nz = 0;
+  LocalOrdinal num_extern_nz = 0;
+
+  //First sort within each row of A, so that local entries come
+  //before external entries within each row.
+  //tmp_row_offsets describe the locations of the local entries, and
+  //tmp_row_offsets_external describe the locations of the external entries.
+  //
+  for(size_t i=0; i<nrows; ++i) {
+    GlobalOrdinal* row_begin = &A.packed_cols[A.row_offsets[i]];
+    GlobalOrdinal* row_end = &A.packed_cols[A.row_offsets[i+1]];
+
+    Scalar* coef_row_begin = &A.packed_coefs[A.row_offsets[i]];
+
+    tmp_row_offsets[i*2] = A.row_offsets[i];
+    tmp_row_offsets[i*2+1] = A.row_offsets[i+1];
+    tmp_row_offsets_external[i*2] = A.row_offsets[i+1];
+    tmp_row_offsets_external[i*2+1] = A.row_offsets[i+1];
+
+    ptrdiff_t row_len = row_end - row_begin;
+
+    sort_with_companions(row_len, row_begin, coef_row_begin);
+
+    GlobalOrdinal* row_iter = std::lower_bound(row_begin, row_end, nrows);
+
+    LocalOrdinal offset = A.row_offsets[i] + row_iter-row_begin;
+    tmp_row_offsets[i*2+1] = offset;
+    tmp_row_offsets_external[i*2] = offset;
+
+    num_local_nz += tmp_row_offsets[i*2+1]-tmp_row_offsets[i*2];
+    num_extern_nz += tmp_row_offsets_external[i*2+1]-tmp_row_offsets_external[i*2];
+  }
+
+  //Next, copy the external entries into separate arrays.
+
+  std::vector<GlobalOrdinal> ext_cols(num_extern_nz);
+  std::vector<Scalar> ext_coefs(num_extern_nz);
+  std::vector<LocalOrdinal> ext_offsets(nrows+1);
+  LocalOrdinal offset = 0;
+  for(size_t i=0; i<nrows; ++i) {
+    ext_offsets[i] = offset;
+    for(LocalOrdinal j=tmp_row_offsets_external[i*2];
+                     j<tmp_row_offsets_external[i*2+1]; ++j) {
+      ext_cols[offset] = A.packed_cols[j];
+      ext_coefs[offset++] = A.packed_coefs[j];
+    }
+  }
+  ext_offsets[nrows] = offset;
+
+  //Now slide all local entries down to the beginning of A's packed arrays
+
+  A.row_offsets.resize(nrows+1);
+  offset = 0;
+  for(size_t i=0; i<nrows; ++i) {
+    A.row_offsets[i] = offset;
+    for(LocalOrdinal j=tmp_row_offsets[i*2]; j<tmp_row_offsets[i*2+1]; ++j) {
+      A.packed_cols[offset] = A.packed_cols[j];
+      A.packed_coefs[offset++] = A.packed_coefs[j];
+    }
+  }
+  A.row_offsets[nrows] = offset;
+
+  //Finally, copy the external entries back into A.packed_cols and
+  //A.packed_coefs, starting at the end of the local entries.
+
+  for(LocalOrdinal i=offset; i<offset+ext_cols.size(); ++i) {
+    A.packed_cols[i] = ext_cols[i-offset];
+    A.packed_coefs[i] = ext_coefs[i-offset];
+  }
+
+  A.row_offsets_external.resize(nrows+1);
+  for(size_t i=0; i<=nrows; ++i) A.row_offsets_external[i] = ext_offsets[i] + offset;
+}
+
+//------------------------------------------------------------------------
+template<typename MatrixType>
+void
+zero_row_and_put_1_on_diagonal(MatrixType& A, typename MatrixType::GlobalOrdinalType row)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  size_t row_len = 0;
+  GlobalOrdinal* cols = NULL;
+  Scalar* coefs = NULL;
+
+#if defined(MINIFE_SELL_MATRIX)
+  size_t stride;
+  A.get_row_pointers(row, row_len, stride, cols, coefs);
+
+  for(size_t i=0; i<row_len; ++i) {
+    if (cols[i*stride] == row) coefs[i*stride] = 1;
+    else coefs[i*stride] = 0;
+  }
+#else
+  A.get_row_pointers(row, row_len, cols, coefs);
+  
+  for(size_t i=0; i<row_len; ++i) {
+    if (cols[i] == row) coefs[i] = 1;
+    else coefs[i] = 0;
+  }
+#endif
+}
+
+//------------------------------------------------------------------------
+template<typename MatrixType,
+         typename VectorType>
+void
+impose_dirichlet(typename MatrixType::ScalarType prescribed_value,
+                    MatrixType& A,
+                    VectorType& b,
+                    int global_nx,
+                    int global_ny,
+                    int global_nz,
+                    const std::set<typename MatrixType::GlobalOrdinalType>& bc_rows)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  GlobalOrdinal first_local_row = A.rows.size()>0 ? A.rows[0] : 0;
+  GlobalOrdinal last_local_row  = A.rows.size()>0 ? A.rows[A.rows.size()-1] : -1;
+
+  typename std::set<GlobalOrdinal>::const_iterator
+    bc_iter = bc_rows.begin(), bc_end = bc_rows.end();
+  for(; bc_iter!=bc_end; ++bc_iter) {
+    GlobalOrdinal row = *bc_iter;
+    if (row >= first_local_row && row <= last_local_row) {
+      size_t local_row = row - first_local_row;
+      b.coefs[local_row] = prescribed_value;
+      zero_row_and_put_1_on_diagonal(A, row);
+    }
+  }
+
+  const int ROW_COUNT = A.rows.size();
+
+  #pragma omp parallel for
+  for(MINIFE_GLOBAL_ORDINAL i=0; i < ROW_COUNT; ++i) {
+    GlobalOrdinal row = A.rows[i];
+
+    if (bc_rows.find(row) != bc_rows.end()) continue;
+
+    size_t row_length = 0;
+    GlobalOrdinal* cols = NULL;
+    Scalar* coefs = NULL;
+#if defined(MINIFE_SELL_MATRIX)
+    size_t stride;
+    A.get_row_pointers(row, row_length, stride, cols, coefs);
+
+    Scalar sum = 0;
+    for(size_t j=0; j<row_length; ++j) {
+      if (bc_rows.find(cols[j*stride]) != bc_rows.end()) {
+        sum += coefs[j*stride];
+        coefs[j*stride] = 0;
+      }
+    }
+#else
+    A.get_row_pointers(row, row_length, cols, coefs);
+
+    Scalar sum = 0;
+    for(size_t j=0; j<row_length; ++j) {
+      if (bc_rows.find(cols[j]) != bc_rows.end()) {
+        sum += coefs[j];
+        coefs[j] = 0;
+      }
+    }
+#endif
+    #pragma omp atomic
+    b.coefs[i] -= sum*prescribed_value;
+  }
+}
+
+static timer_type exchtime = 0;
+
+//------------------------------------------------------------------------
+//Compute matrix vector product y = A*x where:
+//
+// A - input matrix
+// x - input vector
+// y - result vector
+//
+#if defined(MINIFE_CSR_MATRIX)
+template<typename MatrixType,
+         typename VectorType>
+struct matvec_std {
+void operator()(MatrixType& A,
+            VectorType& x,
+            VectorType& y)
+{
+  	exchange_externals(A, x);
+
+  	typedef typename MatrixType::ScalarType ScalarType;
+  	typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  	typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+
+        const MINIFE_GLOBAL_ORDINAL rows_size     = A.rows.size();
+        const LocalOrdinalType* const Arowoffsets = &A.row_offsets[0];
+        const GlobalOrdinalType* const Acols      = &A.packed_cols[0];
+        const ScalarType* const Acoefs            = &A.packed_coefs[0];
+        const ScalarType* const xcoefs            = &x.coefs[0];
+        ScalarType* ycoefs                        = &y.coefs[0];
+        const ScalarType beta                     = 0;
+
+        #pragma omp parallel for
+        for(MINIFE_GLOBAL_ORDINAL row = 0; row < rows_size; ++row) {
+                const MINIFE_GLOBAL_ORDINAL row_start = Arowoffsets[row];
+                const MINIFE_GLOBAL_ORDINAL row_end   = Arowoffsets[row+1];
+
+                MINIFE_SCALAR sum = 0;
+
+                #pragma loop_count(15)
+                for(MINIFE_GLOBAL_ORDINAL i = row_start; i < row_end; ++i) {
+                        sum += Acoefs[i] * xcoefs[Acols[i]];
+                }
+
+                ycoefs[row] = sum;
+        }
+}
+};
+#elif defined(MINIFE_ELL_MATRIX)
+template<typename MatrixType,
+         typename VectorType>
+struct matvec_std {
+void operator()(MatrixType& A,
+            VectorType& x,
+            VectorType& y)
+{
+  exchange_externals(A, x);
+
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+
+  int row_len = A.num_cols_per_row;
+  int n = A.rows.size();
+  const GlobalOrdinalType* Acols      = &A.cols[0];
+  const ScalarType* Acoefs            = &A.coefs[0];
+  const ScalarType* xcoefs = &x.coefs[0];
+        ScalarType* ycoefs = &y.coefs[0];
+  ScalarType beta = 0;
+
+  #pragma omp parallel for
+  for(int row=0; row<n; ++row) {
+    ScalarType sum = beta*ycoefs[row];
+
+    int row_start=row*row_len;
+    int row_end=row_start+row_len;
+
+    for(LocalOrdinalType i=row_start; i<row_end; ++i) {
+      sum += Acoefs[i]*xcoefs[Acols[i]];
+    }
+
+    ycoefs[row] = sum;
+  }
+}
+};
+#elif defined(MINIFE_SELL_MATRIX)
+template<typename MatrixType,
+         typename VectorType>
+struct matvec_std {
+void operator()(MatrixType& A,
+            VectorType& x,
+            VectorType& y)
+{
+  exchange_externals(A, x);
+
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+
+  int row_len = A.num_cols_per_row;
+  int n = A.rows.size();
+  const GlobalOrdinalType* Acols      = &A.cols[0];
+  const ScalarType* Acoefs            = &A.coefs[0];
+  const ScalarType* xcoefs = &x.coefs[0];
+        ScalarType* ycoefs = &y.coefs[0];
+  ScalarType beta = 0;
+  int num_blocks = A.num_blocks;
+  int num_rows_per_block = A.num_rows_per_block;
+
+#pragma omp parallel for
+  for(int block_id=0; block_id < num_blocks; block_id++) {
+#ifdef AVX512_ZMM
+           __m512d sum = _mm512_setzero_pd();
+	  int block_offset = block_id * num_rows_per_block * row_len;
+	  int stride = num_rows_per_block;
+//#define AVOID_MASK
+#ifndef AVOID_MASK
+	  if (block_id == num_blocks-1  && n%num_rows_per_block!=0){
+		  stride = n%num_rows_per_block;
+	  }
+      __mmask8 pg = _cvtu32_mask8((1<<stride)-1);
+      for(int i=0; i<row_len; i++){
+          __m512d acofs = _mm512_maskz_load_pd(pg, &Acoefs[block_offset]);
+          __m256i indices = _mm256_maskz_load_epi32(pg, &Acols[block_offset]);
+          __m512d xcofs = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), pg, indices, &xcoefs[0], sizeof(double));
+
+          sum =  _mm512_maskz_fmadd_pd(pg, acofs, xcofs, sum);
+
+          block_offset += stride;
+      }
+
+      _mm512_mask_store_pd (&ycoefs[block_id * num_rows_per_block], pg, sum);
+#else
+	  if (block_id == num_blocks-1  && n%num_rows_per_block!=0){
+	    stride = n%num_rows_per_block;
+	    __mmask8 pg = _cvtu32_mask8((1<<stride)-1);
+	    for(int i=0; i<row_len; i++){
+	      __m512d acofs = _mm512_maskz_load_pd(pg, &Acoefs[block_offset]);
+	      __m256i indices = _mm256_maskz_load_epi32(pg, &Acols[block_offset]);
+	      __m512d xcofs = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), pg, indices, &xcoefs[0], sizeof(double));
+	      
+	      sum =  _mm512_maskz_fmadd_pd(pg, acofs, xcofs, sum);
+	      
+	      block_offset += stride;
+	    }
+	    _mm512_mask_store_pd (&ycoefs[block_id * num_rows_per_block], pg, sum);
+	  } else {
+	    for(int i=0; i<row_len; i++){
+	      __m512d acofs = _mm512_load_pd(&Acoefs[block_offset]);
+	      __m256i indices = _mm256_load_epi32(&Acols[block_offset]);
+	      __m512d xcofs = _mm512_i32gather_pd(indices, &xcoefs[0], sizeof(double));
+	      
+	      sum =  _mm512_fmadd_pd(acofs, xcofs, sum);
+	      
+	      block_offset += 8;
+	    }
+	    _mm512_store_pd (&ycoefs[block_id * num_rows_per_block], sum);
+	  }
+	 
+#endif
+#else
+	  __m256d sum = _mm256_setzero_pd();
+	  int block_offset = block_id * num_rows_per_block * row_len;
+	  int stride = num_rows_per_block;
+	  if (block_id == num_blocks-1  && n%num_rows_per_block!=0){
+		  stride = n%num_rows_per_block;
+	  }
+      __mmask8 pg = _cvtu32_mask8((1<<stride)-1);
+      for(int i=0; i<row_len; i++){
+          __m256d acofs = _mm256_maskz_load_pd(pg, &Acoefs[block_offset]);
+          __m128i indices = _mm_maskz_load_epi32(pg, &Acols[block_offset]);
+	  __m256d xcofs = _mm256_mmask_i32gather_pd(_mm256_undefined_pd(), pg, indices, &xcoefs[0], sizeof(double)); // beware the second m in 'mmask' ... otherwise you get the AVX2 version with different parameters!
+
+          sum =  _mm256_maskz_fmadd_pd(pg, acofs, xcofs, sum);
+
+          block_offset += stride;
+      }
+
+      _mm256_mask_store_pd (&ycoefs[block_id * num_rows_per_block], pg, sum);
+#endif
+  }
+
+
+/*   for(int block_id=0; block_id < num_blocks; block_id++) { */
+/*       std::vector<ScalarType> sum(num_rows_per_block, 0); // initialize to beta*ycoefs[row] not necessary cause beta =0 */
+/* 	  int block_offset = block_id * num_rows_per_block * row_len; */
+/* 	  int stride = num_rows_per_block; */
+/* 	  if (block_id == num_blocks-1  && n%num_rows_per_block!=0){ */
+/* 		  stride = n%num_rows_per_block; */
+/* 	  } */
+/* 	  for(int row=0; row<stride; row++ ){ */
+/* 		  for(int i=0; i<row_len; i++){ */
+/* 			  sum[row] += Acoefs[block_offset + i * stride + row] * xcoefs[Acols[block_offset + i * stride + row]]; */
+/* 		  } */
+/* 	  } */
+
+/* 	  for(int row=0; row<stride; row++){ */
+/* 		  ycoefs[block_id*num_rows_per_block + row] = sum[row]; */
+/* 	  } */
+/*   } */
+}
+};
+#endif
+
+template<typename MatrixType,
+         typename VectorType>
+void matvec(MatrixType& A, VectorType& x, VectorType& y)
+{
+  matvec_std<MatrixType,VectorType> mv;
+  mv(A, x, y);
+}
+
+template<typename MatrixType,
+         typename VectorType>
+struct matvec_overlap {
+void operator()(MatrixType& A,
+                    VectorType& x,
+                    VectorType& y)
+{
+#ifdef HAVE_MPI
+  begin_exchange_externals(A, x);
+#endif
+
+  typedef typename MatrixType::ScalarType ScalarType;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
+
+
+  int n = A.rows.size();
+  const LocalOrdinalType* Arowoffsets = &A.row_offsets[0];
+  const GlobalOrdinalType* Acols      = &A.packed_cols[0];
+  const ScalarType* Acoefs            = &A.packed_coefs[0];
+  const ScalarType* xcoefs = &x.coefs[0];
+        ScalarType* ycoefs = &y.coefs[0];
+  ScalarType beta = 0;
+
+  for(int row=0; row<n; ++row) {
+    ScalarType sum = beta*ycoefs[row];
+
+    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
+      sum += Acoefs[i]*xcoefs[Acols[i]];
+    }
+
+    ycoefs[row] = sum;
+  }
+
+#ifdef HAVE_MPI
+  finish_exchange_externals(A.neighbors.size());
+
+  Arowoffsets = &A.row_offsets_external[0];
+  beta = 1;
+
+  for(int row=0; row<n; ++row) {
+    ScalarType sum = beta*ycoefs[row];
+
+    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
+      sum += Acoefs[i]*xcoefs[Acols[i]];
+    }
+
+    ycoefs[row] = sum;
+  }
+#endif
+}
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/Vector.hpp b/openmp-avx512/src/Vector.hpp
new file mode 100644
index 0000000..07beef4
--- /dev/null
+++ b/openmp-avx512/src/Vector.hpp
@@ -0,0 +1,68 @@
+#ifndef _Vector_hpp_
+#define _Vector_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+
+namespace miniFE {
+
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal>
+struct Vector {
+  typedef Scalar ScalarType;
+  typedef LocalOrdinal LocalOrdinalType;
+  typedef GlobalOrdinal GlobalOrdinalType;
+
+  Vector(GlobalOrdinal startIdx, LocalOrdinal local_sz)
+   : startIndex(startIdx),
+     local_size(local_sz),
+     coefs(local_size)
+  {
+    #pragma omp parallel for
+    for(MINIFE_LOCAL_ORDINAL i=0; i < local_size; ++i) {
+	coefs[i] = 0;	 
+    }
+  }
+
+  ~Vector()
+  {
+  }
+
+  GlobalOrdinal startIndex;
+  LocalOrdinal local_size;
+  std::vector<Scalar> coefs;
+};
+
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/Vector_functions.hpp b/openmp-avx512/src/Vector_functions.hpp
new file mode 100644
index 0000000..6b2d815
--- /dev/null
+++ b/openmp-avx512/src/Vector_functions.hpp
@@ -0,0 +1,308 @@
+#ifndef _Vector_functions_hpp_
+#define _Vector_functions_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+#include <sstream>
+#include <fstream>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#ifdef MINIFE_HAVE_TBB
+#include <LockingVector.hpp>
+#endif
+
+#include <TypeTraits.hpp>
+#include <Vector.hpp>
+
+#define MINIFE_MIN(X, Y)  ((X) < (Y) ? (X) : (Y))
+
+namespace miniFE {
+
+
+template<typename VectorType>
+void write_vector(const std::string& filename,
+                  const VectorType& vec)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  std::ostringstream osstr;
+  osstr << filename << "." << numprocs << "." << myproc;
+  std::string full_name = osstr.str();
+  std::ofstream ofs(full_name.c_str());
+
+  typedef typename VectorType::ScalarType ScalarType;
+
+  const std::vector<ScalarType>& coefs = vec.coefs;
+  for(int p=0; p<numprocs; ++p) {
+    if (p == myproc) {
+      if (p == 0) {
+        ofs << vec.local_size << std::endl;
+      }
+  
+      typename VectorType::GlobalOrdinalType first = vec.startIndex;
+      for(size_t i=0; i<vec.local_size; ++i) {
+        ofs << first+i << " " << coefs[i] << std::endl;
+      }
+    }
+#ifdef HAVE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  }
+}
+
+template<typename VectorType>
+void sum_into_vector(size_t num_indices,
+                     const typename VectorType::GlobalOrdinalType* indices,
+                     const typename VectorType::ScalarType* coefs,
+                     VectorType& vec)
+{
+  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename VectorType::ScalarType Scalar;
+
+  GlobalOrdinal first = vec.startIndex;
+  GlobalOrdinal last = first + vec.local_size - 1;
+
+  std::vector<Scalar>& vec_coefs = vec.coefs;
+
+  for(size_t i=0; i<num_indices; ++i) {
+    if (indices[i] < first || indices[i] > last) continue;
+    size_t idx = indices[i] - first;
+
+    #pragma omp atomic
+    vec_coefs[idx] += coefs[i];
+  }
+}
+
+#ifdef MINIFE_HAVE_TBB
+template<typename VectorType>
+void sum_into_vector(size_t num_indices,
+                     const typename VectorType::GlobalOrdinalType* indices,
+                     const typename VectorType::ScalarType* coefs,
+                     LockingVector<VectorType>& vec)
+{
+  vec.sum_in(num_indices, indices, coefs);
+}
+#endif
+
+//------------------------------------------------------------
+//Compute the update of a vector with the sum of two scaled vectors where:
+//
+// w = alpha*x + beta*y
+//
+// x,y - input vectors
+//
+// alpha,beta - scalars applied to x and y respectively
+//
+// w - output vector
+//
+template<typename VectorType>
+void
+  waxpby(typename VectorType::ScalarType alpha, const VectorType& x,
+         typename VectorType::ScalarType beta, const VectorType& y,
+         VectorType& w)
+{
+  typedef typename VectorType::ScalarType ScalarType;
+
+#ifdef MINIFE_DEBUG_OPENMP
+  std::cout << "Starting WAXPBY..." << std::endl;
+#endif
+
+#ifdef MINIFE_DEBUG
+  if (y.local_size < x.local_size || w.local_size < x.local_size) {
+    std::cerr << "miniFE::waxpby ERROR, y and w must be at least as long as x." << std::endl;
+    return;
+  }
+#endif
+
+  const int n = x.coefs.size();
+  const ScalarType*  xcoefs = &x.coefs[0];
+  const ScalarType*  ycoefs = &y.coefs[0];
+        ScalarType*  wcoefs = &w.coefs[0];
+
+  if(beta == 0.0) {
+	if(alpha == 1.0) {
+  		#pragma omp parallel for
+  		for(int i=0; i<n; ++i) {
+    			wcoefs[i] = xcoefs[i];
+  		}
+  	} else {
+  		#pragma omp parallel for
+  		for(int i=0; i<n; ++i) {
+    			wcoefs[i] = alpha * xcoefs[i];
+  		}
+  	}
+  } else {
+	if(alpha == 1.0) {
+  		#pragma omp parallel for
+  		for(int i=0; i<n; ++i) {
+    			wcoefs[i] = xcoefs[i] + beta * ycoefs[i];
+  		}
+  	} else {
+  		#pragma omp parallel for
+  		for(int i=0; i<n; ++i) {
+    			wcoefs[i] = alpha * xcoefs[i] + beta * ycoefs[i];
+  		}
+  	}
+  }
+
+#ifdef MINIFE_DEBUG_OPENMP
+  std::cout << "Finished WAXPBY." << std::endl;
+#endif
+}
+
+template<typename VectorType>
+void
+  daxpby(const MINIFE_SCALAR alpha, 
+	const VectorType& x,
+	const MINIFE_SCALAR beta, 
+	VectorType& y)
+{
+
+  const MINIFE_LOCAL_ORDINAL n = MINIFE_MIN(x.coefs.size(), y.coefs.size());
+  const MINIFE_SCALAR*  xcoefs = &x.coefs[0];
+        MINIFE_SCALAR*  ycoefs = &y.coefs[0];
+
+  if(alpha == 1.0 && beta == 1.0) {
+	  #pragma omp parallel for
+	  for(int i = 0; i < n; ++i) {
+	    ycoefs[i] += xcoefs[i];
+  	  }
+  } else if (beta == 1.0) {
+	  #pragma omp parallel for
+	  for(int i = 0; i < n; ++i) {
+	    ycoefs[i] += alpha * xcoefs[i];
+  	  }
+  } else if (alpha == 1.0) {
+	  #pragma omp parallel for
+	  for(int i = 0; i < n; ++i) {
+	    ycoefs[i] = xcoefs[i] + beta * ycoefs[i];
+  	  }
+  } else if (beta == 0.0) {
+	  #pragma omp parallel for
+	  for(int i = 0; i < n; ++i) {
+	    ycoefs[i] = alpha * xcoefs[i];
+  	  }
+  } else {
+	  #pragma omp parallel for
+	  for(int i = 0; i < n; ++i) {
+	    ycoefs[i] = alpha * xcoefs[i] + beta * ycoefs[i];
+  	  }
+  }
+
+}
+
+//-----------------------------------------------------------
+//Compute the dot product of two vectors where:
+//
+// x,y - input vectors
+//
+// result - return-value
+//
+template<typename Vector>
+typename TypeTraits<typename Vector::ScalarType>::magnitude_type
+  dot(const Vector& x,
+      const Vector& y)
+{
+  const MINIFE_LOCAL_ORDINAL n = x.coefs.size();
+
+  typedef typename Vector::ScalarType Scalar;
+  typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
+
+  const Scalar*  xcoefs = &x.coefs[0];
+  const Scalar*  ycoefs = &y.coefs[0];
+  MINIFE_SCALAR result = 0;
+
+  #pragma omp parallel for reduction(+:result)
+  for(int i=0; i<n; ++i) {
+    result += xcoefs[i] * ycoefs[i];
+  }
+
+#ifdef HAVE_MPI
+  magnitude local_dot = result, global_dot = 0;
+  MPI_Datatype mpi_dtype = TypeTraits<magnitude>::mpi_type();  
+  MPI_Allreduce(&local_dot, &global_dot, 1, mpi_dtype, MPI_SUM, MPI_COMM_WORLD);
+  return global_dot;
+#else
+  return result;
+#endif
+}
+
+template<typename Vector>
+typename TypeTraits<typename Vector::ScalarType>::magnitude_type
+  dot_r2(const Vector& x)
+{
+#ifdef MINIFE_DEBUG_OPENMP
+ 	int myrank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+	std::cout << "[" << myrank << "] Starting dot..." << std::endl;
+#endif
+
+  const MINIFE_LOCAL_ORDINAL n = x.coefs.size();
+
+
+  typedef typename Vector::ScalarType Scalar;
+  typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
+
+  const MINIFE_SCALAR*  xcoefs = &x.coefs[0];
+  MINIFE_SCALAR result = 0;
+
+  #pragma omp parallel for reduction(+:result)
+  for(int i=0; i<n; ++i) {
+    result += xcoefs[i] * xcoefs[i];
+  }
+
+#ifdef HAVE_MPI
+  magnitude local_dot = result, global_dot = 0;
+  MPI_Datatype mpi_dtype = TypeTraits<magnitude>::mpi_type();  
+  MPI_Allreduce(&local_dot, &global_dot, 1, mpi_dtype, MPI_SUM, MPI_COMM_WORLD);
+
+#ifdef MINIFE_DEBUG_OPENMP
+ 	std::cout << "[" << myrank << "] Completed dot." << std::endl;
+#endif
+
+  return global_dot;
+#else
+#ifdef MINIFE_DEBUG_OPENMP
+ 	std::cout << "[" << myrank << "] Completed dot." << std::endl;
+#endif
+  return result;
+#endif
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/YAML_Doc.cpp b/openmp-avx512/src/YAML_Doc.cpp
new file mode 100644
index 0000000..382c8cd
--- /dev/null
+++ b/openmp-avx512/src/YAML_Doc.cpp
@@ -0,0 +1,102 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <ctime>
+#include <cstdlib>
+#include <ctime>
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#ifdef REDSTORM
+#include <time.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#endif
+#include "YAML_Doc.hpp"
+using namespace std;
+
+//set the microapp_name and version which will become part of the YAML doc.
+YAML_Doc::YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version, const std::string& destination_Directory, const std::string& destination_FileName){
+  miniAppName = miniApp_Name;
+  miniAppVersion = miniApp_Version;
+  destinationDirectory = destination_Directory;
+  destinationFileName = destination_FileName;
+}
+
+//inherits the destructor from YAML_Element
+YAML_Doc::~YAML_Doc(void){
+}
+
+/*
+* generates YAML from the elements of the document and saves it
+* to a file
+*/
+string YAML_Doc::generateYAML(){
+  string yaml;
+  yaml =  yaml + "Mini-Application Name: " + miniAppName + "\n";
+  yaml =  yaml + "Mini-Application Version: " + miniAppVersion + "\n";
+  for(size_t i=0; i<children.size(); i++){
+    yaml = yaml + children[i]->printYAML("");
+  }
+  
+  time_t rawtime;
+  tm * ptm;
+  time ( &rawtime );
+  ptm = localtime(&rawtime);
+  char sdate[25];
+  //use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
+  sprintf (sdate,"%04d:%02d:%02d-%02d:%02d:%02d",ptm->tm_year + 1900, ptm->tm_mon+1,
+    ptm->tm_mday, ptm->tm_hour, ptm->tm_min,ptm->tm_sec);
+
+  string filename;
+  if (destinationFileName=="") 
+    filename = miniAppName + "-" + miniAppVersion + "_";
+  else 
+    filename = destinationFileName;
+  filename = filename + string(sdate) + ".yaml";
+  if (destinationDirectory!="" && destinationDirectory!=".") {
+    string mkdir_cmd = "mkdir " + destinationDirectory;
+#ifdef REDSTORM
+    mkdir(destinationDirectory.c_str(),0755);
+#else
+    system(mkdir_cmd.c_str());
+#endif
+    filename = destinationDirectory + "/" + destinationFileName;
+  }
+  else 
+    filename = "./" + filename;
+
+  ofstream myfile;
+  myfile.open(filename.c_str());
+  myfile << yaml;
+  myfile.close();
+  return yaml;
+}
+
+
diff --git a/openmp-avx512/src/YAML_Doc.hpp b/openmp-avx512/src/YAML_Doc.hpp
new file mode 100644
index 0000000..8ab6961
--- /dev/null
+++ b/openmp-avx512/src/YAML_Doc.hpp
@@ -0,0 +1,115 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef YAML_DOC_H
+#define YAML_DOC_H
+#include <string>
+#include <vector>
+#include "YAML_Element.hpp"
+
+//! The Mantevo YAML_Doc class for the uniform collecting and reporting of performance data for mini-applications
+
+/*!
+
+The YAML_Doc class works in conjuction with the YAML_Element class to facilitate easy collecting and reporting of YAML-formatted
+data that can be then registered with the Mantevo results collection website.
+
+\code
+
+//EXAMPLE CODE FOR GENERATING YAML
+
+  YAML_Doc doc("hpccg","1.0");
+  doc.add("final_residual",1.4523e-13);
+  doc.add("time","4.893"); 
+ 
+//note: the following line will remove the data (4.890) associated with "time"
+  doc.get("time")->add("total",4.243);
+
+//note:  the following line will likewise remove the data (1.243) associated with "time"
+  doc.get("time")->get("total")->add("time",2.457);
+  doc.get("time")->get("total")->add("flops",4.88e5);
+  doc.get("time")->add("ddot",1.243);
+  doc.get("time")->add("sparsemv","");
+  doc.get("time")->get("sparsemv")->add("time",0.3445);
+  doc.get("time")->get("sparsemv")->add("overhead","");
+  doc.get("time")->get("sparsemv")->get("overhead")->add("time",0.0123);
+  doc.get("time")->get("sparsemv")->get("overhead")->add("percentage",0.034);
+  cout << doc.generateYAML() << endl; 
+  return 0;
+
+\endcode
+
+Below is the output generated by the above code:
+
+\verbatim
+
+final_residual: 1.4523e-13
+time: 
+  total:
+    time: 2.457
+    flops: 4.88e5
+  ddot: 1.243
+  sparsemv:
+    time: 0.3445
+    overhead:
+      time: 0.0123
+      percentage: 0.034
+
+\endverbatim
+
+\note {No value is allowed to be attached to a key that has children.  If children are added to a key, the value is simply set to "".}
+
+*/
+class YAML_Doc: public YAML_Element {
+  public:
+  //! Constructor: accepts mini-application name and version as strings, optionally accepts directory and file name for printing results.
+  /*!
+    The sole constructor for this class accepts and name and version number for the mini-application as well as optional directory 
+    and file name information for results that are generated by the generateYAML() method.
+    \param miniApp_Name (in) string containing name of the mini-application
+    \param miniApp_Version (in) string containing the version of the mini-application
+    \param destination_Directory (in, optional) path of diretory where results file will be stored, relative to current working directory. 
+           If this value is not supplied, the results file will be stored in the current working directory.  If the directory does not exist
+	   it will be created.
+    \param destination_FileName (in, optional) root name of the results file.  A suffix of ".yaml" will be automatically appended.  If no
+           file name is specified the filename will be constructed by concatenating the miniAppName + miniAppVersion + ".yaml" strings.
+  */
+  YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version, const std::string& destination_Directory = "", const std::string& destination_FileName = "");
+  //! Destructor
+  ~YAML_Doc();
+  //! Generate YAML results to standard out and to a file using specified directory and filename, using current directory and miniAppName + miniAppVersion + ".yaml" by default
+  std::string generateYAML();
+
+protected:
+  std::string miniAppName;
+  std::string miniAppVersion;
+  std::string destinationDirectory;
+  std::string destinationFileName;
+};
+#endif /* YAML_DOC_H */
+
diff --git a/openmp-avx512/src/YAML_Element.cpp b/openmp-avx512/src/YAML_Element.cpp
new file mode 100644
index 0000000..26ba04b
--- /dev/null
+++ b/openmp-avx512/src/YAML_Element.cpp
@@ -0,0 +1,148 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include "YAML_Element.hpp"
+using namespace std;
+YAML_Element::YAML_Element(const std::string& key_arg, const std::string& value_arg){
+  key = key_arg;
+  value = value_arg;
+}
+
+YAML_Element::~YAML_Element(){
+  for (size_t i=0; i<children.size(); i++) {
+    delete children[i];
+  }
+  children.clear();
+}
+
+/*
+* Add an element to the vector
+* QUESTION: if an element is not added because the key already exists,
+* will this lead to memory leakage?
+*/
+YAML_Element* YAML_Element::add(const std::string& key_arg, double value_arg) {
+  this->value = "";
+  string converted_value = convert_double_to_string(value_arg);
+  YAML_Element* element = new YAML_Element(key_arg,converted_value);
+  children.push_back(element);
+  return element;
+}
+
+YAML_Element* YAML_Element::add(const std::string& key_arg, int value_arg) {
+  this->value = "";
+  string converted_value = convert_int_to_string(value_arg);
+  YAML_Element* element = new YAML_Element(key_arg,converted_value);
+  children.push_back(element);
+  return element;
+}
+
+#ifndef MINIFE_NO_LONG_LONG
+
+YAML_Element* YAML_Element::add(const std::string& key_arg, long long value_arg) {
+  this->value = "";
+  string converted_value = convert_long_long_to_string(value_arg);
+  YAML_Element* element = new YAML_Element(key_arg,converted_value);
+  children.push_back(element);
+  return element;
+}
+
+#endif
+
+YAML_Element* YAML_Element::add(const std::string& key_arg, size_t value_arg) {
+  this->value = "";
+  string converted_value = convert_size_t_to_string(value_arg);
+  YAML_Element* element = new YAML_Element(key_arg,converted_value);
+  children.push_back(element);
+  return element;
+}
+
+YAML_Element* YAML_Element::add(const std::string& key_arg, const std::string& value_arg) {
+  this->value = "";
+  YAML_Element* element = new YAML_Element(key_arg, value_arg);
+  children.push_back(element);
+  return element;
+}
+
+/*
+* returns pointer to the YAML_Element for the given key.
+* I, cam, believe an exception should be thrown if there is no
+* element in the vector for the specified key
+*/
+YAML_Element* YAML_Element::get(const std::string& key_arg) {
+  for (size_t i=0; i<children.size(); i++) {
+    if(children[i]->getKey() == key_arg){
+      return children[i];
+    }
+  }
+  return 0;
+}
+
+/*
+* prints a line of a YAML document.  Correct YAML depends on
+* correct spacing; the parameter space should be the proper
+* amount of space for the parent element
+*/
+string YAML_Element::printYAML(std::string space){
+  string yaml_line = space + key + ": " + value + "\n";
+  for(int i=0; i<2; i++) space = space + " ";
+  for(size_t i=0; i<children.size(); i++){
+    yaml_line = yaml_line + children[i]->printYAML(space);
+  }
+  return yaml_line;
+}
+
+string YAML_Element::convert_double_to_string(double value_arg){
+  stringstream strm;
+  strm << value_arg;
+  return strm.str();
+}
+string YAML_Element::convert_int_to_string(int value_arg){
+  stringstream strm;
+  strm << value_arg;
+  return strm.str();
+}
+
+#ifndef MINIFE_NO_LONG_LONG
+
+string YAML_Element::convert_long_long_to_string(long long value_arg){
+  stringstream strm;
+  strm << value_arg;
+  return strm.str();
+}
+
+#endif
+
+string YAML_Element::convert_size_t_to_string(size_t value_arg){
+  stringstream strm;
+  strm << value_arg;
+  return strm.str();
+}
diff --git a/openmp-avx512/src/YAML_Element.hpp b/openmp-avx512/src/YAML_Element.hpp
new file mode 100644
index 0000000..8ab1e5f
--- /dev/null
+++ b/openmp-avx512/src/YAML_Element.hpp
@@ -0,0 +1,79 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef YAML_ELEMENT_H
+#define YAML_ELEMENT_H
+#include <string>
+#include <vector>
+//! The Mantevo YAML_Element class for registering key-value pairs of performance data
+
+/*!
+  Mantevo mini-applications generate a collection of performance data for each run of the executable.  YAML_Element, and
+  the related YAML_Doc class, provide a uniform facility for gathering and reporting this data using the YAML text format.
+*/
+class YAML_Element {
+  public:
+
+  //! Default constructor.
+  YAML_Element (){key="";value="";}
+  //! Construct with known key-value pair
+  YAML_Element (const std::string& key_arg, const std::string& value_arg);
+  //! Destructor
+  ~YAML_Element ();
+  //! Key accessor method
+  std::string getKey(){return key;}
+  //! Add a child element to an element list associated with this element, value of type double
+  YAML_Element* add(const std::string& key_arg, double value_arg);
+  //! Add a child element to an element list associated with this element, value of type int
+  YAML_Element* add(const std::string& key_arg, int value_arg);
+#ifndef MINIFE_NO_LONG_LONG
+  //! Add a child element to an element list associated with this element, value of type long long
+  YAML_Element* add(const std::string& key_arg, long long value_arg);
+#endif
+  //! Add a child element to an element list associated with this element, value of type size_t
+  YAML_Element* add(const std::string& key_arg, size_t value_arg);
+  //! Add a child element to an element list associated with this element, value of type string
+  YAML_Element* add(const std::string& key_arg, const std::string& value_arg);
+  //! get the element in the list with the given key
+  YAML_Element* get(const std::string& key_arg);
+  std::string printYAML(std::string space);
+  
+protected:
+  std::string key;
+  std::string value;
+  std::vector<YAML_Element*> children;
+
+private:
+  std::string convert_double_to_string(double value_arg);
+  std::string convert_int_to_string(int value_arg);
+#ifndef MINIFE_NO_LONG_LONG
+  std::string convert_long_long_to_string(long long value_arg);
+#endif
+  std::string convert_size_t_to_string(size_t value_arg);
+};
+#endif /* YAML_ELEMENT_H */
diff --git a/openmp-avx512/src/assemble_FE_data.hpp b/openmp-avx512/src/assemble_FE_data.hpp
new file mode 100644
index 0000000..a60b29c
--- /dev/null
+++ b/openmp-avx512/src/assemble_FE_data.hpp
@@ -0,0 +1,78 @@
+#ifndef _assemble_FE_data_hpp_
+#define _assemble_FE_data_hpp_
+
+//@HEADER
+// ************************************************************************
+// 
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <box_utils.hpp>
+#include <simple_mesh_description.hpp>
+
+#include <perform_element_loop.hpp>
+
+namespace miniFE {
+
+template<typename MatrixType,
+         typename VectorType>
+void
+assemble_FE_data(const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh,
+                 MatrixType& A,
+                 VectorType& b,
+                 Parameters& params)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+
+  Box local_elem_box;
+  copy_box(mesh.local_box, local_elem_box);
+
+  if (get_num_ids<GlobalOrdinal>(local_elem_box) < 1) {
+    return;
+  }
+
+  //
+  //We want the element-loop to loop over our (processor-local) domain plus a
+  //ghost layer, so we can assemble the complete linear-system without doing
+  //any communication.
+  //
+  int ghost = 1;
+  if (local_elem_box[0][0] > 0) local_elem_box[0][0] -= ghost;
+  if (local_elem_box[1][0] > 0) local_elem_box[1][0] -= ghost;
+  if (local_elem_box[2][0] > 0) local_elem_box[2][0] -= ghost;
+  if (local_elem_box[0][1] < global_elems_x) local_elem_box[0][1] += ghost;
+  if (local_elem_box[1][1] < global_elems_y) local_elem_box[1][1] += ghost;
+  if (local_elem_box[2][1] < global_elems_z) local_elem_box[2][1] += ghost;
+
+  perform_element_loop(mesh, local_elem_box, A, b, params);
+}
+                      
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/cg_solve.hpp b/openmp-avx512/src/cg_solve.hpp
new file mode 100644
index 0000000..69812c4
--- /dev/null
+++ b/openmp-avx512/src/cg_solve.hpp
@@ -0,0 +1,215 @@
+#ifndef _cg_solve_hpp_
+#define _cg_solve_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia	Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+#include <limits>
+
+#include <Vector_functions.hpp>
+#include <mytimer.hpp>
+
+#include <outstream.hpp>
+
+namespace miniFE {
+
+template<typename Scalar>
+void print_vec(const std::vector<Scalar>& vec, const std::string& name)
+{
+  for(size_t i=0; i<vec.size(); ++i) {
+    std::cout << name << "["<<i<<"]: " << vec[i] << std::endl;
+  }
+}
+
+template<typename VectorType>
+bool breakdown(typename VectorType::ScalarType inner,
+               const VectorType& v,
+               const VectorType& w)
+{
+  typedef typename VectorType::ScalarType Scalar;
+  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
+
+//This is code that was copied from Aztec, and originally written
+//by my hero, Ray Tuminaro.
+//
+//Assuming that inner = <v,w> (inner product of v and w),
+//v and w are considered orthogonal if
+//  |inner| < 100 * ||v||_2 * ||w||_2 * epsilon
+
+  magnitude vnorm = std::sqrt(dot(v,v));
+  magnitude wnorm = std::sqrt(dot(w,w));
+  return std::abs(inner) <= 100*vnorm*wnorm*std::numeric_limits<magnitude>::epsilon();
+}
+
+template<typename OperatorType,
+         typename VectorType,
+         typename Matvec>
+void
+cg_solve(OperatorType& A,
+         const VectorType& b,
+         VectorType& x,
+         Matvec matvec,
+         typename OperatorType::LocalOrdinalType max_iter,
+         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
+         typename OperatorType::LocalOrdinalType& num_iters,
+         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
+         timer_type* my_cg_times)
+{
+  typedef typename OperatorType::ScalarType ScalarType;
+  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
+  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
+  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;
+
+  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
+  timer_type total_time = mytimer();
+
+  int myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (!A.has_local_indices) {
+    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
+       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
+       << std::endl;
+    return;
+  }
+
+  size_t nrows = A.rows.size();
+  LocalOrdinalType ncols = A.num_cols;
+
+  VectorType r(b.startIndex, nrows);
+  VectorType p(0, ncols);
+  VectorType Ap(b.startIndex, nrows);
+
+  normr = 0;
+  magnitude_type rtrans = 0;
+  magnitude_type oldrtrans = 0;
+
+  LocalOrdinalType print_freq = max_iter/10;
+  if (print_freq>50) print_freq = 50;
+  if (print_freq<1)  print_freq = 1;
+
+  ScalarType one = 1.0;
+  ScalarType zero = 0.0;
+
+  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);
+
+//  print_vec(p.coefs, "p");
+
+  TICK();
+  matvec(A, p, Ap);
+  TOCK(tMATVEC);
+
+  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);
+
+  TICK(); rtrans = dot_r2(r); TOCK(tDOT);
+
+//std::cout << "rtrans="<<rtrans<<std::endl;
+
+  normr = std::sqrt(rtrans);
+
+  if (myproc == 0) {
+    std::cout << "Initial Residual = "<< normr << std::endl;
+  }
+
+  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();
+
+#ifdef MINIFE_DEBUG
+  std::ostream& os = outstream();
+  os << "brkdown_tol = " << brkdown_tol << std::endl;
+#endif
+
+#ifdef MINIFE_DEBUG_OPENMP
+  std::cout << "Starting CG Solve Phase..." << std::endl;
+#endif
+
+  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
+    if (k == 1) {
+      TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
+    }
+    else {
+      oldrtrans = rtrans;
+      TICK(); rtrans = dot_r2(r); TOCK(tDOT);
+      magnitude_type beta = rtrans/oldrtrans;
+      TICK(); daxpby(one, r, beta, p); TOCK(tWAXPY);
+    }
+
+    normr = sqrt(rtrans);
+
+    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
+      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
+    }
+
+    magnitude_type alpha = 0;
+    magnitude_type p_ap_dot = 0;
+
+    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);
+    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);
+
+#ifdef MINIFE_DEBUG
+    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
+    os.flush();
+#endif
+    if (p_ap_dot < brkdown_tol) {
+      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
+        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
+#ifdef MINIFE_DEBUG
+        os << "ERROR, numerical breakdown!"<<std::endl;
+#endif
+        //update the timers before jumping out.
+        my_cg_times[WAXPY] = tWAXPY;
+        my_cg_times[DOT] = tDOT;
+        my_cg_times[MATVEC] = tMATVEC;
+        my_cg_times[TOTAL] = mytimer() - total_time;
+        return;
+      }
+      else brkdown_tol = 0.1 * p_ap_dot;
+    }
+    alpha = rtrans/p_ap_dot;
+#ifdef MINIFE_DEBUG
+    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
+#endif
+
+    TICK(); daxpby(alpha, p, one, x);
+            daxpby(-alpha, Ap, one, r); TOCK(tWAXPY);
+
+    num_iters = k;
+  }
+
+  my_cg_times[WAXPY] = tWAXPY;
+  my_cg_times[DOT] = tDOT;
+  my_cg_times[MATVEC] = tMATVEC;
+  my_cg_times[MATVECDOT] = tMATVECDOT;
+  my_cg_times[TOTAL] = mytimer() - total_time;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/driver.hpp b/openmp-avx512/src/driver.hpp
new file mode 100644
index 0000000..74321ea
--- /dev/null
+++ b/openmp-avx512/src/driver.hpp
@@ -0,0 +1,410 @@
+#ifndef _driver_hpp_
+#define _driver_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia	Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+
+#include <box_utils.hpp>
+#include <Vector.hpp>
+
+#ifdef MINIFE_SELL_MATRIX
+#include <SELLMatrix.hpp>
+#elif defined(MINIFE_ELL_MATRIX)
+#include <ELLMatrix.hpp>
+#else
+#include <CSRMatrix.hpp>
+#endif
+
+#include <simple_mesh_description.hpp>
+
+#include <SparseMatrix_functions.hpp>
+
+#include <generate_matrix_structure.hpp>
+#include <assemble_FE_data.hpp>
+
+#include <verify_solution.hpp>
+
+#include <compute_matrix_stats.hpp>
+#include <make_local_matrix.hpp>
+#include <imbalance.hpp>
+#include <cg_solve.hpp>
+#if MINIFE_KERNELS != 0
+#include <time_kernels.hpp>
+#endif
+#include <outstream.hpp>
+#include <utils.hpp>
+#include <mytimer.hpp>
+#include <YAML_Doc.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#define RUN_TIMED_FUNCTION(msg, fn, time_inc, time_total) \
+{                                   \
+  if (myproc==0) {                  \
+    std::cout.width(30);            \
+    std::cout << msg;               \
+    std::cout.flush();              \
+  }                                 \
+  timer_type rtf_t0 = mytimer();    \
+  fn;                               \
+  time_inc = mytimer() - rtf_t0;    \
+  time_total += time_inc;           \
+  if (myproc==0) {                  \
+    std::cout << time_inc << "s, total time: " << time_total << std::endl; \
+  }                                 \
+}
+
+//This program assembles finite-element matrices into a global matrix and
+//vector, then solves the linear-system using Conjugate Gradients.
+//Each finite-element is a hexahedron with 8 vertex-nodes.
+//
+//Notes:
+//- In finite-element terms, the box dimensions are in elements, not nodes.
+//  In other words, a 2x2x2 box describes 8 elements, each of which has 8 nodes,
+//  so it is a 3x3x3 node domain (27 nodes).
+//  The assembled linear system will have 1 equation for each finite element node.
+//
+//- The coordinate origin is at the corner of the global box where x=0,
+//  y=0, z=0, and the box extends along the positive x-axis, positive y-axis,
+//  and the positive z-axis.
+//
+//- Some aspects of matrix-structure generation and finite-element assembly
+//  are convenient to do using global node identifiers.
+//  A global identifier for each node is obtained from coordinates plus
+//  global box dimensions. See the function 'get_id' in box_utils.hpp.
+//
+//- Each node corresponds to a row in the matrix. The RCB partitioning method
+//  we use to split the global box among processors results in some
+//  processors owning non-contiguous blocks of global node identifiers.
+//  Since it is convenient for matrices and vectors to store contiguously-
+//  numbered blocks of rows, we map global node identifiers to a separate
+//  space of row numbers such that each processor's nodes correspond to a
+//  contiguous block of row numbers.
+//
+
+namespace miniFE {
+
+template<typename Scalar,
+         typename LocalOrdinal,
+         typename GlobalOrdinal>
+int
+driver(const Box& global_box, Box& my_box,
+       Parameters& params, YAML_Doc& ydoc)
+{
+  int global_nx = global_box[0][1];
+  int global_ny = global_box[1][1];
+  int global_nz = global_box[2][1];
+
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (params.load_imbalance > 0) {
+    add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc);
+  }
+
+  float largest_imbalance = 0, std_dev = 0;
+  compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance,
+                                   std_dev, ydoc, true);
+
+
+  //Create a representation of the mesh:
+  //Note that 'simple_mesh_description' is a virtual or conceptual
+  //mesh that doesn't actually store mesh data.
+
+  if (myproc==0) {
+    std::cout.width(30);
+    std::cout << "creating/filling mesh...";
+    std::cout.flush();
+  }
+
+  timer_type t_start = mytimer();
+  timer_type t0 = mytimer();
+
+  simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box);
+
+  timer_type mesh_fill = mytimer() - t0;
+  timer_type t_total = mytimer() - t_start;
+
+  if (myproc==0) {
+    std::cout << mesh_fill << "s, total time: " << t_total << std::endl;
+  }
+
+  //next we will generate the matrix structure.
+
+  //Declare matrix object:
+#ifdef MINIFE_SELL_MATRIX
+  typedef SELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
+#elif defined(MINIFE_ELL_MATRIX)
+  typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
+#else
+  typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
+#endif
+
+  MatrixType A;
+
+  timer_type gen_structure;
+  RUN_TIMED_FUNCTION("generating matrix structure...",
+                     generate_matrix_structure(mesh, A),
+                     gen_structure, t_total);
+
+  GlobalOrdinal local_nrows = A.rows.size();
+  GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1;
+
+  Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows);
+  Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows);
+
+  //Assemble finite-element sub-matrices and sub-vectors into the global
+  //linear system:
+
+  timer_type fe_assembly;
+  RUN_TIMED_FUNCTION("assembling FE data...",
+                     assemble_FE_data(mesh, A, b, params),
+                     fe_assembly, t_total);
+
+  if (myproc == 0) {
+    ydoc.add("Matrix structure generation","");
+    ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure);
+    ydoc.add("FE assembly","");
+    ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly);
+  }
+
+#ifdef MINIFE_DEBUG
+  write_matrix("A_prebc.mtx", A);
+  write_vector("b_prebc.vec", b);
+#endif
+
+  //Now apply dirichlet boundary-conditions
+  //(Apply the 0-valued surfaces first, then the 1-valued surface last.)
+
+  timer_type dirbc_time;
+  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
+            impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total);
+  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
+            impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total);
+
+#ifdef MINIFE_DEBUG
+  write_matrix("A.mtx", A);
+  write_vector("b.vec", b);
+#endif
+
+  //Transform global indices to local, set up communication information:
+
+  timer_type make_local_time;
+  RUN_TIMED_FUNCTION("making matrix indices local...",
+                     make_local_matrix(A),
+                     make_local_time, t_total);
+
+#ifdef MINIFE_DEBUG
+  write_matrix("A_local.mtx", A);
+  write_vector("b_local.vec", b);
+#endif
+
+  size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc);
+
+  //Prepare to perform conjugate gradient solve:
+
+  LocalOrdinal max_iters = 200;
+  LocalOrdinal num_iters = 0;
+  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
+  magnitude rnorm = 0;
+  magnitude tol = std::numeric_limits<magnitude>::epsilon();
+
+  timer_type cg_times[NUM_TIMERS];
+
+  typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType;
+
+  t_total = mytimer() - t_start;
+
+  bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1;
+
+  int verify_result = 0;
+
+#if MINIFE_KERNELS != 0
+  if (myproc==0) {
+    std::cout.width(30);
+    std::cout << "Starting kernel timing loops ..." << std::endl;
+  }
+
+  max_iters = 500;
+  x.coefs[0] = 0.9;
+  if (matvec_with_comm_overlap) {
+    time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
+  }
+  else {
+    time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
+  }
+  num_iters = max_iters;
+  std::string title("Kernel timings");
+#else
+  if (myproc==0) {
+    std::cout << "Starting CG solver ... " << std::endl;
+  }
+
+  if (matvec_with_comm_overlap) {
+#ifdef MINIFE_CSR_MATRIX
+    rearrange_matrix_local_external(A);
+    cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol,
+           num_iters, rnorm, cg_times);
+#else
+    std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl;
+#endif
+  }
+  else {
+    cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol,
+           num_iters, rnorm, cg_times);
+    if (myproc == 0) {
+      std::cout << "Final Resid Norm: " << rnorm << std::endl;
+    }
+
+    if (params.verify_solution > 0) {
+      double tolerance = 0.06;
+      bool verify_whole_domain = false;
+  #ifdef MINIFE_DEBUG
+      verify_whole_domain = true;
+  #endif
+      if (myproc == 0) {
+        if (verify_whole_domain) std::cout << "verifying solution..." << std::endl;
+        else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl;
+      }
+      verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain);
+    }
+  }
+
+#ifdef MINIFE_DEBUG
+  write_vector("x.vec", x);
+#endif
+  std::string title("CG solve");
+#endif
+
+  if (myproc == 0) {
+    ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name());
+    ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name());
+    ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name());
+    ydoc.add(title,"");
+    ydoc.get(title)->add("Iterations",num_iters);
+    ydoc.get(title)->add("Final Resid Norm",rnorm);
+
+    GlobalOrdinal global_nrows = global_nx;
+    global_nrows *= global_ny*global_nz;
+
+    //flops-per-mv, flops-per-dot, flops-per-waxpy:
+    double mv_flops = global_nnz*2.0;
+    double dot_flops = global_nrows*2.0;
+    double waxpy_flops = global_nrows*3.0;
+
+#if MINIFE_KERNELS == 0
+//if MINIFE_KERNELS == 0 then we did a CG solve, and in that case
+//there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys.
+    mv_flops *= (num_iters+1);
+    dot_flops *= (2*num_iters);
+    waxpy_flops *= (3*num_iters+2);
+#else
+//if MINIFE_KERNELS then we did one of each operation per iteration.
+    mv_flops *= num_iters;
+    dot_flops *= num_iters;
+    waxpy_flops *= num_iters;
+#endif
+
+    double total_flops = mv_flops + dot_flops + waxpy_flops;
+
+    double mv_mflops = -1;
+    if (cg_times[MATVEC] > 1.e-4)
+      mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]);
+
+    double dot_mflops = -1;
+    if (cg_times[DOT] > 1.e-4)
+      dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]);
+
+    double waxpy_mflops = -1;
+    if (cg_times[WAXPY] > 1.e-4)
+      waxpy_mflops = 1.e-6 *  (waxpy_flops/cg_times[WAXPY]);
+
+    double total_mflops = -1;
+    if (cg_times[TOTAL] > 1.e-4)
+      total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]);
+
+    ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]);
+    ydoc.get(title)->add("WAXPY Flops",waxpy_flops);
+    if (waxpy_mflops >= 0)
+      ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops);
+    else
+      ydoc.get(title)->add("WAXPY Mflops","inf");
+
+    ydoc.get(title)->add("DOT Time",cg_times[DOT]);
+    ydoc.get(title)->add("DOT Flops",dot_flops);
+    if (dot_mflops >= 0)
+      ydoc.get(title)->add("DOT Mflops",dot_mflops);
+    else
+      ydoc.get(title)->add("DOT Mflops","inf");
+
+    ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]);
+    ydoc.get(title)->add("MATVEC Flops",mv_flops);
+    if (mv_mflops >= 0)
+      ydoc.get(title)->add("MATVEC Mflops",mv_mflops);
+    else
+      ydoc.get(title)->add("MATVEC Mflops","inf");
+
+#ifdef MINIFE_FUSED
+    ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]);
+    ydoc.get(title)->add("MATVECDOT Flops",mv_flops);
+    if (mv_mflops >= 0)
+      ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops);
+    else
+      ydoc.get(title)->add("MATVECDOT Mflops","inf");
+#endif
+
+#if MINIFE_KERNELS == 0
+    ydoc.get(title)->add("Total","");
+    ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]);
+    ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops);
+    if (total_mflops >= 0)
+      ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops);
+    else
+      ydoc.get(title)->get("Total")->add("Total CG Mflops","inf");
+    ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters);
+#endif
+  }
+
+  return verify_result;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/exchange_externals.hpp b/openmp-avx512/src/exchange_externals.hpp
new file mode 100644
index 0000000..7faf242
--- /dev/null
+++ b/openmp-avx512/src/exchange_externals.hpp
@@ -0,0 +1,270 @@
+#ifndef _exchange_externals_hpp_
+#define _exchange_externals_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstdlib>
+#include <iostream>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <outstream.hpp>
+
+#include <TypeTraits.hpp>
+
+namespace miniFE {
+
+template<typename MatrixType,
+         typename VectorType>
+void
+exchange_externals(MatrixType& A,
+                   VectorType& x)
+{
+#ifdef HAVE_MPI
+#ifdef MINIFE_DEBUG
+  std::ostream& os = outstream();
+  os << "entering exchange_externals\n";
+#endif
+
+  int numprocs = 1;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+
+  if (numprocs < 2) return;
+
+  typedef typename MatrixType::ScalarType Scalar;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+  // Extract Matrix pieces
+
+  int local_nrow = A.rows.size();
+  int num_neighbors = A.neighbors.size();
+  const std::vector<LocalOrdinal>& recv_length = A.recv_length;
+  const std::vector<LocalOrdinal>& send_length = A.send_length;
+  const std::vector<int>& neighbors = A.neighbors;
+  const std::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
+
+  std::vector<Scalar>& send_buffer = A.send_buffer;
+
+  //
+  // first post receives, these are immediate receives
+  // Do not wait for result to come, will do that at the
+  // wait call below.
+  //
+
+  int MPI_MY_TAG = 99;
+
+  std::vector<MPI_Request>& request = A.request;
+
+  //
+  // Externals are at end of locals
+  //
+
+  std::vector<Scalar>& x_coefs = x.coefs;
+  Scalar* x_external = &(x_coefs[local_nrow]);
+
+  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
+
+  // Post receives first
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_recv = recv_length[i];
+    MPI_Irecv(x_external, n_recv, mpi_dtype, neighbors[i], MPI_MY_TAG,
+              MPI_COMM_WORLD, &request[i]);
+    x_external += n_recv;
+  }
+
+#ifdef MINIFE_DEBUG
+  os << "launched recvs\n";
+#endif
+
+  //
+  // Fill up send buffer
+  //
+
+  size_t total_to_be_sent = elements_to_send.size();
+#ifdef MINIFE_DEBUG
+  os << "total_to_be_sent: " << total_to_be_sent << std::endl;
+#endif
+
+#pragma omp parallel for
+  for(size_t i=0; i<total_to_be_sent; ++i) {
+#ifdef MINIFE_DEBUG
+    //expensive index range-check:
+    if (elements_to_send[i] < 0 || elements_to_send[i] > x.coefs.size()) {
+      os << "error, out-of-range. x.coefs.size()=="<<x.coefs.size()<<", elements_to_send[i]=="<<elements_to_send[i]<<std::endl;
+    }
+#endif
+    send_buffer[i] = x.coefs[elements_to_send[i]];
+  }
+
+  //
+  // Send to each neighbor
+  //
+
+  Scalar* s_buffer = &send_buffer[0];
+
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_send = send_length[i];
+    MPI_Send(s_buffer, n_send, mpi_dtype, neighbors[i], MPI_MY_TAG,
+             MPI_COMM_WORLD);
+    s_buffer += n_send;
+  }
+
+#ifdef MINIFE_DEBUG
+  os << "send to " << num_neighbors << std::endl;
+#endif
+
+  //
+  // Complete the reads issued above
+  //
+
+  MPI_Status status;
+  for(int i=0; i<num_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+  }
+
+#ifdef MINIFE_DEBUG
+  os << "leaving exchange_externals"<<std::endl;
+#endif
+
+//endif HAVE_MPI
+#endif
+}
+
+#ifdef HAVE_MPI
+static std::vector<MPI_Request> exch_ext_requests;
+#endif
+
+template<typename MatrixType,
+         typename VectorType>
+void
+begin_exchange_externals(MatrixType& A,
+                         VectorType& x)
+{
+#ifdef HAVE_MPI
+
+  int numprocs = 1, myproc = 0;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+  if (numprocs < 2) return;
+
+  typedef typename MatrixType::ScalarType Scalar;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+
+  // Extract Matrix pieces
+
+  int local_nrow = A.rows.size();
+  int num_neighbors = A.neighbors.size();
+  const std::vector<LocalOrdinal>& recv_length = A.recv_length;
+  const std::vector<LocalOrdinal>& send_length = A.send_length;
+  const std::vector<int>& neighbors = A.neighbors;
+  const std::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
+
+  std::vector<Scalar> send_buffer(elements_to_send.size(), 0);
+
+  //
+  // first post receives, these are immediate receives
+  // Do not wait for result to come, will do that at the
+  // wait call below.
+  //
+
+  int MPI_MY_TAG = 99;
+
+  exch_ext_requests.resize(num_neighbors);
+
+  //
+  // Externals are at end of locals
+  //
+
+  std::vector<Scalar>& x_coefs = x.coefs;
+  Scalar* x_external = &(x_coefs[local_nrow]);
+
+  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
+
+  // Post receives first
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_recv = recv_length[i];
+    MPI_Irecv(x_external, n_recv, mpi_dtype, neighbors[i], MPI_MY_TAG,
+              MPI_COMM_WORLD, &exch_ext_requests[i]);
+    x_external += n_recv;
+  }
+
+  //
+  // Fill up send buffer
+  //
+
+  size_t total_to_be_sent = elements_to_send.size();
+  for(size_t i=0; i<total_to_be_sent; ++i) send_buffer[i] = x.coefs[elements_to_send[i]];
+
+  //
+  // Send to each neighbor
+  //
+
+  Scalar* s_buffer = &send_buffer[0];
+
+  for(int i=0; i<num_neighbors; ++i) {
+    int n_send = send_length[i];
+    MPI_Send(s_buffer, n_send, mpi_dtype, neighbors[i], MPI_MY_TAG,
+             MPI_COMM_WORLD);
+    s_buffer += n_send;
+  }
+#endif
+}
+
+inline
+void
+finish_exchange_externals(int num_neighbors)
+{
+#ifdef HAVE_MPI
+  //
+  // Complete the reads issued above
+  //
+
+  MPI_Status status;
+  for(int i=0; i<num_neighbors; ++i) {
+    if (MPI_Wait(&exch_ext_requests[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+  }
+
+//endif HAVE_MPI
+#endif
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/generate_info_header b/openmp-avx512/src/generate_info_header
new file mode 100755
index 0000000..469f0b9
--- /dev/null
+++ b/openmp-avx512/src/generate_info_header
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# usage:
+# generate_info_header <compiler> <flags> <header_prefix> <macro_prefix>
+# example:
+#   % generate_info_header g++ -O3 miniFE MINIFE
+# this will cause the appropriate info to be put in a 
+# header named miniFE_info.hpp and the info will be in macros
+# that start with MINIFE.
+#
+# an example of usage can be seen in miniFE/make_targets
+#
+if [ $# != 4 ] ; then
+echo "error, need 4 arguments.";
+exit 1;
+fi
+
+cxx=`which ${1}`
+errcode="$?"
+if [ ${errcode} != "0" ] ; then
+cxx="unknown";
+fi
+echo "CXX: ${cxx}"
+
+cxx_ver=`${1} --version 2>&1`
+errcode="$?"
+if [ ${errcode} != "0" ] ; then
+cxx_ver=`${1} -V 2>&1`;
+errcode="$?"
+if [ ${errcode} != "0" ] ; then
+cxx_ver="unknown";
+fi
+fi
+
+cxx_ver=${cxx_ver// /@}
+cxx_version=""
+for i in $(echo ${cxx_ver});
+do
+ if [ "$cxx_version" == "" ]; then
+    cxx_version=$i;
+ fi
+done
+cxx_version=${cxx_version//@/ }
+echo "Compiler version: ${cxx_version}"
+
+cxxflags=${2}
+hostname=`uname -n`
+errcode="$?"
+if [ ${errcode} != "0" ] ; then
+hostname="unknown";
+fi
+
+kern_name=`uname -s`
+errcode="$?"
+if [ ${errcode} != "0" ] ; then
+kern_name="unknown";
+fi
+
+kern_rel=`uname -r`
+errcode="$?"
+if [ ${errcode} != "0" ] ; then
+kern_rel="unknown";
+fi
+
+proc=`uname -p`
+errcode="$?"
+if [ ${errcode} != "0" ] ; then
+proc="unknown";
+fi
+
+header_prefix=${3}
+macro_prefix=${4}
+
+cat << END_CAT > ${header_prefix}_info.hpp
+#ifndef ${header_prefix}_info_hpp
+#define ${header_prefix}_info_hpp
+
+#define ${macro_prefix}_HOSTNAME "${hostname}"
+#define ${macro_prefix}_KERNEL_NAME "'${kern_name}'"
+#define ${macro_prefix}_KERNEL_RELEASE "'${kern_rel}'"
+#define ${macro_prefix}_PROCESSOR "'${proc}'"
+
+#define ${macro_prefix}_CXX "'${cxx}'"
+#define ${macro_prefix}_CXX_VERSION "'${cxx_version}'"
+#define ${macro_prefix}_CXXFLAGS "'${cxxflags}'"
+
+#endif
+END_CAT
diff --git a/openmp-avx512/src/generate_matrix_structure.hpp b/openmp-avx512/src/generate_matrix_structure.hpp
new file mode 100644
index 0000000..76de40c
--- /dev/null
+++ b/openmp-avx512/src/generate_matrix_structure.hpp
@@ -0,0 +1,165 @@
+#ifndef _generate_matrix_structure_hpp_
+#define _generate_matrix_structure_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <sstream>
+#include <stdexcept>
+#include <map>
+#include <algorithm>
+
+#include <simple_mesh_description.hpp>
+#include <SparseMatrix_functions.hpp>
+#include <box_utils.hpp>
+#include <utils.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType>
+int
+generate_matrix_structure(const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh,
+                          MatrixType& A)
+{
+  int myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  int threw_exc = 0;
+  try {
+
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+
+  const int global_nodes_x = mesh.global_box[0][1]+1;
+  const int global_nodes_y = mesh.global_box[1][1]+1;
+  const int global_nodes_z = mesh.global_box[2][1]+1;
+  Box box;
+  copy_box(mesh.local_box, box);
+
+  //num-owned-nodes in each dimension is num-elems+1
+  //only if num-elems > 0 in that dimension *and*
+  //we are at the high end of the global range in that dimension:
+  if (box[0][1] > box[0][0] && box[0][1] == mesh.global_box[0][1]) ++box[0][1];
+  if (box[1][1] > box[1][0] && box[1][1] == mesh.global_box[1][1]) ++box[1][1];
+  if (box[2][1] > box[2][0] && box[2][1] == mesh.global_box[2][1]) ++box[2][1];
+
+  GlobalOrdinal global_nrows = global_nodes_x;
+  global_nrows *= global_nodes_y*global_nodes_z;
+
+  GlobalOrdinal nrows = get_num_ids<GlobalOrdinal>(box);
+  try {
+    A.reserve_space(nrows, 27);
+  }
+  catch(std::exception& exc) {
+    std::ostringstream osstr;
+    osstr << "One of A.rows.resize, A.row_offsets.resize, A.packed_cols.reserve or A.packed_coefs.reserve: nrows=" <<nrows<<": ";
+    osstr << exc.what();
+    std::string str1 = osstr.str();
+    throw std::runtime_error(str1);
+  }
+
+  std::vector<GlobalOrdinal> rows(nrows);
+  std::vector<LocalOrdinal> row_offsets(nrows+1);
+  std::vector<LocalOrdinal> row_coords(nrows*3);
+
+  const MINIFE_GLOBAL_ORDINAL z_width = box[2][1] - box[2][0];
+  const MINIFE_GLOBAL_ORDINAL y_width = box[1][1] - box[1][0];
+  const MINIFE_GLOBAL_ORDINAL x_width = box[0][1] - box[0][0];
+  const MINIFE_GLOBAL_ORDINAL r_n = (box[2][1] - box[2][0]) *
+					(box[1][1] - box[1][0]) *
+					(box[0][1] - box[0][0]);
+  const MINIFE_GLOBAL_ORDINAL xy_width = x_width * y_width;
+        MINIFE_GLOBAL_ORDINAL* const row_ptr = &rows[0];
+        MINIFE_LOCAL_ORDINAL* const row_offset_ptr = &row_offsets[0];
+        MINIFE_LOCAL_ORDINAL* const row_coords_ptr = &row_coords[0];
+
+	#pragma omp parallel for
+	for(int r = 0; r < r_n; ++r) {
+		int iz = r / (xy_width) + box[2][0];
+		int iy = (r / x_width) % y_width + box[1][0];
+		int ix = r % x_width + box[0][0];
+
+        	GlobalOrdinal row_id =
+                           	get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
+                               	ix, iy, iz);
+                       	row_ptr[r] = mesh.map_id_to_row(row_id);
+                       	row_coords_ptr[r*3] = ix;
+                       	row_coords_ptr[r*3+1] = iy;
+                        row_coords_ptr[r*3+2] = iz;
+
+			MINIFE_LOCAL_ORDINAL nnz = 0;
+                        for(int sz=-1; sz<=1; ++sz) {
+                               	for(int sy=-1; sy<=1; ++sy) {
+                                       	for(int sx=-1; sx<=1; ++sx) {
+                                               	GlobalOrdinal col_id =
+get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
+	                                   ix+sx, iy+sy, iz+sz);
+
+                                               	if (col_id >= 0 && col_id < global_nrows) {
+                                               	++nnz;
+                                               	}
+                                       	}
+                               	}
+                       	}
+                       	row_offset_ptr[r+1] = nnz;
+
+	}
+
+  const MINIFE_GLOBAL_ORDINAL n = row_offsets.size() - 1;
+  for(int i = 0; i < n; ++i) {
+  	row_offset_ptr[i+1] += row_offset_ptr[i];
+  }
+
+  init_matrix(A, rows, row_offsets, row_coords,
+              global_nodes_x, global_nodes_y, global_nodes_z, global_nrows, mesh);
+  }
+  catch(...) {
+    std::cout << "proc " << myproc << " threw an exception in generate_matrix_structure, probably due to running out of memory." << std::endl;
+    threw_exc = 1;
+  }
+#ifdef HAVE_MPI
+  int global_throw = 0;
+  MPI_Allreduce(&threw_exc, &global_throw, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  threw_exc = global_throw;
+#endif
+  if (threw_exc) {
+    return 1;
+  }
+
+  return 0;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/get_common_files b/openmp-avx512/src/get_common_files
new file mode 100755
index 0000000..e2448e3
--- /dev/null
+++ b/openmp-avx512/src/get_common_files
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+if [ -f ./YAML_Doc.cpp ] ; then
+  exit 0;
+fi
+
+dir=../../common
+
+cp ${dir}/YAML_Doc.cpp .
+cp ${dir}/YAML_Doc.hpp .
+cp ${dir}/YAML_Element.cpp .
+cp ${dir}/YAML_Element.hpp .
+
+cp ${dir}/generate_info_header .
+
diff --git a/openmp-avx512/src/main.cpp b/openmp-avx512/src/main.cpp
new file mode 100644
index 0000000..5a38b7f
--- /dev/null
+++ b/openmp-avx512/src/main.cpp
@@ -0,0 +1,276 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <iostream>
+#include <ctime>
+#include <cstdlib>
+#include <vector>
+
+#include <miniFE_version.h>
+
+#include <outstream.hpp>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef MINIFE_REPORT_RUSAGE
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <Box.hpp>
+#include <BoxPartition.hpp>
+#include <box_utils.hpp>
+#include <Parameters.hpp>
+#include <utils.hpp>
+#include <driver.hpp>
+#include <YAML_Doc.hpp>
+
+#if MINIFE_INFO != 0
+#include <miniFE_info.hpp>
+#else
+#include <miniFE_no_info.hpp>
+#endif
+
+//The following macros should be specified as compile-macros in the
+//makefile. They are defaulted here just in case...
+#ifndef MINIFE_SCALAR
+#define MINIFE_SCALAR double
+#endif
+#ifndef MINIFE_LOCAL_ORDINAL
+#define MINIFE_LOCAL_ORDINAL int
+#endif
+#ifndef MINIFE_GLOBAL_ORDINAL
+#define MINIFE_GLOBAL_ORDINAL int
+#endif
+
+// ************************************************************************
+
+void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params);
+void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads);
+void add_timestring_to_yaml(YAML_Doc& doc);
+
+//
+//We will create a 'box' of size nx X ny X nz, partition it among processors,
+//then call miniFE::driver which will use the partitioned box as the domain
+//from which to assemble finite-element matrices into a global matrix and
+//vector, then solve the linear-system using Conjugate Gradients.
+//
+
+int main(int argc, char** argv) {
+  miniFE::Parameters params;
+  miniFE::get_parameters(argc, argv, params);
+
+  int numprocs = 1, myproc = 0;
+  miniFE::initialize_mpi(argc, argv, numprocs, myproc);
+
+  miniFE::timer_type start_time = miniFE::mytimer();
+
+#ifdef MINIFE_DEBUG
+  outstream(numprocs, myproc);
+#endif
+
+  if(myproc==0) {
+    std::cout << "MiniFE Mini-App, OpenMP Peer Implementation" << std::endl;
+    std::cout << "Creating OpenMP Thread Pool..." << std::endl;
+  }
+  int value = 0;
+  const int thread_count = omp_get_max_threads();
+#pragma omp parallel for reduction(+:value)
+  for(int i = 0; i < thread_count; ++i) {
+	value += 1;
+  }
+  double global_threadcount;
+  double local_threadcount = value;
+
+#ifdef HAVE_MPI
+  MPI_Allreduce(&local_threadcount,&global_threadcount,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+#else
+  global_threadcount = local_threadcount;
+#endif
+  if(myproc==0) {
+    std::cout << "Counted: " << global_threadcount << " threads." << std::endl;
+    std::cout << "Running MiniFE Mini-App..." << std::endl;
+  }
+
+  //make sure each processor has the same parameters:
+  miniFE::broadcast_parameters(params);
+
+
+  Box global_box = { 0, params.nx, 0, params.ny, 0, params.nz };
+  std::vector<Box> local_boxes(numprocs);
+
+  box_partition(0, numprocs, 2, global_box, &local_boxes[0]);
+
+  Box& my_box = local_boxes[myproc];
+
+  MINIFE_GLOBAL_ORDINAL num_my_ids = miniFE::get_num_ids<MINIFE_GLOBAL_ORDINAL>(my_box);
+  MINIFE_GLOBAL_ORDINAL min_ids = num_my_ids;
+
+#ifdef HAVE_MPI
+  MPI_Datatype mpi_dtype = miniFE::TypeTraits<MINIFE_GLOBAL_ORDINAL>::mpi_type();
+  MPI_Allreduce(&num_my_ids, &min_ids, 1, mpi_dtype, MPI_MIN, MPI_COMM_WORLD);
+#endif
+
+  if (min_ids == 0) {
+    std::cout<<"One or more processors have 0 equations. Not currently supported. Exiting."<<std::endl;
+
+    miniFE::finalize_mpi();
+
+    return 1;
+  }
+
+  std::ostringstream osstr;
+  osstr << "miniFE." << params.nx << "x" << params.ny << "x" << params.nz;
+#ifdef HAVE_MPI
+  osstr << ".P" << numprocs;
+#endif
+#ifdef _OPENMP
+  osstr << ".T" << omp_get_max_threads();
+#endif
+  osstr << ".";
+  if (params.name != "") osstr << params.name << ".";
+
+  YAML_Doc doc("miniFE", MINIFE_VERSION, ".", osstr.str());
+  if (myproc == 0) {
+    add_params_to_yaml(doc, params);
+    add_configuration_to_yaml(doc, numprocs, params.numthreads);
+    add_timestring_to_yaml(doc);
+  }
+
+  //Most of the program is performed in the 'driver' function, which is
+  //templated on < Scalar, LocalOrdinal, GlobalOrdinal >.
+  //To run miniFE with float instead of double, or 'long long' instead of int,
+  //etc., change these template-parameters by changing the macro definitions in
+  //the makefile or on the make command-line.
+
+  int return_code =
+     miniFE::driver< MINIFE_SCALAR, MINIFE_LOCAL_ORDINAL, MINIFE_GLOBAL_ORDINAL>(global_box, my_box, params, doc);
+
+  miniFE::timer_type total_time = miniFE::mytimer() - start_time;
+
+#ifdef MINIFE_REPORT_RUSAGE
+   struct rusage get_mem;
+   getrusage(RUSAGE_SELF, &get_mem);
+
+   long long int rank_rss = get_mem.ru_maxrss;
+   long long int global_rss = 0;
+   long long int max_rss = 0;
+
+#ifdef HAVE_MPI
+   MPI_Reduce(&rank_rss, &global_rss, 1, 
+	MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
+   MPI_Reduce(&rank_rss, &max_rss, 1, 
+	MPI_LONG_LONG, MPI_MAX, 0, MPI_COMM_WORLD);
+   if (myproc == 0) {
+	doc.add("Global All-RSS (kB)", global_rss);
+	doc.add("Global Max-RSS (kB)", max_rss);
+   }
+#else
+   doc.add("RSS (kB)", rank_rss);
+#endif
+#endif
+
+  if (myproc == 0) {
+    doc.add("Total Program Time",total_time);
+    doc.generateYAML();
+  }
+
+
+  miniFE::finalize_mpi();
+
+  return return_code;
+}
+
+void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params)
+{
+  doc.add("Global Run Parameters","");
+  doc.get("Global Run Parameters")->add("dimensions","");
+  doc.get("Global Run Parameters")->get("dimensions")->add("nx",params.nx);
+  doc.get("Global Run Parameters")->get("dimensions")->add("ny",params.ny);
+  doc.get("Global Run Parameters")->get("dimensions")->add("nz",params.nz);
+  doc.get("Global Run Parameters")->add("load_imbalance", params.load_imbalance);
+  if (params.mv_overlap_comm_comp == 1) {
+    std::string val("1 (yes)");
+    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
+  }
+  else {
+    std::string val("0 (no)");
+    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
+  }
+#ifdef _OPENMP
+  doc.get("Global Run Parameters")->add("OpenMP Max Threads:", omp_get_max_threads());
+#endif
+}
+
+void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads)
+{
+  doc.get("Global Run Parameters")->add("number of processors", numprocs);
+
+  doc.add("Platform","");
+  doc.get("Platform")->add("hostname",MINIFE_HOSTNAME);
+  doc.get("Platform")->add("kernel name",MINIFE_KERNEL_NAME);
+  doc.get("Platform")->add("kernel release",MINIFE_KERNEL_RELEASE);
+  doc.get("Platform")->add("processor",MINIFE_PROCESSOR);
+
+  doc.add("Build","");
+  doc.get("Build")->add("CXX",MINIFE_CXX);
+#if MINIFE_INFO != 0
+  doc.get("Build")->add("compiler version",MINIFE_CXX_VERSION);
+#endif
+  doc.get("Build")->add("CXXFLAGS",MINIFE_CXXFLAGS);
+  std::string using_mpi("no");
+#ifdef HAVE_MPI
+  using_mpi = "yes";
+#endif
+  doc.get("Build")->add("using MPI",using_mpi);
+}
+
+void add_timestring_to_yaml(YAML_Doc& doc)
+{
+  std::time_t rawtime;
+  struct tm * timeinfo;
+  std::time(&rawtime);
+  timeinfo = std::localtime(&rawtime);
+  std::ostringstream osstr;
+  osstr.fill('0');
+  osstr << timeinfo->tm_year+1900 << "-";
+  osstr.width(2); osstr << timeinfo->tm_mon+1 << "-";
+  osstr.width(2); osstr << timeinfo->tm_mday << ", ";
+  osstr.width(2); osstr << timeinfo->tm_hour << "-";
+  osstr.width(2); osstr << timeinfo->tm_min << "-";
+  osstr.width(2); osstr << timeinfo->tm_sec;
+  std::string timestring = osstr.str();
+  doc.add("Run Date/Time",timestring);
+}
+
diff --git a/openmp-avx512/src/make_local_matrix.hpp b/openmp-avx512/src/make_local_matrix.hpp
new file mode 100644
index 0000000..76970d0
--- /dev/null
+++ b/openmp-avx512/src/make_local_matrix.hpp
@@ -0,0 +1,447 @@
+#ifndef _make_local_matrix_hpp_
+#define _make_local_matrix_hpp_
+#include <assert.h>
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <utils.hpp>
+
+#include <map>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename MatrixType>
+void
+make_local_matrix(MatrixType& A)
+{
+#ifdef HAVE_MPI
+  int numprocs = 1, myproc = 0;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+  if (numprocs < 2) {
+    A.num_cols = A.rows.size();
+    A.has_local_indices = true;
+    return;
+  }
+
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  std::map<GlobalOrdinal,GlobalOrdinal> externals;
+  LocalOrdinal num_external = 0;
+
+  //Extract Matrix pieces
+
+  size_t local_nrow = A.rows.size();
+  GlobalOrdinal start_row = local_nrow>0 ? A.rows[0] : -1;
+  GlobalOrdinal stop_row  = local_nrow>0 ? A.rows[local_nrow-1] : -1;
+
+  // We need to convert the index values for the rows on this processor
+  // to a local index space. We need to:
+  // - Determine if each index reaches to a local value or external value
+  // - If local, subtract start_row from index value to get local index
+  // - If external, find out if it is already accounted for.
+  //   - If so, then do nothing,
+  //   - otherwise
+  //     - add it to the list of external indices,
+  //     - find out which processor owns the value.
+  //     - Set up communication for sparse MV operation
+
+  ///////////////////////////////////////////
+  // Scan the indices and transform to local
+  ///////////////////////////////////////////
+
+  std::vector<GlobalOrdinal>& external_index = A.external_index;
+
+  for(size_t i=0; i<A.rows.size(); ++i) {
+    GlobalOrdinal* Acols = NULL;
+    Scalar* Acoefs = NULL;
+    size_t row_len = 0;
+    A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);
+
+    for(size_t j=0; j<row_len; ++j) {
+      GlobalOrdinal cur_ind = Acols[j];
+      if (start_row <= cur_ind && cur_ind <= stop_row) {
+        Acols[j] -= start_row;
+      }
+      else { // Must find out if we have already set up this point
+        if (externals.find(cur_ind) == externals.end()) {
+          externals[cur_ind] = num_external++;
+          external_index.push_back(cur_ind);
+        }
+        // Mark index as external by adding 1 and negating it
+        Acols[j] = -(Acols[j] + 1);
+      }
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // Go through list of externals to find out which processors must be accessed.
+  ////////////////////////////////////////////////////////////////////////
+
+  std::vector<GlobalOrdinal> tmp_buffer(numprocs, 0); // Temp buffer space needed below
+
+  // Build list of global index offset
+
+  std::vector<GlobalOrdinal> global_index_offsets(numprocs, 0);
+
+  tmp_buffer[myproc] = start_row; // This is my start row
+
+  // This call sends the start_row of each ith processor to the ith
+  // entry of global_index_offsets on all processors.
+  // Thus, each processor knows the range of indices owned by all
+  // other processors.
+  // Note: There might be a better algorithm for doing this, but this
+  //       will work...
+
+  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
+  MPI_Allreduce(&tmp_buffer[0], &global_index_offsets[0], numprocs, mpi_dtype,
+                MPI_SUM, MPI_COMM_WORLD);
+
+  // Go through list of externals and find the processor that owns each
+  std::vector<int> external_processor(num_external);
+
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    GlobalOrdinal cur_ind = external_index[i];
+    for(int j=numprocs-1; j>=0; --j) {
+      if (global_index_offsets[j] <= cur_ind && global_index_offsets[j] >= 0) {
+        external_processor[i] = j;
+        break;
+      }
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  // Sift through the external elements. For each newly encountered external
+  // point assign it the next index in the sequence. Then look for other
+  // external elements who are updated by the same node and assign them the next
+  // set of index numbers in the sequence (ie. elements updated by the same node
+  // have consecutive indices).
+  /////////////////////////////////////////////////////////////////////////
+
+  size_t count = local_nrow;
+  std::vector<GlobalOrdinal>& external_local_index = A.external_local_index;
+  external_local_index.assign(num_external, -1);
+
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    if (external_local_index[i] == -1) {
+      external_local_index[i] = count++;
+
+      for(LocalOrdinal j=i+1; j<num_external; ++j) {
+        if (external_processor[j] == external_processor[i])
+          external_local_index[j] = count++;
+      }
+    }
+  }
+
+  for(size_t i=0; i<local_nrow; ++i) {
+    GlobalOrdinal* Acols = NULL;
+    Scalar* Acoefs = NULL;
+    size_t row_len = 0;
+    A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);
+
+    for(size_t j=0; j<row_len; ++j) {
+      if (Acols[j] < 0) { // Change index values of externals
+        GlobalOrdinal cur_ind = -Acols[j] - 1;
+        Acols[j] = external_local_index[externals[cur_ind]];
+      }
+    }
+  }
+
+  std::vector<int> new_external_processor(num_external, 0);
+
+  for(int i=0; i<num_external; ++i) {
+    new_external_processor[external_local_index[i]-local_nrow] =
+      external_processor[i];
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ///
+  // Count the number of neighbors from which we receive information to update
+  // our external elements. Additionally, fill the array tmp_neighbors in the
+  // following way:
+  //      tmp_neighbors[i] = 0   ==>  No external elements are updated by
+  //                              processor i.
+  //      tmp_neighbors[i] = x   ==>  (x-1)/numprocs elements are updated from
+  //                              processor i.
+  ///
+  ////////////////////////////////////////////////////////////////////////
+
+  std::vector<GlobalOrdinal> tmp_neighbors(numprocs, 0);
+
+  int num_recv_neighbors = 0;
+  int length             = 1;
+
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    if (tmp_neighbors[new_external_processor[i]] == 0) {
+      ++num_recv_neighbors;
+      tmp_neighbors[new_external_processor[i]] = 1;
+    }
+    tmp_neighbors[new_external_processor[i]] += numprocs;
+  }
+
+  /// sum over all processor all the tmp_neighbors arrays ///
+
+  MPI_Allreduce(&tmp_neighbors[0], &tmp_buffer[0], numprocs, mpi_dtype,
+                MPI_SUM, MPI_COMM_WORLD);
+
+  // decode the combined 'tmp_neighbors' (stored in tmp_buffer)
+  // array from all the processors
+
+  GlobalOrdinal num_send_neighbors = tmp_buffer[myproc] % numprocs;
+
+  /// decode 'tmp_buffer[myproc] to deduce total number of elements
+  //  we must send
+
+  GlobalOrdinal total_to_be_sent = (tmp_buffer[myproc] - num_send_neighbors) / numprocs;
+
+  ///////////////////////////////////////////////////////////////////////
+  ///
+  // Make a list of the neighbors that will send information to update our
+  // external elements (in the order that we will receive this information).
+  ///
+  ///////////////////////////////////////////////////////////////////////
+
+  std::vector<int> recv_list;
+  recv_list.push_back(new_external_processor[0]);
+  for(LocalOrdinal i=1; i<num_external; ++i) {
+    if (new_external_processor[i-1] != new_external_processor[i]) {
+      recv_list.push_back(new_external_processor[i]);
+    }
+  }
+
+  //
+  // Send a 0 length message to each of our recv neighbors
+  //
+
+  std::vector<int> send_list(num_send_neighbors, 0);
+
+  //
+  // first post receives, these are immediate receives
+  // Do not wait for result to come, will do that at the
+  // wait call below.
+  //
+  int MPI_MY_TAG = 99;
+
+  std::vector<MPI_Request> request(num_send_neighbors);
+  for(int i=0; i<num_send_neighbors; ++i) {
+    MPI_Irecv(&tmp_buffer[i], 1, mpi_dtype, MPI_ANY_SOURCE, MPI_MY_TAG,
+              MPI_COMM_WORLD, &request[i]);
+  }
+
+  // send messages
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    MPI_Send(&tmp_buffer[i], 1, mpi_dtype, recv_list[i], MPI_MY_TAG,
+             MPI_COMM_WORLD);
+  }
+
+  ///
+  // Receive message from each send neighbor to construct 'send_list'.
+  ///
+
+  MPI_Status status;
+  for(int i=0; i<num_send_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+    send_list[i] = status.MPI_SOURCE;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  ///
+  // Compare the two lists. In most cases they should be the same.
+  // However, if they are not then add new entries to the recv list
+  // that are in the send list (but not already in the recv list).
+  ///
+  //////////////////////////////////////////////////////////////////////
+
+  for(int j=0; j<num_send_neighbors; ++j) {
+    int found = 0;
+    for(int i=0; i<num_recv_neighbors; ++i) {
+      if (recv_list[i] == send_list[j]) found = 1;
+    }
+
+    if (found == 0) {
+      recv_list.push_back(send_list[j]);
+      ++num_recv_neighbors;
+    }
+  }
+
+  num_send_neighbors = num_recv_neighbors;
+  request.resize(num_send_neighbors);
+
+  A.elements_to_send.assign(total_to_be_sent, 0);
+  A.send_buffer.assign(total_to_be_sent, 0);
+
+  //
+  // Create 'new_external' which explicitly put the external elements in the
+  // order given by 'external_local_index'
+  //
+
+  std::vector<GlobalOrdinal> new_external(num_external);
+  for(LocalOrdinal i=0; i<num_external; ++i) {
+    new_external[external_local_index[i] - local_nrow] = external_index[i];
+  }
+
+  /////////////////////////////////////////////////////////////////////////
+  //
+  // Send each processor the global index list of the external elements in the
+  // order that I will want to receive them when updating my external elements.
+  //
+  /////////////////////////////////////////////////////////////////////////
+
+  std::vector<int> lengths(num_recv_neighbors);
+
+  ++MPI_MY_TAG;
+
+  // First post receives
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    int partner = recv_list[i];
+    MPI_Irecv(&lengths[i], 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD,
+              &request[i]);
+  }
+
+  std::vector<int>& neighbors = A.neighbors;
+  std::vector<int>& recv_length = A.recv_length;
+  std::vector<int>& send_length = A.send_length;
+
+  neighbors.resize(num_recv_neighbors, 0);
+  A.request.resize(num_recv_neighbors);
+  recv_length.resize(num_recv_neighbors, 0);
+  send_length.resize(num_recv_neighbors, 0);
+
+  LocalOrdinal j = 0;
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    int start = j;
+    int newlength = 0;
+
+    //go through list of external elements until updating
+    //processor changes
+
+    while((j < num_external) &&
+          (new_external_processor[j] == recv_list[i])) {
+      ++newlength;
+      ++j;
+      if (j == num_external) break;
+    }
+
+    recv_length[i] = newlength;
+    neighbors[i] = recv_list[i];
+
+    length = j - start;
+    MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  // Complete the receives of the number of externals
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+    send_length[i] = lengths[i];
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // Build "elements_to_send" list. These are the x elements I own
+  // that need to be sent to other processors.
+  ////////////////////////////////////////////////////////////////////////
+
+  ++MPI_MY_TAG;
+
+  j = 0;
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    MPI_Irecv(&A.elements_to_send[j], send_length[i], mpi_dtype, neighbors[i],
+              MPI_MY_TAG, MPI_COMM_WORLD, &request[i]);
+    j += send_length[i];
+  }
+
+  j = 0;
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    LocalOrdinal start = j;
+    LocalOrdinal newlength = 0;
+
+    // Go through list of external elements
+    // until updating processor changes. This is redundant, but
+    // saves us from recording this information.
+
+    while((j < num_external) &&
+          (new_external_processor[j] == recv_list[i])) {
+      ++newlength;
+      ++j;
+      if (j == num_external) break;
+    }
+    MPI_Send(&new_external[start], j-start, mpi_dtype, recv_list[i],
+             MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  // receive from each neighbor the global index list of external elements
+
+  for(int i=0; i<num_recv_neighbors; ++i) {
+    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
+      std::cerr << "MPI_Wait error\n"<<std::endl;
+      MPI_Abort(MPI_COMM_WORLD, -1);
+    }
+  }
+
+  /// replace global indices by local indices ///
+
+  for(GlobalOrdinal i=0; i<total_to_be_sent; ++i) {
+    A.elements_to_send[i] -= start_row;
+    if (A.elements_to_send[i] >= A.rows.size()) {
+//std::cout<<"start_row: "<<start_row<<", A.elements_to_send[i]: "<<A.elements_to_send[i]<<", A.rows.size(): "<<A.rows.size()<<std::endl;
+    assert(A.elements_to_send[i] < A.rows.size());
+    }
+  }
+
+  //////////////////
+  // Finish up !!
+  //////////////////
+
+  A.num_cols = local_nrow + num_external;
+
+#else
+  A.num_cols = A.rows.size();
+#endif
+
+  A.has_local_indices = true;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/make_targets b/openmp-avx512/src/make_targets
new file mode 100644
index 0000000..a9a8ee6
--- /dev/null
+++ b/openmp-avx512/src/make_targets
@@ -0,0 +1,45 @@
+
+OBJS = \
+	BoxPartition.o \
+	YAML_Doc.o \
+	YAML_Element.o
+
+UTIL_OBJS = \
+	param_utils.o \
+	utils.o \
+	mytimer.o
+
+MAIN_OBJ = \
+	main.o
+
+MINIFE_INFO = 1
+MINIFE_KERNELS = 0
+
+vpath %.cpp ../utils
+
+all:common_files generate_info miniFE.x
+
+miniFE.x:common_files $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) *.hpp generate_info
+	$(INSTRUMENT) $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) -o miniFE.x $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
+
+common_files:
+	./get_common_files
+
+generate_info:
+	./generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
+
+verify:all
+	./run_tests
+
+%.o:%.cpp *.hpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
+
+%.o:%.c *.h
+	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
+
+clean:
+	rm -f *.o *.a *.x *.linkinfo miniFE_info.hpp
+
+realclean: clean
+	rm -f gmon.out gprof.* *~ *.yaml *.TVD.* *.mtx* *.vec* minife_debug*
+
diff --git a/openmp-avx512/src/perform_element_loop.hpp b/openmp-avx512/src/perform_element_loop.hpp
new file mode 100644
index 0000000..5b71b10
--- /dev/null
+++ b/openmp-avx512/src/perform_element_loop.hpp
@@ -0,0 +1,95 @@
+#ifndef _perform_element_loop_hpp_
+#define _perform_element_loop_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <BoxIterator.hpp>
+#include <simple_mesh_description.hpp>
+#include <SparseMatrix_functions.hpp>
+#include <box_utils.hpp>
+#include <Hex8_box_utils.hpp>
+#include <Hex8_ElemData.hpp>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+//#include "advisor-annotate.h"
+
+namespace miniFE {
+
+template<typename GlobalOrdinal,
+         typename MatrixType, typename VectorType>
+void
+perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
+                     const Box& local_elem_box,
+                     MatrixType& A, VectorType& b,
+                     Parameters& /*params*/)
+{
+  typedef typename MatrixType::ScalarType Scalar;
+
+  int global_elems_x = mesh.global_box[0][1];
+  int global_elems_y = mesh.global_box[1][1];
+  int global_elems_z = mesh.global_box[2][1];
+
+  //We will iterate the local-element-box (local portion of the mesh), and
+  //get element-IDs in preparation for later assembling the FE operators
+  //into the global sparse linear-system.
+
+  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
+  std::vector<GlobalOrdinal> elemIDs(num_elems);
+
+  BoxIterator iter = BoxIterator::begin(local_elem_box);
+  BoxIterator end  = BoxIterator::end(local_elem_box);
+
+  for(size_t i=0; iter != end; ++iter, ++i) {
+    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
+                                       iter.x, iter.y, iter.z);
+  }
+
+  timer_type t_gn = 0, t_ce = 0, t_si = 0;
+  timer_type t0 = 0;
+
+  const MINIFE_GLOBAL_ORDINAL elemID_size = elemIDs.size();
+
+  #pragma omp parallel for shared (elemIDs)
+  for(MINIFE_GLOBAL_ORDINAL i=0; i < elemID_size; ++i) {
+    ElemData<GlobalOrdinal,Scalar> elem_data;
+    compute_gradient_values(elem_data.grad_vals);
+
+    get_elem_nodes_and_coords(mesh, elemIDs[i], elem_data);
+    compute_element_matrix_and_vector(elem_data);
+    sum_into_global_linear_system(elem_data, A, b);
+  }
+
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/src/simple_mesh_description.hpp b/openmp-avx512/src/simple_mesh_description.hpp
new file mode 100644
index 0000000..b34d44a
--- /dev/null
+++ b/openmp-avx512/src/simple_mesh_description.hpp
@@ -0,0 +1,248 @@
+
+#ifndef _simple_mesh_description_hpp_
+#define _simple_mesh_description_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <utils.hpp>
+#include <set>
+#include <map>
+
+namespace miniFE {
+
+template<typename GlobalOrdinal>
+class simple_mesh_description {
+public:
+  simple_mesh_description(const Box& global_box_in, const Box& local_box_in)
+  {
+   Box local_node_box;
+    for(int i=0; i<3; ++i) {
+      global_box[i][0] = global_box_in[i][0];
+      global_box[i][1] = global_box_in[i][1];
+      local_box[i][0] = local_box_in[i][0];
+      local_box[i][1] = local_box_in[i][1];
+      local_node_box[i][0] = local_box_in[i][0];
+      local_node_box[i][1] = local_box_in[i][1];
+      //num-owned-nodes == num-elems+1 in this dimension if the elem box is not empty
+      //and we are at the high end of the global range in that dimension:
+      if (local_box_in[i][1] > local_box_in[i][0] && local_box_in[i][1] == global_box[i][1]) local_node_box[i][1] += 1;
+    }
+
+    int max_node_x = global_box[0][1]+1;
+    int max_node_y = global_box[1][1]+1;
+    int max_node_z = global_box[2][1]+1;
+    create_map_id_to_row(max_node_x, max_node_y, max_node_z, local_node_box,
+                         map_ids_to_rows);
+
+    //As described in analytic_soln.hpp,
+    //we will impose a 0 boundary-condition on faces x=0, y=0, z=0, y=1, z=1
+    //we will impose a 1 boundary-condition on face x=1
+
+#ifdef MINIFE_DEBUG
+std::cout<<std::endl;
+#endif
+    const int X=0;
+    const int Y=1;
+    const int Z=2;
+
+    const int x1 = max_node_x - 1;
+    const int y1 = max_node_y - 1;
+    const int z1 = max_node_z - 1;
+
+    //if we're on the x=0 face:
+    if (global_box[X][0] == local_box[X][0]) {
+      int miny = local_node_box[Y][0];
+      int minz = local_node_box[Z][0];
+      int maxy = local_node_box[Y][1];
+      int maxz = local_node_box[Z][1];
+      //expand y and z dimensions to include ghost layer
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+
+      for(int iz=minz; iz<maxz; ++iz) {
+        for(int iy=miny; iy<maxy; ++iy) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             0, iy, iz);
+#ifdef MINIFE_DEBUG
+std::cout<<"x=0 BC, node "<<nodeID<<", (0,"<<iy<<","<<iz<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the y=0 face:
+    if (global_box[Y][0] == local_box[Y][0]) {
+      int minx = local_node_box[X][0];
+      int minz = local_node_box[Z][0];
+      int maxx = local_node_box[X][1];
+      int maxz = local_node_box[Z][1];
+      //expand x and z dimensions to include ghost layer
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+
+      for(int iz=minz; iz<maxz; ++iz) {
+        for(int ix=minx; ix<maxx; ++ix) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, 0, iz);
+#ifdef MINIFE_DEBUG
+std::cout<<"y=0 BC, node "<<nodeID<<", ("<<ix<<",0,"<<iz<<")"<<std::endl;
+#endif
+          GlobalOrdinal row = map_id_to_row(nodeID);
+          if (row < 0) {
+            std::cout<<"on the y==0 face (ix="<<ix<<", iz="<<iz<<"), ERROR: found negative row ("<<row<<") for nodeID="<<nodeID<<std::endl;
+          }
+          bc_rows_0.insert(row);
+        }
+      }
+    }
+
+    //if we're on the z=0 face:
+    if (global_box[Z][0] == local_box[Z][0]) {
+      int minx = local_node_box[X][0];
+      int miny = local_node_box[Y][0];
+      int maxx = local_node_box[X][1];
+      int maxy = local_node_box[Y][1];
+      //expand x and y dimensions to include ghost layer
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+
+      for(int iy=miny; iy<maxy; ++iy) {
+        for(int ix=minx; ix<maxx; ++ix) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, iy, 0);
+#ifdef MINIFE_DEBUG
+std::cout<<"z=0 BC, node "<<nodeID<<", ("<<ix<<","<<iy<<",0)"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the x=1 face:
+    if (global_box[X][1] == local_box[X][1]) {
+      int minz = local_node_box[Z][0];
+      int miny = local_node_box[Y][0];
+      int maxz = local_node_box[Z][1];
+      int maxy = local_node_box[Y][1];
+      //expand z and y dimensions to include ghost layer
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+
+      for(int iy=miny; iy<maxy; ++iy) {
+        for(int iz=minz; iz<maxz; ++iz) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             x1, iy, iz);
+          GlobalOrdinal row = map_id_to_row(nodeID);
+#ifdef MINIFE_DEBUG
+std::cout<<"x=1 BC, node "<<nodeID<<", row "<<row<<", ("<<x1<<","<<iy<<","<<iz<<")"<<std::endl;
+#endif
+          bc_rows_1.insert(row);
+        }
+      }
+    }
+
+    //if we're on the y=1 face:
+    if (global_box[Y][1] == local_box[Y][1]) {
+      int minz = local_node_box[Z][0];
+      int minx = local_node_box[X][0];
+      int maxz = local_node_box[Z][1];
+      int maxx = local_node_box[X][1];
+      //expand z and x dimensions to include ghost layer
+      if (local_node_box[Z][0] > 0) --minz;
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Z][1] < max_node_z) ++maxz;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+
+      for(int ix=minx; ix<maxx; ++ix) {
+        for(int iz=minz; iz<maxz; ++iz) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, y1, iz);
+#ifdef MINIFE_DEBUG
+std::cout<<"y=1 BC, node "<<nodeID<<", ("<<ix<<","<<y1<<","<<iz<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+    //if we're on the z=1 face:
+    if (global_box[Z][1] == local_box[Z][1]) {
+      int miny = local_node_box[Y][0];
+      int minx = local_node_box[X][0];
+      int maxy = local_node_box[Y][1];
+      int maxx = local_node_box[X][1];
+      //expand x and y dimensions to include ghost layer
+      if (local_node_box[Y][0] > 0) --miny;
+      if (local_node_box[X][0] > 0) --minx;
+      if (local_node_box[Y][1] < max_node_y) ++maxy;
+      if (local_node_box[X][1] < max_node_x) ++maxx;
+
+      for(int ix=minx; ix<maxx; ++ix) {
+        for(int iy=miny; iy<maxy; ++iy) {
+          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
+             ix, iy, z1);
+#ifdef MINIFE_DEBUG
+std::cout<<"z=1 BC, node "<<nodeID<<", ("<<ix<<","<<iy<<","<<z1<<")"<<std::endl;
+#endif
+          bc_rows_0.insert(map_id_to_row(nodeID));
+        }
+      }
+    }
+
+  }
+
+  GlobalOrdinal map_id_to_row(const GlobalOrdinal& id) const
+  {
+    return find_row_for_id(id, map_ids_to_rows);
+  }
+
+  GlobalOrdinal max_row_in_map() const {
+    if (map_ids_to_rows.empty()) return 0;
+    typename std::map<GlobalOrdinal,GlobalOrdinal>::const_iterator mend = map_ids_to_rows.end();
+    --mend;
+    return mend->second;
+  }
+  std::set<GlobalOrdinal> bc_rows_0;
+  std::set<GlobalOrdinal> bc_rows_1;
+  std::map<GlobalOrdinal,GlobalOrdinal> map_ids_to_rows;
+  Box global_box;
+  Box local_box;
+};//class simple_mesh_description
+
+}//namespace miniFE
+
+#endif
diff --git a/openmp-avx512/src/time_kernels.hpp b/openmp-avx512/src/time_kernels.hpp
new file mode 100644
index 0000000..e14ff09
--- /dev/null
+++ b/openmp-avx512/src/time_kernels.hpp
@@ -0,0 +1,139 @@
+#ifndef _time_kernels_hpp_
+#define _time_kernels_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+
+#include <Vector_functions.hpp>
+#include <mytimer.hpp>
+
+#ifdef MINIFE_HAVE_CUDA
+#include <cuda.h>
+#endif
+
+namespace miniFE {
+
+template<typename OperatorType,
+         typename VectorType,
+         typename Matvec>
+void
+time_kernels(OperatorType& A,
+             const VectorType& b,
+             VectorType& x,
+             Matvec matvec,
+             typename OperatorType::LocalOrdinalType max_iter,
+             typename OperatorType::ScalarType& xdotp,
+             timer_type* my_kern_times)
+{
+  typedef typename OperatorType::ScalarType ScalarType;
+  typedef typename OperatorType::LocalOrdinalType OrdinalType;
+  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;
+
+  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0;
+
+  int myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (!A.has_local_indices) {
+    std::cerr << "miniFE::time_kernels ERROR, A.has_local_indices is false, needs to be true. This probably means "
+       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::time_kernels."
+       << std::endl;
+    return;
+  }
+
+  OrdinalType nrows = A.rows.size();
+  OrdinalType ncols = A.num_cols;
+
+  VectorType p(0, ncols, b.compute_node);
+
+  ScalarType one = 1.0;
+  ScalarType zero = 0.0;
+
+  typedef typename VectorType::ComputeNodeType ComputeNodeType;
+  ComputeNodeType& compute_node = x.compute_node;
+
+  //The following lines that create and initialize buffers are no-ops in many
+  //cases, but perform actual allocations and copies if a off-cpu device such as
+  //a GPU is being used by compute_node.
+
+  //Do any required allocations for buffers that will be needed during CG:
+  ScalarType* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
+  ScalarType* d_p = compute_node.get_buffer(&p.coefs[0], p.coefs.size());
+  ScalarType* d_b = compute_node.get_buffer(&b.coefs[0], b.coefs.size());
+  OrdinalType* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
+  OrdinalType* d_Acols   = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
+  ScalarType* d_Acoefs  = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
+
+  //Copy data to buffers that need to be initialized from input data:
+  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
+  compute_node.copy_to_buffer(&b.coefs[0], b.coefs.size(), d_b);
+  compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
+  compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
+  compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);
+
+  TICK();
+  for(OrdinalType i=0; i<max_iter; ++i) {
+    waxpby(one, x, zero, x, p);
+  }
+#ifdef MINIFE_HAVE_CUDA
+  cudaThreadSynchronize();
+#endif
+  TOCK(tWAXPY);
+
+  TICK();
+  for(OrdinalType i=0; i<max_iter; ++i) {
+    matvec(A, p, x);
+  }
+#ifdef MINIFE_HAVE_CUDA
+  cudaThreadSynchronize();
+#endif
+  TOCK(tMATVEC);
+
+  TICK();
+  xdotp = 0;
+  for(OrdinalType i=0; i<max_iter; ++i) {
+    xdotp += dot(x, p);
+  }
+#ifdef MINIFE_HAVE_CUDA
+  cudaThreadSynchronize();
+#endif
+  TOCK(tDOT);
+
+  my_kern_times[WAXPY] = tWAXPY;
+  my_kern_times[DOT] = tDOT;
+  my_kern_times[MATVEC] = tMATVEC;
+  my_kern_times[TOTAL] = 0;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/utils/Box.hpp b/openmp-avx512/utils/Box.hpp
new file mode 100644
index 0000000..a60a34d
--- /dev/null
+++ b/openmp-avx512/utils/Box.hpp
@@ -0,0 +1,55 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _Box_hpp_
+#define _Box_hpp_
+
+/**
+  * a 'Box' is 3 pairs of ints, where each pair specifies a lower
+  * and upper bound for one of the 3 spatial dimensions.
+  *
+  * This struct stores the 3 pairs as a simple array of 6 ints,
+  * but defines the bracket operator so that it can be referenced
+  * using 2-dimensional array notation like this:
+  * int xmin = box[0][0]; int xmax = box[0][1];
+  * int ymin = box[1][0]; int ymax = box[1][1];
+  * int zmin = box[2][0]; int zmax = box[2][1];
+ */
+struct Box {
+  int ranges[6];
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+  int* operator[](int xyz) { return &ranges[xyz*2]; }
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+  const int* operator[](int xyz) const { return &ranges[xyz*2]; }
+};
+
+#endif
+
diff --git a/openmp-avx512/utils/BoxIterator.hpp b/openmp-avx512/utils/BoxIterator.hpp
new file mode 100644
index 0000000..5856e9f
--- /dev/null
+++ b/openmp-avx512/utils/BoxIterator.hpp
@@ -0,0 +1,142 @@
+#ifndef _BoxTraverser_hpp_
+#define _BoxTraverser_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+namespace miniFE {
+
+/** Class for traversing a 3-dimensional 'box' of indices.
+
+  //One way to traverse a 'box[3][2]' is to use a triply-nested for-loop:
+  for(int z=box[2][0]; z<box[2][1]; ++z) {
+    for(int y=box[1][0]; y<box[1][1]; ++y) {
+      for(int x=box[0][0]; x<box[0][1]; ++x) {
+        ...
+      }
+    }
+  }
+
+  //Another way is to use this BoxIterator class, like so:
+  //BoxIterator iter = BoxIterator::begin(box);
+  //BoxIterator end = BoxIterator::end(box);
+  for(; iter != end; ++iter) {
+    int x = iter.x;
+    int y = iter.y;
+    int z = iter.z;
+    ...
+  }
+*/
+class BoxIterator {
+public:
+  ~BoxIterator(){}
+
+  static BoxIterator begin(const Box& box)
+  {
+    return BoxIterator(box);
+  }
+
+  static BoxIterator end(const Box& box)
+  {
+    return BoxIterator(box, true/*at_end==true*/);
+  }
+
+  BoxIterator& operator=(const BoxIterator& src)
+  {
+    box_[0][0] = src.box_[0][0]; box_[0][1] = src.box_[0][1];
+    box_[1][0] = src.box_[1][0]; box_[1][1] = src.box_[1][1];
+    box_[2][0] = src.box_[2][0]; box_[2][1] = src.box_[2][1];
+    x = src.x;
+    y = src.y;
+    z = src.z;
+    return *this;
+  }
+
+  BoxIterator& operator++()
+  {
+    ++x;
+    if (x >= box_[0][1]) {
+      x = box_[0][0];
+      ++y;
+      if (y >= box_[1][1]) {
+        y = box_[1][0];
+        ++z;
+        if (z >= box_[2][1]) {
+          z = box_[2][1];
+          y = box_[1][1];
+          x = box_[0][1];
+        }
+      }
+    }
+    return *this;
+  }
+
+  BoxIterator operator++(int)
+  {
+    BoxIterator temp = *this;
+    ++(*this);
+    return temp;
+  }
+
+  bool operator==(const BoxIterator& rhs) const
+  {
+    return x == rhs.x && y == rhs.y && z == rhs.z;
+  }
+
+  bool operator!=(const BoxIterator& rhs) const
+  {
+    return !(this->operator==(rhs));
+  }
+
+  int x;
+  int y;
+  int z;
+
+private:
+  BoxIterator(const Box& box, bool at_end = false)
+   : x(box[0][0]),
+     y(box[1][0]),
+     z(box[2][0]),
+     box_()
+  {
+    box_[0][0] = box[0][0]; box_[0][1] = box[0][1];
+    box_[1][0] = box[1][0]; box_[1][1] = box[1][1];
+    box_[2][0] = box[2][0]; box_[2][1] = box[2][1];
+    if (at_end) {
+      x = box[0][1];
+      y = box[1][1];
+      z = box[2][1];
+    }
+  }
+
+  Box box_;
+};//class BoxTraverser
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/utils/BoxPartition.cpp b/openmp-avx512/utils/BoxPartition.cpp
new file mode 100644
index 0000000..cb167fb
--- /dev/null
+++ b/openmp-avx512/utils/BoxPartition.cpp
@@ -0,0 +1,503 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <Box.hpp>
+#include <BoxPartition.hpp>
+
+/*--------------------------------------------------------------------*/
+
+static int box_map_local_entry( const Box& box ,
+                                const int ghost ,
+                                int local_x ,
+                                int local_y ,
+                                int local_z )
+{
+  const int nx = 2 * ghost + box[0][1] - box[0][0] ;
+  const int ny = 2 * ghost + box[1][1] - box[1][0] ;
+  const int nz = 2 * ghost + box[2][1] - box[2][0] ;
+  int result = -1 ;
+
+  local_x += ghost ;
+  local_y += ghost ;
+  local_z += ghost ;
+
+  if ( 0 <= local_x && local_x < nx &&
+       0 <= local_y && local_y < ny &&
+       0 <= local_z && local_z < nz ) {
+
+    result = local_z * ny * nx + local_y * nx + local_x ;
+  }
+  return result ;
+}
+
+int box_map_local( const Box& box_local,
+                   const int ghost ,
+                   const int box_local_map[] ,
+                   const int local_x ,
+                   const int local_y ,
+                   const int local_z )
+{
+  int result = box_map_local_entry(box_local,ghost,local_x,local_y,local_z);
+
+  if ( 0 <= result ) {
+    result = box_local_map[ result ];
+  }
+
+  return result ;
+}
+
+/*--------------------------------------------------------------------*/
+/* Recursively split a box into into (up-ip) sub-boxes */
+
+void box_partition( int ip , int up , int axis ,
+                    const Box& box,
+                    Box* p_box )
+{
+  const int np = up - ip ;
+  if ( 1 == np ) {
+    p_box[ip][0][0] = box[0][0] ; p_box[ip][0][1] = box[0][1] ;
+    p_box[ip][1][0] = box[1][0] ; p_box[ip][1][1] = box[1][1] ;
+    p_box[ip][2][0] = box[2][0] ; p_box[ip][2][1] = box[2][1] ;
+  }
+  else {
+    const int n = box[ axis ][1] - box[ axis ][0] ;
+    const int np_low = np / 2 ;  /* Rounded down */
+    const int np_upp = np - np_low ;
+
+    const int n_upp = (int) (((double) n) * ( ((double)np_upp) / ((double)np)));
+    const int n_low = n - n_upp ;
+    const int next_axis = ( axis + 2 ) % 3 ;
+
+    if ( np_low ) { /* P = [ip,ip+np_low) */
+      Box dbox ;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      dbox[ axis ][1] = dbox[ axis ][0] + n_low ;
+
+      box_partition( ip, ip + np_low, next_axis, dbox, p_box );
+    }
+
+    if ( np_upp ) { /* P = [ip+np_low,ip+np_low+np_upp) */
+      Box dbox;
+      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
+      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
+      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
+
+      ip += np_low ;
+      dbox[ axis ][0] += n_low ;
+      dbox[ axis ][1]  = dbox[ axis ][0] + n_upp ;
+
+      box_partition( ip, ip + np_upp, next_axis, dbox, p_box );
+    }
+  }
+}
+
+/*--------------------------------------------------------------------*/
+
+static int box_disjoint( const Box& a , const Box& b)
+{
+  return a[0][1] <= b[0][0] || b[0][1] <= a[0][0] ||
+         a[1][1] <= b[1][0] || b[1][1] <= a[1][0] ||
+         a[2][1] <= b[2][0] || b[2][1] <= a[2][0] ;
+}
+
+static void resize_int( int ** a , int * allocLen , int newLen )
+{
+  int k = 32;
+  while ( k < newLen ) { k <<= 1 ; }
+  if ( NULL == *a )
+    { *a = (int*)malloc( sizeof(int)*(*allocLen = k) ); }
+  else if ( *allocLen < k ) 
+    { *a = (int*)realloc(*a , sizeof(int)*(*allocLen = k)); }
+}
+
+static void box_partition_maps( 
+  const int np ,
+  const int my_p ,
+  const Box* pbox,
+  const int ghost ,
+  int ** map_local_id ,
+  int ** map_recv_pc ,
+  int ** map_send_pc ,
+  int ** map_send_id )
+{
+  const Box& my_box = pbox[my_p] ;
+
+  const int my_ix = my_box[0][0] ;
+  const int my_iy = my_box[1][0] ;
+  const int my_iz = my_box[2][0] ;
+  const int my_nx = my_box[0][1] - my_box[0][0] ;
+  const int my_ny = my_box[1][1] - my_box[1][0] ;
+  const int my_nz = my_box[2][1] - my_box[2][0] ;
+
+  const int my_use_nx = 2 * ghost + my_nx ;
+  const int my_use_ny = 2 * ghost + my_ny ;
+  const int my_use_nz = 2 * ghost + my_nz ;
+
+  const int id_length = my_use_nx * my_use_ny * my_use_nz ;
+
+  int * local_id  = (int *) malloc( id_length * sizeof(int) );
+  int * recv_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
+  int * send_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
+
+  int * send_id  = NULL ;
+  int   send_id_size = 0 ;
+
+  int iLocal , iSend ;
+  int i ;
+
+  Box my_use_box;
+
+  my_use_box[0][0] = my_box[0][0] - ghost ;
+  my_use_box[0][1] = my_box[0][1] + ghost ;
+  my_use_box[1][0] = my_box[1][0] - ghost ;
+  my_use_box[1][1] = my_box[1][1] + ghost ;
+  my_use_box[2][0] = my_box[2][0] - ghost ;
+  my_use_box[2][1] = my_box[2][1] + ghost ;
+
+  for ( i = 0 ; i < id_length ; ++i ) { local_id[i] = -1 ; }
+
+  iSend = 0 ;
+  iLocal = 0 ;
+
+  /* The vector space is partitioned by processors */
+
+  for ( i = 0 ; i < np ; ++i ) {
+    const int ip = ( i + my_p ) % np ;
+    recv_pc[i] = iLocal ;
+    send_pc[i] = iSend ;
+
+    if ( ! box_disjoint( my_use_box , pbox[ip] ) ) {
+      const int p_ix = pbox[ip][0][0] ;
+      const int p_iy = pbox[ip][1][0] ;
+      const int p_iz = pbox[ip][2][0] ;
+      const int p_ex = pbox[ip][0][1] ;
+      const int p_ey = pbox[ip][1][1] ;
+      const int p_ez = pbox[ip][2][1] ;
+
+      int local_x , local_y , local_z ;
+
+      /* Run the span of global cells that my processor uses */
+
+      for ( local_z = -ghost ; local_z < my_nz + ghost ; ++local_z ) {
+      for ( local_y = -ghost ; local_y < my_ny + ghost ; ++local_y ) {
+      for ( local_x = -ghost ; local_x < my_nx + ghost ; ++local_x ) {
+
+        const int global_z = local_z + my_iz ;
+        const int global_y = local_y + my_iy ;
+        const int global_x = local_x + my_ix ;
+
+        const int entry = 
+          box_map_local_entry(my_box,ghost,local_x,local_y,local_z);
+
+        if ( entry < 0 ) { abort(); }
+
+        if ( p_iz <= global_z && global_z < p_ez &&
+             p_iy <= global_y && global_y < p_ey &&
+             p_ix <= global_x && global_x < p_ex ) {
+
+          /* This ordinal is owned by processor 'ip' */
+
+          local_id[ entry ] = iLocal++ ;
+
+#if defined(DEBUG_PRINT)
+if ( my_p != ip ) {
+  fprintf(stdout,"  (%d,%d,%d) : P%d recv at local %d from P%d\n",
+                  global_x,global_y,global_z,my_p,local_id[entry],ip);
+  fflush(stdout);
+}
+#endif
+        }
+
+        /* If in my ownership and used by the other processor */
+        if ( my_p != ip &&
+             /* In my ownership: */
+             ( 0 <= local_z && local_z < my_nz &&
+               0 <= local_y && local_y < my_ny &&
+               0 <= local_x && local_x < my_nx ) &&
+             /* In other processors usage: */
+             ( p_iz - ghost <= global_z && global_z < p_ez + ghost &&
+               p_iy - ghost <= global_y && global_y < p_ey + ghost &&
+               p_ix - ghost <= global_x && global_x < p_ex + ghost ) ) {
+
+          resize_int( & send_id , & send_id_size , (iSend + 1) );
+          send_id[ iSend ] = local_id[ entry ] ;
+          ++iSend ;
+
+#if defined(DEBUG_PRINT)
+{
+  fprintf(stdout,"  (%d,%d,%d) : P%d send at local %d to P%d\n",
+                  global_x,global_y,global_z,my_p,local_id[entry],ip);
+  fflush(stdout);
+}
+#endif
+        }
+      }
+    }
+    }
+    }
+  }
+  recv_pc[np] = iLocal ;
+  send_pc[np] = iSend ;
+
+  *map_local_id  = local_id ;
+  *map_recv_pc   = recv_pc ;
+  *map_send_pc   = send_pc ;
+  *map_send_id   = send_id ;
+}
+
+void box_partition_rcb( const int np , 
+                        const int my_p ,
+                        const Box& root_box,
+                        const int ghost ,
+                        Box** pbox,
+                        int ** map_local_id ,
+                        int ** map_recv_pc ,
+                        int ** map_send_pc ,
+                        int ** map_send_id )
+{
+  *pbox = new Box[ np ];
+
+  box_partition( 0 , np , 2 , root_box , *pbox );
+
+  box_partition_maps( np , my_p , *pbox , ghost ,
+                      map_local_id , map_recv_pc , 
+                      map_send_pc , map_send_id );
+}
+
+/*--------------------------------------------------------------------*/
+
+#ifdef UNIT_TEST
+
+static int box_contain( const Box& a , const Box& b )
+{
+  return a[0][0] <= b[0][0] && b[0][1] <= a[0][1] &&
+         a[1][0] <= b[1][0] && b[1][1] <= a[1][1] &&
+         a[2][0] <= b[2][0] && b[2][1] <= a[2][1] ;
+}
+
+static void box_print( FILE * fp , const Box& a )
+{
+  fprintf(fp,"{ [ %d , %d ) , [ %d , %d ) , [ %d , %d ) }",
+                a[0][0] , a[0][1] ,  
+                a[1][0] , a[1][1] ,  
+                a[2][0] , a[2][1] );
+}
+
+static void test_box( const Box& box , const int np )
+{
+  const int ncell_box = box[0][1] * box[1][1] * box[2][1] ;
+  int ncell_total = 0 ;
+  int ncell_min = ncell_box ;
+  int ncell_max = 0 ;
+  std::vector<Box> pbox(np);
+  int i , j ;
+
+  box_partition( 0 , np , 2 , box , &pbox[0] );
+
+  for ( i = 0 ; i < np ; ++i ) {
+    const int ncell = ( pbox[i][0][1] - pbox[i][0][0] ) *
+                      ( pbox[i][1][1] - pbox[i][1][0] ) *
+                      ( pbox[i][2][1] - pbox[i][2][0] );
+
+    if ( ! box_contain( box , pbox[i] ) ) {
+      fprintf(stdout,"  OUT OF BOUNDS pbox[%d/%d] = ",i,np);
+      box_print(stdout,pbox[i]);
+      fprintf(stdout,"\n");
+      abort();
+    }
+
+    for ( j = i + 1 ; j < np ; ++j ) {
+      if ( ! box_disjoint( pbox[i] , pbox[j] ) ) {
+        fprintf(stdout,"  NOT DISJOINT pbox[%d/%d] = ",i,np);
+        box_print(stdout, pbox[i]);
+        fprintf(stdout,"\n");
+        fprintf(stdout,"               pbox[%d/%d] = ",j,np);
+        box_print(stdout, pbox[j]);
+        fprintf(stdout,"\n");
+        abort();
+      }
+    }
+    ncell_total += ncell ;
+
+    if ( ncell_max < ncell ) { ncell_max = ncell ; }
+    if ( ncell < ncell_min ) { ncell_min = ncell ; }
+  }
+
+  if ( ncell_total != ncell_box ) {
+    fprintf(stdout,"  WRONG CELL COUNT NP = %d\n",np);
+    abort();
+  }
+  fprintf(stdout,"NP = %d, total = %d, avg = %d, min = %d, max = %d\n",
+          np,ncell_box,ncell_box/np,ncell_min,ncell_max);
+}
+
+/*--------------------------------------------------------------------*/
+
+static void test_maps( const Box& root_box , const int np )
+{
+  const int ghost = 1 ;
+  const int nx_global = root_box[0][1] - root_box[0][0] ;
+  const int ny_global = root_box[1][1] - root_box[1][0] ;
+  int ieq , i , j ;
+  std::vector<Box> pbox(np);
+  int **local_values ;
+  int **map_local_id ;
+  int **map_recv_pc ;
+  int **map_send_pc ;
+  int **map_send_id ;
+
+  box_partition( 0 , np , 2 , root_box , &pbox[0] );
+
+  local_values = (int **) malloc( sizeof(int*) * np );
+  map_local_id = (int **) malloc( sizeof(int*) * np );
+  map_recv_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_pc  = (int **) malloc( sizeof(int*) * np );
+  map_send_id  = (int **) malloc( sizeof(int*) * np );
+
+  /* Set each local value to the global equation number */
+
+  for ( ieq = i = 0 ; i < np ; ++i ) {
+    const Box& mybox = pbox[i] ;
+    const int nx = mybox[0][1] - mybox[0][0] ;
+    const int ny = mybox[1][1] - mybox[1][0] ;
+    const int nz = mybox[2][1] - mybox[2][0] ;
+    int ix , iy , iz ;
+
+    /* Generate the partition maps for this rank */
+    box_partition_maps( np , i , &pbox[0] , ghost ,
+                        & map_local_id[i] , & map_recv_pc[i] , 
+                        & map_send_pc[i] , & map_send_id[i] );
+
+    local_values[i] = (int *) malloc( sizeof(int) * map_recv_pc[i][np] );
+
+    for ( iz = -ghost ; iz < nz + ghost ; ++iz ) {
+    for ( iy = -ghost ; iy < ny + ghost ; ++iy ) {
+    for ( ix = -ghost ; ix < nx + ghost ; ++ix ) {
+      const int ieq = box_map_local(mybox,ghost,map_local_id[i],ix,iy,iz);
+
+      if ( 0 <= ieq ) {
+        const int ix_global = ix + mybox[0][0] ;
+        const int iy_global = iy + mybox[1][0] ;
+        const int iz_global = iz + mybox[2][0] ;
+
+        if ( root_box[0][0] <= ix_global && ix_global < root_box[0][1] &&
+             root_box[1][0] <= iy_global && iy_global < root_box[1][1] &&
+             root_box[2][0] <= iz_global && iz_global < root_box[2][1] ) {
+
+          local_values[i][ ieq ] = ix_global +
+                                   iy_global * nx_global +
+                                   iz_global * nx_global * ny_global ;
+        }
+        else {
+          local_values[i][ ieq ] = -1 ;
+        }
+      }
+    }
+    }
+    }
+  }
+
+  /* Pair-wise compare the local values */
+  /* i  == receiving processor rank */
+  /* ip == sending   processor rank */
+  /* j  == receiving processor data entry for message from 'ip' */
+  /* jp == sending   processor data entry for message to   'i' */
+
+  for ( i = 0 ; i < np ; ++i ) {
+    for ( j = 1 ; j < np ; ++j ) {
+      const int ip = ( i + j ) % np ;
+      const int jp = ( i + np - ip ) % np ;
+      const int nrecv = map_recv_pc[i] [j+1]  - map_recv_pc[i] [j] ;
+      const int nsend = map_send_pc[ip][jp+1] - map_send_pc[ip][jp] ;
+      int k ;
+      if ( nrecv != nsend ) {
+        fprintf(stderr,"P%d recv %d from P%d\n",i,nrecv,ip);
+        fprintf(stderr,"P%d send %d to   P%d\n",ip,nsend,i);
+        abort();
+      }
+      for ( k = 0 ; k < nrecv ; ++k ) {
+        const int irecv = map_recv_pc[i][j] + k ;
+        const int isend = map_send_pc[ip][jp] + k ;
+        const int val_irecv = local_values[i][irecv] ;
+        const int val_isend = local_values[ip][ map_send_id[ip][isend] ] ;
+        if ( val_irecv != val_isend ) {
+          fprintf(stderr,"P%d recv[%d] = %d , from P%d\n",i,k,val_irecv,ip);
+          fprintf(stderr,"P%d send[%d] = %d , to   P%d\n",ip,k,val_isend,i);
+          abort();
+        }
+      }
+    }
+  }
+
+  for ( i = 0 ; i < np ; ++i ) {
+    free( map_local_id[i] );
+    free( map_recv_pc[i] );
+    free( map_send_pc[i] );
+    free( map_send_id[i] );
+    free( local_values[i] );
+  }
+  free( map_send_id );
+  free( map_send_pc );
+  free( map_recv_pc );
+  free( map_local_id );
+  free( local_values );
+}
+
+/*--------------------------------------------------------------------*/
+
+int main( int argc , char * argv[] )
+{
+  int np_max = 256 ;
+  Box box = { 0 , 64 , 0 , 64 , 0 , 64 };
+  int np = 0 ;
+
+  switch( argc ) {
+  case 3:
+    sscanf(argv[1],"%d",&np);
+    sscanf(argv[2],"%dx%dx%d",& box[0][1] , & box[1][1] , & box[2][1] );
+    if ( 0 < np ) { test_box( box , np ); }
+    if ( 0 < np ) { test_maps( box , np ); }
+    break ;
+  default:
+    for ( np = 1 ; np <= np_max ; ++np ) {
+      test_box( box , np );
+      test_maps( box , np );
+    }
+    break ;
+  }
+  return 0 ;
+}
+
+#endif
+
+
diff --git a/openmp-avx512/utils/BoxPartition.hpp b/openmp-avx512/utils/BoxPartition.hpp
new file mode 100644
index 0000000..14e4076
--- /dev/null
+++ b/openmp-avx512/utils/BoxPartition.hpp
@@ -0,0 +1,103 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _BoxPartition_hpp_
+#define _BoxPartition_hpp_
+
+#include <Box.hpp>
+
+/** \brief Recursively split a box into (up-ip) sub-boxes
+ */
+void box_partition( int ip , int up , int axis ,
+                    const Box& box ,
+                    Box* p_box );
+
+/** \brief  Partition a { [ix,jx) X [iy,jy) X [iz,jz) } box.
+ *
+ *  Use recursive coordinate bisection to partition a box 
+ *  into np disjoint sub-boxes.  Allocate (via malloc) and
+ *  populate the sub-boxes, mapping the local (x,y,z) to
+ *  a local ordinal, and mappings for the send-recv messages
+ *  to update the ghost cells.
+ *
+ *  usage:
+ *
+ *  my_nx = pbox[my_p][0][1] - pbox[my_p][0][0] ;
+ *  my_ny = pbox[my_p][1][1] - pbox[my_p][1][0] ;
+ *  my_nz = pbox[my_p][2][1] - pbox[my_p][2][0] ;
+ *
+ *  for ( x = -ghost ; x < my_nx + ghost ; ++x ) {
+ *  for ( y = -ghost ; y < my_ny + ghost ; ++y ) {
+ *  for ( z = -ghost ; z < my_nz + ghost ; ++z ) {
+ *    const int x_global = x + pbox[my_p][0][0] ;
+ *    const int y_global = y + pbox[my_p][1][0] ;
+ *    const int z_global = z + pbox[my_p][2][0] ;
+ *
+ *    const int local_ordinal =
+ *      box_map_local( pbox[my_p], ghost, map_local_id, x, y, z );
+ *
+ *    if ( 0 <= local_ordinal ) {
+ *    }
+ *  }
+ *  
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int recv_processor = ( my_p + i ) % np ;
+ *    const int recv_ordinal_begin = map_recv_pc[i];
+ *    const int recv_ordinal_end   = map_recv_pc[i+1];
+ *  }
+ *
+ *  for ( i = 1 ; i < np ; ++i ) {
+ *    const int send_processor = ( my_p + i ) % np ;
+ *    const int send_map_begin = map_send_pc[i];
+ *    const int send_map_end   = map_send_pc[i+1];
+ *    for ( j = send_map_begin ; j < send_map_end ; ++j ) {
+ *      send_ordinal = map_send_id[j] ;
+ *    }
+ *  }
+ */
+void box_partition_rcb( 
+  const int np            /**< [in]  Number of partitions */ ,
+  const int my_p          /**< [in]  My partition rank    */ ,
+  const Box& root_box     /**< [in]  3D Box to partition  */ ,
+  const int ghost         /**< [in]  Ghost cell boundary  */ ,
+  Box* pbox               /**< [out] Partition's 3D boxes */ ,
+  int ** map_local_id     /**< [out] Map local cells */ ,
+  int ** map_recv_pc      /**< [out] Receive spans per processor */ ,
+  int ** map_send_pc      /**< [out] Send prefix counts per processor */ ,
+  int ** map_send_id      /**< [out] Send message ordinals */ );
+
+/* \brief  Map a local (x,y,z) to a local ordinal.
+ */
+int box_map_local( const Box& box_local ,
+                   const int ghost ,
+                   const int map_local_id[] ,
+                   const int local_x ,
+                   const int local_y ,
+                   const int local_z );
+
+#endif
+
diff --git a/openmp-avx512/utils/Parameters.hpp b/openmp-avx512/utils/Parameters.hpp
new file mode 100644
index 0000000..44841cf
--- /dev/null
+++ b/openmp-avx512/utils/Parameters.hpp
@@ -0,0 +1,64 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _parameters_hpp_
+#define _parameters_hpp_
+
+#include <string>
+
+namespace miniFE {
+
+struct Parameters {
+  Parameters()
+   : nx(5), ny(nx), nz(nx), numthreads(1),
+     mv_overlap_comm_comp(0), use_locking(0),
+     load_imbalance(0), name(), elem_group_size(1),
+     use_elem_mat_fields(1), verify_solution(0),
+     device(0),num_devices(2),skip_device(9999),numa(1)
+  {}
+
+  int nx;
+  int ny;
+  int nz;
+  int numthreads;
+  int mv_overlap_comm_comp;
+  int use_locking;
+  float load_imbalance;
+  std::string name;
+  int elem_group_size;
+  int use_elem_mat_fields;
+  int verify_solution;
+  int device;
+  int num_devices;
+  int skip_device;
+  int numa;
+};//struct Parameters
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/utils/TypeTraits.hpp b/openmp-avx512/utils/TypeTraits.hpp
new file mode 100644
index 0000000..c7bcb44
--- /dev/null
+++ b/openmp-avx512/utils/TypeTraits.hpp
@@ -0,0 +1,136 @@
+#ifndef _TypeTraits_hpp_
+#define _TypeTraits_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <complex>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+template<typename T> struct TypeTraits {};
+
+template<>
+struct TypeTraits<float> {
+  typedef float magnitude_type;
+
+  static const char* name() {return "float";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_FLOAT;}
+#endif
+};
+
+template<>
+struct TypeTraits<double> {
+  typedef double magnitude_type;
+
+  static const char* name() {return "double";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_DOUBLE;}
+#endif
+};
+
+template<>
+struct TypeTraits<int> {
+  typedef int magnitude_type;
+
+  static const char* name() {return "int";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_INT;}
+#endif
+};
+
+template<>
+struct TypeTraits<long int> {
+  typedef long int magnitude_type;
+
+  static const char* name() {return "long int";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_LONG;}
+#endif
+};
+
+#ifndef MINIFE_NO_LONG_LONG
+
+template<>
+struct TypeTraits<long long> {
+  typedef long long magnitude_type;
+
+  static const char* name() {return "long long";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_LONG_LONG;}
+#endif
+};
+
+#endif
+
+template<>
+struct TypeTraits<unsigned> {
+  typedef unsigned magnitude_type;
+
+  static const char* name() {return "unsigned";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_UNSIGNED;}
+#endif
+};
+
+template<>
+struct TypeTraits<std::complex<float> > {
+  typedef float magnitude_type;
+
+  static const char* name() {return "std::complex<float>";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_COMPLEX;}
+#endif
+};
+
+template<>
+struct TypeTraits<std::complex<double> > {
+  typedef double magnitude_type;
+
+  static const char* name() {return "std::complex<double>";}
+
+#ifdef HAVE_MPI
+  static MPI_Datatype mpi_type() {return MPI_DOUBLE_COMPLEX;}
+#endif
+};
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/utils/box_utils.hpp b/openmp-avx512/utils/box_utils.hpp
new file mode 100644
index 0000000..f5bdb40
--- /dev/null
+++ b/openmp-avx512/utils/box_utils.hpp
@@ -0,0 +1,320 @@
+#ifndef _box_utils_hpp_
+#define _box_utils_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <vector>
+#include <set>
+#include <map>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <TypeTraits.hpp>
+#include <Box.hpp>
+
+namespace miniFE {
+
+inline void copy_box(const Box& from_box, Box& to_box)
+{
+  for(int i=0; i<3; ++i) {
+    to_box[i][0] = from_box[i][0];
+    to_box[i][1] = from_box[i][1];
+  }
+}
+
+template<typename GlobalOrdinal>
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+void get_int_coords(GlobalOrdinal ID, int nx, int ny, int nz,
+                int& x, int& y, int& z)
+{
+  z = ID/(nx*ny);
+  y = (ID%(nx*ny))/nx;
+  x = ID%nx;
+}
+
+template<typename GlobalOrdinal,typename Scalar>
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+void get_coords(GlobalOrdinal ID, int nx, int ny, int nz,
+                Scalar& x, Scalar& y, Scalar& z)
+{
+  const int xdiv = nx>1 ? nx-1 : 1;
+  const int ydiv = ny>1 ? ny-1 : 1;
+  const int zdiv = nz>1 ? nz-1 : 1;
+
+//This code assumes that ID is 0-based.
+//
+//compute coordinates that lie on (or in) the unit cube.
+//that's why we're dividing by nz,ny,nx:
+  z = (1.0*(ID/(nx*ny)))/zdiv;
+  y = 1.0*((ID%(nx*ny))/nx)/ydiv;
+  x = 1.0*(ID%nx)/xdiv;
+}
+
+template<typename GlobalOrdinal>
+GlobalOrdinal get_num_ids(const Box& box)
+{
+  int nx = box[0][1] - box[0][0];
+  int ny = box[1][1] - box[1][0];
+  int nz = box[2][1] - box[2][0];
+  GlobalOrdinal tmp = nx*ny;
+  tmp *= nz;
+  return tmp;
+}
+
+template<typename GlobalOrdinal>
+#ifdef __CUDACC__
+__host__ __device__ __inline__
+#endif
+GlobalOrdinal get_id(int nx, int ny, int nz,
+                     int x, int y, int z)
+{
+  if (x<0 || y<0 || z<0) return -1;
+  if (x>=nx || y>=ny || z>=nz) return -1;
+
+  //form x + nx*y + nx*ny*z:
+
+  GlobalOrdinal tmp = nx*ny;
+  tmp *= z;
+  tmp = x + nx * y + tmp;
+  return tmp;
+}
+
+template<typename GlobalOrdinal>
+void get_ids(int nx, int ny, int nz,
+             const Box& box,
+             std::vector<GlobalOrdinal>& ids,
+             bool include_ghost_layer=false)
+{
+  ids.clear();
+  int minz = box[2][0];
+  int maxz = box[2][1];
+  int miny = box[1][0];
+  int maxy = box[1][1];
+  int minx = box[0][0];
+  int maxx = box[0][1];
+
+  if (include_ghost_layer) {
+    if (minz > 0) minz--;
+    if (miny > 0) miny--;
+    if (minx > 0) minx--;
+    if (maxz < nz) maxz++;
+    if (maxy < ny) maxy++;
+    if (maxx < nx) maxx++;
+  }
+
+  size_t ids_size = ((maxz - minz) * (maxy - miny)) * (maxx - minx);
+  ids.reserve(ids_size);
+
+  for(int z=minz; z<maxz; ++z) {
+    for(int y=miny; y<maxy; ++y) {
+      for(int x=minx; x<maxx; ++x) {
+        ids.push_back(get_id<GlobalOrdinal>(nx, ny, nz, x, y, z));
+      }
+    }
+  }
+}
+
+template<typename GlobalOrdinal>
+void get_ghost_ids(int nx, int ny, int nz,
+             const Box& box,
+             std::vector<GlobalOrdinal>& ids)
+{
+  ids.clear();
+  int minz,maxz,miny,maxy,minx,maxx;
+  int orig_minz = minz = box[2][0];
+  int orig_maxz = maxz = box[2][1];
+  int orig_miny = miny = box[1][0];
+  int orig_maxy = maxy = box[1][1];
+  int orig_minx = minx = box[0][0];
+  int orig_maxx = maxx = box[0][1];
+
+  if (minz > 0) minz--;
+  if (miny > 0) miny--;
+  if (minx > 0) minx--;
+  if (maxz < nz) maxz++;
+  if (maxy < ny) maxy++;
+  if (maxx < nx) maxx++;
+
+  for(int z=minz; z<maxz; ++z) {
+    for(int y=miny; y<maxy; ++y) {
+      for(int x=minx; x<maxx; ++x) {
+        bool x_in_ghost_layer = (x < orig_minx) || (x >= orig_maxx);
+        bool y_in_ghost_layer = (y < orig_miny) || (y >= orig_maxy);
+        bool z_in_ghost_layer = (z < orig_minz) || (z >= orig_maxz);
+        //we are in the ghost layer if any one of x,y,z are in the ghost layer
+        if (!x_in_ghost_layer && !y_in_ghost_layer && !z_in_ghost_layer) continue;
+        ids.push_back(get_id<GlobalOrdinal>(nx, ny, nz, x, y, z));
+      }
+    }
+  }
+}
+
+ inline void print_box(int myproc, const char* name, const Box& box,
+                      const char* name2, const Box& box2)
+{
+  std::cout << "proc " << myproc << " "<<name
+      <<" ("<<box[0][0]<<","<<box[0][1]<<") "
+      <<" ("<<box[1][0]<<","<<box[1][1]<<") "
+      <<" ("<<box[2][0]<<","<<box[2][1]<<") "
+      <<name2
+      <<" ("<<box2[0][0]<<","<<box2[0][1]<<") "
+      <<" ("<<box2[1][0]<<","<<box2[1][1]<<") "
+      <<" ("<<box2[2][0]<<","<<box2[2][1]<<") "<<std::endl;
+}
+
+bool is_neighbor(const Box& box1, const Box& box2)
+{
+  //neighbors in the x dimension if:
+  bool x_neighbor = (box1[0][1] == box2[0][0]) || (box1[0][0] == box2[0][1]) || // min matches max
+                    (box1[0][0] == box2[0][0]) || (box1[0][1] == box2[0][1]) || // mins or maxs match
+                    (box1[0][0] >  box2[0][0]  &&  box1[0][1] <  box2[0][1]) || // range contains other
+                    (box2[0][0] >  box1[0][0]  &&  box2[0][1] <  box1[0][1]) || // range contains other
+                    (box1[0][0] >  box2[0][0]  &&  box1[0][0] <  box2[0][1]) || // min contained in rng
+                    (box2[0][0] >  box1[0][0]  &&  box2[0][0] <  box1[0][1]);   // min contained in rng
+  if (!x_neighbor) {
+    x_neighbor = (box1[0][1] == box2[0][0]-1) || (box1[0][0] == box2[0][1]+1);
+  }
+
+  bool y_neighbor = (box1[1][1] == box2[1][0]) || (box1[1][0] == box2[1][1]) || // min matches max
+                    (box1[1][0] == box2[1][0]) || (box1[1][1] == box2[1][1]) || // mins or maxs match
+                    (box1[1][0] >  box2[1][0]  &&  box1[1][1] <  box2[1][1]) || // range contains other
+                    (box2[1][0] >  box1[1][0]  &&  box2[1][1] <  box1[1][1]) || // range contains other
+                    (box1[1][0] >  box2[1][0]  &&  box1[1][0] <  box2[1][1]) || // min contained in rng
+                    (box2[1][0] >  box1[1][0]  &&  box2[1][0] <  box1[1][1]);   // min contained in rng
+  if (!y_neighbor) {
+    y_neighbor = (box1[1][1] == box2[1][0]-1) || (box1[1][0] == box2[1][1]+1);
+  }
+
+  bool z_neighbor = (box1[2][1] == box2[2][0]) || (box1[2][0] == box2[2][1]) || // min matches max
+                    (box1[2][0] == box2[2][0]) || (box1[2][1] == box2[2][1]) || // mins or maxs match
+                    (box1[2][0] >  box2[2][0]  &&  box1[2][1] <  box2[2][1]) || // range contains other
+                    (box2[2][0] >  box1[2][0]  &&  box2[2][1] <  box1[2][1]) || // range contains other
+                    (box1[2][0] >  box2[2][0]  &&  box1[2][0] <  box2[2][1]) || // min contained in rng
+                    (box2[2][0] >  box1[2][0]  &&  box2[2][0] <  box1[2][1]);   // min contained in rng
+  if (!z_neighbor) {
+    z_neighbor = (box1[2][1] == box2[2][0]-1) || (box1[2][0] == box2[2][1]+1);
+  }
+
+  return x_neighbor && y_neighbor && z_neighbor;
+}
+
+template<typename GlobalOrdinal>
+void create_map_id_to_row(int global_nx, int global_ny, int global_nz,
+                     const Box& box,
+                     std::map<GlobalOrdinal,GlobalOrdinal>& id_to_row)
+{
+  GlobalOrdinal num_my_ids = get_num_ids<GlobalOrdinal>(box);
+
+  typename std::vector<GlobalOrdinal> all_ids;
+  bool include_ghost_layer = false;
+  get_ids(global_nx, global_ny, global_nz, box, all_ids, include_ghost_layer);
+
+  GlobalOrdinal my_first_row = 0;
+  typename std::vector<GlobalOrdinal> global_offsets;
+  std::vector<int> all_boxes;
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+
+  GlobalOrdinal local_num_ids = num_my_ids;
+  global_offsets.resize(numprocs);
+  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
+  MPI_Allgather(&local_num_ids, 1, mpi_dtype, &global_offsets[0], 1, mpi_dtype, MPI_COMM_WORLD);
+  GlobalOrdinal offset = 0;
+  for(int i=0; i<numprocs; ++i) {
+    GlobalOrdinal tmp = global_offsets[i];
+    global_offsets[i] = offset;
+    offset += tmp;
+  }
+
+  my_first_row = global_offsets[myproc];
+
+  all_boxes.resize(6*numprocs);
+  int* local_box_ranges = const_cast<int*>(&box.ranges[0]);
+  MPI_Allgather(local_box_ranges, 6, MPI_INT, &all_boxes[0], 6, MPI_INT, MPI_COMM_WORLD);
+#endif
+
+  if (all_ids.size() > 0) {
+    id_to_row.insert(std::make_pair(all_ids[0], my_first_row));
+  }
+
+  for(size_t i=1; i<all_ids.size(); ++i) {
+    if (all_ids[i] != all_ids[i-1]+1) {
+      id_to_row.insert(std::make_pair(all_ids[i], my_first_row+i));
+    }
+  }
+
+//  int num_neighbors = 0;
+  for(int i=0; i<numprocs; ++i) {
+    if (i == myproc) continue;
+    Box box_i;
+    for(int r=0; r<6; ++r) box_i.ranges[r] = all_boxes[i*6 + r];
+//    bool neighbor= is_neighbor(box, box_i);
+//if(myproc==2) {
+//  std::cout<<"i: "<<i<<" "<<neighbor<<" ";
+//  print_box(myproc, " ", box, " ", box_i);
+//}
+    if (!is_neighbor(box, box_i)) {
+//      if (myproc==50) {
+//        std::cout<<"box ("<<box[0][0]<<","<<box[0][1]<<" - "<<box[1][0]<<","<<box[1][1]<<" - "<<box[2][0]<<","<<box[2][1]<<")"<<std::endl<<" and ("<<box_i[0][0]<<","<<box_i[0][1]<<" - "<<box_i[1][0]<<","<<box_i[1][1]<<" - "<<box_i[2][0]<<","<<box_i[2][1]<<") not neighbors."<<std::endl;
+//      }
+      continue;
+    }
+//    ++num_neighbors;
+
+    get_ids(global_nx, global_ny, global_nz, box_i, all_ids, include_ghost_layer);
+
+    GlobalOrdinal first_row = global_offsets[i];
+    if (all_ids.size() > 0) {
+      id_to_row.insert(std::make_pair(all_ids[0], first_row));
+    }
+    for(size_t j=1; j<all_ids.size(); ++j) {
+      if (all_ids[j] != all_ids[j-1]+1) {
+        id_to_row.insert(std::make_pair(all_ids[j], first_row+j));
+      }
+    }
+  }
+
+//std::cout<<"proc "<<myproc<<": num_neighbors: "<<num_neighbors<<", id_to_row.size(): "<<id_to_row.size()<<std::endl;
+//typename std::map<GlobalOrdinal,GlobalOrdinal>::iterator iter = id_to_row.begin(), end = id_to_row.end();
+//for(; iter!=end; ++iter) {
+//  std::cout<<"proc "<<myproc<<": "<<iter->first<<" :: "<<iter->second<<std::endl;
+//}
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/utils/compute_matrix_stats.hpp b/openmp-avx512/utils/compute_matrix_stats.hpp
new file mode 100644
index 0000000..76d5f94
--- /dev/null
+++ b/openmp-avx512/utils/compute_matrix_stats.hpp
@@ -0,0 +1,116 @@
+#ifndef _compute_matrix_stats_hpp_
+#define _compute_matrix_stats_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+
+#include <outstream.hpp>
+#include <utils.hpp>
+#include <YAML_Doc.hpp>
+
+namespace miniFE {
+
+template<typename MatrixType>
+size_t
+compute_matrix_stats(const MatrixType& A, int myproc, int numprocs, YAML_Doc& ydoc)
+{
+  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
+  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
+  typedef typename MatrixType::ScalarType Scalar;
+
+  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
+  int min_proc = 0, max_proc = 0;
+
+  GlobalOrdinal local_nrows = A.rows.size();
+
+  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
+                     max_nrows, max_proc);
+
+  //Gather stats on global, min/max matrix num-nonzeros:
+
+  double local_nnz = A.num_nonzeros();
+  double dglobal_nnz = 0, dmin_nnz = 0, dmax_nnz = 0;
+
+  get_global_min_max(local_nnz, dglobal_nnz, dmin_nnz, min_proc,
+                     dmax_nnz, max_proc);
+
+  double avg_nrows = global_nrows;
+  avg_nrows /= numprocs;
+  double avg_nnz = dglobal_nnz;
+  avg_nnz /= numprocs;
+
+  double mem_overhead_MB = parallel_memory_overhead_MB(A);
+
+  size_t global_nnz = static_cast<size_t>(std::ceil(dglobal_nnz));
+  size_t min_nnz = static_cast<size_t>(std::ceil(dmin_nnz));
+  size_t max_nnz = static_cast<size_t>(std::ceil(dmax_nnz));
+  size_t global_num_rows = global_nrows;
+
+  if (myproc == 0) {
+    ydoc.add("Matrix attributes","");
+    ydoc.get("Matrix attributes")->add("Global Nrows",global_num_rows);
+    ydoc.get("Matrix attributes")->add("Global NNZ",global_nnz);
+
+    //compute how much memory the matrix occupies:
+    //num-bytes = sizeof(GlobalOrdinal)*global_nrows   for A.rows
+    //          + sizeof(LocalOrdinal)*global_nrows    for A.rows_offsets
+    //          + sizeof(GlobalOrdinal)*global_nnz     for A.packed_cols
+    //          + sizeof(Scalar)*global_nnz            for A.packed_coefs
+
+    double invGB = 1.0/(1024*1024*1024);
+    double memGB = invGB*global_nrows*sizeof(GlobalOrdinal);
+    memGB += invGB*global_nrows*sizeof(LocalOrdinal);
+    memGB += invGB*global_nnz*sizeof(GlobalOrdinal);
+    memGB += invGB*global_nnz*sizeof(Scalar);
+    ydoc.get("Matrix attributes")->add("Global Memory (GB)",memGB);
+
+    ydoc.get("Matrix attributes")->add("Pll Memory Overhead (MB)",mem_overhead_MB);
+
+    size_t min_num_rows = min_nrows;
+    size_t max_num_rows = max_nrows;
+    ydoc.get("Matrix attributes")->add("Rows per proc MIN",min_num_rows);
+    ydoc.get("Matrix attributes")->add("Rows per proc MAX",max_num_rows);
+    ydoc.get("Matrix attributes")->add("Rows per proc AVG",avg_nrows);
+    ydoc.get("Matrix attributes")->add("NNZ per proc MIN",min_nnz);
+    ydoc.get("Matrix attributes")->add("NNZ per proc MAX",max_nnz);
+    ydoc.get("Matrix attributes")->add("NNZ per proc AVG",avg_nnz);
+  }
+
+  return global_nnz;
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/utils/imbalance.hpp b/openmp-avx512/utils/imbalance.hpp
new file mode 100644
index 0000000..bd6c6c8
--- /dev/null
+++ b/openmp-avx512/utils/imbalance.hpp
@@ -0,0 +1,298 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _imbalance_hpp_
+#define _imbalance_hpp_
+
+#include <cmath>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <box_utils.hpp>
+#include <utils.hpp>
+#include <YAML_Doc.hpp>
+
+namespace miniFE {
+
+const int X = 0;
+const int Y = 1;
+const int Z = 2;
+const int NONE = 3;
+
+const int LOWER = 0;
+const int UPPER = 1;
+
+template<typename GlobalOrdinal>
+void
+compute_imbalance(const Box& global_box,
+                  const Box& local_box,
+                  float& largest_imbalance,
+                  float& std_dev,
+                  YAML_Doc& doc,
+                  bool record_in_doc)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box);
+  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
+  int min_proc = myproc, max_proc = myproc;
+  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
+                     max_nrows, max_proc);
+
+  float avg_nrows = global_nrows;
+  avg_nrows /= numprocs;
+
+  //largest_imbalance will be the difference between the min (or max)
+  //rows-per-processor and avg_nrows, represented as a percentage:
+  largest_imbalance = percentage_difference<float>(min_nrows, avg_nrows);
+
+  float tmp = percentage_difference<float>(max_nrows, avg_nrows);
+  if (tmp > largest_imbalance) largest_imbalance = tmp;
+
+  std_dev = compute_std_dev_as_percentage<float>(local_nrows, avg_nrows);
+
+  if (myproc == 0 && record_in_doc) {
+    doc.add("Rows-per-proc Load Imbalance","");
+    doc.get("Rows-per-proc Load Imbalance")->add("Largest (from avg, %)",largest_imbalance);
+    doc.get("Rows-per-proc Load Imbalance")->add("Std Dev (%)",std_dev);
+  }
+}
+
+std::pair<int,int>
+decide_how_to_grow(const Box& global_box, const Box& local_box)
+{
+  std::pair<int,int> result(NONE,UPPER);
+
+  if (local_box[Z][UPPER] < global_box[Z][UPPER]) {
+    result.first = Z;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Z][LOWER] > global_box[Z][LOWER]) {
+    result.first = Z;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[Y][UPPER] < global_box[Y][UPPER]) {
+    result.first = Y;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Y][LOWER] > global_box[Y][LOWER]) {
+    result.first = Y;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[X][UPPER] < global_box[X][UPPER]) {
+    result.first = X;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[X][LOWER] > global_box[X][LOWER]) {
+    result.first = X;
+    result.second = LOWER;
+    return result;
+  }
+  return result;
+}
+
+std::pair<int,int>
+decide_how_to_shrink(const Box& global_box, const Box& local_box)
+{
+  std::pair<int,int> result(NONE,UPPER);
+
+  if (local_box[Z][UPPER] < global_box[Z][UPPER] && local_box[Z][UPPER]-local_box[Z][LOWER] > 2) {
+    result.first = Z;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Z][LOWER] > global_box[Z][LOWER] && local_box[Z][UPPER]-local_box[Z][LOWER] > 2) {
+    result.first = Z;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[Y][UPPER] < global_box[Y][UPPER] && local_box[Y][UPPER]-local_box[Y][LOWER] > 2) {
+    result.first = Y;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[Y][LOWER] > global_box[Y][LOWER] && local_box[Y][UPPER]-local_box[Y][LOWER] > 2) {
+    result.first = Y;
+    result.second = LOWER;
+    return result;
+  }
+  if (local_box[X][UPPER] < global_box[X][UPPER] && local_box[X][UPPER]-local_box[X][LOWER] > 2) {
+    result.first = X;
+    result.second = UPPER;
+    return result;
+  }
+  if (local_box[X][LOWER] > global_box[X][LOWER] && local_box[X][UPPER]-local_box[X][LOWER] > 2) {
+    result.first = X;
+    result.second = LOWER;
+    return result;
+  }
+  return result;
+}
+
+template<typename GlobalOrdinal>
+void
+add_imbalance(const Box& global_box,
+              Box& local_box,
+              float imbalance,
+              YAML_Doc& doc)
+{
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  if (numprocs == 1) {
+    return;
+  }
+
+  float cur_imbalance = 0, cur_std_dev = 0;
+  compute_imbalance<GlobalOrdinal>(global_box, local_box,
+                                  cur_imbalance, cur_std_dev, doc, false);
+
+  while (cur_imbalance < imbalance) {
+    GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box);
+    GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
+    int min_proc = myproc, max_proc = myproc;
+    get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
+                       max_nrows, max_proc);
+
+    std::pair<int,int> grow(NONE,UPPER);
+    int grow_axis_val = -1;
+    std::pair<int,int> shrink(NONE,UPPER);
+    int shrink_axis_val = -1;
+
+    if (myproc == max_proc) {
+      grow = decide_how_to_grow(global_box, local_box);
+      if (grow.first != NONE) {
+        grow_axis_val = local_box[grow.first][grow.second];
+      }
+    }
+    if (myproc == min_proc) {
+      shrink = decide_how_to_shrink(global_box, local_box);
+      if (shrink.first != NONE) {
+        shrink_axis_val = local_box[shrink.first][shrink.second];
+      }
+    }
+
+    int grow_info[8] = {grow.first, grow.second,
+                        local_box[X][0], local_box[X][1],
+                        local_box[Y][0], local_box[Y][1],
+                        local_box[Z][0], local_box[Z][1]};
+
+    int shrink_info[8] = {shrink.first, shrink.second,
+                        local_box[X][0], local_box[X][1],
+                        local_box[Y][0], local_box[Y][1],
+                        local_box[Z][0], local_box[Z][1]};
+#ifdef HAVE_MPI
+    MPI_Bcast(&grow_info[0], 8, MPI_INT, max_proc, MPI_COMM_WORLD);
+    MPI_Bcast(&shrink_info[0], 8, MPI_INT, min_proc, MPI_COMM_WORLD);
+#endif
+
+    int grow_axis = grow_info[0];
+    int grow_end = grow_info[1];
+    int shrink_axis = shrink_info[0];
+    int shrink_end = shrink_info[1];
+    int grow_incr = 1;
+    if (grow_end == LOWER) grow_incr = -1;
+    int shrink_incr = -1;
+    if (shrink_end == LOWER) shrink_incr = 1;
+    if (grow_axis != NONE) grow_axis_val = grow_info[2+grow_axis*2+grow_end];
+    if (shrink_axis != NONE) shrink_axis_val = shrink_info[2+shrink_axis*2+shrink_end];
+
+    if (grow_axis == NONE && shrink_axis == NONE) break;
+
+    bool grow_status = grow_axis==NONE ? false : true;
+    if (grow_axis != NONE) {
+      if ((grow_incr ==  1 && local_box[grow_axis][0] == grow_axis_val) ||
+          (grow_incr == -1 && local_box[grow_axis][1] == grow_axis_val)) {
+        if (local_box[grow_axis][1] - local_box[grow_axis][0] < 2) {
+          grow_status = false;
+        }
+      }
+    }
+
+    bool shrink_status = shrink_axis==NONE ? false : true;
+    if (shrink_axis != NONE) {
+      if ((shrink_incr ==  1 && local_box[shrink_axis][0] == shrink_axis_val) ||
+          (shrink_incr == -1 && local_box[shrink_axis][1] == shrink_axis_val)) {
+        if (local_box[shrink_axis][1] - local_box[shrink_axis][0] < 2) {
+          shrink_status = false;
+        }
+      }
+    }
+
+#ifdef HAVE_MPI
+    int statusints[2] = { grow_status ? 0 : 1, shrink_status ? 0 : 1 };
+    int globalstatus[2] = { 0, 0 };
+    MPI_Allreduce(&statusints, &globalstatus, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    grow_status = globalstatus[0]>0 ? false : true;
+    shrink_status = globalstatus[1]>0 ? false : true;
+#endif
+
+    if (grow_status == false && shrink_status == false) break;
+
+    if (grow_status && grow_axis != NONE) {
+      if (local_box[grow_axis][0] == grow_axis_val) {
+        local_box[grow_axis][0] += grow_incr;
+      }
+
+      if (local_box[grow_axis][1] == grow_axis_val) {
+        local_box[grow_axis][1] += grow_incr;
+      }
+    }
+
+    if (shrink_status && shrink_axis != NONE) {
+      if (local_box[shrink_axis][0] == shrink_axis_val) {
+        local_box[shrink_axis][0] += shrink_incr;
+      }
+
+      if (local_box[shrink_axis][1] == shrink_axis_val) {
+        local_box[shrink_axis][1] += shrink_incr;
+      }
+    }
+
+    compute_imbalance<GlobalOrdinal>(global_box, local_box,
+                                    cur_imbalance, cur_std_dev, doc, false);
+  }
+}
+
+}//namespace miniFE
+
+#endif
+
diff --git a/openmp-avx512/utils/miniFE_no_info.hpp b/openmp-avx512/utils/miniFE_no_info.hpp
new file mode 100644
index 0000000..7dc5d6f
--- /dev/null
+++ b/openmp-avx512/utils/miniFE_no_info.hpp
@@ -0,0 +1,39 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef miniFE_no_info_hpp
+#define miniFE_no_info_hpp
+
+#define MINIFE_HOSTNAME "unknown"
+#define MINIFE_KERNEL_NAME "unknown"
+#define MINIFE_KERNEL_RELEASE "unknown"
+#define MINIFE_PROCESSOR "unknown"
+
+#define MINIFE_CXX "unknown"
+#define MINIFE_CXXFLAGS "unknown"
+
+#endif
diff --git a/openmp-avx512/utils/miniFE_version.h b/openmp-avx512/utils/miniFE_version.h
new file mode 100644
index 0000000..6ae8398
--- /dev/null
+++ b/openmp-avx512/utils/miniFE_version.h
@@ -0,0 +1,35 @@
+#ifndef _minife_version_h_
+#define _minife_version_h_
+
+//@HEADER
+// ************************************************************************
+// 
+//               miniFE: simple finite-element assembly and linear-solve
+//                 Copyright (2006) Sandia Corporation
+// 
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+// 
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//  
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//  
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+
+#define MINIFE_VERSION "2.0"
+
+#endif
+
diff --git a/openmp-avx512/utils/mytimer.cpp b/openmp-avx512/utils/mytimer.cpp
new file mode 100644
index 0000000..c896263
--- /dev/null
+++ b/openmp-avx512/utils/mytimer.cpp
@@ -0,0 +1,132 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstddef>
+#include <cstdlib>
+#include <mytimer.hpp>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+namespace miniFE {
+
+/////////////////////////////////////////////////////////////////////////
+
+// Function to return time in seconds.
+// If compiled with no flags, return CPU time (user and system).
+// If compiled with -DWALL, returns elapsed time.
+
+/////////////////////////////////////////////////////////////////////////
+
+#if defined(HAVE_MPI) && defined(USE_MPI_WTIME)
+
+timer_type mytimer()
+{
+  return((timer_type) MPI_Wtime());
+}
+
+
+#elif defined(UseClock)
+
+#include <time.hpp>
+timer_type mytimer(void)
+{
+   clock_t t1;
+   static clock_t t0=0;
+   static timer_type CPS = CLOCKS_PER_SEC;
+   timer_type d;
+
+   if (t0 == 0) t0 = clock();
+   t1 = clock() - t0;
+   d = t1 / CPS;
+   return(d);
+}
+
+#elif defined(WALL)
+
+#include <cstdlib>
+#include <sys/time.h>
+#include <sys/resource.h>
+timer_type mytimer(void)
+{
+   struct timeval tp;
+   static long start=0, startu;
+   if (!start)
+   {
+      gettimeofday(&tp, NULL);
+      start = tp.tv_sec;
+      startu = tp.tv_usec;
+      return(0.0);
+   }
+   gettimeofday(&tp, NULL);
+   return( ((timer_type) (tp.tv_sec - start)) + (tp.tv_usec-startu)/1000000.0 );
+}
+
+#elif defined(UseTimes)
+
+#include <cstdlib>
+#include <sys/times.h>
+#include <unistd.h>
+timer_type mytimer(void)
+{
+   struct tms ts;
+   static timer_type ClockTick=0.0;
+
+   if (ClockTick == 0.0) ClockTick = (timer_type) sysconf(_SC_CLK_TCK);
+   times(&ts);
+   return( (timer_type) ts.tms_utime / ClockTick );
+}
+
+#else
+
+#include <cstdlib>
+#include <sys/time.h>
+#include <sys/resource.h>
+timer_type mytimer(void)
+{
+//This function now uses gettimeofday instead of getrusage. See note below.
+//
+  struct timeval tv;
+  struct timezone tz;
+  gettimeofday(&tv, &tz);
+  return ( (timer_type)tv.tv_sec + tv.tv_usec/1000000.0 );
+
+//The below use of 'getrusage' is not used because it doesn't do the right thing
+//for the case of using threads. It adds up the time spent in multiple threads,
+//rather than giving elapsed time.
+//
+//   struct rusage ruse;
+//   getrusage(RUSAGE_SELF, &ruse);
+//   return( (timer_type)(ruse.ru_utime.tv_sec+ruse.ru_utime.tv_usec / 1000000.0) );
+}
+
+#endif
+
+}//namespace miniFE
+
diff --git a/openmp-avx512/utils/mytimer.hpp b/openmp-avx512/utils/mytimer.hpp
new file mode 100644
index 0000000..824dbee
--- /dev/null
+++ b/openmp-avx512/utils/mytimer.hpp
@@ -0,0 +1,52 @@
+#ifndef _mytimer_hpp_
+#define _mytimer_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+namespace miniFE {
+
+typedef double timer_type;
+
+timer_type mytimer();
+
+enum CG_TIMES {
+  WAXPY = 0,
+  DOT = 1,
+  MATVEC = 2,
+  MATVECDOT = 3,
+  TOTAL = 4,
+  NUM_TIMERS = 5
+};
+
+//Use TICK and TOCK to time a code section
+#define TICK() t0 = mytimer();
+#define TOCK(t) t += mytimer() - t0;
+
+}//namespace miniFE
+
+#endif
diff --git a/openmp-avx512/utils/outstream.hpp b/openmp-avx512/utils/outstream.hpp
new file mode 100644
index 0000000..bff02cd
--- /dev/null
+++ b/openmp-avx512/utils/outstream.hpp
@@ -0,0 +1,45 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef _outstream_hpp_
+#define _outstream_hpp_
+
+#include <fstream>
+#include <sstream>
+
+inline
+std::ostream& outstream(int np=1, int p=0)
+{
+  static bool first = true;
+  static std::ostringstream oss;
+  if (first) oss << "minife_debug."<<np<<"."<<p;
+  static std::ofstream ofs(oss.str().c_str(), std::ios::out);
+  first = false;
+  return ofs;
+}
+
+#endif
diff --git a/openmp-avx512/utils/param_utils.cpp b/openmp-avx512/utils/param_utils.cpp
new file mode 100644
index 0000000..c81901f
--- /dev/null
+++ b/openmp-avx512/utils/param_utils.cpp
@@ -0,0 +1,58 @@
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <param_utils.hpp>
+
+#include <sstream>
+#include <fstream>
+
+namespace Mantevo {
+
+//-------------------------------------------------------------
+void read_args_into_string(int argc, char** argv, std::string& arg_string)
+{
+  arg_string = argv[0];
+  for(int i=1; i<argc; ++i) {
+    arg_string += " " + std::string(argv[i]);
+  }
+}
+
+//-------------------------------------------------------------
+void read_file_into_string(const std::string& filename,
+                           std::string& file_contents)
+{
+  file_contents.clear();
+  std::ifstream ifs(filename.c_str());
+  char line[256];
+  while(!ifs.eof()) {
+    ifs.getline(line, 256);
+    file_contents += " " + std::string(line);
+  }
+}
+
+}//namespace Mantevo
+
diff --git a/openmp-avx512/utils/param_utils.hpp b/openmp-avx512/utils/param_utils.hpp
new file mode 100644
index 0000000..42d7b7c
--- /dev/null
+++ b/openmp-avx512/utils/param_utils.hpp
@@ -0,0 +1,160 @@
+#ifndef _param_utils_hpp_
+#define _param_utils_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <string>
+#include <sstream>
+
+//Parameter-parsing Utilities:
+//
+//The functions declared below are intended to assist with parsing
+//input-parameters which may be command-line arguments and/or lines in a
+//text file.
+//
+// Scenario: You want your program to accept parameters that are specified
+// as command-line arguments and/or as lines in a text file (such
+// as a YAML output file). i.e., your program can be run like this:
+// % program.exe foo=3.14159 bar: 42
+// or
+// % program.exe input_file=params.txt
+// or
+// % program.exe foo=3.14159 input_file = params.txt
+//
+//Example:
+// Here is example code to obtain parameters using the 3 functions
+// 'read_args_into_string', 'read_file_into_string' and 'parse_parameter':
+//
+//   std::string arg_string;
+//
+//   //put command-line-arguments into 'arg_string':
+//   read_args_into_string(argc, argv, arg_string);
+//
+//   //do the command-line-arguments specify an 'input_file'?
+//   std::string filename =
+//      parse_parameter<std::string>(arg_string,"input_file","none-specified");
+//
+//   if (filename != "none-specified") {
+//     std::string tmp;
+//     read_file_into_string(filename, tmp);
+//     arg_string += tmp;
+//   }
+//
+//  //now parse the parameters:
+//  float foo = parse_parameter<float>(arg_string, "foo", -9.9);
+//  int bar   = parse_parameter<int>(arg_string, "bar", -1);
+//
+//See the comments below for parse_parameter, for formatting requirements of
+//named parameter-value pairs.
+//
+
+namespace Mantevo {
+
+/**
+ * Concatenate command-line arguments into a single string.
+ *
+ * Note: this function is purely serial. If argc and argv have different
+ * values on different MPI processes, then you need to resolve that by
+ * broadcasting arg_string's contents.
+ */
+void read_args_into_string(int argc, char** argv, std::string& arg_string);
+
+/**
+ * Read the contents of a text-file into a single string.
+ *
+ * Note: this function is purely serial. If you want file_contents on multiple
+ * MPI processes, you need to broadcast it (or call this function on each
+ * MPI process...).
+ */
+void read_file_into_string(const std::string& filename,
+                           std::string& file_contents);
+
+/**
+ * Parse a named parameter value from input 'arg_string'.
+ *
+ * Search 'arg_string' for an occurrence of param_name and attempt to parse
+ * a value into the return-type. If param_name is not found, then default_value
+ * is returned.
+ *
+ * Example:
+ * arg_string = "foo = 3.14159";
+ * float foo = parse_parameter<float>(arg_string, "foo", -999.9);
+ * //foo should now contain the value 3.14159; if 'foo' was not found in
+ * //arg_string, then -999.9 would have been returned.
+ *
+ * Other legal name-value separators are ':' and ' '. Extra spaces are also ok,
+ * e.g. "foo : 3.114159".
+ *
+ * Note that if a YAML file is read into a string, that would be a valid input
+ * string for this function.
+ */
+template<typename T>
+T parse_parameter(const std::string& arg_string,
+                const std::string& param_name,
+                const T& default_value)
+{
+  std::string::size_type pos = arg_string.find(param_name);
+  if (pos == std::string::npos) {
+    //if param_name is not found in arg_string, return default_value:
+    return default_value;
+  }
+
+  pos += param_name.size();
+
+  if (arg_string.size() <= pos) return default_value;
+
+  //skip past ' ', '=' or ':':
+  while(pos < arg_string.size() &&
+        (arg_string[pos] == ' ' ||
+         arg_string[pos] == '=' ||
+         arg_string[pos] == ':'))
+  {
+    ++pos;
+  }
+
+  if (arg_string[pos] == '=' || arg_string[pos] == ':') ++pos;
+
+  std::string str = arg_string.substr(pos);
+
+  std::istringstream isstr(str);
+
+  T return_val = default_value;
+
+  //parse value into return_val:
+  isstr >> return_val;
+
+  //if parse failed, return default_value:
+  if (!isstr) return default_value;
+
+  return return_val;
+}
+
+}//namespace Mantevo
+
+#endif
+
diff --git a/openmp-avx512/utils/utils.cpp b/openmp-avx512/utils/utils.cpp
new file mode 100644
index 0000000..29fcc8f
--- /dev/null
+++ b/openmp-avx512/utils/utils.cpp
@@ -0,0 +1,136 @@
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#ifdef MINIFE_HAVE_TPI
+#include <TPI.h>
+#endif
+
+#ifdef MINIFE_HAVE_TBB
+#include <tbb/task_scheduler_init.h>
+#endif
+
+#include <param_utils.hpp>
+#include <Parameters.hpp>
+#include <utils.hpp>
+
+namespace miniFE {
+
+//-------------------------------------------------------------
+void get_parameters(int argc, char** argv, Parameters& params)
+{
+  std::string argstring;
+  Mantevo::read_args_into_string(argc, argv, argstring);
+
+  std::string garbage("garbage");
+  std::string filename =
+      Mantevo::parse_parameter<std::string>(argstring, "input_file", garbage);
+
+  if (filename != garbage) {
+    Mantevo::read_file_into_string(filename, argstring);
+  }
+
+  params.nx = Mantevo::parse_parameter<int>(argstring, "nx", 10);
+  params.ny = Mantevo::parse_parameter<int>(argstring, "ny", params.nx);
+  params.nz = Mantevo::parse_parameter<int>(argstring, "nz", params.ny);
+  params.load_imbalance =
+      Mantevo::parse_parameter<float>(argstring, "load_imbalance", 0);
+  params.numthreads = Mantevo::parse_parameter<int>(argstring, "numthreads", 1);
+  params.mv_overlap_comm_comp = Mantevo::parse_parameter<int>(argstring, "mv_overlap_comm_comp", 0);
+  params.use_locking = Mantevo::parse_parameter<int>(argstring, "use_locking", 0);
+  params.name = Mantevo::parse_parameter<std::string>(argstring, "name","");
+  params.elem_group_size = Mantevo::parse_parameter<int>(argstring, "elem_group_size", 1);
+  params.use_elem_mat_fields = Mantevo::parse_parameter<int>(argstring, "use_elem_mat_fields", 1);
+  params.verify_solution = Mantevo::parse_parameter<int>(argstring, "verify_solution", 0);
+  params.device = Mantevo::parse_parameter<int>(argstring, "device", 0);
+  params.num_devices = Mantevo::parse_parameter<int>(argstring, "num_devices", 2);
+  params.skip_device = Mantevo::parse_parameter<int>(argstring, "skip_device", 9999);
+  params.numa = Mantevo::parse_parameter<int>(argstring, "numa", 1);
+}
+
+//-------------------------------------------------------------
+void broadcast_parameters(Parameters& params)
+{
+#ifdef HAVE_MPI
+  const int num_int_params = 13;
+  int iparams[num_int_params] = {params.nx, params.ny, params.nz, params.numthreads, params.mv_overlap_comm_comp, params.use_locking,
+		     params.elem_group_size, params.use_elem_mat_fields, params.verify_solution,
+		     params.device, params.num_devices,params.skip_device,params.numa};
+  MPI_Bcast(&iparams[0], num_int_params, MPI_INT, 0, MPI_COMM_WORLD);
+  params.nx = iparams[0];
+  params.ny = iparams[1];
+  params.nz = iparams[2];
+  params.numthreads = iparams[3];
+  params.mv_overlap_comm_comp = iparams[4];
+  params.use_locking = iparams[5];
+  params.elem_group_size = iparams[6];
+  params.use_elem_mat_fields = iparams[7];
+  params.verify_solution = iparams[8];
+  params.device = iparams[9];
+  params.num_devices = iparams[10];
+  params.skip_device = iparams[11];
+  params.numa = iparams[12];
+
+  float fparams[1] = {params.load_imbalance};
+  MPI_Bcast(&fparams[0], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
+  params.load_imbalance = fparams[0];
+
+#endif
+}
+
+//-------------------------------------------------------------
+void initialize_mpi(int argc, char** argv, int& numprocs, int& myproc)
+{
+#ifdef HAVE_MPI
+  MPI_Init(&argc, &argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#else
+  numprocs = 1;
+  myproc = 0;
+#endif
+}
+
+//-------------------------------------------------------------
+void finalize_mpi()
+{
+#ifdef HAVE_MPI
+  MPI_Finalize();
+#endif
+}
+
+}//namespace miniFE
+
diff --git a/openmp-avx512/utils/utils.hpp b/openmp-avx512/utils/utils.hpp
new file mode 100644
index 0000000..263294d
--- /dev/null
+++ b/openmp-avx512/utils/utils.hpp
@@ -0,0 +1,204 @@
+#ifndef _utils_hpp_
+#define _utils_hpp_
+
+//@HEADER
+// ************************************************************************
+//
+// MiniFE: Simple Finite Element Assembly and Solve
+// Copyright (2006-2013) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+// USA
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstdlib>
+#include <cmath>
+#include <vector>
+#include <map>
+
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#include <TypeTraits.hpp>
+#include <Parameters.hpp>
+
+namespace miniFE {
+
+void get_parameters(int argc, char** argv, Parameters& params);
+
+void broadcast_parameters(Parameters& params);
+
+void initialize_mpi(int argc, char** argv, int& numprocs, int& myproc);
+
+void finalize_mpi();
+
+template<typename Scalar>
+Scalar percentage_difference(Scalar value, Scalar average)
+{
+  //result will be the difference between value and average, represented as
+  //a percentage of average.
+  //Examples:
+  //  if value=100 and average=50, result is 100%
+  //  if value=500 and average=400, result is 25%
+
+  //Note: if average is 0, result is undefined. We'll return -1.0;
+
+  Scalar result = std::abs(value-average);
+  if (std::abs(average) > 1.e-5) {
+    result /= average;
+    result *= 100;
+  }
+  else result = -1;
+
+  return result;
+}
+
+template<typename GlobalOrdinal>
+void get_global_min_max(GlobalOrdinal local_n,
+                        GlobalOrdinal& global_n,
+                        GlobalOrdinal& min_n,
+                        int& min_proc,
+                        GlobalOrdinal& max_n,
+                        int& max_proc)
+{
+//Given a local_n, compute global_n, min/max, etc. All computed results
+//will be returned on all processors.
+//
+  int numprocs = 1, myproc = 0;
+#ifdef HAVE_MPI
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+#endif
+
+  std::vector<GlobalOrdinal> all_n(numprocs, 0);
+  all_n[myproc] = local_n;
+#ifdef HAVE_MPI
+  std::vector<GlobalOrdinal> tmp(all_n);
+  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
+  MPI_Allreduce(&tmp[0], &all_n[0], numprocs, mpi_dtype, MPI_MAX, MPI_COMM_WORLD);
+#endif
+
+  global_n = 0;
+  min_n= 5*local_n;
+  min_proc = 0;
+  max_n= 0;
+  max_proc = 0;
+
+  for(int i=0; i<numprocs; ++i) {
+    global_n += all_n[i];
+    //min_proc will be the lowest-numbered proc with n = min_n
+    if (all_n[i] < min_n) {
+      min_n = all_n[i];
+      min_proc = i;
+    }
+    //max_proc will be the highest-numbered proc with n = max_n
+    if (all_n[i] >= max_n) {
+      max_n = all_n[i];
+      max_proc = i;
+    }
+  }
+}
+
+template<typename Scalar>
+Scalar compute_std_dev_as_percentage(Scalar local_nrows,
+                                     Scalar avg_nrows)
+{
+//compute and return a standard deviation for the deviation of local_nrows from the average.
+//the std. dev. will be expressed as a percentage of avg_nrows.
+//
+//Input argument local_nrows is really a integer, but taking it as a floating-point scalar is
+//harmless.
+//
+#ifdef HAVE_MPI
+  int numprocs = 1, myproc = 0;
+  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
+
+//If it's significantly more efficient, we may consider using MPI_Gather below instead of
+//MPI_Allgather. We really only need to compute std.dev. on proc 0...
+//
+//(But for now, use MPI_Allgather and compute on all procs.)
+
+  std::vector<Scalar> all_nrows(numprocs, 0);
+  MPI_Allgather(&local_nrows, 1, mpi_dtype, &all_nrows[0], 1, mpi_dtype, MPI_COMM_WORLD);
+
+  //turn all_nrows contents into deviations, add to sum-of-squares-of-deviations:
+  Scalar sum_sqr_dev = 0;
+  for(size_t i=0; i<all_nrows.size(); ++i) {
+    all_nrows[i] -= avg_nrows;
+    all_nrows[i] *= all_nrows[i];
+    sum_sqr_dev += all_nrows[i];
+  }
+  Scalar tmp1 = sum_sqr_dev;
+  Scalar std_dev = numprocs>1 ? std::sqrt(tmp1/(numprocs-1)) : 0;
+
+  //std_dev is now the standard deviation of rows-per-processor with respect
+  //to avg_nrows.
+  //Next turn std_dev into a percentage of avg_nrows:
+  std_dev /= avg_nrows;
+  std_dev *= 100;
+  return std_dev;
+#else
+  return 0;
+#endif
+}
+
+template<typename GlobalOrdinal>
+GlobalOrdinal find_row_for_id(GlobalOrdinal id,
+                              const std::map<GlobalOrdinal,GlobalOrdinal>& ids_to_rows)
+{
+  typename std::map<GlobalOrdinal,GlobalOrdinal>::const_iterator
+    iter = ids_to_rows.lower_bound(id);
+
+  if (iter == ids_to_rows.end() || iter->first != id) {
+    if (ids_to_rows.size() > 0) {
+      --iter;
+    }
+    else {
+      std::cout << "ERROR, failed to map id to row."<<std::endl;
+      return -99;
+    }
+  }
+
+  if (iter->first == id) {
+    return iter->second;
+  }
+
+  if (iter == ids_to_rows.begin() && iter->first > id) {
+    std::cout << "ERROR, id:" << id << ", ids_to_rows.begin(): " << iter->first<<std::endl;
+    return -99;
+  }
+
+  GlobalOrdinal offset = id - iter->first;
+
+  if (offset < 0) {
+    std::cout << "ERROR, negative offset in find_row_for_id for id="<<id<<std::endl;
+    return -99;
+  }
+
+  return iter->second + offset;
+}
+
+}//namespace miniFE
+
+#endif
+
-- 
GitLab