diff --git a/Golden_Repo/e/ELPA/ELPA-2021.11.001-gomkl-2021b.eb b/Golden_Repo/e/ELPA/ELPA-2021.11.001-gomkl-2021b.eb
new file mode 100644
index 0000000000000000000000000000000000000000..cd313db7f33e93dd93834a820b48cc2727306f6c
--- /dev/null
+++ b/Golden_Repo/e/ELPA/ELPA-2021.11.001-gomkl-2021b.eb
@@ -0,0 +1,107 @@
+name = 'ELPA'
+version = '2021.11.001'
+
+homepage = 'https://elpa.rzg.mpg.de'
+description = """Eigenvalue SoLvers for Petaflop-Applications. ELPA has been installed as module in
+$EBROOTELPA ($ELPA_ROOT is also defined). This installation
+contains the pure MPI version and the hybrid MPI/OpenMP version.
+Notice: If you want to use OpenMP threads you have to set
+export ELPA_DEFAULT_omp=<number of threads per MPI process>
+in your batch job and start MPI with MPI_INIT_THREAD(MPI_THREAD_MULTIPLE,....
+
+Several assembly kernels have been compiled. They can be chosen at runtime when calling the library or
+with the environment variables REAL_ELPA_KERNEL or COMPLEX_ELPA_KERNEL.
+
+An example is
+export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GENERIC
+which chooses the generic real kernel for elpa2.
+Starting with version 2019.11.001 the legacy interface is no longer available.
+"""
+
+usage = """You can get an overview over the available kernels by loading ELPA and then submitting a batch job with
+
+srun --ntasks=1 $EBROOTELPA/bin/elpa2_print_kernels
+
+Programs using this ELPA library have to be compiled with
+
+-I$ELPA_INCLUDE[_OPENMP]/ -I$ELPA_INCLUDE[_OPENMP]/elpa -I$ELPA_MODULES[_OPENMP]
+
+and linked with
+
+-L$EBROOTELPA/lib -lelpa[_openmp]
+-lmkl_scalapack_lp64
+${MKLROOT}/lib/intel64/libmkl_blacs_openmpi_lp64.a
+-lmkl_gf_lp64 -lmkl_sequential[-lmkl_gnu_thread]
+-lmkl_core -lgomp -lpthread -lm -ldl -lstdc++
+"""
+
+examples = 'Examples can be found in $EBROOTELPA/examples'
+
+toolchain = {'name': 'gomkl', 'version': '2021b'}
+toolchainopts = {'openmp': True, 'usempi': True}
+
+source_urls = ['https://gitlab.mpcdf.mpg.de/elpa/elpa/-/archive/new_release_%(version)s/']
+sources = ["elpa-new_release_%(version)s.tar.gz"]
+patches = [
+    '%(name)s-%(version)s_fix_hardcoded_perl_path.patch',
+    'ELPA-%(version)s_install-libelpatest.patch',
+]
+checksums = [
+    'e61048393a5e5f460858a11b216547fa3f434dd620c478cb20a52ebf543260f1',  # elpa-new_release_2021.11.001.tar.gz
+    # ELPA-2021.11.001_fix_hardcoded_perl_path.patch
+    '5fc40b6f3f948fd026efc688f9bafba0461d68ad007d9dc161bfd1507e2fc13b',
+    '2ce155ccbcdd61e8036d859aa204b48883695eff5f4decee3e5c2677516d8272',  # ELPA-2021.11.001_install-libelpatest.patch
+]
+
+builddependencies = [
+    ('Autotools', '20210726'),
+    # remove_xcompiler script requires 'python' command,
+    ('Python', '3.9.6'),
+    ('Perl', '5.34.0'),
+]
+
+preconfigopts = './autogen.sh && '
+preconfigopts += 'export LDFLAGS="-lm $LDFLAGS" && '
+preconfigopts += 'autoreconf && '
+
+# The checking of MPI_THREAD_MULTIPLE does not work because the check uses an
+# MPI program that is then executed by just ./conftest
+# Unfortunately you cannot turn of checking during runtime, too
+configopts = '--without-threading-support-check-during-build '
+
+with_single = False
+
+# When building in parallel, the file test_setup_mpi.mod is sometimes
+# used before it is built, leading to an error.  This must be a bug in
+# the makefile affecting parallel builds.
+maxparallel = 1
+
+
+postinstallcmds = [
+    'cp -r %(builddir)s/elpa-new_release_%(version)s/examples %(installdir)s/examples/',
+    'rm %(installdir)s/examples/*.orig',
+    'rm %(installdir)s/examples/*_cuda',
+    'rm %(installdir)s/examples/C/*.orig',
+    'rm %(installdir)s/examples/C/*_cuda',
+    'rm %(installdir)s/examples/Fortran/*.orig',
+    'rm %(installdir)s/examples/Fortran/*_cuda',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test/shared/generated.h %(installdir)s/examples/C/generated.h',
+    'cp config.h config-f90.h %(installdir)s/include/elpa_openmp-%(version)s/elpa/',
+    'grep -v WITH_OPENMP config.h > %(installdir)s/include/elpa-%(version)s/elpa/config.h',
+    'grep -v WITH_OPENMP config-f90.h > %(installdir)s/include/elpa-%(version)s/elpa/config-f90.h',
+    'cp %(builddir)s/elpa-new_release_%(version)s/private_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+]
+
+modextravars = {
+    'ELPA_ROOT': '%(installdir)s',
+    'ELPAROOT': '%(installdir)s',
+    'ELPA_INCLUDE': '%(installdir)s/include/elpa-%(version)s/',
+    'ELPA_INCLUDE_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/',
+    'ELPA_LIB': '%(installdir)s/lib',
+    'ELPA_LIB_OPENMP': '%(installdir)s/lib',
+    'ELPA_MODULES': '%(installdir)s/include/elpa-%(version)s/modules',
+    'ELPA_MODULES_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/modules',
+}
+
+moduleclass = 'math'
diff --git a/Golden_Repo/e/ELPA/ELPA-2021.11.001-gpsmkl-2021b.eb b/Golden_Repo/e/ELPA/ELPA-2021.11.001-gpsmkl-2021b.eb
new file mode 100644
index 0000000000000000000000000000000000000000..eeeef09bff2024c840a5b531087aebdc8b6181ae
--- /dev/null
+++ b/Golden_Repo/e/ELPA/ELPA-2021.11.001-gpsmkl-2021b.eb
@@ -0,0 +1,105 @@
+name = 'ELPA'
+version = '2021.11.001'
+
+homepage = 'https://elpa.rzg.mpg.de'
+description = """Eigenvalue SoLvers for Petaflop-Applications. ELPA has been installed as module in
+$EBROOTELPA ($ELPA_ROOT is also defined). This installation
+contains the pure MPI version and the hybrid MPI/OpenMP version.
+Notice: If you want to use OpenMP threads you have to set
+export ELPA_DEFAULT_omp=<number of threads per MPI process>
+in your batch job and start MPI with MPI_INIT_THREAD(MPI_THREAD_MULTIPLE,....
+
+Several assembly kernels have been compiled. They can be chosen at runtime when calling the library or
+with the environment variables REAL_ELPA_KERNEL or COMPLEX_ELPA_KERNEL.
+
+An example is
+export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GENERIC
+which chooses the generic real kernel for elpa2.
+Starting with version 2019.11.001 the legacy interface is no longer available.
+"""
+
+usage = """You can get an overview over the available kernels by loading ELPA and then submitting a batch job with
+
+srun --ntasks=1 $EBROOTELPA/bin/elpa2_print_kernels
+
+Programs using this ELPA library have to be compiled with
+
+-I$ELPA_INCLUDE[_OPENMP]/ -I$ELPA_INCLUDE[_OPENMP]/elpa -I$ELPA_MODULES[_OPENMP]
+
+and linked with
+
+-L$EBROOTELPA/lib -lelpa[_openmp]
+-lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64 -lmkl_gf_lp64
+-lmkl_sequential[-lmkl_gnu_thread]
+-lmkl_core -lgomp -lpthread -lm -ldl -lstdc++
+"""
+
+examples = 'Examples can be found in $EBROOTELPA/examples'
+
+toolchain = {'name': 'gpsmkl', 'version': '2021b'}
+toolchainopts = {'openmp': True, 'usempi': True}
+
+source_urls = ['https://gitlab.mpcdf.mpg.de/elpa/elpa/-/archive/new_release_%(version)s/']
+sources = ["elpa-new_release_%(version)s.tar.gz"]
+patches = [
+    '%(name)s-%(version)s_fix_hardcoded_perl_path.patch',
+    'ELPA-%(version)s_install-libelpatest.patch',
+]
+checksums = [
+    'e61048393a5e5f460858a11b216547fa3f434dd620c478cb20a52ebf543260f1',  # elpa-new_release_2021.11.001.tar.gz
+    # ELPA-2021.11.001_fix_hardcoded_perl_path.patch
+    '5fc40b6f3f948fd026efc688f9bafba0461d68ad007d9dc161bfd1507e2fc13b',
+    '2ce155ccbcdd61e8036d859aa204b48883695eff5f4decee3e5c2677516d8272',  # ELPA-2021.11.001_install-libelpatest.patch
+]
+
+builddependencies = [
+    ('Autotools', '20210726'),
+    # remove_xcompiler script requires 'python' command,
+    ('Python', '3.9.6'),
+    ('Perl', '5.34.0'),
+]
+
+preconfigopts = './autogen.sh && '
+preconfigopts += 'export LDFLAGS="-lm $LDFLAGS" && '
+preconfigopts += 'autoreconf && '
+
+# The checking of MPI_THREAD_MULTIPLE does not work because the check uses an
+# MPI program that is then executed by just ./conftest
+# Unfortunately you cannot turn of checking during runtime, too
+configopts = '--without-threading-support-check-during-build '
+
+with_single = False
+
+# When building in parallel, the file test_setup_mpi.mod is sometimes
+# used before it is built, leading to an error.  This must be a bug in
+# the makefile affecting parallel builds.
+maxparallel = 1
+
+postinstallcmds = [
+    'cp -r %(builddir)s/elpa-new_release_%(version)s/examples %(installdir)s/examples/',
+    'rm %(installdir)s/examples/*.orig',
+    'rm %(installdir)s/examples/*_cuda',
+    'rm %(installdir)s/examples/C/*.orig',
+    'rm %(installdir)s/examples/C/*_cuda',
+    'rm %(installdir)s/examples/Fortran/*.orig',
+    'rm %(installdir)s/examples/Fortran/*_cuda',
+    'cp config.h config-f90.h %(installdir)s/include/elpa_openmp-%(version)s/elpa/',
+    'grep -v WITH_OPENMP config.h > %(installdir)s/include/elpa-%(version)s/elpa/config.h',
+    'grep -v WITH_OPENMP config-f90.h > %(installdir)s/include/elpa-%(version)s/elpa/config-f90.h',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test/shared/generated.h %(installdir)s/examples/C/generated.h',
+    'cp %(builddir)s/elpa-new_release_%(version)s/private_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+]
+
+modextravars = {
+    'ELPA_ROOT': '%(installdir)s',
+    'ELPAROOT': '%(installdir)s',
+    'ELPA_INCLUDE': '%(installdir)s/include/elpa-%(version)s/',
+    'ELPA_INCLUDE_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/',
+    'ELPA_LIB': '%(installdir)s/lib',
+    'ELPA_LIB_OPENMP': '%(installdir)s/lib',
+    'ELPA_MODULES': '%(installdir)s/include/elpa-%(version)s/modules',
+    'ELPA_MODULES_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/modules',
+}
+
+moduleclass = 'math'
diff --git a/Golden_Repo/e/ELPA/ELPA-2021.11.001-intel-2021b.eb b/Golden_Repo/e/ELPA/ELPA-2021.11.001-intel-2021b.eb
new file mode 100644
index 0000000000000000000000000000000000000000..2f80e05e2ba06c1ed2fb01d8dffdb1af26859701
--- /dev/null
+++ b/Golden_Repo/e/ELPA/ELPA-2021.11.001-intel-2021b.eb
@@ -0,0 +1,104 @@
+name = 'ELPA'
+version = '2021.11.001'
+
+homepage = 'https://elpa.rzg.mpg.de'
+description = """Eigenvalue SoLvers for Petaflop-Applications. ELPA has been installed as module in
+$EBROOTELPA ($ELPA_ROOT is also defined). This installation
+contains the pure MPI version and the hybrid MPI/OpenMP version.
+Notice: If you want to use OpenMP threads you have to set
+export ELPA_DEFAULT_omp=<number of threads per MPI process>
+in your batch job and start MPI with MPI_INIT_THREAD(MPI_THREAD_MULTIPLE,....
+
+Several assembly kernels have been compiled. They can be chosen at runtime when calling the library or
+with the environment variables REAL_ELPA_KERNEL or COMPLEX_ELPA_KERNEL.
+
+An example is
+export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GENERIC
+which chooses the generic real kernel for elpa2.
+Starting with version 2019.11.001 the legacy interface is no longer available.
+"""
+
+usage = """You can get an overview over the available kernels by loading ELPA and then submitting a batch job with
+
+srun --ntasks=1 $EBROOTELPA/bin/elpa2_print_kernels
+
+Programs using this ELPA library have to be compiled with
+
+-I$ELPA_INCLUDE[_OPENMP]/ -I$ELPA_INCLUDE[_OPENMP]/elpa -I$ELPA_MODULES[_OPENMP]
+
+and linked with
+
+-L$EBROOTELPA/lib -lelpa[_openmp]
+-lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64 -lmkl_intel_lp64
+-lmkl_sequential[-lmkl_intel_thread]
+-lmkl_core -liomp5 -lpthread -lstdc++
+"""
+
+examples = 'Examples can be found in $EBROOTELPA/examples'
+
+toolchain = {'name': 'intel', 'version': '2021b'}
+toolchainopts = {'openmp': True, 'usempi': True}
+
+source_urls = ['https://gitlab.mpcdf.mpg.de/elpa/elpa/-/archive/new_release_%(version)s/']
+sources = ["elpa-new_release_%(version)s.tar.gz"]
+patches = [
+    '%(name)s-%(version)s_fix_hardcoded_perl_path.patch',
+    'ELPA-%(version)s_install-libelpatest.patch',
+]
+checksums = [
+    'e61048393a5e5f460858a11b216547fa3f434dd620c478cb20a52ebf543260f1',  # elpa-new_release_2021.11.001.tar.gz
+    # ELPA-2021.11.001_fix_hardcoded_perl_path.patch
+    '5fc40b6f3f948fd026efc688f9bafba0461d68ad007d9dc161bfd1507e2fc13b',
+    '2ce155ccbcdd61e8036d859aa204b48883695eff5f4decee3e5c2677516d8272',  # ELPA-2021.11.001_install-libelpatest.patch
+]
+
+builddependencies = [
+    ('Autotools', '20210726'),
+    # remove_xcompiler script requires 'python' command,
+    ('Python', '3.9.6'),
+    ('Perl', '5.34.0'),
+]
+
+preconfigopts = './autogen.sh && '
+preconfigopts += 'autoreconf && '
+
+# The checking of MPI_THREAD_MULTIPLE does not work because the check uses an
+# MPI program that is then executed by just ./conftest
+# Unfortunately you cannot turn of checking during runtime, too
+configopts = '--without-threading-support-check-during-build '
+
+with_single = False
+
+# When building in parallel, the file test_setup_mpi.mod is sometimes
+# used before it is built, leading to an error.  This must be a bug in
+# the makefile affecting parallel builds.
+maxparallel = 1
+
+postinstallcmds = [
+    'cp -r %(builddir)s/elpa-new_release_%(version)s/examples %(installdir)s/examples/',
+    'rm %(installdir)s/examples/*.orig',
+    'rm %(installdir)s/examples/*_cuda',
+    'rm %(installdir)s/examples/C/*.orig',
+    'rm %(installdir)s/examples/C/*_cuda',
+    'rm %(installdir)s/examples/Fortran/*.orig',
+    'rm %(installdir)s/examples/Fortran/*_cuda',
+    'cp config.h config-f90.h %(installdir)s/include/elpa_openmp-%(version)s/elpa/',
+    'grep -v WITH_OPENMP config.h > %(installdir)s/include/elpa-%(version)s/elpa/config.h',
+    'grep -v WITH_OPENMP config-f90.h > %(installdir)s/include/elpa-%(version)s/elpa/config-f90.h',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test/shared/generated.h %(installdir)s/examples/C/generated.h',
+    'cp %(builddir)s/elpa-new_release_%(version)s/private_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+]
+
+modextravars = {
+    'ELPA_ROOT': '%(installdir)s',
+    'ELPAROOT': '%(installdir)s',
+    'ELPA_INCLUDE': '%(installdir)s/include/elpa-%(version)s/',
+    'ELPA_INCLUDE_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/',
+    'ELPA_LIB': '%(installdir)s/lib',
+    'ELPA_LIB_OPENMP': '%(installdir)s/lib',
+    'ELPA_MODULES': '%(installdir)s/include/elpa-%(version)s/modules',
+    'ELPA_MODULES_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/modules',
+}
+
+moduleclass = 'math'
diff --git a/Golden_Repo/e/ELPA/ELPA-2021.11.001-intel-para-2021b.eb b/Golden_Repo/e/ELPA/ELPA-2021.11.001-intel-para-2021b.eb
new file mode 100644
index 0000000000000000000000000000000000000000..8eda37582dd7d40372a09d88561616236b566c80
--- /dev/null
+++ b/Golden_Repo/e/ELPA/ELPA-2021.11.001-intel-para-2021b.eb
@@ -0,0 +1,104 @@
+name = 'ELPA'
+version = '2021.11.001'
+
+homepage = 'https://elpa.rzg.mpg.de'
+description = """Eigenvalue SoLvers for Petaflop-Applications. ELPA has been installed as module in
+$EBROOTELPA ($ELPA_ROOT is also defined). This installation
+contains the pure MPI version and the hybrid MPI/OpenMP version.
+Notice: If you want to use OpenMP threads you have to set
+export ELPA_DEFAULT_omp=<number of threads per MPI process>
+in your batch job and start MPI with MPI_INIT_THREAD(MPI_THREAD_MULTIPLE,....
+
+Several assembly kernels have been compiled. They can be chosen at runtime when calling the library or
+with the environment variables REAL_ELPA_KERNEL or COMPLEX_ELPA_KERNEL.
+
+An example is
+export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GENERIC
+which chooses the generic real kernel for elpa2.
+Starting with version 2019.11.001 the legacy interface is no longer available.
+"""
+
+usage = """You can get an overview over the available kernels by loading ELPA and then submitting a batch job with
+
+srun --ntasks=1 $EBROOTELPA/bin/elpa2_print_kernels
+
+Programs using this ELPA library have to be compiled with
+
+-I$ELPA_INCLUDE[_OPENMP]/ -I$ELPA_INCLUDE[_OPENMP]/elpa -I$ELPA_MODULES[_OPENMP]
+
+and linked with
+
+-L$EBROOTELPA/lib -lelpa[_openmp]
+-lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64 -lmkl_intel_lp64
+-lmkl_sequential[-lmkl_intel_thread]
+-lmkl_core -liomp5 -lpthread -lstdc++
+"""
+
+examples = 'Examples can be found in $EBROOTELPA/examples'
+
+toolchain = {'name': 'intel-para', 'version': '2021b'}
+toolchainopts = {'openmp': True, 'usempi': True}
+
+source_urls = ['https://gitlab.mpcdf.mpg.de/elpa/elpa/-/archive/new_release_%(version)s/']
+sources = ["elpa-new_release_%(version)s.tar.gz"]
+patches = [
+    '%(name)s-%(version)s_fix_hardcoded_perl_path.patch',
+    'ELPA-%(version)s_install-libelpatest.patch',
+]
+checksums = [
+    'e61048393a5e5f460858a11b216547fa3f434dd620c478cb20a52ebf543260f1',  # elpa-new_release_2021.11.001.tar.gz
+    # ELPA-2021.11.001_fix_hardcoded_perl_path.patch
+    '5fc40b6f3f948fd026efc688f9bafba0461d68ad007d9dc161bfd1507e2fc13b',
+    '2ce155ccbcdd61e8036d859aa204b48883695eff5f4decee3e5c2677516d8272',  # ELPA-2021.11.001_install-libelpatest.patch
+]
+
+builddependencies = [
+    ('Autotools', '20210726'),
+    # remove_xcompiler script requires 'python' command,
+    ('Python', '3.9.6'),
+    ('Perl', '5.34.0'),
+]
+
+preconfigopts = './autogen.sh && '
+preconfigopts += 'autoreconf && '
+
+# The checking of MPI_THREAD_MULTIPLE does not work because the check uses an
+# MPI program that is then executed by just ./conftest
+# Unfortunately you cannot turn of checking during runtime, too
+configopts = '--without-threading-support-check-during-build '
+
+with_single = False
+
+# When building in parallel, the file test_setup_mpi.mod is sometimes
+# used before it is built, leading to an error.  This must be a bug in
+# the makefile affecting parallel builds.
+maxparallel = 1
+
+postinstallcmds = [
+    'cp -r %(builddir)s/elpa-new_release_%(version)s/examples %(installdir)s/examples/',
+    'rm %(installdir)s/examples/*.orig',
+    'rm %(installdir)s/examples/*_cuda',
+    'rm %(installdir)s/examples/C/*.orig',
+    'rm %(installdir)s/examples/C/*_cuda',
+    'rm %(installdir)s/examples/Fortran/*.orig',
+    'rm %(installdir)s/examples/Fortran/*_cuda',
+    'cp config.h config-f90.h %(installdir)s/include/elpa_openmp-%(version)s/elpa/',
+    'grep -v WITH_OPENMP config.h > %(installdir)s/include/elpa-%(version)s/elpa/config.h',
+    'grep -v WITH_OPENMP config-f90.h > %(installdir)s/include/elpa-%(version)s/elpa/config-f90.h',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test/shared/generated.h %(installdir)s/examples/C/generated.h',
+    'cp %(builddir)s/elpa-new_release_%(version)s/private_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+]
+
+modextravars = {
+    'ELPA_ROOT': '%(installdir)s',
+    'ELPAROOT': '%(installdir)s',
+    'ELPA_INCLUDE': '%(installdir)s/include/elpa-%(version)s/',
+    'ELPA_INCLUDE_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/',
+    'ELPA_LIB': '%(installdir)s/lib',
+    'ELPA_LIB_OPENMP': '%(installdir)s/lib',
+    'ELPA_MODULES': '%(installdir)s/include/elpa-%(version)s/modules',
+    'ELPA_MODULES_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/modules',
+}
+
+moduleclass = 'math'
diff --git a/Golden_Repo/e/ELPA/ELPA-2021.11.001-iomkl-2021b.eb b/Golden_Repo/e/ELPA/ELPA-2021.11.001-iomkl-2021b.eb
new file mode 100644
index 0000000000000000000000000000000000000000..08e7f242458aa3a1f8959cd1a89621a97e85a3cf
--- /dev/null
+++ b/Golden_Repo/e/ELPA/ELPA-2021.11.001-iomkl-2021b.eb
@@ -0,0 +1,105 @@
+name = 'ELPA'
+version = '2021.11.001'
+
+homepage = 'https://elpa.rzg.mpg.de'
+description = """Eigenvalue SoLvers for Petaflop-Applications. ELPA has been installed as module in
+$EBROOTELPA ($ELPA_ROOT is also defined). This installation
+contains the pure MPI version and the hybrid MPI/OpenMP version.
+Notice: If you want to use OpenMP threads you have to set
+export ELPA_DEFAULT_omp=<number of threads per MPI process>
+in your batch job and start MPI with MPI_INIT_THREAD(MPI_THREAD_MULTIPLE,....
+
+Several assembly kernels have been compiled. They can be chosen at runtime when calling the library or
+with the environment variables REAL_ELPA_KERNEL or COMPLEX_ELPA_KERNEL.
+
+An example is
+export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GENERIC
+which chooses the generic real kernel for elpa2.
+Starting with version 2019.11.001 the legacy interface is no longer available.
+"""
+
+usage = """You can get an overview over the available kernels by loading ELPA and then submitting a batch job with
+
+srun --ntasks=1 $EBROOTELPA/bin/elpa2_print_kernels
+
+Programs using this ELPA library have to be compiled with
+
+-I$ELPA_INCLUDE[_OPENMP]/ -I$ELPA_INCLUDE[_OPENMP]/elpa -I$ELPA_MODULES[_OPENMP]
+
+and linked with
+
+-L$EBROOTELPA/lib -lelpa[_openmp]
+-lmkl_scalapack_lp64
+${MKLROOT}/lib/intel64/libmkl_blacs_openmpi_lp64.a
+-lmkl_intel_lp64 -lmkl_sequential[-lmkl_intel_thread]
+-lmkl_core -liomp -lpthread -ldl -lstdc++
+"""
+
+examples = 'Examples can be found in $EBROOTELPA/examples'
+
+toolchain = {'name': 'iomkl', 'version': '2021b'}
+toolchainopts = {'openmp': True, 'usempi': True}
+
+source_urls = ['https://gitlab.mpcdf.mpg.de/elpa/elpa/-/archive/new_release_%(version)s/']
+sources = ["elpa-new_release_%(version)s.tar.gz"]
+patches = [
+    '%(name)s-%(version)s_fix_hardcoded_perl_path.patch',
+    'ELPA-%(version)s_install-libelpatest.patch',
+]
+checksums = [
+    'e61048393a5e5f460858a11b216547fa3f434dd620c478cb20a52ebf543260f1',  # elpa-new_release_2021.11.001.tar.gz
+    # ELPA-2021.11.001_fix_hardcoded_perl_path.patch
+    '5fc40b6f3f948fd026efc688f9bafba0461d68ad007d9dc161bfd1507e2fc13b',
+    '2ce155ccbcdd61e8036d859aa204b48883695eff5f4decee3e5c2677516d8272',  # ELPA-2021.11.001_install-libelpatest.patch
+]
+
+builddependencies = [
+    ('Autotools', '20210726'),
+    # remove_xcompiler script requires 'python' command,
+    ('Python', '3.9.6'),
+    ('Perl', '5.34.0'),
+]
+
+preconfigopts = './autogen.sh && '
+preconfigopts += 'autoreconf && '
+
+# The checking of MPI_THREAD_MULTIPLE does not work because the check uses an
+# MPI program that is then executed by just ./conftest
+# Unfortunately you cannot turn of checking during runtime, too
+configopts = '--without-threading-support-check-during-build '
+
+with_single = False
+
+# When building in parallel, the file test_setup_mpi.mod is sometimes
+# used before it is built, leading to an error.  This must be a bug in
+# the makefile affecting parallel builds.
+maxparallel = 1
+
+postinstallcmds = [
+    'cp -r %(builddir)s/elpa-new_release_%(version)s/examples %(installdir)s/examples/',
+    'rm %(installdir)s/examples/*.orig',
+    'rm %(installdir)s/examples/*_cuda',
+    'rm %(installdir)s/examples/C/*.orig',
+    'rm %(installdir)s/examples/C/*_cuda',
+    'rm %(installdir)s/examples/Fortran/*.orig',
+    'rm %(installdir)s/examples/Fortran/*_cuda',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test/shared/generated.h %(installdir)s/examples/C/generated.h',
+    'cp config.h config-f90.h %(installdir)s/include/elpa_openmp-%(version)s/elpa/',
+    'grep -v WITH_OPENMP config.h > %(installdir)s/include/elpa-%(version)s/elpa/config.h',
+    'grep -v WITH_OPENMP config-f90.h > %(installdir)s/include/elpa-%(version)s/elpa/config-f90.h',
+    'cp %(builddir)s/elpa-new_release_%(version)s/private_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+    'cp %(builddir)s/elpa-new_release_%(version)s/test_modules/* %(installdir)s/include/elpa-%(version)s/modules',
+]
+
+modextravars = {
+    'ELPA_ROOT': '%(installdir)s',
+    'ELPAROOT': '%(installdir)s',
+    'ELPA_INCLUDE': '%(installdir)s/include/elpa-%(version)s/',
+    'ELPA_INCLUDE_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/',
+    'ELPA_LIB': '%(installdir)s/lib',
+    'ELPA_LIB_OPENMP': '%(installdir)s/lib',
+    'ELPA_MODULES': '%(installdir)s/include/elpa-%(version)s/modules',
+    'ELPA_MODULES_OPENMP': '%(installdir)s/include/elpa_openmp-%(version)s/modules',
+}
+
+moduleclass = 'math'
diff --git a/Golden_Repo/e/ELPA/ELPA-2021.11.001_fix_hardcoded_perl_path.patch b/Golden_Repo/e/ELPA/ELPA-2021.11.001_fix_hardcoded_perl_path.patch
new file mode 100644
index 0000000000000000000000000000000000000000..5c7ae5eae05ce11d3a24b3202194616933db38da
--- /dev/null
+++ b/Golden_Repo/e/ELPA/ELPA-2021.11.001_fix_hardcoded_perl_path.patch
@@ -0,0 +1,40 @@
+--- elpa-new_release_2021.11.001/test_project_1stage/fdep/fortran_dependencies.pl	2021-12-17 08:20:49.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/test_project_1stage/fdep/fortran_dependencies.pl	2022-01-25 17:07:21.169362000 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ 
+ use strict;
+ 
+--- elpa-new_release_2021.11.001/fdep/fortran_dependencies.pl	2021-12-17 08:20:49.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/fdep/fortran_dependencies.pl	2022-01-25 17:08:17.272544000 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ #
+ # Copyright 2015 Lorenz Hüdepohl
+ #
+--- elpa-new_release_2021.11.001/test_project_C_2stage/fdep/fortran_dependencies.pl	2021-12-17 08:20:49.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/test_project_C_2stage/fdep/fortran_dependencies.pl	2022-01-25 17:06:20.088471000 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ 
+ use strict;
+ 
+--- elpa-new_release_2021.11.001/test_project_2stage/fdep/fortran_dependencies.pl	2021-12-17 08:20:49.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/test_project_2stage/fdep/fortran_dependencies.pl	2022-01-25 17:05:10.675886000 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ 
+ use strict;
+ 
+--- elpa-new_release_2021.11.001/test_project_C/fdep/fortran_dependencies.pl	2021-12-17 08:20:49.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/test_project_C/fdep/fortran_dependencies.pl	2022-01-25 17:04:14.834326000 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/perl -w
++#!/usr/bin/env perl
+ 
+ use strict;
+ 
diff --git a/Golden_Repo/e/ELPA/ELPA-2021.11.001_install-libelpatest.patch b/Golden_Repo/e/ELPA/ELPA-2021.11.001_install-libelpatest.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ad5240130a3f09af760ed386157af998597a8f24
--- /dev/null
+++ b/Golden_Repo/e/ELPA/ELPA-2021.11.001_install-libelpatest.patch
@@ -0,0 +1,12790 @@
+--- elpa-new_release_2021.11.001/Makefile.am	2021-12-17 08:20:49.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/Makefile.am	2022-01-26 10:47:31.956245709 +0100
+@@ -665,7 +665,7 @@
+ test_program_fcflags = $(AM_FCFLAGS) $(FC_MODOUT)test_modules $(FC_MODINC)test_modules $(FC_MODINC)modules $(FC_MODINC)private_modules
+ 
+ # library with shared sources for the test files
+-noinst_LTLIBRARIES += libelpatest@SUFFIX@.la
++lib_LTLIBRARIES += libelpatest@SUFFIX@.la
+ libelpatest@SUFFIX@_la_FCFLAGS = $(test_program_fcflags)
+ libelpatest@SUFFIX@_la_SOURCES = \
+   test/shared/tests_variable_definitions.F90 \
+diff -ruN elpa-new_release_2021.11.001/examples/C/Makefile_examples_hybrid elpa-new_release_2021.11.001_ok/examples/C/Makefile_examples_hybrid
+--- elpa-new_release_2021.11.001/examples/C/Makefile_examples_hybrid	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/C/Makefile_examples_hybrid	2022-01-28 09:53:19.118256000 +0100
+@@ -0,0 +1,31 @@
++# MPICH, that is IntelMPI or ParaStationMPI
++SCALAPACK_LIB = -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
++# OpenMPI
++# SCALAPACK_LIB = -lmkl_scalapack_lp64 $(MKLROOT)/lib/intel64/libmkl_blacs_openmpi_lp64.a
++LAPACK_LIB = 
++# Intel compiler
++MKL = -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lstdc++
++# GCC
++# MKL = -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lstdc++ -lm
++F90 = mpif90 -O3 -qopenmp -I$(ELPA_MODULES_OPENMP) -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa
++# GCC
++# F90 = mpif90 -O3 -fopenmp -I$(ELPA_MODULES_OPENMP) -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa
++LIBS = -L$(ELPA_LIB_OPENMP) -lelpa_openmp -lelpatest_openmp $(SCALAPACK_LIB) $(MKL)
++CC            = mpicc -O3 -qopenmp
++# GCC
++# CC            = mpicc -O3 -fopenmp
++
++all: test_real_1stage_hybrid test_real_2stage_all_kernels_hybrid test_autotune_hybrid test_multiple_objs_hybrid
++
++test_real_1stage_hybrid: test.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_1STAGE -DWITH_OPENMP_TRADITIONAL -DTEST_EIGENVECTORS -DWITH_MPI -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test.c $(LIBS)
++
++test_real_2stage_all_kernels_hybrid: test.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_GPU=0 -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_2STAGE -DWITH_OPENMP_TRADITIONAL -DTEST_EIGENVECTORS -DTEST_ALL_KERNELS -DWITH_MPI -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test.c $(LIBS)
++
++test_autotune_hybrid: test_autotune.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DWITH_OPENMP_TRADITIONAL -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test_autotune.c $(LIBS)
++
++test_multiple_objs_hybrid: test_multiple_objs.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DWITH_OPENMP_TRADITIONAL -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test_multiple_objs.c $(LIBS)
++
+diff -ruN elpa-new_release_2021.11.001/examples/C/Makefile_examples_pure elpa-new_release_2021.11.001_ok/examples/C/Makefile_examples_pure
+--- elpa-new_release_2021.11.001/examples/C/Makefile_examples_pure	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/C/Makefile_examples_pure	2022-01-28 09:53:42.223490000 +0100
+@@ -0,0 +1,27 @@
++# MPICH, that is IntelMPI or ParaStationMPI
++SCALAPACK_LIB = -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
++# OpenMPI
++# SCALAPACK_LIB = -lmkl_scalapack_lp64 $(MKLROOT)/lib/intel64/libmkl_blacs_openmpi_lp64.a
++LAPACK_LIB = 
++# Intel compiler
++MKL = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5 -lpthread -lstdc++
++# GCC
++# MKL = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lgomp -lpthread -lstdc++ -lm
++F90 = mpif90 -O3 -I$(ELPA_MODULES) -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa
++LIBS = -L$(ELPA_LIB) -lelpa -lelpatest $(SCALAPACK_LIB) $(MKL)
++CC            = mpicc -O3 
++
++all: test_real_1stage test_real_2stage_all_kernels test_autotune test_multiple_objs
++
++test_real_1stage: test.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_1STAGE -DTEST_EIGENVECTORS -DWITH_MPI -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test.c $(LIBS)
++
++test_real_2stage_all_kernels: test.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_GPU=0 -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_2STAGE -DTEST_EIGENVECTORS -DTEST_ALL_KERNELS -DWITH_MPI -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test.c $(LIBS)
++
++test_autotune: test_autotune.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test_autotune.c $(LIBS)
++
++test_multiple_objs: test_multiple_objs.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test_multiple_objs.c $(LIBS)
++
+diff -ruN elpa-new_release_2021.11.001/examples/C/Makefile_examples_pure_cuda elpa-new_release_2021.11.001_ok/examples/C/Makefile_examples_pure_cuda
+--- elpa-new_release_2021.11.001/examples/C/Makefile_examples_pure_cuda	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/C/Makefile_examples_pure_cuda	2022-01-28 09:53:55.785592000 +0100
+@@ -0,0 +1,27 @@
++# MPICH, that is IntelMPI or ParaStationMPI
++SCALAPACK_LIB = -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
++# OpenMPI
++# SCALAPACK_LIB = -lmkl_scalapack_lp64 $(MKLROOT)/lib/intel64/libmkl_blacs_openmpi_lp64.a
++LAPACK_LIB = 
++# Intel compiler
++MKL = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5 -lpthread -lstdc++
++# GCC
++# MKL = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lgomp -lpthread -lstdc++ -lm
++F90 = mpif90 -O3 -I$(ELPA_MODULES) -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa
++LIBS = -L$(ELPA_LIB) -lelpa -lelpatest $(SCALAPACK_LIB) $(MKL) -lcublas -lcudart
++CC            = mpicc -O3 
++
++all: test_real_1stage test_real_2stage_all_kernels test_autotune test_multiple_objs
++
++test_real_1stage: test.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_NVIDIA_GPU=1 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_1STAGE -DTEST_EIGENVECTORS -DWITH_MPI -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test.c $(LIBS)
++
++test_real_2stage_all_kernels: test.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_NVIDIA_GPU=1 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_2STAGE -DTEST_EIGENVECTORS -DTEST_ALL_KERNELS -DWITH_MPI -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test.c $(LIBS)
++
++test_autotune: test_autotune.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_NVIDIA_GPU=1 -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test_autotune.c $(LIBS)
++
++test_multiple_objs: test_multiple_objs.c
++	$(CC) -DCURRENT_API_VERSION=20211125 -DTEST_NVIDIA_GPU=1 -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa -I. -o $@ test_multiple_objs.c $(LIBS)
++
+diff -ruN elpa-new_release_2021.11.001/examples/C/test_autotune.c elpa-new_release_2021.11.001_ok/examples/C/test_autotune.c
+--- elpa-new_release_2021.11.001/examples/C/test_autotune.c	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/C/test_autotune.c	2022-02-01 18:20:00.273429184 +0100
+@@ -0,0 +1,342 @@
++/*   This file is part of ELPA.
++
++     The ELPA library was originally created by the ELPA consortium,
++     consisting of the following organizations:
++
++     - Max Planck Computing and Data Facility (MPCDF), formerly known as
++       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++     - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++       Informatik,
++     - Technische Universität München, Lehrstuhl für Informatik mit
++       Schwerpunkt Wissenschaftliches Rechnen ,
++     - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++     - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++       and
++     - IBM Deutschland GmbH
++
++
++     More information can be found here:
++     http://elpa.mpcdf.mpg.de/
++
++     ELPA is free software: you can redistribute it and/or modify
++     it under the terms of the version 3 of the license of the
++     GNU Lesser General Public License as published by the Free
++     Software Foundation.
++
++     ELPA is distributed in the hope that it will be useful,
++     but WITHOUT ANY WARRANTY; without even the implied warranty of
++     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++     GNU Lesser General Public License for more details.
++
++     You should have received a copy of the GNU Lesser General Public License
++     along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++
++     ELPA reflects a substantial effort on the part of the original
++     ELPA consortium, and we ask you to respect the spirit of the
++     license that we chose: i.e., please contribute any changes you
++     may have back to the original ELPA library distribution, and keep
++     any derivatives of ELPA under the same license that we chose for
++     the original distribution, the GNU Lesser General Public License.
++*/
++
++#include "config.h"
++
++#include <string.h>
++#include <stdio.h>
++#include <stdlib.h>
++#ifdef WITH_MPI
++#include <mpi.h>
++#endif
++#include <math.h>
++
++#include <elpa/elpa.h>
++#include <assert.h>
++
++#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
++//#error "define exactly one of TEST_REAL or TEST_COMPLEX"
++#endif
++
++#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
++//#error "define exactly one of TEST_SINGLE or TEST_DOUBLE"
++#endif
++
++#if !(defined(TEST_SOLVER_1STAGE) ^ defined(TEST_SOLVER_2STAGE))
++//#error "define exactly one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE"
++#endif
++
++#ifdef TEST_SINGLE
++#  define EV_TYPE float
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE float
++#  else
++#    define MATRIX_TYPE complex float
++#  endif
++#else
++#  define EV_TYPE double
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE double
++#  else
++#    define MATRIX_TYPE complex double
++#  endif
++#endif
++
++#define assert_elpa_ok(x) assert(x == ELPA_OK)
++
++#ifdef HAVE_64BIT_INTEGER_SUPPORT
++#define TEST_C_INT_TYPE_PTR long int*
++#define C_INT_TYPE_PTR long int*
++#define TEST_C_INT_TYPE long int
++#define C_INT_TYPE long int
++#else
++#define TEST_C_INT_TYPE_PTR int*
++#define C_INT_TYPE_PTR int*
++#define TEST_C_INT_TYPE int
++#define C_INT_TYPE int
++#endif
++
++#include "generated.h"
++
++int main(int argc, char** argv) {
++   /* matrix dimensions */
++   C_INT_TYPE na, nev, nblk;
++
++   /* mpi */
++   C_INT_TYPE myid, nprocs;
++   C_INT_TYPE na_cols, na_rows;
++   C_INT_TYPE np_cols, np_rows;
++   C_INT_TYPE my_prow, my_pcol;
++   C_INT_TYPE mpi_comm;
++
++   /* blacs */
++   C_INT_TYPE my_blacs_ctxt, sc_desc[9], info;
++
++   /* The Matrix */
++   MATRIX_TYPE *a, *as, *z;
++   EV_TYPE *ev;
++
++   C_INT_TYPE status;
++   int error_elpa; 
++   elpa_t handle;
++
++   elpa_autotune_t autotune_handle;
++   C_INT_TYPE i, unfinished;
++
++   C_INT_TYPE value;
++#ifdef WITH_MPI
++   MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &info);
++   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
++   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
++#else
++   nprocs = 1;
++   myid = 0;
++#endif
++
++   if (argc == 4) {
++     na = atoi(argv[1]);
++     nev = atoi(argv[2]);
++     nblk = atoi(argv[3]);
++   } else {
++     na = 500;
++     nev = 250;
++     nblk = 16;
++   }
++
++   for (np_cols = (C_INT_TYPE) sqrt((double) nprocs); np_cols > 1; np_cols--) {
++     if (nprocs % np_cols == 0) {
++       break;
++     }
++   }
++
++   np_rows = nprocs/np_cols;
++
++   /* set up blacs */
++   /* convert communicators before */
++#ifdef WITH_MPI
++   mpi_comm = MPI_Comm_c2f(MPI_COMM_WORLD);
++#else
++   mpi_comm = 0;
++#endif
++   set_up_blacsgrid_f(mpi_comm, np_rows, np_cols, 'C', &my_blacs_ctxt, &my_prow, &my_pcol);
++   set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
++
++   /* allocate the matrices needed for elpa */
++   a  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   z  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   as = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   ev = calloc(na, sizeof(EV_TYPE));
++
++#ifdef TEST_REAL
++#ifdef TEST_DOUBLE
++   prepare_matrix_random_real_double_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
++#else
++   prepare_matrix_random_real_single_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
++#endif
++#else
++#ifdef TEST_DOUBLE
++   prepare_matrix_random_complex_double_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
++#else
++   prepare_matrix_random_complex_single_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
++#endif
++#endif
++
++   if (elpa_init(CURRENT_API_VERSION) != ELPA_OK) {
++     fprintf(stderr, "Error: ELPA API version not supported");
++     exit(1);
++   }
++
++#if OPTIONAL_C_ERROR_ARGUMENT == 1
++   handle = elpa_allocate();
++#else
++   handle = elpa_allocate(&error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++   assert_elpa_ok(error_elpa);
++
++   /* Set parameters */
++   elpa_set(handle, "na", (int) na, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "nev", (int) nev, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   if (myid == 0) {
++     printf("Setting the matrix parameters na=%d, nev=%d \n",na,nev);
++   }
++   elpa_set(handle, "local_nrows", (int) na_rows, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "local_ncols", (int) na_cols, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "nblk", (int) nblk, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++#ifdef WITH_MPI
++   elpa_set(handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(MPI_COMM_WORLD)), &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "process_row", (int) my_prow, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "process_col", (int) my_pcol, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++   /* Setup */
++   assert_elpa_ok(elpa_setup(handle));
++
++#if TEST_NVIDIA_GPU == 1
++   elpa_set(handle, "nvidia-gpu", 0, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++#if TEST_INTEL_GPU == 1
++   elpa_set(handle, "intel-gpu", 0, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++ 
++   autotune_handle = elpa_autotune_setup(handle, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, &error_elpa);
++   assert_elpa_ok(error_elpa);
++   /* mimic 20 scf steps */
++
++   for (i=0; i < 20; i++) {
++
++      unfinished = elpa_autotune_step(handle, autotune_handle, &error_elpa);
++
++      if (unfinished == 0) {
++        if (myid == 0) {
++       	  printf("ELPA autotuning finished in the %d th scf step \n",i);
++        }
++	break;
++      }
++      if (myid == 0) {
++	printf("The current setting of the ELPA object: \n");
++        elpa_print_settings(handle, &error_elpa);
++
++	printf("The state of the autotuning: \n");
++        elpa_autotune_print_state(handle, autotune_handle, &error_elpa);
++      }
++
++
++      /* Solve EV problem */
++      elpa_eigenvectors(handle, a, ev, z, &error_elpa);
++      assert_elpa_ok(error_elpa);
++
++      /* check the results */
++#ifdef TEST_REAL
++#ifdef TEST_DOUBLE
++      status = check_correctness_evp_numeric_residuals_real_double_f(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
++      memcpy(a, as, na_rows*na_cols*sizeof(double));
++
++#else
++      status = check_correctness_evp_numeric_residuals_real_single_f(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
++      memcpy(a, as, na_rows*na_cols*sizeof(float));
++#endif
++#else
++#ifdef TEST_DOUBLE
++      status = check_correctness_evp_numeric_residuals_complex_double_f(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
++      memcpy(a, as, na_rows*na_cols*sizeof(complex double));
++#else
++      status = check_correctness_evp_numeric_residuals_complex_single_f(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
++      memcpy(a, as, na_rows*na_cols*sizeof(complex float));
++#endif
++#endif
++
++      if (status !=0){
++        printf("The computed EVs are not correct !\n");
++	break;
++      }
++      printf("hier %d \n",myid);
++   }
++
++   if (unfinished == 1) {
++     if (myid == 0) {
++        printf("ELPA autotuning did not finished during %d scf cycles\n",i);
++
++     }	     
++
++   }
++   elpa_autotune_set_best(handle, autotune_handle, &error_elpa);
++
++   if (myid == 0) {
++     printf("The best combination found by the autotuning:\n");
++     elpa_autotune_print_best(handle, autotune_handle, &error_elpa);
++   }
++
++#if OPTIONAL_C_ERROR_ARGUMENT == 1
++   elpa_autotune_deallocate(autotune_handle);
++   elpa_deallocate(handle);
++#else
++   elpa_autotune_deallocate(autotune_handle, &error_elpa);
++   elpa_deallocate(handle, &error_elpa);
++#endif
++   elpa_uninit(&error_elpa);
++
++   if (myid == 0) {
++     printf("\n");
++     printf("2stage ELPA real solver complete\n");
++     printf("\n");
++   }
++
++   if (status ==0){
++     if (myid ==0) {
++       printf("All ok!\n");
++     }
++   }
++
++   free(a);
++   free(z);
++   free(as);
++   free(ev);
++
++#ifdef WITH_MPI
++   MPI_Finalize();
++#endif
++
++   return !!status;
++}
+diff -ruN elpa-new_release_2021.11.001/examples/C/test.c elpa-new_release_2021.11.001_ok/examples/C/test.c
+--- elpa-new_release_2021.11.001/examples/C/test.c	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/C/test.c	2022-01-28 09:45:19.528434910 +0100
+@@ -0,0 +1,359 @@
++/*   This file is part of ELPA.
++
++     The ELPA library was originally created by the ELPA consortium,
++     consisting of the following organizations:
++
++     - Max Planck Computing and Data Facility (MPCDF), formerly known as
++       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++     - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++       Informatik,
++     - Technische Universität München, Lehrstuhl für Informatik mit
++       Schwerpunkt Wissenschaftliches Rechnen ,
++     - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++     - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++       and
++     - IBM Deutschland GmbH
++
++
++     More information can be found here:
++     http://elpa.mpcdf.mpg.de/
++
++     ELPA is free software: you can redistribute it and/or modify
++     it under the terms of the version 3 of the license of the
++     GNU Lesser General Public License as published by the Free
++     Software Foundation.
++
++     ELPA is distributed in the hope that it will be useful,
++     but WITHOUT ANY WARRANTY; without even the implied warranty of
++     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++     GNU Lesser General Public License for more details.
++
++     You should have received a copy of the GNU Lesser General Public License
++     along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++
++     ELPA reflects a substantial effort on the part of the original
++     ELPA consortium, and we ask you to respect the spirit of the
++     license that we chose: i.e., please contribute any changes you
++     may have back to the original ELPA library distribution, and keep
++     any derivatives of ELPA under the same license that we chose for
++     the original distribution, the GNU Lesser General Public License.
++*/
++
++#include "config.h"
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#ifdef WITH_MPI
++#include <mpi.h>
++#endif
++#include <math.h>
++
++#include <elpa/elpa.h>
++#include <assert.h>
++
++#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
++#error "define exactly one of TEST_REAL or TEST_COMPLEX"
++#endif
++
++#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
++#error "define exactly one of TEST_SINGLE or TEST_DOUBLE"
++#endif
++
++#if !(defined(TEST_SOLVER_1STAGE) ^ defined(TEST_SOLVER_2STAGE))
++#error "define exactly one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE"
++#endif
++
++#ifdef TEST_GENERALIZED_DECOMP_EIGENPROBLEM
++#define TEST_GENERALIZED_EIGENPROBLEM
++#endif
++
++#ifdef TEST_SINGLE
++#  define EV_TYPE float
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE float
++#    define PREPARE_MATRIX_RANDOM prepare_matrix_random_real_single_f
++#    define PREPARE_MATRIX_RANDOM_SPD prepare_matrix_random_spd_real_single_f
++#    define CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS check_correctness_evp_numeric_residuals_real_single_f
++#    define CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS check_correctness_evp_gen_numeric_residuals_real_single_f
++#  else
++#    define MATRIX_TYPE complex float
++#    define PREPARE_MATRIX_RANDOM prepare_matrix_random_complex_single_f
++#    define PREPARE_MATRIX_RANDOM_SPD prepare_matrix_random_spd_complex_single_f
++#    define CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS check_correctness_evp_numeric_residuals_complex_single_f
++#    define CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS check_correctness_evp_gen_numeric_residuals_complex_single_f
++#  endif
++#else
++#  define EV_TYPE double
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE double
++#    define PREPARE_MATRIX_RANDOM prepare_matrix_random_real_double_f
++#    define PREPARE_MATRIX_RANDOM_SPD prepare_matrix_random_spd_real_double_f
++#    define CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS check_correctness_evp_numeric_residuals_real_double_f
++#    define CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS check_correctness_evp_gen_numeric_residuals_real_double_f
++#  else
++#    define MATRIX_TYPE complex double
++#    define PREPARE_MATRIX_RANDOM prepare_matrix_random_complex_double_f
++#    define PREPARE_MATRIX_RANDOM_SPD prepare_matrix_random_spd_complex_double_f
++#    define CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS check_correctness_evp_numeric_residuals_complex_double_f
++#    define CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS check_correctness_evp_gen_numeric_residuals_complex_double_f
++#  endif
++#endif
++
++#define assert_elpa_ok(x) assert(x == ELPA_OK)
++
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_C_INT_TYPE_PTR long int*
++#define C_INT_TYPE_PTR long int*
++#define TEST_C_INT_TYPE long int
++#define C_INT_TYPE long int
++#else
++#define TEST_C_INT_TYPE_PTR int*
++#define C_INT_TYPE_PTR int*
++#define TEST_C_INT_TYPE int
++#define C_INT_TYPE int
++#endif
++
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_C_INT_MPI_TYPE_PTR long int*
++#define C_INT_MPI_TYPE_PTR long int*
++#define TEST_C_INT_MPI_TYPE long int
++#define C_INT_MPI_TYPE long int
++#else
++#define TEST_C_INT_MPI_TYPE_PTR int*
++#define C_INT_MPI_TYPE_PTR int*
++#define TEST_C_INT_MPI_TYPE int
++#define C_INT_MPI_TYPE int
++#endif
++
++#define TEST_GPU  0
++#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1) || (TEST_INTEL_GPU == 1)
++#undef TEST_GPU
++#define TEST_GPU  1
++#endif
++
++
++#include "generated.h"
++
++int main(int argc, char** argv) {
++   /* matrix dimensions */
++   C_INT_TYPE na, nev, nblk;
++
++   /* mpi */
++   C_INT_TYPE myid, nprocs;
++   C_INT_MPI_TYPE myidMPI, nprocsMPI;
++   C_INT_TYPE na_cols, na_rows;
++   C_INT_TYPE np_cols, np_rows;
++   C_INT_TYPE my_prow, my_pcol;
++   C_INT_TYPE mpi_comm;
++   C_INT_MPI_TYPE provided_mpi_thread_level;
++
++   /* blacs */
++   C_INT_TYPE my_blacs_ctxt, sc_desc[9], info;
++
++   /* The Matrix */
++   MATRIX_TYPE *a, *as, *z, *b, *bs;
++   EV_TYPE *ev;
++
++   C_INT_TYPE error, status;
++   int error_elpa;
++
++   elpa_t handle;
++
++   int  value;
++#ifdef WITH_MPI
++#ifndef WITH_OPENMP_TRADITIONAL
++   MPI_Init(&argc, &argv);
++#else
++   MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided_mpi_thread_level);
++
++   if (provided_mpi_thread_level != MPI_THREAD_SERIALIZED) {
++     fprintf(stderr, "MPI ERROR: MPI_THREAD_SERIALIZED is not provided on this system\n");
++     MPI_Finalize();
++     exit(77);
++   }
++#endif
++
++   MPI_Comm_size(MPI_COMM_WORLD, &nprocsMPI);
++   nprocs = (C_INT_TYPE) nprocsMPI;
++   MPI_Comm_rank(MPI_COMM_WORLD, &myidMPI);
++   myid = (C_INT_TYPE) myidMPI;
++
++#else
++   nprocs = 1;
++   myid = 0;
++#endif
++
++   if (argc == 4) {
++     na = atoi(argv[1]);
++     nev = atoi(argv[2]);
++     nblk = atoi(argv[3]);
++   } else {
++     na = 500;
++     nev = 250;
++     nblk = 16;
++   }
++
++   for (np_cols = (C_INT_TYPE) sqrt((double) nprocs); np_cols > 1; np_cols--) {
++     if (nprocs % np_cols == 0) {
++       break;
++     }
++   }
++
++   np_rows = nprocs/np_cols;
++
++   /* set up blacs */
++   /* convert communicators before */
++#ifdef WITH_MPI
++   mpi_comm = MPI_Comm_c2f(MPI_COMM_WORLD);
++#else
++   mpi_comm = 0;
++#endif
++   set_up_blacsgrid_f(mpi_comm, np_rows, np_cols, 'C', &my_blacs_ctxt, &my_prow, &my_pcol);
++   set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
++
++   /* allocate the matrices needed for elpa */
++   a  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   z  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   as = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   ev = calloc(na, sizeof(EV_TYPE));
++
++   PREPARE_MATRIX_RANDOM(na, myid, na_rows, na_cols, sc_desc, a, z, as);
++
++#if defined(TEST_GENERALIZED_EIGENPROBLEM)
++   b  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   bs = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   PREPARE_MATRIX_RANDOM_SPD(na, myid, na_rows, na_cols, sc_desc, b, z, bs, nblk, np_rows, np_cols, my_prow, my_pcol);
++#endif
++
++   if (elpa_init(CURRENT_API_VERSION) != ELPA_OK) {
++     fprintf(stderr, "Error: ELPA API version not supported");
++     exit(1);
++   }
++
++   handle = elpa_allocate(&error_elpa);
++   //assert_elpa_ok(error_elpa);
++
++   /* Set parameters */
++   elpa_set(handle, "na", (int) na, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "nev", (int) nev, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   if (myid == 0) {
++     printf("Setting the matrix parameters na=%d, nev=%d \n",na,nev);
++   }
++   elpa_set(handle, "local_nrows", (int) na_rows, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "local_ncols", (int) na_cols, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "nblk", (int) nblk, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++#ifdef WITH_MPI
++   elpa_set(handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(MPI_COMM_WORLD)), &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "process_row", (int) my_prow, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(handle, "process_col", (int) my_pcol, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++#ifdef TEST_GENERALIZED_EIGENPROBLEM
++   elpa_set(handle, "blacs_context", (int) my_blacs_ctxt, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++   /* Setup */
++   assert_elpa_ok(elpa_setup(handle));
++
++   /* Set tunables */
++#ifdef TEST_SOLVER_1STAGE
++   elpa_set(handle, "solver", ELPA_SOLVER_1STAGE, &error_elpa);
++#else
++   elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error_elpa);
++#endif
++   assert_elpa_ok(error_elpa);
++
++#if TEST_NVIDIA_GPU == 1
++   elpa_set(handle, "nvidia-gpu", TEST_GPU, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++#if TEST_AMD_GPU == 1
++   elpa_set(handle, "amd-gpu", TEST_GPU, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++#if TEST_INTEL_GPU == 1
++   elpa_set(handle, "intel-gpu", TEST_GPU, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++#if defined(TEST_SOLVE_2STAGE) && defined(TEST_KERNEL)
++# ifdef TEST_COMPLEX
++   elpa_set(handle, "complex_kernel", TEST_KERNEL, &error_elpa);
++# else
++   elpa_set(handle, "real_kernel", TEST_KERNEL, &error_elpa);
++# endif
++   assert_elpa_ok(error_elpa);
++#endif
++
++   elpa_get(handle, "solver", &value, &error_elpa);
++   if (myid == 0) {
++     printf("Solver is set to %d \n", value);
++   }
++
++#if defined(TEST_GENERALIZED_EIGENPROBLEM)
++     elpa_generalized_eigenvectors(handle, a, b, ev, z, 0, &error_elpa);
++#if defined(TEST_GENERALIZED_DECOMP_EIGENPROBLEM)
++     //a = as, so that the problem can be solved again
++     memcpy(a, as, na_rows * na_cols * sizeof(MATRIX_TYPE));
++     elpa_generalized_eigenvectors(handle, a, b, ev, z, 1, &error_elpa);
++#endif
++#else
++   /* Solve EV problem */
++   elpa_eigenvectors(handle, a, ev, z, &error_elpa);
++#endif
++   assert_elpa_ok(error_elpa);
++
++   elpa_deallocate(handle, &error_elpa);
++   elpa_uninit(&error_elpa);
++
++   /* check the results */
++#if defined(TEST_GENERALIZED_EIGENPROBLEM)
++   status = CHECK_CORRECTNESS_EVP_GEN_NUMERIC_RESIDUALS(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol, bs);
++#else
++   status = CHECK_CORRECTNESS_EVP_NUMERIC_RESIDUALS(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
++#endif
++
++   if (status !=0){
++     printf("The computed EVs are not correct !\n");
++   }
++   if (status ==0){
++     printf("All ok!\n");
++   }
++
++   free(a);
++   free(z);
++   free(as);
++   free(ev);
++#if defined(TEST_GENERALIZED_EIGENPROBLEM)
++   free(b);
++   free(bs);
++#endif
++
++#ifdef WITH_MPI
++   MPI_Finalize();
++#endif
++
++   return !!status;
++}
+diff -ruN elpa-new_release_2021.11.001/examples/C/test_multiple_objs.c elpa-new_release_2021.11.001_ok/examples/C/test_multiple_objs.c
+--- elpa-new_release_2021.11.001/examples/C/test_multiple_objs.c	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/C/test_multiple_objs.c	2022-02-01 18:20:51.698668546 +0100
+@@ -0,0 +1,401 @@
++/*   This file is part of ELPA.
++
++     The ELPA library was originally created by the ELPA consortium,
++     consisting of the following organizations:
++
++     - Max Planck Computing and Data Facility (MPCDF), formerly known as
++       Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++     - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++       Informatik,
++     - Technische Universität München, Lehrstuhl für Informatik mit
++       Schwerpunkt Wissenschaftliches Rechnen ,
++     - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++     - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++       Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++       and
++     - IBM Deutschland GmbH
++
++
++     More information can be found here:
++     http://elpa.mpcdf.mpg.de/
++
++     ELPA is free software: you can redistribute it and/or modify
++     it under the terms of the version 3 of the license of the
++     GNU Lesser General Public License as published by the Free
++     Software Foundation.
++
++     ELPA is distributed in the hope that it will be useful,
++     but WITHOUT ANY WARRANTY; without even the implied warranty of
++     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++     GNU Lesser General Public License for more details.
++
++     You should have received a copy of the GNU Lesser General Public License
++     along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++
++     ELPA reflects a substantial effort on the part of the original
++     ELPA consortium, and we ask you to respect the spirit of the
++     license that we chose: i.e., please contribute any changes you
++     may have back to the original ELPA library distribution, and keep
++     any derivatives of ELPA under the same license that we chose for
++     the original distribution, the GNU Lesser General Public License.
++*/
++
++#include "config.h"
++
++#include <string.h>
++#include <stdio.h>
++#include <stdlib.h>
++#ifdef WITH_MPI
++#include <mpi.h>
++#endif
++#include <math.h>
++
++#include <elpa/elpa.h>
++#include <assert.h>
++
++#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
++//#error "define exactly one of TEST_REAL or TEST_COMPLEX"
++#endif
++
++#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
++//#error "define exactly one of TEST_SINGLE or TEST_DOUBLE"
++#endif
++
++#if !(defined(TEST_SOLVER_1STAGE) ^ defined(TEST_SOLVER_2STAGE))
++//#error "define exactly one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE"
++#endif
++
++#ifdef TEST_SINGLE
++#  define EV_TYPE float
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE float
++#  else
++#    define MATRIX_TYPE complex float
++#  endif
++#else
++#  define EV_TYPE double
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE double
++#  else
++#    define MATRIX_TYPE complex double
++#  endif
++#endif
++
++#define assert_elpa_ok(x) assert(x == ELPA_OK)
++#ifdef HAVE_64BIT_INTEGER_SUPPORT
++#define TEST_C_INT_TYPE_PTR long int*
++#define C_INT_TYPE_PTR long int*
++#define TEST_C_INT_TYPE long int
++#define C_INT_TYPE long int
++#else
++#define TEST_C_INT_TYPE_PTR int*
++#define C_INT_TYPE_PTR int*
++#define TEST_C_INT_TYPE int
++#define C_INT_TYPE int
++#endif
++
++#include "generated.h"
++void set_basic_parameters(elpa_t *handle, C_INT_TYPE na, C_INT_TYPE nev, C_INT_TYPE na_rows, C_INT_TYPE na_cols, C_INT_TYPE nblk, C_INT_TYPE my_prow, C_INT_TYPE my_pcol){
++   int error_elpa;
++   elpa_set(*handle, "na", (int) na, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(*handle, "nev", (int) nev, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(*handle, "local_nrows", (int) na_rows, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(*handle, "local_ncols", (int) na_cols, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(*handle, "nblk", (int) nblk, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++#ifdef WITH_MPI
++   elpa_set(*handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(MPI_COMM_WORLD)), &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(*handle, "process_row", (int) my_prow, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(*handle, "process_col", (int) my_pcol, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++}
++
++
++int main(int argc, char** argv) {
++   /* matrix dimensions */
++   C_INT_TYPE na, nev, nblk;
++
++   /* mpi */
++   C_INT_TYPE myid, nprocs;
++   C_INT_TYPE na_cols, na_rows;
++   C_INT_TYPE np_cols, np_rows;
++   C_INT_TYPE my_prow, my_pcol;
++   C_INT_TYPE mpi_comm;
++
++   /* blacs */
++   C_INT_TYPE my_blacs_ctxt, sc_desc[9], info;
++
++   /* The Matrix */
++   MATRIX_TYPE *a, *as, *z;
++   EV_TYPE *ev;
++
++   C_INT_TYPE status;
++   int error_elpa;
++   int gpu, timings, debug; 
++   char str[400];
++
++   elpa_t elpa_handle_1, elpa_handle_2, *elpa_handle_ptr;
++
++   elpa_autotune_t autotune_handle;
++   C_INT_TYPE i, unfinished;
++
++   C_INT_TYPE value;
++#ifdef WITH_MPI
++   MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &info);
++   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
++   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
++#else
++   nprocs = 1;
++   myid = 0;
++#endif
++
++   if (argc == 4) {
++     na = atoi(argv[1]);
++     nev = atoi(argv[2]);
++     nblk = atoi(argv[3]);
++   } else {
++     na = 500;
++     nev = 250;
++     nblk = 16;
++   }
++
++   for (np_cols = (C_INT_TYPE) sqrt((double) nprocs); np_cols > 1; np_cols--) {
++     if (nprocs % np_cols == 0) {
++       break;
++     }
++   }
++
++   np_rows = nprocs/np_cols;
++
++   /* set up blacs */
++   /* convert communicators before */
++#ifdef WITH_MPI
++   mpi_comm = MPI_Comm_c2f(MPI_COMM_WORLD);
++#else
++   mpi_comm = 0;
++#endif
++   set_up_blacsgrid_f(mpi_comm, np_rows, np_cols, 'C', &my_blacs_ctxt, &my_prow, &my_pcol);
++   set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, np_rows, np_cols, &na_rows, &na_cols, sc_desc, my_blacs_ctxt, &info);
++
++   /* allocate the matrices needed for elpa */
++   a  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   z  = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   as = calloc(na_rows*na_cols, sizeof(MATRIX_TYPE));
++   ev = calloc(na, sizeof(EV_TYPE));
++
++#ifdef TEST_REAL
++#ifdef TEST_DOUBLE
++   prepare_matrix_random_real_double_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
++#else
++   prepare_matrix_random_real_single_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
++#endif
++#else
++#ifdef TEST_DOUBLE
++   prepare_matrix_random_complex_double_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
++#else
++   prepare_matrix_random_complex_single_f(na, myid, na_rows, na_cols, sc_desc, a, z, as);
++#endif
++#endif
++
++   if (elpa_init(CURRENT_API_VERSION) != ELPA_OK) {
++     fprintf(stderr, "Error: ELPA API version not supported");
++     exit(1);
++   }
++
++   elpa_handle_1 = elpa_allocate(&error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   set_basic_parameters(&elpa_handle_1, na, nev, na_rows, na_cols, nblk, my_prow, my_pcol);
++   /* Setup */
++   assert_elpa_ok(elpa_setup(elpa_handle_1));
++
++#if TEST_NVIDIA_GPU == 1
++   elpa_set(elpa_handle_1, "nvidia-gpu", 0, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++#if TEST_INTEL_GPU == 1
++   elpa_set(elpa_handle_1, "intel-gpu", 0, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++   elpa_set(elpa_handle_1, "timings", 1, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_set(elpa_handle_1, "debug", 1, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_store_settings(elpa_handle_1, "initial_parameters.txt", &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++#ifdef WITH_MPI
++     // barrier after store settings, file created from one MPI rank only, but loaded everywhere
++     MPI_Barrier(MPI_COMM_WORLD);
++#endif
++
++#if OPTIONAL_C_ERROR_ARGUMENT == 1
++   elpa_handle_2 = elpa_allocate();
++#else
++   elpa_handle_2 = elpa_allocate(&error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++   set_basic_parameters(&elpa_handle_2, na, nev, na_rows, na_cols, nblk, my_prow, my_pcol);
++   /* Setup */
++   assert_elpa_ok(elpa_setup(elpa_handle_2));
++
++   elpa_load_settings(elpa_handle_2, "initial_parameters.txt", &error_elpa);
++
++#if TEST_NVIDIA_GPU == 1
++   elpa_get(elpa_handle_2, "nvidia-gpu", &gpu, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++#if TEST_INTEL_GPU == 1
++   elpa_get(elpa_handle_2, "intel-gpu", &gpu, &error_elpa);
++   assert_elpa_ok(error_elpa);
++#endif
++
++   elpa_get(elpa_handle_2, "timings", &timings, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   elpa_get(elpa_handle_2, "debug", &debug, &error_elpa);
++   assert_elpa_ok(error_elpa);
++
++   if ((timings != 1) || (debug != 1) || (gpu != 0)){
++     printf("Parameters not stored or loaded correctly. Aborting... %d, %d, %d\n", timings, debug, gpu);
++     exit(1);
++   }
++
++   elpa_handle_ptr = &elpa_handle_2;
++
++   autotune_handle = elpa_autotune_setup(*elpa_handle_ptr, ELPA_AUTOTUNE_FAST, ELPA_AUTOTUNE_DOMAIN_REAL, &error_elpa);
++   assert_elpa_ok(error_elpa);
++   /* mimic 20 scf steps */
++
++   for (i=0; i < 20; i++) {
++
++      unfinished = elpa_autotune_step(*elpa_handle_ptr, autotune_handle, &error_elpa);
++
++      if (unfinished == 0) {
++        if (myid == 0) {
++          printf("ELPA autotuning finished in the %d th scf step \n",i);
++        }
++        break;
++      }
++
++      elpa_print_settings(*elpa_handle_ptr, &error_elpa);
++      elpa_autotune_print_state(*elpa_handle_ptr, autotune_handle, &error_elpa);
++
++      sprintf(str, "saved_parameters_%d.txt", i);
++      elpa_store_settings(*elpa_handle_ptr, str, &error_elpa);
++      assert_elpa_ok(error_elpa);
++
++      /* Solve EV problem */
++      elpa_eigenvectors(*elpa_handle_ptr, a, ev, z, &error_elpa);
++      assert_elpa_ok(error_elpa);
++
++      /* check the results */
++#ifdef TEST_REAL
++#ifdef TEST_DOUBLE
++      status = check_correctness_evp_numeric_residuals_real_double_f(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
++      memcpy(a, as, na_rows*na_cols*sizeof(double));
++
++#else
++      status = check_correctness_evp_numeric_residuals_real_single_f(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
++      memcpy(a, as, na_rows*na_cols*sizeof(float));
++#endif
++#else
++#ifdef TEST_DOUBLE
++      status = check_correctness_evp_numeric_residuals_complex_double_f(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
++      memcpy(a, as, na_rows*na_cols*sizeof(complex double));
++#else
++      status = check_correctness_evp_numeric_residuals_complex_single_f(na, nev, na_rows, na_cols, as, z, ev,
++                                sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol);
++      memcpy(a, as, na_rows*na_cols*sizeof(complex float));
++#endif
++#endif
++
++      if (status !=0){
++        printf("The computed EVs are not correct !\n");
++        break;
++      }
++
++     elpa_autotune_print_state(*elpa_handle_ptr, autotune_handle, &error_elpa);
++     assert_elpa_ok(error_elpa);
++
++     sprintf(str, "saved_state_%d.txt", i);
++     elpa_autotune_save_state(*elpa_handle_ptr, autotune_handle, str, &error_elpa);
++     assert_elpa_ok(error_elpa);
++
++#ifdef WITH_MPI
++     //barrier after save state, file created from one MPI rank only, but loaded everywhere
++     MPI_Barrier(MPI_COMM_WORLD);
++#endif
++
++     elpa_autotune_load_state(*elpa_handle_ptr, autotune_handle, str, &error_elpa);
++     assert_elpa_ok(error_elpa);
++
++     if (unfinished == 1) {
++       if (myid == 0) {
++          printf("ELPA autotuning did not finished during %d scf cycles\n",i);
++       }
++     }
++
++   }
++   elpa_autotune_set_best(*elpa_handle_ptr, autotune_handle, &error_elpa);
++
++   if (myid == 0) {
++     printf("The best combination found by the autotuning:\n");
++     elpa_autotune_print_best(*elpa_handle_ptr, autotune_handle, &error_elpa);
++   }
++
++   elpa_autotune_deallocate(autotune_handle, &error_elpa);
++   elpa_deallocate(elpa_handle_1, &error_elpa);
++#if OPTIONAL_C_ERROR_ARGUMENT == 1
++   elpa_deallocate(elpa_handle_2);
++#else
++   elpa_deallocate(elpa_handle_2, &error_elpa);
++#endif
++   elpa_uninit(&error_elpa);
++
++   if (myid == 0) {
++     printf("\n");
++     printf("2stage ELPA real solver complete\n");
++     printf("\n");
++   }
++
++   if (status ==0){
++     if (myid ==0) {
++       printf("All ok!\n");
++     }
++   }
++
++   free(a);
++   free(z);
++   free(as);
++   free(ev);
++
++#ifdef WITH_MPI
++   MPI_Finalize();
++#endif
++
++   return !!status;
++}
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/assert.h elpa-new_release_2021.11.001_ok/examples/Fortran/assert.h
+--- elpa-new_release_2021.11.001/examples/Fortran/assert.h	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/assert.h	2022-01-26 10:05:16.438246000 +0100
+@@ -0,0 +1,7 @@
++#define stringify_(x) "x"
++#define stringify(x) stringify_(x)
++#define assert(x) call x_a(x, stringify(x), "F", __LINE__)
++
++#define assert_elpa_ok(error_code) call x_ao(error_code, stringify(error_code), __FILE__, __LINE__)
++
++! vim: syntax=fortran
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/elpa2/complex_2stage_banded.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/complex_2stage_banded.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/elpa2/complex_2stage_banded.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/complex_2stage_banded.F90	2022-01-26 10:09:19.163552000 +0100
+@@ -0,0 +1,300 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++#include "../assert.h"
++!>
++!> Fortran test programm to demonstrates the use of
++!> ELPA 2 complex case library.
++!> If "HAVE_REDIRECT" was defined at build time
++!> the stdout and stderr output of each MPI task
++!> can be redirected to files if the environment
++!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
++!> to "true".
++!>
++!> By calling executable [arg1] [arg2] [arg3] [arg4]
++!> one can define the size (arg1), the number of
++!> Eigenvectors to compute (arg2), and the blocking (arg3).
++!> If these values are not set default values (500, 150, 16)
++!> are choosen.
++!> If these values are set the 4th argument can be
++!> "output", which specifies that the EV's are written to
++!> an ascii file.
++!>
++!> The complex ELPA 2 kernel is set as the default kernel.
++!> However, this can be overriden by setting
++!> the environment variable "COMPLEX_ELPA_KERNEL" to an
++!> appropiate value.
++!>
++
++#include "config-f90.h"
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++program test_complex2_double_banded
++
++!-------------------------------------------------------------------------------
++! Standard eigenvalue problem - COMPLEX version
++!
++! This program demonstrates the use of the ELPA module
++! together with standard scalapack routines
++!
++! Copyright of the original code rests with the authors inside the ELPA
++! consortium. The copyright of any additional modifications shall rest
++! with their original authors, but shall adhere to the licensing terms
++! distributed along with the original code in the file "COPYING".
++!-------------------------------------------------------------------------------
++   use elpa
++
++   !use test_util
++   use test_read_input_parameters
++   use test_check_correctness
++   use test_setup_mpi
++   use test_blacs_infrastructure
++   use test_prepare_matrix
++#ifdef HAVE_REDIRECT
++   use test_redirect
++#endif
++   use test_output_type
++   implicit none
++
++   !-------------------------------------------------------------------------------
++   ! Please set system size parameters below!
++   ! na:   System size
++   ! nev:  Number of eigenvectors to be calculated
++   ! nblk: Blocking factor in block cyclic distribution
++   !-------------------------------------------------------------------------------
++
++   TEST_INT_TYPE              :: nblk
++   TEST_INT_TYPE              :: na, nev
++
++   TEST_INT_TYPE              :: np_rows, np_cols, na_rows, na_cols
++
++   TEST_INT_TYPE              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
++   TEST_INT_TYPE              :: i, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++   TEST_INT_MPI_TYPE          :: mpierr
++#ifdef WITH_MPI
++   !TEST_INT_TYPE, external    :: numroc
++#endif
++   complex(kind=ck8), parameter   :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8)
++   real(kind=rk8), allocatable    :: ev(:)
++
++   complex(kind=ck8), allocatable :: a(:,:), z(:,:), as(:,:)
++
++   TEST_INT_TYPE              :: STATUS
++#ifdef WITH_OPENMP_TRADITIONAL
++   TEST_INT_TYPE              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
++#endif
++   type(output_t)                :: write_to_file
++   integer(kind=c_int)          :: error_elpa
++   character(len=8)              :: task_suffix
++   TEST_INT_TYPE              :: j
++
++
++   TEST_INT_TYPE              :: numberOfDevices
++   TEST_INT_TYPE              :: global_row, global_col, local_row, local_col
++   TEST_INT_TYPE              :: bandwidth
++   class(elpa_t), pointer        :: e
++
++#define COMPLEXCASE
++#define DOUBLE_PRECISION_COMPLEX 1
++
++   call read_input_parameters(na, nev, nblk, write_to_file)
++
++   if (nblk .eq. 1) then
++     stop 77
++   endif
++
++   !-------------------------------------------------------------------------------
++   !  MPI Initialization
++   call setup_mpi(myid, nprocs)
++
++   STATUS = 0
++
++   !-------------------------------------------------------------------------------
++   ! Selection of number of processor rows/columns
++   ! We try to set up the grid square-like, i.e. start the search for possible
++   ! divisors of nprocs with a number next to the square root of nprocs
++   ! and decrement it until a divisor is found.
++
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++   ! at the end of the above loop, nprocs is always divisible by np_cols
++
++   np_rows = nprocs/np_cols
++
++   if(myid==0) then
++      print *
++      print '(a)','Standard eigenvalue problem - COMPLEX version'
++      print *
++      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
++      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
++      print *
++   endif
++
++   !-------------------------------------------------------------------------------
++   ! Set up BLACS context and MPI communicators
++   !
++   ! The BLACS context is only necessary for using Scalapack.
++   !
++   ! For ELPA, the MPI communicators along rows/cols are sufficient,
++   ! and the grid setup may be done in an arbitrary way as long as it is
++   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
++   ! process has a unique (my_prow,my_pcol) pair).
++
++   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
++                         my_blacs_ctxt, my_prow, my_pcol)
++
++   if (myid==0) then
++     print '(a)','| Past BLACS_Gridinfo.'
++   end if
++
++   ! Determine the necessary size of the distributed matrices,
++   ! we use the Scalapack tools routine NUMROC for that.
++
++   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   if (myid==0) then
++     print '(a)','| Past scalapack descriptor setup.'
++   end if
++   !-------------------------------------------------------------------------------
++   ! Allocate matrices and set up a test matrix for the eigenvalue problem
++
++   allocate(a (na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++   allocate(as(na_rows,na_cols))
++
++   allocate(ev(na))
++
++   call prepare_matrix_random(na, myid, sc_desc, a, z, as)
++
++   ! set values outside of the bandwidth to zero
++   bandwidth = nblk
++
++   do local_row = 1, na_rows
++     global_row = index_l2g( local_row, nblk, my_prow, np_rows )
++     do local_col = 1, na_cols
++       global_col = index_l2g( local_col, nblk, my_pcol, np_cols )
++
++       if (ABS(global_row-global_col) > bandwidth) then
++         a(local_row, local_col) = 0
++         as(local_row, local_col) = 0
++       end if
++     end do
++   end do
++
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++
++   e => elpa_allocate(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e%set("na", int(na,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nev", int(nev,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#ifdef WITH_MPI
++   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++   call e%set("bandwidth", int(bandwidth,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   assert(e%setup() .eq. ELPA_OK)
++
++   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%eigenvectors(a, ev, z, error_elpa)
++   assert_elpa_ok(error_elpa)
++   call elpa_deallocate(e, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_uninit(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   !-------------------------------------------------------------------------------
++   ! Test correctness of result (using plain scalapack routines)
++   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
++
++   deallocate(a)
++   deallocate(as)
++
++   deallocate(z)
++   deallocate(ev)
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++   call EXIT(STATUS)
++end
++
++!-------------------------------------------------------------------------------
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/elpa2/double_instance.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/double_instance.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/elpa2/double_instance.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/double_instance.F90	2022-01-26 10:09:19.164133000 +0100
+@@ -0,0 +1,244 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++#include "../assert.h"
++
++program test_interface
++   use elpa
++
++   use precision_for_tests
++   !use test_util
++   use test_setup_mpi
++   use test_prepare_matrix
++   use test_read_input_parameters
++   use test_blacs_infrastructure
++   use test_check_correctness
++   implicit none
++
++   ! matrix dimensions
++   TEST_INT_TYPE :: na, nev, nblk
++
++   ! mpi
++   TEST_INT_TYPE :: myid, nprocs
++   TEST_INT_TYPE :: na_cols, na_rows  ! local matrix size
++   TEST_INT_TYPE :: np_cols, np_rows  ! number of MPI processes per column/row
++   TEST_INT_TYPE :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
++   TEST_INT_MPI_TYPE :: mpierr
++
++   ! blacs
++   TEST_INT_TYPE :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++
++   ! The Matrix
++   real(kind=C_DOUBLE), allocatable :: a1(:,:), as1(:,:)
++   ! eigenvectors
++   real(kind=C_DOUBLE), allocatable :: z1(:,:)
++   ! eigenvalues
++   real(kind=C_DOUBLE), allocatable :: ev1(:)
++
++   ! The Matrix
++   complex(kind=C_DOUBLE_COMPLEX), allocatable :: a2(:,:), as2(:,:)
++   ! eigenvectors
++   complex(kind=C_DOUBLE_COMPLEX), allocatable :: z2(:,:)
++   ! eigenvalues
++   real(kind=C_DOUBLE), allocatable :: ev2(:)
++   TEST_INT_TYPE :: status
++   integer(kind=c_int) :: error_elpa
++
++   TEST_INT_TYPE :: solver
++   TEST_INT_TYPE :: qr
++
++   type(output_t) :: write_to_file
++   class(elpa_t), pointer :: e1, e2
++
++   call read_input_parameters(na, nev, nblk, write_to_file)
++   call setup_mpi(myid, nprocs)
++
++   status = 0
++
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++
++   np_rows = nprocs/np_cols
++
++   my_prow = mod(myid, np_cols)
++   my_pcol = myid / np_cols
++
++   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
++                         my_blacs_ctxt, my_prow, my_pcol)
++
++   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   allocate(a1 (na_rows,na_cols), as1(na_rows,na_cols))
++   allocate(z1 (na_rows,na_cols))
++   allocate(ev1(na))
++
++   a1(:,:) = 0.0
++   z1(:,:) = 0.0
++   ev1(:) = 0.0
++
++   call prepare_matrix_random(na, myid, sc_desc, a1, z1, as1)
++   allocate(a2 (na_rows,na_cols), as2(na_rows,na_cols))
++   allocate(z2 (na_rows,na_cols))
++   allocate(ev2(na))
++
++   a2(:,:) = 0.0
++   z2(:,:) = 0.0
++   ev2(:) = 0.0
++
++   call prepare_matrix_random(na, myid, sc_desc, a2, z2, as2)
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++
++   e1 => elpa_allocate(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e1%set("na", int(na,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e1%set("nev", int(nev,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e1%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e1%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e1%set("nblk", int(nblk,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#ifdef WITH_MPI
++   call e1%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e1%set("process_row", int(my_prow,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e1%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++   assert(e1%setup() .eq. ELPA_OK)
++
++   call e1%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e1%set("real_kernel", ELPA_2STAGE_REAL_DEFAULT, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++
++   e2 => elpa_allocate(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e2%set("na", int(na,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e2%set("nev", int(nev,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e2%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e2%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e2%set("nblk", int(nblk,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#ifdef WITH_MPI
++   call e2%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e2%set("process_row", int(my_prow,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e2%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++   assert(e2%setup() .eq. ELPA_OK)
++
++   call e2%set("solver", ELPA_SOLVER_1STAGE, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e1%eigenvectors(a1, ev1, z1, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_deallocate(e1, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e2%eigenvectors(a2, ev2, z2, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_deallocate(e2, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_uninit(error_elpa)
++
++   status = check_correctness_evp_numeric_residuals(na, nev, as1, z1, ev1, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
++
++   deallocate(a1)
++   deallocate(as1)
++   deallocate(z1)
++   deallocate(ev1)
++
++   status = check_correctness_evp_numeric_residuals(na, nev, as2, z2, ev2, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
++
++   deallocate(a2)
++   deallocate(as2)
++   deallocate(z2)
++   deallocate(ev2)
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++   call EXIT(STATUS)
++
++
++end program
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/elpa2/real_2stage_banded.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/real_2stage_banded.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/elpa2/real_2stage_banded.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/real_2stage_banded.F90	2022-01-26 10:09:19.164859000 +0100
+@@ -0,0 +1,298 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++#include "../assert.h"
++!>
++!> Fortran test programm to demonstrates the use of
++!> ELPA 2 real case library.
++!> If "HAVE_REDIRECT" was defined at build time
++!> the stdout and stderr output of each MPI task
++!> can be redirected to files if the environment
++!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
++!> to "true".
++!>
++!> By calling executable [arg1] [arg2] [arg3] [arg4]
++!> one can define the size (arg1), the number of
++!> Eigenvectors to compute (arg2), and the blocking (arg3).
++!> If these values are not set default values (500, 150, 16)
++!> are choosen.
++!> If these values are set the 4th argument can be
++!> "output", which specifies that the EV's are written to
++!> an ascii file.
++!>
++!> The real ELPA 2 kernel is set as the default kernel.
++!> However, this can be overriden by setting
++!> the environment variable "REAL_ELPA_KERNEL" to an
++!> appropiate value.
++!>
++
++#include "config-f90.h"
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++program test_real2_double_banded
++
++!-------------------------------------------------------------------------------
++! Standard eigenvalue problem - REAL version
++!
++! This program demonstrates the use of the ELPA module
++! together with standard scalapack routines
++!
++! Copyright of the original code rests with the authors inside the ELPA
++! consortium. The copyright of any additional modifications shall rest
++! with their original authors, but shall adhere to the licensing terms
++! distributed along with the original code in the file "COPYING".
++!
++!-------------------------------------------------------------------------------
++   use elpa
++
++   !use test_util
++   use test_read_input_parameters
++   use test_check_correctness
++   use test_setup_mpi
++   use test_blacs_infrastructure
++   use test_prepare_matrix
++#ifdef HAVE_REDIRECT
++   use test_redirect
++#endif
++   use test_output_type
++   implicit none
++
++   !-------------------------------------------------------------------------------
++   ! Please set system size parameters below!
++   ! na:   System size
++   ! nev:  Number of eigenvectors to be calculated
++   ! nblk: Blocking factor in block cyclic distribution
++   !-------------------------------------------------------------------------------
++
++   TEST_INT_TYPE           :: nblk
++   TEST_INT_TYPE           :: na, nev
++
++   TEST_INT_TYPE           :: np_rows, np_cols, na_rows, na_cols
++
++   TEST_INT_TYPE           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
++   TEST_INT_TYPE           :: i,  my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++   TEST_INT_MPI_TYPE       :: mpierr
++   !TEST_INT_TYPE, external :: numroc
++
++   real(kind=rk8), allocatable :: a(:,:), z(:,:), as(:,:), ev(:)
++
++   TEST_INT_TYPE           :: STATUS
++#ifdef WITH_OPENMP_TRADITIONAL
++   TEST_INT_TYPE           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
++#endif
++   integer(kind=c_int)          :: error_elpa
++   TEST_INT_TYPE           :: numberOfDevices
++   type(output_t)             :: write_to_file
++   character(len=8)           :: task_suffix
++   TEST_INT_TYPE           :: j
++   TEST_INT_TYPE           :: global_row, global_col, local_row, local_col
++   TEST_INT_TYPE           :: bandwidth
++   class(elpa_t), pointer     :: e
++#define DOUBLE_PRECISION_REAL 1
++
++
++   call read_input_parameters(na, nev, nblk, write_to_file)
++
++   if (nblk .eq. 1) then
++     stop 77
++   endif
++
++   !-------------------------------------------------------------------------------
++   !  MPI Initialization
++   call setup_mpi(myid, nprocs)
++
++   STATUS = 0
++
++#define REALCASE
++
++   !-------------------------------------------------------------------------------
++   ! Selection of number of processor rows/columns
++   ! We try to set up the grid square-like, i.e. start the search for possible
++   ! divisors of nprocs with a number next to the square root of nprocs
++   ! and decrement it until a divisor is found.
++
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++   ! at the end of the above loop, nprocs is always divisible by np_cols
++
++   np_rows = nprocs/np_cols
++
++   if(myid==0) then
++      print *
++      print '(a)','Standard eigenvalue problem - REAL version'
++      print *
++      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
++      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
++      print *
++   endif
++
++   !-------------------------------------------------------------------------------
++   ! Set up BLACS context and MPI communicators
++   !
++   ! The BLACS context is only necessary for using Scalapack.
++   !
++   ! For ELPA, the MPI communicators along rows/cols are sufficient,
++   ! and the grid setup may be done in an arbitrary way as long as it is
++   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
++   ! process has a unique (my_prow,my_pcol) pair).
++
++   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
++                         my_blacs_ctxt, my_prow, my_pcol)
++
++   if (myid==0) then
++     print '(a)','| Past BLACS_Gridinfo.'
++   end if
++
++   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   if (myid==0) then
++     print '(a)','| Past scalapack descriptor setup.'
++   end if
++
++   !-------------------------------------------------------------------------------
++   ! Allocate matrices and set up a test matrix for the eigenvalue problem
++   allocate(a (na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++   allocate(as(na_rows,na_cols))
++
++   allocate(ev(na))
++
++   call prepare_matrix_random(na, myid, sc_desc, a, z, as)
++
++   ! set values outside of the bandwidth to zero
++   bandwidth = nblk
++
++   do local_row = 1, na_rows
++     global_row = index_l2g(local_row, nblk, my_prow, np_rows)
++     do local_col = 1, na_cols
++       global_col = index_l2g(local_col, nblk, my_pcol, np_cols)
++
++       if (ABS(global_row-global_col) > bandwidth) then
++         a(local_row, local_col) = 0.0
++         as(local_row, local_col) = 0.0
++       end if
++     end do
++   end do
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++   e => elpa_allocate(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e%set("na", int(na,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nev", int(nev,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#ifdef WITH_MPI
++   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++   call e%set("bandwidth", int(bandwidth,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   assert(e%setup() .eq. ELPA_OK)
++
++   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e%eigenvectors(a, ev, z, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_deallocate(e, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_uninit(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++
++   !-------------------------------------------------------------------------------
++   ! Test correctness of result (using plain scalapack routines)
++
++   
++   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
++
++
++   deallocate(a)
++   deallocate(as)
++
++   deallocate(z)
++   deallocate(ev)
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++   call EXIT(STATUS)
++end
++
++!-------------------------------------------------------------------------------
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/elpa2/single_complex_2stage_banded.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/single_complex_2stage_banded.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/elpa2/single_complex_2stage_banded.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/single_complex_2stage_banded.F90	2022-01-26 10:09:19.166040000 +0100
+@@ -0,0 +1,299 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++#include "../assert.h"
++!>
++!> Fortran test programm to demonstrates the use of
++!> ELPA 2 complex case library.
++!> If "HAVE_REDIRECT" was defined at build time
++!> the stdout and stderr output of each MPI task
++!> can be redirected to files if the environment
++!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
++!> to "true".
++!>
++!> By calling executable [arg1] [arg2] [arg3] [arg4]
++!> one can define the size (arg1), the number of
++!> Eigenvectors to compute (arg2), and the blocking (arg3).
++!> If these values are not set default values (500, 150, 16)
++!> are choosen.
++!> If these values are set the 4th argument can be
++!> "output", which specifies that the EV's are written to
++!> an ascii file.
++!>
++!> The complex ELPA 2 kernel is set as the default kernel.
++!> However, this can be overriden by setting
++!> the environment variable "COMPLEX_ELPA_KERNEL" to an
++!> appropiate value.
++!>
++program test_complex2_single_banded
++
++!-------------------------------------------------------------------------------
++! Standard eigenvalue problem - COMPLEX version
++!
++! This program demonstrates the use of the ELPA module
++! together with standard scalapack routines
++!
++! Copyright of the original code rests with the authors inside the ELPA
++! consortium. The copyright of any additional modifications shall rest
++! with their original authors, but shall adhere to the licensing terms
++! distributed along with the original code in the file "COPYING".
++!-------------------------------------------------------------------------------
++   use elpa
++
++   use test_util
++   use test_read_input_parameters
++   use test_check_correctness
++   use test_setup_mpi
++   use test_blacs_infrastructure
++   use test_prepare_matrix
++#ifdef HAVE_REDIRECT
++  use test_redirect
++#endif
++
++ use test_output_type
++   implicit none
++
++   !-------------------------------------------------------------------------------
++   ! Please set system size parameters below!
++   ! na:   System size
++   ! nev:  Number of eigenvectors to be calculated
++   ! nblk: Blocking factor in block cyclic distribution
++   !-------------------------------------------------------------------------------
++
++   TEST_INT_TYPE              :: nblk
++   TEST_INT_TYPE              :: na, nev
++
++   TEST_INT_TYPE              :: np_rows, np_cols, na_rows, na_cols
++
++   TEST_INT_TYPE              :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
++   TEST_INT_TYPE              :: i, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++   TEST_INT_MPI_TYPE          :: mpierr
++#ifdef WITH_MPI
++   !TEST_INT_TYPE, external    :: numroc
++#endif
++   complex(kind=ck4), parameter   :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4)
++   real(kind=rk4), allocatable    :: ev(:)
++
++   complex(kind=ck4), allocatable :: a(:,:), z(:,:), as(:,:)
++
++   TEST_INT_TYPE              :: STATUS
++#ifdef WITH_OPENMP_TRADITIONAL
++   TEST_INT_TYPE              :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
++#endif
++   type(output_t)                :: write_to_file
++   integer(kind=ik)              :: error_elpa
++   character(len=8)              :: task_suffix
++   TEST_INT_TYPE              :: j
++
++
++   TEST_INT_TYPE              :: global_row, global_col, local_row, local_col
++   TEST_INT_TYPE              :: bandwidth
++   class(elpa_t), pointer        :: e
++
++#define COMPLEXCASE
++#define DOUBLE_PRECISION_COMPLEX 1
++
++   call read_input_parameters(na, nev, nblk, write_to_file)
++   if (nblk .eq. 1) then
++     stop 77
++   endif
++
++   !-------------------------------------------------------------------------------
++   !  MPI Initialization
++   call setup_mpi(myid, nprocs)
++
++   STATUS = 0
++
++   !-------------------------------------------------------------------------------
++   ! Selection of number of processor rows/columns
++   ! We try to set up the grid square-like, i.e. start the search for possible
++   ! divisors of nprocs with a number next to the square root of nprocs
++   ! and decrement it until a divisor is found.
++
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++   ! at the end of the above loop, nprocs is always divisible by np_cols
++
++   np_rows = nprocs/np_cols
++
++   if(myid==0) then
++      print *
++      print '(a)','Standard eigenvalue problem - COMPLEX version'
++      print *
++      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
++      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
++      print *
++   endif
++
++   !-------------------------------------------------------------------------------
++   ! Set up BLACS context and MPI communicators
++   !
++   ! The BLACS context is only necessary for using Scalapack.
++   !
++   ! For ELPA, the MPI communicators along rows/cols are sufficient,
++   ! and the grid setup may be done in an arbitrary way as long as it is
++   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
++   ! process has a unique (my_prow,my_pcol) pair).
++
++   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
++                         my_blacs_ctxt, my_prow, my_pcol)
++
++   if (myid==0) then
++     print '(a)','| Past BLACS_Gridinfo.'
++   end if
++
++   ! Determine the necessary size of the distributed matrices,
++   ! we use the Scalapack tools routine NUMROC for that.
++
++   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   if (myid==0) then
++     print '(a)','| Past scalapack descriptor setup.'
++   end if
++   !-------------------------------------------------------------------------------
++   ! Allocate matrices and set up a test matrix for the eigenvalue problem
++
++   allocate(a (na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++   allocate(as(na_rows,na_cols))
++
++   allocate(ev(na))
++
++   call prepare_matrix_random(na, myid, sc_desc, a, z, as)
++
++   ! set values outside of the bandwidth to zero
++   bandwidth = nblk
++
++   do local_row = 1, na_rows
++     global_row = index_l2g( local_row, nblk, my_prow, np_rows )
++     do local_col = 1, na_cols
++       global_col = index_l2g( local_col, nblk, my_pcol, np_cols )
++
++       if (ABS(global_row-global_col) > bandwidth) then
++         a(local_row, local_col) = 0
++         as(local_row, local_col) = 0
++       end if
++     end do
++   end do
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++
++   e => elpa_allocate(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e%set("na", int(na,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nev", int(nev,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#ifdef WITH_MPI
++   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++   call e%set("bandwidth", int(bandwidth,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   assert(e%setup() .eq. ELPA_OK)
++
++   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%eigenvectors(a, ev, z, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_deallocate(e, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_uninit(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   !-------------------------------------------------------------------------------
++   ! Test correctness of result (using plain scalapack routines)
++   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
++
++   deallocate(a)
++   deallocate(as)
++
++   deallocate(z)
++   deallocate(ev)
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++   call EXIT(STATUS)
++end
++
++!-------------------------------------------------------------------------------
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/elpa2/single_real_2stage_banded.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/single_real_2stage_banded.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/elpa2/single_real_2stage_banded.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/elpa2/single_real_2stage_banded.F90	2022-01-26 10:09:19.166871000 +0100
+@@ -0,0 +1,290 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++#include "../assert.h"
++!>
++!> Fortran test programm to demonstrates the use of
++!> ELPA 2 real case library.
++!> If "HAVE_REDIRECT" was defined at build time
++!> the stdout and stderr output of each MPI task
++!> can be redirected to files if the environment
++!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
++!> to "true".
++!>
++!> By calling executable [arg1] [arg2] [arg3] [arg4]
++!> one can define the size (arg1), the number of
++!> Eigenvectors to compute (arg2), and the blocking (arg3).
++!> If these values are not set default values (500, 150, 16)
++!> are choosen.
++!> If these values are set the 4th argument can be
++!> "output", which specifies that the EV's are written to
++!> an ascii file.
++!>
++!> The real ELPA 2 kernel is set as the default kernel.
++!> However, this can be overriden by setting
++!> the environment variable "REAL_ELPA_KERNEL" to an
++!> appropiate value.
++!>
++program test_real2_single_banded
++
++!-------------------------------------------------------------------------------
++! Standard eigenvalue problem - REAL version
++!
++! This program demonstrates the use of the ELPA module
++! together with standard scalapack routines
++!
++! Copyright of the original code rests with the authors inside the ELPA
++! consortium. The copyright of any additional modifications shall rest
++! with their original authors, but shall adhere to the licensing terms
++! distributed along with the original code in the file "COPYING".
++!
++!-------------------------------------------------------------------------------
++   use elpa
++
++   !use test_util
++   use test_read_input_parameters
++   use test_check_correctness
++   use test_setup_mpi
++   use test_blacs_infrastructure
++   use test_prepare_matrix
++#ifdef HAVE_REDIRECT
++   use test_redirect
++#endif
++   use test_output_type
++   use tests_scalapack_interfaces
++   implicit none
++
++   !-------------------------------------------------------------------------------
++   ! Please set system size parameters below!
++   ! na:   System size
++   ! nev:  Number of eigenvectors to be calculated
++   ! nblk: Blocking factor in block cyclic distribution
++   !-------------------------------------------------------------------------------
++
++   TEST_INT_TYPE           :: nblk
++   TEST_INT_TYPE           :: na, nev
++
++   TEST_INT_TYPE           :: np_rows, np_cols, na_rows, na_cols
++
++   TEST_INT_TYPE           :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
++   TEST_INT_TYPE           :: i, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++   TEST_INT_MPI_TYPE       :: mpierr
++
++   real(kind=rk4), allocatable :: a(:,:), z(:,:), as(:,:), ev(:)
++
++   TEST_INT_TYPE           :: STATUS
++#ifdef WITH_OPENMP_TRADITIONAL
++   TEST_INT_TYPE           :: omp_get_max_threads,  required_mpi_thread_level, provided_mpi_thread_level
++#endif
++   integer(kind=c_int)     :: error_elpa
++   type(output_t)          :: write_to_file
++   character(len=8)        :: task_suffix
++   TEST_INT_TYPE           :: j
++   TEST_INT_TYPE           :: global_row, global_col, local_row, local_col
++   TEST_INT_TYPE           :: bandwidth
++   class(elpa_t), pointer  :: e
++#define DOUBLE_PRECISION_REAL 1
++
++   call read_input_parameters(na, nev, nblk, write_to_file)
++   if (nblk .eq. 1) then
++     stop 77
++   endif
++
++   !-------------------------------------------------------------------------------
++   !  MPI Initialization
++   call setup_mpi(myid, nprocs)
++
++
++   STATUS = 0
++
++#define REALCASE
++
++   !-------------------------------------------------------------------------------
++   ! Selection of number of processor rows/columns
++   ! We try to set up the grid square-like, i.e. start the search for possible
++   ! divisors of nprocs with a number next to the square root of nprocs
++   ! and decrement it until a divisor is found.
++
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++   ! at the end of the above loop, nprocs is always divisible by np_cols
++
++   np_rows = nprocs/np_cols
++
++   if(myid==0) then
++      print *
++      print '(a)','Standard eigenvalue problem - REAL version'
++      print *
++      print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
++      print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
++      print *
++   endif
++
++   !-------------------------------------------------------------------------------
++   ! Set up BLACS context and MPI communicators
++   !
++   ! The BLACS context is only necessary for using Scalapack.
++   !
++   ! For ELPA, the MPI communicators along rows/cols are sufficient,
++   ! and the grid setup may be done in an arbitrary way as long as it is
++   ! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
++   ! process has a unique (my_prow,my_pcol) pair).
++
++   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, 'C', &
++                         my_blacs_ctxt, my_prow, my_pcol)
++
++   if (myid==0) then
++     print '(a)','| Past BLACS_Gridinfo.'
++   end if
++
++   call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   if (myid==0) then
++     print '(a)','| Past scalapack descriptor setup.'
++   end if
++
++   !-------------------------------------------------------------------------------
++   ! Allocate matrices and set up a test matrix for the eigenvalue problem
++   allocate(a (na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++   allocate(as(na_rows,na_cols))
++
++   allocate(ev(na))
++
++   call prepare_matrix_random(na, myid, sc_desc, a, z, as)
++
++   ! set values outside of the bandwidth to zero
++   bandwidth = nblk
++
++   do local_row = 1, na_rows
++     global_row = index_l2g( local_row, nblk, my_prow, np_rows )
++     do local_col = 1, na_cols
++       global_col = index_l2g( local_col, nblk, my_pcol, np_cols )
++
++       if (ABS(global_row-global_col) > bandwidth) then
++         a(local_row, local_col) = 0.0
++         as(local_row, local_col) = 0.0
++       end if
++     end do
++   end do
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++   e => elpa_allocate(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e%set("na", int(na,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nev", int(nev,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#ifdef WITH_MPI
++   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++   call e%set("bandwidth", int(bandwidth,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   assert(e%setup() .eq. ELPA_OK)
++
++   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e%eigenvectors(a, ev, z, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_deallocate(e, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call elpa_uninit(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++
++   !-------------------------------------------------------------------------------
++   ! Test correctness of result (using plain scalapack routines)
++
++   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
++   deallocate(a)
++   deallocate(as)
++
++   deallocate(z)
++   deallocate(ev)
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++   call EXIT(STATUS)
++end
++
++!-------------------------------------------------------------------------------
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/elpa_generalized/test_bindings.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/elpa_generalized/test_bindings.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/elpa_generalized/test_bindings.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/elpa_generalized/test_bindings.F90	2022-01-26 10:09:56.136747000 +0100
+@@ -0,0 +1,168 @@
++#include "config-f90.h"
++
++#include "../assert.h"
++
++program test_bindings
++   use elpa
++
++   use test_util
++   use test_setup_mpi
++!   use test_prepare_matrix
++   use test_read_input_parameters
++   use test_blacs_infrastructure
++!   use test_check_correctness
++!   use test_analytic
++!   use test_scalapack
++
++
++   implicit none
++
++#include "src/elpa_generated_fortran_interfaces.h"
++
++   ! matrix dimensions
++   integer                     :: na, nev, nblk
++
++   ! mpi
++   integer                     :: myid, nprocs
++   integer                     :: na_cols, na_rows  ! local matrix size
++   integer                     :: np_cols, np_rows  ! number of MPI processes per column/row
++   integer                     :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
++   integer                     :: mpierr, mpi_comm_rows, mpi_comm_cols
++   type(output_t)              :: write_to_file
++
++   ! blacs
++   integer                     :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol, i, j
++   character(len=1)            :: layout
++
++
++   ! The Matrix
++   real(kind=C_DOUBLE) , allocatable    :: a(:,:), res(:,:)
++
++   logical                     :: skip_check_correctness
++
++   class(elpa_t), pointer      :: e
++
++   integer                     :: error, status
++
++     call read_input_parameters_traditional(na, nev, nblk, write_to_file, skip_check_correctness)
++     call setup_mpi(myid, nprocs)
++#ifdef WITH_MPI
++     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++     !call redirect_stdout(myid)
++#endif
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++
++   layout = 'C'
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++
++   np_rows = nprocs/np_cols
++   assert(nprocs == np_rows * np_cols)
++
++   if (myid == 0) then
++     print '((a,i0))', 'Matrix size: ', na
++     print '((a,i0))', 'Num eigenvectors: ', nev
++     print '((a,i0))', 'Blocksize: ', nblk
++#ifdef WITH_MPI
++     print '((a,i0))', 'Num MPI proc: ', nprocs
++     print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
++     print '(a)',      'Process layout: ' // layout
++#endif
++     print *,''
++   endif
++
++
++   call set_up_blacsgrid(mpi_comm_world, np_rows, np_cols, layout, &
++                         my_blacs_ctxt, my_prow, my_pcol)
++
++   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   allocate(a (na_rows,na_cols))
++   allocate(res(na_rows,na_cols))   
++
++   e => elpa_allocate(error)
++   assert_elpa_ok(error)
++
++   call e%set("na", na, error)
++   assert_elpa_ok(error)
++   call e%set("nev", nev, error)
++   assert_elpa_ok(error)
++   call e%set("local_nrows", na_rows, error)
++   assert_elpa_ok(error)
++   call e%set("local_ncols", na_cols, error)
++   assert_elpa_ok(error)
++   call e%set("nblk", nblk, error)
++   assert_elpa_ok(error)
++
++#ifdef WITH_MPI
++   call e%set("mpi_comm_parent", MPI_COMM_WORLD, error)
++   assert_elpa_ok(error)
++   call e%set("process_row", my_prow, error)
++   assert_elpa_ok(error)
++   call e%set("process_col", my_pcol, error)
++   assert_elpa_ok(error)
++#endif
++
++   call e%get("mpi_comm_rows",mpi_comm_rows, error)
++   assert_elpa_ok(error)
++   call e%get("mpi_comm_cols",mpi_comm_cols, error)
++   assert_elpa_ok(error)
++
++   a(:,:) = 1.0
++   res(:,:) = 0.0
++
++   call test_c_bindings(a, na_rows, na_cols, np_rows, np_cols, my_prow, my_pcol, sc_desc, res, mpi_comm_rows, mpi_comm_cols)
++
++   status = 0
++   do i = 1, na_rows
++     do j = 1, na_cols
++       if(a(i,j) .ne. 1.0) then
++         write(*,*) i, j, ": wrong value of A: ", a(i,j), ", should be 1"
++         status = 1
++       endif
++       if(res(i,j) .ne. 3.0) then
++         write(*,*) i, j, ": wrong value of res: ", res(i,j), ", should be 3"
++         status = 1
++       endif
++     enddo
++   enddo
++
++   call check_status(status, myid)
++
++   call elpa_deallocate(e, error)
++   assert_elpa_ok(error)
++
++   deallocate(a)
++   deallocate(res)
++   call elpa_uninit(error)
++   assert_elpa_ok(error)
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++
++   call exit(status)
++
++   contains
++
++     subroutine check_status(status, myid)
++       implicit none
++       integer, intent(in) :: status, myid
++       integer :: mpierr
++       if (status /= 0) then
++         if (myid == 0) print *, "Result incorrect!"
++#ifdef WITH_MPI
++         call mpi_finalize(mpierr)
++#endif
++         call exit(status)
++       endif
++     end subroutine
++
++end program
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/elpa_print_headers.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/elpa_print_headers.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/elpa_print_headers.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/elpa_print_headers.F90	2022-01-26 10:04:52.441241000 +0100
+@@ -0,0 +1,282 @@
++#if 0
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
++!
++! Copyright of the original code rests with the authors inside the ELPA
++! consortium. The copyright of any additional modifications shall rest
++! with their original authors, but shall adhere to the licensing terms
++! distributed along with the original code in the file "COPYING".
++#endif
++
++#ifdef WITH_OPENMP_TRADITIONAL
++   if (myid .eq. 0) then
++      print *,"Threaded version of test program"
++      print *,"Using ",omp_get_max_threads()," threads"
++      print *," "
++   endif
++#endif
++
++#ifndef WITH_MPI
++   if (myid .eq. 0) then
++     print *,"This version of ELPA does not support MPI parallelisation"
++     print *,"For MPI support re-build ELPA with appropiate flags"
++     print *," "
++   endif
++#endif
++
++#ifdef ELPA1
++
++#ifdef REALCASE
++#ifdef DOUBLE_PRECISION_REAL
++   if (myid .eq. 0) then
++     print *," "
++     print *,"Real valued double-precision version of ELPA1 is used"
++     print *," "
++   endif
++#else
++   if (myid .eq. 0) then
++     print *," "
++     print *,"Real valued single-precision version of ELPA1 is used"
++     print *," "
++   endif
++#endif
++
++#endif
++
++#ifdef COMPLEXCASE
++#ifdef DOUBLE_PRECISION_COMPLEX
++   if (myid .eq. 0) then
++     print *," "
++     print *,"Complex valued double-precision version of ELPA1 is used"
++     print *," "
++   endif
++#else
++   if (myid .eq. 0) then
++     print *," "
++     print *,"Complex valued single-precision version of ELPA1 is used"
++     print *," "
++   endif
++#endif
++
++#endif /* DATATYPE */
++
++#else /* ELPA1 */
++
++#ifdef REALCASE
++#ifdef DOUBLE_PRECISION_REAL
++   if (myid .eq. 0) then
++     print *," "
++     print *,"Real valued double-precision version of ELPA2 is used"
++     print *," "
++   endif
++#else
++   if (myid .eq. 0) then
++     print *," "
++     print *,"Real valued single-precision version of ELPA2 is used"
++     print *," "
++   endif
++#endif
++
++#endif
++
++#ifdef COMPLEXCASE
++#ifdef DOUBLE_PRECISION_COMPLEX
++   if (myid .eq. 0) then
++     print *," "
++     print *,"Complex valued double-precision version of ELPA2 is used"
++     print *," "
++   endif
++#else
++   if (myid .eq. 0) then
++     print *," "
++     print *,"Complex valued single-precision version of ELPA2 is used"
++     print *," "
++   endif
++#endif
++
++#endif /* DATATYPE */
++
++#endif /* ELPA1 */
++
++#ifdef WITH_MPI
++    call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++#endif
++#ifdef HAVE_REDIRECT
++   if (check_redirect_environment_variable()) then
++     if (myid .eq. 0) then
++       print *," "
++       print *,"Redirection of mpi processes is used"
++       print *," "
++       if (create_directories() .ne. 1) then
++         write(error_unit,*) "Unable to create directory for stdout and stderr!"
++         stop 1
++       endif
++     endif
++#ifdef WITH_MPI
++     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++#endif
++     call redirect_stdout(myid)
++   endif
++#endif
++
++#ifndef ELPA1
++
++   if (myid .eq. 0) then
++      print *," "
++      print *,"This ELPA2 is build with"
++#ifdef WITH_NVIDIA_GPU_KERNEL
++        print *,"CUDA GPU support"
++#endif
++#ifdef WITH_NVIDIA_SM80_GPU_KERNEL
++        print *,"CUDA sm_80 GPU support"
++#endif
++#ifdef WITH_INTEL_GPU_KERNEL
++        print *,"INTEL GPU support"
++#endif
++#ifdef WITH_AMD_GPU_KERNEL
++        print *,"AMD GPU support"
++#endif
++      print *," "
++#ifdef REALCASE
++
++#ifdef HAVE_AVX2
++
++#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
++      print *,"AVX2 optimized kernel (2 blocking) for real matrices"
++#endif
++#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
++      print *,"AVX2 optimized kernel (4 blocking) for real matrices"
++#endif
++#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
++      print *,"AVX2 optimized kernel (6 blocking) for real matrices"
++#endif
++
++#else /* no HAVE_AVX2 */
++
++#ifdef HAVE_AVX
++
++#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
++      print *,"AVX optimized kernel (2 blocking) for real matrices"
++#endif
++#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
++      print *,"AVX optimized kernel (4 blocking) for real matrices"
++#endif
++#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
++      print *,"AVX optimized kernel (6 blocking) for real matrices"
++#endif
++
++#endif
++
++#endif /* HAVE_AVX2 */
++
++
++#ifdef WITH_REAL_GENERIC_KERNEL
++     print *,"GENERIC kernel for real matrices"
++#endif
++#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
++     print *,"GENERIC SIMPLE kernel for real matrices"
++#endif
++#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
++     print *,"SSE ASSEMBLER kernel for real matrices"
++#endif
++#ifdef WITH_REAL_BGP_KERNEL
++     print *,"BGP kernel for real matrices"
++#endif
++#ifdef WITH_REAL_BGQ_KERNEL
++     print *,"BGQ kernel for real matrices"
++#endif
++
++#endif /* DATATYPE == REAL */
++
++#ifdef COMPLEXCASE
++
++#ifdef HAVE_AVX2
++
++#ifdef  WITH_COMPLEX_AVX_BLOCK2_KERNEL
++      print *,"AVX2 optimized kernel (2 blocking) for complex matrices"
++#endif
++#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
++      print *,"AVX2 optimized kernel (1 blocking) for complex matrices"
++#endif
++
++#else /* no HAVE_AVX2 */
++
++#ifdef HAVE_AVX
++
++#ifdef  WITH_COMPLEX_AVX_BLOCK2_KERNEL
++      print *,"AVX optimized kernel (2 blocking) for complex matrices"
++#endif
++#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
++      print *,"AVX optimized kernel (1 blocking) for complex matrices"
++#endif
++
++#endif
++
++#endif /* HAVE_AVX2 */
++
++
++#ifdef WITH_COMPLEX_GENERIC_KERNEL
++     print *,"GENERIC kernel for complex matrices"
++#endif
++#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
++     print *,"GENERIC SIMPLE kernel for complex matrices"
++#endif
++#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
++     print *,"SSE ASSEMBLER kernel for complex matrices"
++#endif
++
++#endif /* DATATYPE == COMPLEX */
++
++   endif
++#endif /* ELPA1 */
++
++   if (write_to_file%eigenvectors) then
++     if (myid .eq. 0) print *,"Writing Eigenvectors to files"
++   endif
++
++   if (write_to_file%eigenvalues) then
++     if (myid .eq. 0) print *,"Writing Eigenvalues to files"
++   endif
++
++
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/Makefile_examples_hybrid elpa-new_release_2021.11.001_ok/examples/Fortran/Makefile_examples_hybrid
+--- elpa-new_release_2021.11.001/examples/Fortran/Makefile_examples_hybrid	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/Makefile_examples_hybrid	2022-01-28 09:55:45.556223000 +0100
+@@ -0,0 +1,38 @@
++# MPICH, that is IntelMPI or ParaStationMPI
++SCALAPACK_LIB = -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
++# OpenMPI
++# SCALAPACK_LIB = -lmkl_scalapack_lp64 $(MKLROOT)/lib/intel64/libmkl_blacs_openmpi_lp64.a
++LAPACK_LIB = 
++# Intel compiler
++MKL = -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lstdc++
++# GCC
++# MKL = -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lstdc++ -lm
++F90 = mpif90 -O3 -qopenmp -I$(ELPA_MODULES_OPENMP) -I$(ELPA_MODULES) -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa
++# GCC
++# F90 = mpif90 -O3 -fopenmp -I$(ELPA_MODULES_OPENMP) -I$(ELPA_MODULES) -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa
++LIBS = -L$(ELPA_LIB) -lelpatest_openmp -lelpa_openmp $(SCALAPACK_LIB) $(MKL)
++# CC            = mpicc -qopenmp -O3 
++# GCC
++# CC            = mpicc -fopenmp -O3
++
++all: test_real_1stage_omp test_real_2stage_all_kernels_omp test_autotune_omp test_multiple_objs_omp test_split_comm_omp test_skewsymmetric_omp
++
++test_real_1stage_omp: test.F90
++	/usr/bin/cpp -P -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_1STAGE -DTEST_EIGENVECTORS -DWITH_OPENMP_TRADITIONAL -DCURRENT_API_VERSION=20211125 -DWITH_MPI -I$(ELPA_INCLUDE_OPENMP)/elpa -o test_real_1stage_omp.F90 test.F90
++	$(F90) -o $@ test_real_1stage_omp.F90 $(LIBS)
++
++test_real_2stage_all_kernels_omp: test.F90
++	/usr/bin/cpp -P -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_2STAGE -DTEST_EIGENVECTORS -DTEST_ALL_KERNELS -DWITH_OPENMP_TRADITIONAL -DCURRENT_API_VERSION=20211125 -DWITH_MPI -I$(ELPA_INCLUDE_OPENMP)/elpa -o test_real_2stage_all_kernels_omp.F90 test.F90
++	$(F90) -o $@ test_real_2stage_all_kernels_omp.F90 $(LIBS)
++
++test_autotune_omp: test_autotune.F90
++	$(F90) -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DWITH_OPENMP_TRADITIONAL -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE_OPENMP)/elpa -o $@ test_autotune.F90 $(LIBS)
++
++test_multiple_objs_omp: test_multiple_objs.F90
++	$(F90) -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DWITH_OPENMP_TRADITIONAL -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE_OPENMP)/elpa -o $@ test_multiple_objs.F90 $(LIBS)
++
++test_split_comm_omp: test_split_comm.F90
++	$(F90) -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DWITH_OPENMP_TRADITIONAL -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE_OPENMP)/elpa -o $@ test_split_comm.F90 $(LIBS)
++
++test_skewsymmetric_omp: test_skewsymmetric.F90
++	$(F90) -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DWITH_OPENMP_TRADITIONAL -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE_OPENMP)/elpa -o $@ test_skewsymmetric.F90 $(LIBS)
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/Makefile_examples_pure elpa-new_release_2021.11.001_ok/examples/Fortran/Makefile_examples_pure
+--- elpa-new_release_2021.11.001/examples/Fortran/Makefile_examples_pure	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/Makefile_examples_pure	2022-01-28 09:54:41.176236000 +0100
+@@ -0,0 +1,34 @@
++# MPICH, that is IntelMPI or ParaStationMPI
++SCALAPACK_LIB = -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
++# OpenMPI
++# SCALAPACK_LIB = -lmkl_scalapack_lp64 $(MKLROOT)/lib/intel64/libmkl_blacs_openmpi_lp64.a
++LAPACK_LIB = 
++# Intel compiler
++MKL = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5 -lpthread -lstdc++
++# GCC
++# MKL = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lgomp -lpthread -lstdc++ -lm
++F90 = mpif90 -O3 -I$(ELPA_MODULES) -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa
++LIBS = -L$(ELPA_LIB) -lelpa -lelpatest $(SCALAPACK_LIB) $(MKL)
++# CC            = mpicc -O3 
++
++all: test_real_1stage test_real_2stage_all_kernels test_autotune test_multiple_objs test_split_comm test_skewsymmetric
++
++test_real_1stage: test.F90
++	/usr/bin/cpp -P -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_1STAGE -DTEST_EIGENVECTORS -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o test_real_1stage.F90 test.F90
++	$(F90) -o $@ test_real_1stage.F90 $(LIBS)
++
++test_real_2stage_all_kernels: test.F90
++	/usr/bin/cpp -P -DTEST_GPU=0 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_2STAGE -DTEST_EIGENVECTORS -DTEST_ALL_KERNELS -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o test_real_2stage_all_kernels.F90 test.F90
++	$(F90) -o $@ test_real_2stage_all_kernels.F90 $(LIBS)
++
++test_autotune: test_autotune.F90
++	$(F90) -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o $@ test_autotune.F90 $(LIBS)
++
++test_multiple_objs: test_multiple_objs.F90
++	$(F90) -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o $@ test_multiple_objs.F90 $(LIBS)
++
++test_split_comm: test_split_comm.F90
++	$(F90) -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o $@ test_split_comm.F90 $(LIBS)
++
++test_skewsymmetric: test_skewsymmetric.F90
++	$(F90) -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o $@ test_skewsymmetric.F90 $(LIBS)
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/Makefile_examples_pure_cuda elpa-new_release_2021.11.001_ok/examples/Fortran/Makefile_examples_pure_cuda
+--- elpa-new_release_2021.11.001/examples/Fortran/Makefile_examples_pure_cuda	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/Makefile_examples_pure_cuda	2022-01-28 09:54:52.690358000 +0100
+@@ -0,0 +1,34 @@
++# MPICH, that is IntelMPI or ParaStationMPI
++SCALAPACK_LIB = -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
++# OpenMPI
++# SCALAPACK_LIB = -lmkl_scalapack_lp64 $(MKLROOT)/lib/intel64/libmkl_blacs_openmpi_lp64.a
++LAPACK_LIB = 
++# Intel compiler
++MKL = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5 -lpthread -lstdc++
++# GCC
++# MKL = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lgomp -lpthread -lstdc++ -lm
++F90 = mpif90 -O3 -I$(ELPA_MODULES) -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa
++LIBS = -L$(ELPA_LIB) -lelpa -lelpatest $(SCALAPACK_LIB) $(MKL) -lcudart
++# CC            = mpicc -O3 
++
++all: test_real_1stage test_real_2stage_all_kernels test_autotune test_multiple_objs test_split_comm test_skewsymmetric
++
++test_real_1stage: test.F90
++	/usr/bin/cpp -P -DTEST_NVIDIA_GPU=1 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_1STAGE -DTEST_EIGENVECTORS -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o test_real_1stage.F90 test.F90
++	$(F90) -o $@ test_real_1stage.F90 $(LIBS)
++
++test_real_2stage_all_kernels: test.F90
++	/usr/bin/cpp -P -DTEST_NVIDIA_GPU=1 -DTEST_REAL -DTEST_DOUBLE -DTEST_SOLVER_2STAGE -DTEST_EIGENVECTORS -DTEST_ALL_KERNELS -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o test_real_2stage_all_kernels.F90 test.F90
++	$(F90) -o $@ test_real_2stage_all_kernels.F90 $(LIBS)
++
++test_autotune: test_autotune.F90
++	$(F90) -DTEST_REAL -DTEST_NVIDIA_GPU=1 -DTEST_DOUBLE -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o $@ test_autotune.F90 $(LIBS)
++
++test_multiple_objs: test_multiple_objs.F90
++	$(F90) -DTEST_REAL -DTEST_NVIDIA_GPU=1 -DTEST_DOUBLE -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o $@ test_multiple_objs.F90 $(LIBS)
++
++test_split_comm: test_split_comm.F90
++	$(F90) -DTEST_NVIDIA_GPU=1 -DTEST_REAL -DTEST_DOUBLE -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o $@ test_split_comm.F90 $(LIBS)
++
++test_skewsymmetric: test_skewsymmetric.F90
++	$(F90) -DTEST_REAL -DTEST_NVIDIA_GPU=1 -DTEST_DOUBLE -DWITH_MPI -DCURRENT_API_VERSION=20211125 -I$(ELPA_INCLUDE)/elpa -o $@ test_skewsymmetric.F90 $(LIBS)
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/test_autotune.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/test_autotune.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/test_autotune.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/test_autotune.F90	2022-01-28 18:31:05.305617544 +0100
+@@ -0,0 +1,345 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++! Define one of TEST_REAL or TEST_COMPLEX
++! Define one of TEST_SINGLE or TEST_DOUBLE
++! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
++! Define TEST_NVIDIA_GPU \in [0, 1]
++! Define TEST_INTEL_GPU \in [0, 1]
++! Define TEST_AMD_GPU \in [0, 1]
++! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
++
++#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
++error: define exactly one of TEST_REAL or TEST_COMPLEX
++#endif
++
++#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
++error: define exactly one of TEST_SINGLE or TEST_DOUBLE
++#endif
++
++#ifdef TEST_SINGLE
++#  define EV_TYPE real(kind=C_FLOAT)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_FLOAT)
++#  else
++#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
++#  endif
++#else
++#  define EV_TYPE real(kind=C_DOUBLE)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_DOUBLE)
++#  else
++#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
++#  endif
++#endif
++
++
++#ifdef TEST_REAL
++#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_REAL
++#else
++#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_COMPLEX
++#endif
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++
++
++#define TEST_GPU 0
++#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1)
++#undef TEST_GPU
++#define TEST_GPU 1
++#endif
++
++
++#include "assert.h"
++
++program test
++   use elpa
++
++   !use test_util
++   use test_setup_mpi
++   use test_prepare_matrix
++   use test_read_input_parameters
++   use test_blacs_infrastructure
++   use test_check_correctness
++   use test_analytic
++   use iso_fortran_env
++
++#ifdef HAVE_REDIRECT
++   use test_redirect
++#endif
++   implicit none
++
++   ! matrix dimensions
++   TEST_INT_TYPE                     :: na, nev, nblk
++
++   ! mpi
++   TEST_INT_TYPE                     :: myid, nprocs
++   TEST_INT_TYPE                     :: na_cols, na_rows  ! local matrix size
++   TEST_INT_TYPE                     :: np_cols, np_rows  ! number of MPI processes per column/row
++   TEST_INT_TYPE                     :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
++   TEST_INT_MPI_TYPE                 :: mpierr
++
++   ! blacs
++   character(len=1)            :: layout
++   TEST_INT_TYPE                     :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++
++   ! The Matrix
++   MATRIX_TYPE, allocatable    :: a(:,:), as(:,:)
++   ! eigenvectors
++   MATRIX_TYPE, allocatable    :: z(:,:)
++   ! eigenvalues
++   EV_TYPE, allocatable        :: ev(:)
++
++   TEST_INT_TYPE               :: status
++   integer(kind=c_int)         :: error_elpa
++
++   type(output_t)              :: write_to_file
++   class(elpa_t), pointer      :: e
++   class(elpa_autotune_t), pointer :: tune_state
++
++   TEST_INT_TYPE                     :: iter
++   character(len=5)            :: iter_string
++
++   call read_input_parameters(na, nev, nblk, write_to_file)
++!   call setup_mpi(myid, nprocs)
++    call MPI_INIT_THREAD(MPI_THREAD_SERIALIZED,info, mpierr)
++    call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs, mpierr)
++    call MPI_COMM_RANK(MPI_COMM_WORLD,myid, mpierr)
++#ifdef HAVE_REDIRECT
++#ifdef WITH_MPI
++   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++   call redirect_stdout(myid)
++#endif
++#endif
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++
++   layout = 'C'
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++   np_rows = nprocs/np_cols
++   assert(nprocs == np_rows * np_cols)
++
++   if (myid == 0) then
++     print '((a,i0))', 'Matrix size: ', na
++     print '((a,i0))', 'Num eigenvectors: ', nev
++     print '((a,i0))', 'Blocksize: ', nblk
++#ifdef WITH_MPI
++     print '((a,i0))', 'Num MPI proc: ', nprocs
++     print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
++     print '(a)',      'Process layout: ' // layout
++#endif
++     print *,''
++   endif
++
++   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, layout, &
++                         my_blacs_ctxt, my_prow, my_pcol)
++
++   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   allocate(a (na_rows,na_cols))
++   allocate(as(na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++   allocate(ev(na))
++
++   a(:,:) = 0.0
++   z(:,:) = 0.0
++   ev(:) = 0.0
++
++   call prepare_matrix_analytic(na, a, nblk, myid, np_rows, np_cols, my_prow, my_pcol, print_times=.false.)
++   as(:,:) = a(:,:)
++
++   e => elpa_allocate(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e%set("na", int(na,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nev", int(nev,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   if (layout .eq. 'C') then
++     call e%set("matrix_order",COLUMN_MAJOR_ORDER,error_elpa)
++   else
++     call e%set("matrix_order",ROW_MAJOR_ORDER,error_elpa)
++   endif
++
++#ifdef WITH_MPI
++   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++   call e%set("timings",1, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e%set("debug",1, error_elpa)
++   assert_elpa_ok(error_elpa)
++#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)
++   call e%set("nvidia-gpu", 0, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++#if TEST_AMD_GPU == 1
++   call e%set("amd-gpu", 0, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++#if TEST_INTEL_GPU == 1
++   call e%set("intel-gpu", 0, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++   !call e%set("max_stored_rows", 15, error_elpa)
++
++   !call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
++
++   assert_elpa_ok(e%setup())
++
++   if (myid == 0) print *, ""
++
++   ! if you want to use the new autotuning implentation
++   !call e%autotune_set_api_version(20211125, error_elpa)
++   !assert_elpa_ok(error_elpa)
++   ! if you want to use the old one, either do not set autotune_set_api_version
++   ! or set autotune_set_api_version to a supported api version < 20211125
++   tune_state => e%autotune_setup(ELPA_AUTOTUNE_MEDIUM, AUTOTUNE_DOMAIN, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   iter=0
++   do while (e%autotune_step(tune_state, error_elpa))
++     assert_elpa_ok(error_elpa)
++     iter=iter+1
++     write(iter_string,'(I5.5)') iter
++     !call e%print_settings()
++     !call e%store_settings("saved_parameters_"//trim(iter_string)//".txt")
++     call e%timer_start("eigenvectors: iteration "//trim(iter_string))
++     call e%eigenvectors(a, ev, z, error_elpa)
++     assert_elpa_ok(error_elpa)
++     call e%timer_stop("eigenvectors: iteration "//trim(iter_string))
++
++     assert_elpa_ok(error_elpa)
++     if (myid .eq. 0) then
++       print *, ""
++       call e%print_times("eigenvectors: iteration "//trim(iter_string))
++     endif
++     status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, &
++                                         .true., .true., print_times=.false.)
++     a(:,:) = as(:,:)
++     call e%autotune_print_state(tune_state)
++     call e%autotune_save_state(tune_state, "saved_state_"//trim(iter_string)//".txt")
++   end do
++
++   !! set and print the autotuned-settings
++   call e%autotune_set_best(tune_state, error_elpa)
++   assert_elpa_ok(error_elpa)
++   if (myid .eq. 0) then
++     flush(output_unit)
++     print *, "The best combination found by the autotuning:"
++     call e%autotune_print_best(tune_state, error_elpa)
++     assert_elpa_ok(error_elpa)
++   endif
++   ! de-allocate autotune object
++   call elpa_autotune_deallocate(tune_state, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   if (myid .eq. 0) then
++     print *, "Running once more time with the best found setting..."
++   endif
++   call e%timer_start("eigenvectors: best setting")
++   call e%eigenvectors(a, ev, z, error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%timer_stop("eigenvectors: best setting")
++   assert_elpa_ok(error_elpa)
++   if (myid .eq. 0) then
++   !  print *, ""
++     call e%print_times("eigenvectors: best setting")
++   endif
++   status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, &
++                                       .true., .true., print_times=.false.)
++
++   call elpa_deallocate(e,error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   deallocate(a)
++   deallocate(as)
++   deallocate(z)
++   deallocate(ev)
++
++   call elpa_uninit(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++
++   call exit(status)
++
++end program
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/test.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/test.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/test.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/test.F90	2022-01-28 12:00:32.452129948 +0100
+@@ -0,0 +1,1207 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++! Define one of TEST_REAL or TEST_COMPLEX
++! Define one of TEST_SINGLE or TEST_DOUBLE
++! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
++! Define TEST_NVIDIA_GPU \in [0, 1]
++! Define TEST_INTEL_GPU \in [0, 1]
++! Define TEST_AMD_GPU \in [0, 1]
++! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
++
++#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
++error: define exactly one of TEST_REAL or TEST_COMPLEX
++#endif
++
++#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
++error: define exactly one of TEST_SINGLE or TEST_DOUBLE
++#endif
++
++#if !(defined(TEST_SOLVER_1STAGE) ^ defined(TEST_SOLVER_2STAGE) ^ defined(TEST_SCALAPACK_ALL) ^ defined(TEST_SCALAPACK_PART))
++error: define exactly one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE or TEST_SCALAPACK_ALL or TEST_SCALAPACK_PART
++#endif
++
++#ifdef TEST_SOLVER_1STAGE
++#ifdef TEST_ALL_KERNELS
++error: TEST_ALL_KERNELS cannot be defined for TEST_SOLVER_1STAGE
++#endif
++#ifdef TEST_KERNEL
++error: TEST_KERNEL cannot be defined for TEST_SOLVER_1STAGE
++#endif
++#endif
++
++#ifdef TEST_SOLVER_2STAGE
++#if !(defined(TEST_KERNEL) ^ defined(TEST_ALL_KERNELS))
++error: define either TEST_ALL_KERNELS or a valid TEST_KERNEL
++#endif
++#endif
++
++#ifdef TEST_GENERALIZED_DECOMP_EIGENPROBLEM
++#define TEST_GENERALIZED_EIGENPROBLEM
++#endif
++
++#ifdef TEST_SINGLE
++#  define EV_TYPE real(kind=C_FLOAT)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_FLOAT)
++#  else
++#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
++#  endif
++#else
++#  define EV_TYPE real(kind=C_DOUBLE)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_DOUBLE)
++#  else
++#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
++#  endif
++#endif
++
++#ifdef TEST_REAL
++#define KERNEL_KEY "real_kernel"
++#endif
++#ifdef TEST_COMPLEX
++#define KERNEL_KEY "complex_kernel"
++#endif
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++#define TEST_GPU 0
++#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1) || (TEST_INTEL_GPU == 1)
++#undef TEST_GPU
++#define TEST_GPU 1
++#endif
++
++#include "assert.h"
++
++program test
++   use elpa 
++   !use test_util
++   use test_setup_mpi
++   use test_prepare_matrix
++   use test_read_input_parameters
++   use test_blacs_infrastructure
++   use test_check_correctness
++   use test_analytic
++#ifdef WITH_SCALAPACK_TESTS
++   use test_scalapack
++#endif
++
++#ifdef HAVE_REDIRECT
++   use test_redirect
++#endif
++#ifdef WITH_OPENMP_TRADITIONAL
++   use omp_lib
++#endif
++   use precision_for_tests
++
++#if TEST_GPU_DEVICE_POINTER_API == 1
++   use test_gpu
++#if TEST_NVIDIA_GPU == 1
++   use test_cuda_functions
++#endif
++#if TEST_AMD_GPU == 1
++   use test_hip_functions
++#endif
++
++#endif /* TEST_GPU_DEVICE_POINTER_API */
++
++   implicit none
++
++   ! matrix dimensions
++   TEST_INT_TYPE     :: na, nev, nblk
++
++   ! mpi
++   TEST_INT_TYPE     :: myid, nprocs
++   TEST_INT_MPI_TYPE :: myidMPI, nprocsMPI
++   TEST_INT_TYPE     :: na_cols, na_rows  ! local matrix size
++   TEST_INT_TYPE     :: np_cols, np_rows  ! number of MPI processes per column/row
++   TEST_INT_TYPE     :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
++   TEST_INT_MPI_TYPE :: mpierr
++
++   ! blacs
++   TEST_INT_TYPE     :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++
++   ! The Matrix
++   MATRIX_TYPE, allocatable, target    :: a(:,:)
++   MATRIX_TYPE, allocatable           :: as(:,:)
++#if defined(TEST_HERMITIAN_MULTIPLY)
++   MATRIX_TYPE, allocatable    :: b(:,:), c(:,:)
++#endif
++#if defined(TEST_GENERALIZED_EIGENPROBLEM)
++   MATRIX_TYPE, allocatable    :: b(:,:), bs(:,:)
++#endif
++   ! eigenvectors
++   MATRIX_TYPE, allocatable, target    :: z(:,:)
++   ! eigenvalues
++   EV_TYPE, allocatable, target        :: ev(:)
++
++#if TEST_GPU_DEVICE_POINTER_API == 1
++   type(c_ptr)                :: a_dev, q_dev, ev_dev
++#endif
++
++
++   logical                     :: check_all_evals, skip_check_correctness
++
++#if defined(TEST_MATRIX_TOEPLITZ) || defined(TEST_MATRIX_FRANK)
++   EV_TYPE, allocatable        :: d(:), sd(:), ds(:), sds(:)
++   EV_TYPE                     :: diagonalELement, subdiagonalElement
++#endif
++
++   TEST_INT_TYPE               :: status
++   integer(kind=c_int)         :: error_elpa
++
++   type(output_t)              :: write_to_file
++   class(elpa_t), pointer      :: e
++#ifdef TEST_ALL_KERNELS
++   TEST_INT_TYPE      :: i
++#endif
++#ifdef TEST_ALL_LAYOUTS
++   TEST_INT_TYPE      :: i_layout
++#ifdef BUILD_FUGAKU
++   character(len=1) :: layouts(2)
++#else
++   character(len=1), parameter :: layouts(2) = [ 'C', 'R' ]
++#endif
++#endif
++   integer(kind=c_int):: kernel
++   character(len=1)   :: layout
++   logical            :: do_test_numeric_residual, do_test_numeric_residual_generalized, &
++                         do_test_analytic_eigenvalues, &
++                         do_test_analytic_eigenvalues_eigenvectors,   &
++                         do_test_frank_eigenvalues,  &
++                         do_test_toeplitz_eigenvalues, do_test_cholesky,   &
++                         do_test_hermitian_multiply
++   logical            :: ignoreError, success, successGPU
++#ifdef WITH_OPENMP_TRADITIONAL
++   TEST_INT_TYPE      :: max_threads, threads_caller
++#endif
++#if TEST_GPU_SET_ID == 1
++   TEST_INT_TYPE      :: gpuID
++#endif
++#ifdef SPLIT_COMM_MYSELF
++   TEST_INT_MPI_TYPE  :: mpi_comm_rows, mpi_comm_cols, mpi_string_length, mpierr2
++   character(len=MPI_MAX_ERROR_STRING) :: mpierr_string
++#endif
++
++
++#if TEST_GPU_DEVICE_POINTER_API == 1
++#if TEST_REAL == 1
++#if TEST_DOUBLE
++   integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_double_real
++#endif
++#if TEST_SINGLE
++   integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_single_real
++#endif
++#endif /* TEST_REAL == 1 */
++
++#if TEST_COMPLEX == 1
++#if TEST_DOUBLE
++   integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_double_complex
++#endif
++#if TEST_SINGLE
++   integer(kind=c_intptr_t), parameter           :: size_of_datatype = size_of_single_complex
++#endif
++#endif
++#endif /* TEST_GPU_DEVICE_POINTER_API == 1 */
++
++#ifdef TEST_ALL_LAYOUTS
++#ifdef BUILD_FUGAKU
++   layouts(1) = 'C'
++   layouts(2) = 'R'
++#endif
++#endif
++
++   ignoreError = .false.
++
++   call read_input_parameters_traditional(na, nev, nblk, write_to_file, skip_check_correctness)
++!   call setup_mpi(myid, nprocs)
++    call MPI_INIT_THREAD(MPI_THREAD_SERIALIZED,info, mpierr)
++    call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs, mpierr)
++    call MPI_COMM_RANK(MPI_COMM_WORLD,myid, mpierr)
++
++#ifdef HAVE_REDIRECT
++#ifdef WITH_MPI
++     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++     call redirect_stdout(myid)
++#endif
++#endif
++
++   check_all_evals = .true.
++
++
++   do_test_numeric_residual = .false.
++   do_test_numeric_residual_generalized = .false.
++   do_test_analytic_eigenvalues = .false.
++   do_test_analytic_eigenvalues_eigenvectors = .false.
++   do_test_frank_eigenvalues = .false.
++   do_test_toeplitz_eigenvalues = .false. 
++
++   do_test_cholesky = .false.
++#if defined(TEST_CHOLESKY)
++   do_test_cholesky = .true.
++#endif
++   do_test_hermitian_multiply = .false.
++#if defined(TEST_HERMITIAN_MULTIPLY)
++   do_test_hermitian_multiply = .true.
++#endif
++
++   status = 0
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++
++   if (myid == 0) then
++     print '((a,i0))', 'Program '
++     // TEST_CASE
++     print *, ""
++   endif
++
++#ifdef TEST_ALL_LAYOUTS
++   do i_layout = 1, size(layouts)               ! layouts
++     layout = layouts(i_layout)
++     do np_cols = 1, nprocs                     ! factors
++       if (mod(nprocs,np_cols) /= 0 ) then
++         cycle
++       endif
++#else
++   layout = 'C'
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++#endif
++
++   np_rows = nprocs/np_cols
++   assert(nprocs == np_rows * np_cols)
++
++   if (myid == 0) then
++     print '((a,i0))', 'Matrix size: ', na
++     print '((a,i0))', 'Num eigenvectors: ', nev
++     print '((a,i0))', 'Blocksize: ', nblk
++#ifdef WITH_MPI
++     print '((a,i0))', 'Num MPI proc: ', nprocs
++     print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
++     print '(a)',      'Process layout: ' // layout
++#endif
++     print *,''
++   endif
++
++#if TEST_QR_DECOMPOSITION == 1
++
++#if (TEST_NVIDIA_GPU == 1) || (TEST_INTEL_GPU == 1) || (TEST_AMD_GPU == 1)
++#ifdef WITH_MPI
++     call mpi_finalize(mpierr)
++#endif
++     stop 77
++#endif /* TEST_NVIDIA_GPU || TEST_INTEL_GPU */
++   if (nblk .lt. 64) then
++     if (myid .eq. 0) then
++       print *,"At the moment QR decomposition need blocksize of at least 64"
++     endif
++     if ((na .lt. 64) .and. (myid .eq. 0)) then
++       print *,"This is why the matrix size must also be at least 64 or only 1 MPI task can be used"
++     endif
++
++#ifdef WITH_MPI
++     call mpi_finalize(mpierr)
++#endif
++     stop 77
++   endif
++#endif /* TEST_QR_DECOMPOSITION */
++
++
++   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, &
++                         np_cols, layout, my_blacs_ctxt, my_prow, &
++                         my_pcol)
++
++
++#if defined(TEST_GENERALIZED_EIGENPROBLEM) && defined(TEST_ALL_LAYOUTS)
++#ifdef WITH_MPI
++     call mpi_finalize(mpierr)
++#endif
++     stop 77
++#endif
++
++
++   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
++                                np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   allocate(a (na_rows,na_cols))
++   allocate(as(na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++   allocate(ev(na))
++
++#ifdef TEST_HERMITIAN_MULTIPLY
++   allocate(b (na_rows,na_cols))
++   allocate(c (na_rows,na_cols))
++#endif
++
++#ifdef TEST_GENERALIZED_EIGENPROBLEM
++   allocate(b (na_rows,na_cols))
++   allocate(bs (na_rows,na_cols))
++#endif
++
++#if defined(TEST_MATRIX_TOEPLITZ) || defined(TEST_MATRIX_FRANK)
++   allocate(d (na), ds(na))
++   allocate(sd (na), sds(na))
++#endif
++
++   a(:,:) = 0.0
++   z(:,:) = 0.0
++   ev(:) = 0.0
++
++#if defined(TEST_MATRIX_RANDOM) && !defined(TEST_SOLVE_TRIDIAGONAL) && !defined(TEST_CHOLESKY) && !defined(TEST_EIGENVALUES)
++   ! the random matrix can be used in allmost all tests; but for some no
++   ! correctness checks have been implemented; do not allow these
++   ! combinations
++   ! RANDOM + TEST_SOLVE_TRIDIAGONAL: we need a TOEPLITZ MATRIX
++   ! RANDOM + TEST_CHOLESKY: wee need SPD matrix
++   ! RANDOM + TEST_EIGENVALUES: no correctness check known
++
++   ! We also have to take care of special case in TEST_EIGENVECTORS
++#if !defined(TEST_EIGENVECTORS)
++    call prepare_matrix_random(na, myid, sc_desc, a, z, as)
++#else /* TEST_EIGENVECTORS */
++    if (nev .ge. 1) then
++      call prepare_matrix_random(na, myid, sc_desc, a, z, as)
++#ifndef TEST_HERMITIAN_MULTIPLY
++      do_test_numeric_residual = .true.
++#endif
++   else
++     if (myid .eq. 0) then
++       print *,"At the moment with the random matrix you need nev >=1"
++     endif
++#ifdef WITH_MPI
++     call mpi_finalize(mpierr)
++#endif
++     stop 77
++   endif
++#endif /* TEST_EIGENVECTORS */
++    do_test_analytic_eigenvalues = .false.
++    do_test_analytic_eigenvalues_eigenvectors = .false.
++    do_test_frank_eigenvalues = .false.
++    do_test_toeplitz_eigenvalues = .false.
++#endif /* (TEST_MATRIX_RANDOM) */
++
++#if defined(TEST_MATRIX_RANDOM) && defined(TEST_CHOLESKY)
++     call prepare_matrix_random_spd(na, myid, sc_desc, a, z, as, &
++                 nblk, np_rows, np_cols, my_prow, my_pcol)
++    do_test_analytic_eigenvalues = .false.
++    do_test_analytic_eigenvalues_eigenvectors = .false.
++    do_test_frank_eigenvalues = .false.
++    do_test_toeplitz_eigenvalues = .false.
++#endif /* TEST_MATRIX_RANDOM and TEST_CHOLESKY */
++
++#if defined(TEST_MATRIX_RANDOM) && defined(TEST_GENERALIZED_EIGENPROBLEM)
++   ! call prepare_matrix_random(na, myid, sc_desc, a, z, as)
++    call prepare_matrix_random_spd(na, myid, sc_desc, b, z, bs, &
++                 nblk, np_rows, np_cols, my_prow, my_pcol)
++    do_test_analytic_eigenvalues = .false.
++    do_test_analytic_eigenvalues_eigenvectors = .false.
++    do_test_frank_eigenvalues = .false.
++    do_test_toeplitz_eigenvalues = .false.
++    do_test_numeric_residual = .false.
++    do_test_numeric_residual_generalized = .true.
++#endif /* TEST_MATRIX_RANDOM and TEST_GENERALIZED_EIGENPROBLEM */
++
++#if defined(TEST_MATRIX_RANDOM) && (defined(TEST_SOLVE_TRIDIAGONAL) || defined(TEST_EIGENVALUES))
++#error "Random matrix is not allowed in this configuration"
++#endif
++
++#if defined(TEST_MATRIX_ANALYTIC)  && !defined(TEST_SOLVE_TRIDIAGONAL) && !defined(TEST_CHOLESKY)
++   ! the analytic matrix can be used in allmost all tests; but for some no
++   ! correctness checks have been implemented; do not allow these
++   ! combinations
++   ! ANALYTIC + TEST_SOLVE_TRIDIAGONAL: we need a TOEPLITZ MATRIX
++   ! ANALTIC  + TEST_CHOLESKY: no correctness check yet implemented
++
++   call prepare_matrix_analytic(na, a, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
++   as(:,:) = a
++
++   do_test_numeric_residual = .false.
++   do_test_analytic_eigenvalues_eigenvectors = .false.
++#ifndef TEST_HERMITIAN_MULTIPLY
++   do_test_analytic_eigenvalues = .true.
++#endif
++#if defined(TEST_EIGENVECTORS)
++   if (nev .ge. 1) then
++     do_test_analytic_eigenvalues_eigenvectors = .true.
++     do_test_analytic_eigenvalues = .false.
++   else
++     do_test_analytic_eigenvalues_eigenvectors = .false.
++   endif
++#endif
++   do_test_frank_eigenvalues = .false.
++   do_test_toeplitz_eigenvalues = .false.
++#endif /* TEST_MATRIX_ANALYTIC */
++#if defined(TEST_MATRIX_ANALYTIC) && (defined(TEST_SOLVE_TRIDIAGONAL) || defined(TEST_CHOLESKY))
++#error "Analytic matrix is not allowd in this configuration"
++#endif
++
++#if defined(TEST_MATRIX_TOEPLITZ)
++   ! The Toeplitz matrix works in each test
++#ifdef TEST_SINGLE
++   diagonalElement = 0.45_c_float
++   subdiagonalElement =  0.78_c_float
++#else
++   diagonalElement = 0.45_c_double
++   subdiagonalElement =  0.78_c_double
++#endif
++
++! actually we test cholesky for diagonal matrix only
++#if defined(TEST_CHOLESKY)
++#ifdef TEST_SINGLE
++  diagonalElement = (2.546_c_float, 0.0_c_float)
++  subdiagonalElement =  (0.0_c_float, 0.0_c_float)
++#else
++  diagonalElement = (2.546_c_double, 0.0_c_double)
++  subdiagonalElement =  (0.0_c_double, 0.0_c_double)
++#endif
++#endif /* TEST_CHOLESKY */
++
++   ! check first whether to abort
++   if (na < 10) then
++#ifdef WITH_MPI
++     call mpi_finalize(mpierr)
++#endif
++     stop 77
++   endif
++   call prepare_matrix_toeplitz(na, diagonalElement, subdiagonalElement, &
++                                d, sd, ds, sds, a, as, nblk, np_rows, &
++                                np_cols, my_prow, my_pcol)
++
++
++   do_test_numeric_residual = .false.
++#if defined(TEST_EIGENVECTORS)
++   if (nev .ge. 1) then
++     do_test_numeric_residual = .true.
++   else
++     do_test_numeric_residual = .false.
++   endif
++#endif
++
++   do_test_analytic_eigenvalues = .false.
++   do_test_analytic_eigenvalues_eigenvectors = .false.
++   do_test_frank_eigenvalues = .false.
++#if defined(TEST_CHOLESKY)
++   do_test_toeplitz_eigenvalues = .false.
++#else
++   do_test_toeplitz_eigenvalues = .true.
++#endif
++
++#endif /* TEST_MATRIX_TOEPLITZ */
++
++
++#if defined(TEST_MATRIX_FRANK) && !defined(TEST_SOLVE_TRIDIAGONAL) && !defined(TEST_CHOLESKY)
++   ! the random matrix can be used in allmost all tests; but for some no
++   ! correctness checks have been implemented; do not allow these
++   ! combinations
++   ! FRANK + TEST_SOLVE_TRIDIAGONAL: we need a TOEPLITZ MATRIX
++   ! FRANK + TEST_CHOLESKY: no correctness check yet implemented
++
++   ! We also have to take care of special case in TEST_EIGENVECTORS
++#if !defined(TEST_EIGENVECTORS)
++    call prepare_matrix_frank(na, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
++
++    do_test_analytic_eigenvalues = .false.
++    do_test_analytic_eigenvalues_eigenvectors = .false.
++#ifndef TEST_HERMITIAN_MULTIPLY
++    do_test_frank_eigenvalues = .true.
++#endif
++    do_test_toeplitz_eigenvalues = .false.
++
++#else /* TEST_EIGENVECTORS */
++
++    if (nev .ge. 1) then
++      call prepare_matrix_frank(na, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
++
++    do_test_analytic_eigenvalues = .false.
++    do_test_analytic_eigenvalues_eigenvectors = .false.
++#ifndef TEST_HERMITIAN_MULTIPLY
++    do_test_frank_eigenvalues = .true.
++#endif
++    do_test_toeplitz_eigenvalues = .false.
++    do_test_numeric_residual = .false.
++   else
++    do_test_analytic_eigenvalues = .false.
++    do_test_analytic_eigenvalues_eigenvectors = .false.
++#ifndef TEST_HERMITIAN_MULTIPLY
++    do_test_frank_eigenvalues = .true.
++#endif
++    do_test_toeplitz_eigenvalues = .false.
++    do_test_numeric_residual = .false.
++
++   endif
++
++#endif /* TEST_EIGENVECTORS */
++#endif /* (TEST_MATRIX_FRANK) */
++#if defined(TEST_MATRIX_FRANK) && (defined(TEST_SOLVE_TRIDIAGONAL) || defined(TEST_CHOLESKY))
++#error "FRANK matrix is not allowed in this configuration"
++#endif
++
++
++#ifdef TEST_HERMITIAN_MULTIPLY
++#ifdef TEST_REAL
++
++#ifdef TEST_DOUBLE
++   b(:,:) = 2.0_c_double * a(:,:)
++   c(:,:) = 0.0_c_double
++#else
++   b(:,:) = 2.0_c_float * a(:,:)
++   c(:,:) = 0.0_c_float
++#endif
++
++#endif /* TEST_REAL */
++
++#ifdef TEST_COMPLEX
++
++#ifdef TEST_DOUBLE
++   b(:,:) = 2.0_c_double * a(:,:)
++   c(:,:) = (0.0_c_double, 0.0_c_double)
++#else
++   b(:,:) = 2.0_c_float * a(:,:)
++   c(:,:) = (0.0_c_float, 0.0_c_float)
++#endif
++
++#endif /* TEST_COMPLEX */
++
++#endif /* TEST_HERMITIAN_MULTIPLY */
++
++! if the test is used for (repeated) performacne tests, one might want to skip the checking
++! of the results, which might be time-consuming and not necessary.
++   if(skip_check_correctness) then
++     do_test_numeric_residual = .false.
++     do_test_numeric_residual_generalized = .false.
++     do_test_analytic_eigenvalues = .false.
++     do_test_analytic_eigenvalues_eigenvectors = .false.
++     do_test_frank_eigenvalues = .false.
++     do_test_toeplitz_eigenvalues = .false. 
++     do_test_cholesky = .false.
++   endif
++
++
++#ifdef WITH_OPENMP_TRADITIONAL
++   threads_caller = omp_get_max_threads()
++   if (myid == 0) then
++     print *,"The calling program uses ",threads_caller," threads"
++   endif
++#endif
++
++   e => elpa_allocate(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e%set("na", int(na,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nev", int(nev,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("nblk", int(nblk,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   if (layout .eq. 'C') then
++     call e%set("matrix_order",COLUMN_MAJOR_ORDER,error_elpa)
++   else
++     call e%set("matrix_order",ROW_MAJOR_ORDER,error_elpa)
++   endif
++
++#ifdef WITH_MPI
++#ifdef SPLIT_COMM_MYSELF
++   call mpi_comm_split(MPI_COMM_WORLD, int(my_pcol,kind=MPI_KIND), int(my_prow,kind=MPI_KIND), &
++                       mpi_comm_rows, mpierr)
++   if (mpierr .ne. MPI_SUCCESS) then
++     call MPI_ERROR_STRING(mpierr, mpierr_string, mpi_string_length, mpierr2)
++     write(error_unit,*) "MPI ERROR occured during mpi_comm_split for row communicator: ", trim(mpierr_string)
++     stop 1
++   endif
++
++   call mpi_comm_split(MPI_COMM_WORLD, int(my_prow,kind=MPI_KIND), int(my_pcol,kind=MPI_KIND), &
++                       mpi_comm_cols, mpierr)
++   if (mpierr .ne. MPI_SUCCESS) then
++     call MPI_ERROR_STRING(mpierr,mpierr_string, mpi_string_length, mpierr2)
++     write(error_unit,*) "MPI ERROR occured during mpi_comm_split for col communicator: ", trim(mpierr_string)
++     stop 1
++   endif
++
++   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("mpi_comm_rows", int(mpi_comm_rows,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("mpi_comm_cols", int(mpi_comm_cols,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++
++#else
++   call e%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_row", int(my_prow,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++   call e%set("verbose", 1, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++#endif
++#ifdef TEST_GENERALIZED_EIGENPROBLEM
++   call e%set("blacs_context", int(my_blacs_ctxt,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++   call e%set("timings", 1_ik, error_elpa)
++   assert_elpa_ok(e%setup())
++
++#ifdef TEST_SOLVER_1STAGE
++   call e%set("solver", ELPA_SOLVER_1STAGE, error_elpa)
++#else
++   call e%set("solver", ELPA_SOLVER_2STAGE, error_elpa)
++#endif
++   assert_elpa_ok(error_elpa)
++
++#if TEST_NVIDIA_GPU == 1
++   call e%set("nvidia-gpu", TEST_GPU, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++#if TEST_AMD_GPU == 1
++   call e%set("amd-gpu", TEST_GPU, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++#if TEST_INTEL_GPU == 1
++   call e%set("intel-gpu", TEST_GPU, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++#if (TEST_GPU_SET_ID == 1) && (TEST_INTEL_GPU == 0)
++   ! simple test
++   ! Can (and should) fail often
++   !gpuID = mod(myid,2)
++   gpuID = mod(myid,1)
++   call e%set("use_gpu_id", int(gpuID,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++#if TEST_GPU_DEVICE_POINTER_API == 1
++#if defined(TEST_EIGENVECTORS) && defined(TEST_MATRIX_RANDOM)
++   ! create device pointers for a,q, ev copy a to
++#if TEST_NVIDIA_GPU == 1
++   if (gpu_vendor(NVIDIA_GPU) == NVIDIA_GPU) then
++     call set_gpu_parameters()
++   endif
++#endif
++#if TEST_AMD_GPU == 1
++   if (gpu_vendor(AMD_GPU) == AMD_GPU) then
++     call set_gpu_parameters()
++   endif
++#endif
++
++   ! set device
++   success = .true.
++#if TEST_NVIDIA_GPU == 1
++   success = cuda_setdevice(gpuID)
++#endif
++#if TEST_AMD_GPU == 1
++   success = cuda_setdevice(gpuID)
++#endif
++   if (.not.(success)) then
++     print *,"Cannot set GPU device. Aborting..."
++     stop
++   endif
++
++   ! malloc
++   successGPU = gpu_malloc(a_dev, na_rows*na_cols*size_of_datatype)
++   if (.not.(successGPU)) then
++     print *,"Cannot allocate matrix a on GPU! Aborting..."
++     stop
++   endif
++   successGPU = gpu_malloc(q_dev, na_rows*na_cols*size_of_datatype)
++   if (.not.(successGPU)) then
++     print *,"Cannot allocate matrix q on GPU! Aborting..."
++     stop
++   endif
++   successGPU = gpu_malloc(ev_dev, na*size_of_datatype)
++   if (.not.(successGPU)) then
++     print *,"Cannot allocate vector of eigenvalues on GPU! Aborting..."
++     stop
++   endif
++
++   successGPU = gpu_memcpy(a_dev, c_loc(a), na_rows*na_cols*size_of_datatype, &
++                           gpuMemcpyHostToDevice)
++   if (.not.(successGPU)) then
++     print *,"Cannot copy matrix a to GPU! Aborting..."
++     stop
++   endif
++#endif
++#endif /* TEST_GPU_DEVICE_POINTER_API */
++
++#if TEST_QR_DECOMPOSITION == 1
++   call e%set("qr", 1_ik, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++#ifdef WITH_OPENMP_TRADITIONAL
++   max_threads=omp_get_max_threads()
++   call e%set("omp_threads", int(max_threads,kind=c_int), error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++   if (myid == 0) print *, ""
++
++#ifdef TEST_ALL_KERNELS
++   do i = 0, elpa_option_cardinality(KERNEL_KEY)  ! kernels
++#if (TEST_NVIDIA_GPU == 0) && (TEST_INTEL_GPU == 0) && (TEST_AMD_GPU == 0)
++     !if (TEST_GPU .eq. 0) then
++       kernel = elpa_option_enumerate(KERNEL_KEY, int(i,kind=c_int))
++       if (kernel .eq. ELPA_2STAGE_REAL_NVIDIA_GPU) continue
++       if (kernel .eq. ELPA_2STAGE_COMPLEX_NVIDIA_GPU) continue
++       if (kernel .eq. ELPA_2STAGE_REAL_AMD_GPU) continue
++       if (kernel .eq. ELPA_2STAGE_COMPLEX_AMD_GPU) continue
++       if (kernel .eq. ELPA_2STAGE_REAL_INTEL_GPU) continue
++       if (kernel .eq. ELPA_2STAGE_COMPLEX_INTEL_GPU) continue
++     !endif
++#endif
++#endif
++
++#ifdef TEST_KERNEL
++     kernel = TEST_KERNEL
++#endif
++
++#ifdef TEST_SOLVER_2STAGE
++#if TEST_NVIDIA_GPU == 1
++#if defined TEST_REAL
++#if (TEST_NVIDIA_GPU == 1)
++#if WITH_NVIDIA_GPU_SM80_COMPUTE_CAPABILITY == 1
++     kernel = ELPA_2STAGE_REAL_NVIDIA_SM80_GPU
++#else
++     kernel = ELPA_2STAGE_REAL_NVIDIA_GPU
++#endif
++#endif /* TEST_NVIDIA_GPU */
++#if (TEST_AMD_GPU == 1)
++     kernel = ELPA_2STAGE_REAL_AMD_GPU
++#endif
++#if (TEST_INTEL_GPU == 1)
++     kernel = ELPA_2STAGE_REAL_INTEL_GPU
++#endif
++#endif /* TEST_REAL */
++
++#if defined TEST_COMPLEX
++#if (TEST_NVIDIA_GPU == 1)
++     kernel = ELPA_2STAGE_COMPLEX_NVIDIA_GPU
++#endif
++#if (TEST_AMD_GPU == 1)
++     kernel = ELPA_2STAGE_COMPLEX_AMD_GPU
++#endif
++#if (TEST_INTEL_GPU == 1)
++     kernel = ELPA_2STAGE_COMPLEX_INTEL_GPU
++#endif
++#endif /* TEST_COMPLEX */
++#endif /* TEST_GPU == 1 */
++
++
++     call e%set(KERNEL_KEY, kernel, error_elpa)
++#ifdef TEST_KERNEL
++     assert_elpa_ok(error_elpa)
++#else
++     if (error_elpa /= ELPA_OK) then
++       cycle
++     endif
++     ! actually used kernel might be different if forced via environment variables
++     call e%get(KERNEL_KEY, kernel, error_elpa)
++     assert_elpa_ok(error_elpa)
++#endif
++     if (myid == 0) then
++       print *, elpa_int_value_to_string(KERNEL_KEY, kernel) // " kernel"
++     endif
++#endif
++
++#if !defined(TEST_ALL_LAYOUTS)
++! print all parameters
++     call e%print_settings(error_elpa)
++     assert_elpa_ok(error_elpa)
++#endif
++
++#ifdef TEST_ALL_KERNELS
++     call e%timer_start(elpa_int_value_to_string(KERNEL_KEY, kernel))
++#endif
++
++     ! The actual solve step
++#if defined(TEST_EIGENVECTORS)
++#if TEST_QR_DECOMPOSITION == 1
++     call e%timer_start("e%eigenvectors_qr()")
++#else
++     call e%timer_start("e%eigenvectors()")
++#endif
++#ifdef TEST_SCALAPACK_ALL
++     call solve_scalapack_all(na, a, sc_desc, ev, z)
++#elif TEST_SCALAPACK_PART
++     call solve_scalapack_part(na, a, sc_desc, nev, ev, z)
++     check_all_evals = .false. ! scalapack does not compute all eigenvectors
++#else /* TEST_SCALAPACK_PART */
++#ifdef TEST_EXPLICIT_NAME
++#if defined(TEST_REAL)
++#if defined(TEST_DOUBLE)
++#if (TEST_GPU_DEVICE_POINTER_API == 1) && defined(TEST_MATRIX_RANDOM) && defined(TEST_EIGENVECTORS)
++     call e%eigenvectors_double(a_dev, ev_dev, q_dev, error_elpa)
++#else
++     call e%eigenvectors_double(a, ev, z, error_elpa)
++#endif
++#endif /* TEST_DOUBLE */
++#if defined(TEST_SINGLE)
++#if (TEST_GPU_DEVICE_POINTER_API == 1) && defined(TEST_MATRIX_RANDOM) && defined(TEST_EIGENVECTORS)
++     call e%eigenvectors_float(a_dev, ev_dev, q_dev, error_elpa)
++#else
++     call e%eigenvectors_float(a, ev, z, error_elpa)
++#endif
++#endif /* TEST_SINGLE */
++#endif /* TEST_REAL */
++#if defined(TEST_COMPLEX)
++#if defined(TEST_DOUBLE)
++#if (TEST_GPU_DEVICE_POINTER_API == 1) && defined(TEST_MATRIX_RANDOM) && defined(TEST_EIGENVECTORS)
++     call e%eigenvectors_double_complex(a_dev, ev_dev, q_dev, error_elpa)
++#else
++     call e%eigenvectors_double_complex(a, ev, z, error_elpa)
++#endif
++#endif /* TEST_DOUBLE */
++#if defined(TEST_SINGLE)
++#if (TEST_GPU_DEVICE_POINTER_API == 1) && defined(TEST_MATRIX_RANDOM) && defined(TEST_EIGENVECTORS)
++     call e%eigenvectors_float_complex(a_dev, ev_dev, q_dev, error_elpa)
++#else
++     call e%eigenvectors_float_complex(a, ev, z, error_elpa)
++#endif
++#endif /* TEST_SINGLE */
++#endif /* TEST_COMPLEX */
++#else /* TEST_EXPLICIT_NAME */
++     call e%eigenvectors(a, ev, z, error_elpa)
++#endif /* TEST_EXPLICIT_NAME */
++#endif /* TEST_SCALAPACK_PART */
++#if TEST_QR_DECOMPOSITION == 1
++     call e%timer_stop("e%eigenvectors_qr()")
++#else
++     call e%timer_stop("e%eigenvectors()")
++#endif
++#endif /* TEST_EIGENVECTORS  */
++
++#ifdef TEST_EIGENVALUES
++     call e%timer_start("e%eigenvalues()")
++#ifdef TEST_EXPLICIT_NAME
++#if defined(TEST_REAL)
++#if defined(TEST_DOUBLE)
++     call e%eigenvalues_double(a, ev, error_elpa)
++#endif
++#if defined(TEST_SINGLE)
++     call e%eigenvalues_float(a, ev, error_elpa)
++#endif
++#endif /* TEST_REAL */
++#if defined(TEST_COMPLEX)
++#if defined(TEST_DOUBLE)
++     call e%eigenvalues_double_complex(a, ev, error_elpa)
++#endif
++#if defined(TEST_SINGLE)
++     call e%eigenvalues_float_complex(a, ev, error_elpa)
++#endif
++#endif
++#else /* TEST_EXPLICIT_NAME */
++     call e%eigenvalues(a, ev, error_elpa)
++#endif /* TEST_EXPLICIT_NAME */
++     call e%timer_stop("e%eigenvalues()")
++#endif
++
++#if defined(TEST_SOLVE_TRIDIAGONAL)
++     call e%timer_start("e%solve_tridiagonal()")
++     call e%solve_tridiagonal(d, sd, z, error_elpa)
++     call e%timer_stop("e%solve_tridiagonal()")
++     ev(:) = d(:)
++#endif
++
++#if defined(TEST_CHOLESKY)
++     call e%timer_start("e%cholesky()")
++     call e%cholesky(a, error_elpa)
++     assert_elpa_ok(error_elpa)
++     call e%timer_stop("e%cholesky()")
++#endif
++
++#if defined(TEST_HERMITIAN_MULTIPLY)
++     call e%timer_start("e%hermitian_multiply()")
++     call e%hermitian_multiply('F','F', int(na,kind=c_int), a, b, int(na_rows,kind=c_int), &
++                               int(na_cols,kind=c_int), c, int(na_rows,kind=c_int),        &
++                               int(na_cols,kind=c_int), error_elpa)
++     call e%timer_stop("e%hermitian_multiply()")
++#endif
++
++#if defined(TEST_GENERALIZED_EIGENPROBLEM)
++     call e%timer_start("e%generalized_eigenvectors()")
++#if defined(TEST_GENERALIZED_DECOMP_EIGENPROBLEM)
++     call e%timer_start("is_already_decomposed=.false.")
++#endif
++     call e%generalized_eigenvectors(a, b, ev, z, .false., error_elpa)
++#if defined(TEST_GENERALIZED_DECOMP_EIGENPROBLEM)
++     call e%timer_stop("is_already_decomposed=.false.")
++     a = as
++     call e%timer_start("is_already_decomposed=.true.")
++     call e%generalized_eigenvectors(a, b, ev, z, .true., error_elpa)
++     call e%timer_stop("is_already_decomposed=.true.")
++#endif
++     call e%timer_stop("e%generalized_eigenvectors()")
++#endif
++
++     assert_elpa_ok(error_elpa)
++
++#ifdef TEST_ALL_KERNELS
++     call e%timer_stop(elpa_int_value_to_string(KERNEL_KEY, kernel))
++#endif
++
++     if (myid .eq. 0) then
++#ifdef TEST_ALL_KERNELS
++       call e%print_times(elpa_int_value_to_string(KERNEL_KEY, kernel))
++#else /* TEST_ALL_KERNELS */
++
++#if defined(TEST_EIGENVECTORS)
++#if TEST_QR_DECOMPOSITION == 1
++       call e%print_times("e%eigenvectors_qr()")
++#else
++       call e%print_times("e%eigenvectors()")
++#endif
++#endif
++#ifdef TEST_EIGENVALUES
++       call e%print_times("e%eigenvalues()")
++#endif
++#ifdef TEST_SOLVE_TRIDIAGONAL
++       call e%print_times("e%solve_tridiagonal()")
++#endif
++#ifdef TEST_CHOLESKY
++       call e%print_times("e%cholesky()")
++#endif
++#ifdef TEST_HERMITIAN_MULTIPLY
++       call e%print_times("e%hermitian_multiply()")
++#endif
++#ifdef TEST_GENERALIZED_EIGENPROBLEM
++      call e%print_times("e%generalized_eigenvectors()")
++#endif
++#endif /* TEST_ALL_KERNELS */
++     endif
++
++
++
++
++#if TEST_GPU_DEVICE_POINTER_API == 1
++#if defined(TEST_EIGENVECTORS) && defined(TEST_MATRIX_RANDOM)
++   ! copy for testing
++   successGPU = gpu_memcpy(c_loc(z), q_dev, na_rows*na_cols*size_of_datatype, &
++                           gpuMemcpyDeviceToHost)
++   if (.not.(successGPU)) then
++     print *,"cannot copy matrix of eigenvectors from GPU to host! Aborting..."
++     stop
++   endif
++
++   successGPU = gpu_memcpy(c_loc(ev), ev_dev, na*&
++#ifdef TEST_DOUBLE
++           size_of_double_real, &
++#endif
++#ifdef TEST_SINGLE
++           size_of_single_real, &
++#endif
++                           gpuMemcpyDeviceToHost)
++   if (.not.(successGPU)) then
++     print *,"cannot copy vector of eigenvalues from GPU to host! Aborting..."
++     stop
++   endif
++
++   ! and deallocate device pointer
++   successGPU = gpu_free(a_dev)
++   if (.not.(successGPU)) then
++     print *,"cannot free memory of a_dev on GPU. Aborting..."
++     stop
++   endif
++   successGPU = gpu_free(q_dev)
++   if (.not.(successGPU)) then
++     print *,"cannot free memory of q_dev on GPU. Aborting..."
++     stop
++   endif
++   successGPU = gpu_free(ev_dev)
++   if (.not.(successGPU)) then
++     print *,"cannot free memory of ev_dev on GPU. Aborting..."
++     stop
++   endif
++#endif
++#endif
++
++
++     if (do_test_analytic_eigenvalues) then
++       status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, &
++                                           my_prow, my_pcol, check_all_evals, .false.)
++       call check_status(status, myid)
++     endif
++
++     if (do_test_analytic_eigenvalues_eigenvectors) then
++       status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, &
++                                           my_prow, my_pcol, check_all_evals, .true.)
++       call check_status(status, myid)
++     endif
++
++     if(do_test_numeric_residual) then
++       status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, &
++                                                        np_rows,np_cols, my_prow, my_pcol)
++       call check_status(status, myid)
++     endif
++
++     if (do_test_frank_eigenvalues) then
++       status = check_correctness_eigenvalues_frank(na, ev, z, myid)
++       call check_status(status, myid)
++     endif
++
++     if (do_test_toeplitz_eigenvalues) then
++#if defined(TEST_EIGENVALUES) || defined(TEST_SOLVE_TRIDIAGONAL)
++       status = check_correctness_eigenvalues_toeplitz(na, diagonalElement, &
++                                                       subdiagonalElement, ev, z, myid)
++       call check_status(status, myid)
++#endif
++     endif
++
++     if (do_test_cholesky) then
++       status = check_correctness_cholesky(na, a, as, na_rows, sc_desc, myid )
++       call check_status(status, myid)
++     endif
++
++#ifdef TEST_HERMITIAN_MULTIPLY
++     if (do_test_hermitian_multiply) then
++       status = check_correctness_hermitian_multiply(na, a, b, c, na_rows, sc_desc, myid )
++       call check_status(status, myid)
++     endif
++#endif
++
++#ifdef TEST_GENERALIZED_EIGENPROBLEM
++     if(do_test_numeric_residual_generalized) then
++       status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, &
++                                                        np_cols, my_prow, &
++       my_pcol, bs)
++       call check_status(status, myid)
++     endif
++#endif
++
++
++#ifdef WITH_OPENMP_TRADITIONAL
++     if (threads_caller .ne. omp_get_max_threads()) then
++       if (myid .eq. 0) then
++         print *, " ERROR! the number of OpenMP threads has not been restored correctly"
++       endif
++       status = 1
++     endif
++#endif
++     if (myid == 0) then
++       print *, ""
++     endif
++
++#ifdef TEST_ALL_KERNELS
++     a(:,:) = as(:,:)
++#if defined(TEST_MATRIX_TOEPLITZ) || defined(TEST_MATRIX_FRANK)
++     d = ds
++     sd = sds
++#endif
++   end do ! kernels
++#endif
++
++   call elpa_deallocate(e, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   deallocate(a)
++   deallocate(as)
++   deallocate(z)
++   deallocate(ev)
++
++#ifdef TEST_HERMITIAN_MULTIPLY
++   deallocate(b)
++   deallocate(c)
++#endif
++#if defined(TEST_MATRIX_TOEPLITZ) || defined(TEST_MATRIX_FRANK)
++   deallocate(d, ds)
++   deallocate(sd, sds)
++#endif
++#if defined(TEST_GENERALIZED_EIGENPROBLEM)
++  deallocate(b, bs)
++#endif
++
++#ifdef TEST_ALL_LAYOUTS
++   end do ! factors
++   end do ! layouts
++#endif
++   call elpa_uninit(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++
++   call exit(status)
++
++   contains
++
++     subroutine check_status(status, myid)
++       implicit none
++       TEST_INT_TYPE, intent(in) :: status, myid
++       TEST_INT_MPI_TYPE         :: mpierr
++       if (status /= 0) then
++         if (myid == 0) print *, "Result incorrect!"
++#ifdef WITH_MPI
++         call mpi_finalize(mpierr)
++#endif
++         call exit(status)
++       endif
++     end subroutine
++
++end program
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/test_multiple_objs.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/test_multiple_objs.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/test_multiple_objs.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/test_multiple_objs.F90	2022-01-28 09:48:29.071140954 +0100
+@@ -0,0 +1,411 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++! Define one of TEST_REAL or TEST_COMPLEX
++! Define one of TEST_SINGLE or TEST_DOUBLE
++! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
++! Define TEST_NVIDIA_GPU \in [0, 1]
++! Define TEST_INTEL_GPU \in [0, 1]
++! Define TEST_AMD_GPU \in [0, 1]
++! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
++
++#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
++error: define exactly one of TEST_REAL or TEST_COMPLEX
++#endif
++
++#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
++error: define exactly one of TEST_SINGLE or TEST_DOUBLE
++#endif
++
++#ifdef TEST_SINGLE
++#  define EV_TYPE real(kind=C_FLOAT)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_FLOAT)
++#  else
++#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
++#  endif
++#else
++#  define EV_TYPE real(kind=C_DOUBLE)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_DOUBLE)
++#  else
++#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
++#  endif
++#endif
++
++
++#ifdef TEST_REAL
++#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_REAL
++#else
++#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_COMPLEX
++#endif
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++#define TEST_GPU 0
++#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1) || (TEST_INTEL_GPU == 1)
++#undef TEST_GPU
++#define TEST_GPU 1
++#endif
++
++#include "assert.h"
++
++program test
++   use elpa
++
++   !use test_util
++   use test_setup_mpi
++   use test_prepare_matrix
++   use test_read_input_parameters
++   use test_blacs_infrastructure
++   use test_check_correctness
++   use test_analytic
++   use iso_fortran_env
++
++#ifdef HAVE_REDIRECT
++   use test_redirect
++#endif
++   implicit none
++
++   ! matrix dimensions
++   TEST_INT_TYPE                     :: na, nev, nblk
++
++   ! mpi
++   TEST_INT_TYPE                     :: myid, nprocs
++   TEST_INT_TYPE                     :: na_cols, na_rows  ! local matrix size
++   TEST_INT_TYPE                     :: np_cols, np_rows  ! number of MPI processes per column/row
++   TEST_INT_TYPE                     :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
++   TEST_INT_TYPE                     :: ierr
++   TEST_INT_MPI_TYPE                 :: mpierr
++   ! blacs
++   character(len=1)                  :: layout
++   TEST_INT_TYPE                     :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++
++   ! The Matrix
++   MATRIX_TYPE, allocatable    :: a(:,:), as(:,:)
++   ! eigenvectors
++   MATRIX_TYPE, allocatable    :: z(:,:)
++   ! eigenvalues
++   EV_TYPE, allocatable        :: ev(:)
++
++   TEST_INT_TYPE               :: status
++   integer(kind=c_int)         :: error_elpa
++
++   type(output_t)              :: write_to_file
++   class(elpa_t), pointer      :: e1, e2, e_ptr
++   class(elpa_autotune_t), pointer :: tune_state
++
++   TEST_INT_TYPE                     :: iter
++   character(len=5)            :: iter_string
++   integer(kind=c_int)         :: timings, debug, gpu
++
++   call read_input_parameters(na, nev, nblk, write_to_file)
++!   call setup_mpi(myid, nprocs)
++    call MPI_INIT_THREAD(MPI_THREAD_SERIALIZED,info, mpierr)
++    call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs, mpierr)
++    call MPI_COMM_RANK(MPI_COMM_WORLD,myid, mpierr)
++#ifdef HAVE_REDIRECT
++#ifdef WITH_MPI
++   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++   call redirect_stdout(myid)
++#endif
++#endif
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++
++   layout = 'C'
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++   np_rows = nprocs/np_cols
++   assert(nprocs == np_rows * np_cols)
++
++   if (myid == 0) then
++     print '((a,i0))', 'Matrix size: ', na
++     print '((a,i0))', 'Num eigenvectors: ', nev
++     print '((a,i0))', 'Blocksize: ', nblk
++#ifdef WITH_MPI
++     print '((a,i0))', 'Num MPI proc: ', nprocs
++     print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
++     print '(a)',      'Process layout: ' // layout
++#endif
++     print *,''
++   endif
++
++   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, np_cols, layout, &
++                         my_blacs_ctxt, my_prow, my_pcol)
++
++   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   allocate(a (na_rows,na_cols))
++   allocate(as(na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++   allocate(ev(na))
++
++   a(:,:) = 0.0
++   z(:,:) = 0.0
++   ev(:) = 0.0
++
++   call prepare_matrix_analytic(na, a, nblk, myid, np_rows, np_cols, my_prow, my_pcol, print_times=.false.)
++   as(:,:) = a(:,:)
++
++   e1 => elpa_allocate(error_elpa)
++   !assert_elpa_ok(error_elpa)
++
++   call set_basic_params(e1, na, nev, na_rows, na_cols, my_prow, my_pcol)
++
++   call e1%set("timings",1, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e1%set("debug",1, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)
++   call e1%set("nvidia-gpu", TEST_GPU, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++#if TEST_AMD_GPU == 1
++   call e1%set("amd-gpu", TEST_GPU, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++#if TEST_INTEL_GPU == 1
++   call e1%set("intel-gpu", TEST_GPU, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++   !call e1%set("max_stored_rows", 15, error_elpa)
++
++   assert_elpa_ok(e1%setup())
++
++   call e1%store_settings("initial_parameters.txt", error_elpa)
++   assert_elpa_ok(error_elpa)
++
++#ifdef WITH_MPI
++     ! barrier after store settings, file created from one MPI rank only, but loaded everywhere
++     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++#endif
++
++   ! try to load parameters into another object
++   e2 => elpa_allocate(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call set_basic_params(e2, na, nev, na_rows, na_cols, my_prow, my_pcol)
++   call e2%load_settings("initial_parameters.txt", error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   assert_elpa_ok(e2%setup())
++
++   call e2%get("timings", timings, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e2%get("debug", debug, error_elpa)
++   assert_elpa_ok(error_elpa)
++#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)
++   call e2%get("nvidia-gpu", gpu, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++#if TEST_AMD_GPU == 1
++   call e2%get("amd-gpu", gpu, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++#if TEST_INTEL_GPU == 1
++   call e2%get("intel-gpu", gpu, error_elpa)
++   assert_elpa_ok(error_elpa)
++#endif
++
++   if ((timings .ne. 1) .or. (debug .ne. 1) .or. (gpu .ne. 0)) then
++     print *, "Parameters not stored or loaded correctly. Aborting...", timings, debug, gpu
++     stop 1
++   endif
++
++   if(myid == 0) print *, "parameters of e1"
++   call e1%print_settings(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   if(myid == 0) print *, ""
++   if(myid == 0) print *, "parameters of e2"
++   call e2%print_settings(error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   e_ptr => e2
++
++
++   tune_state => e_ptr%autotune_setup(ELPA_AUTOTUNE_FAST, AUTOTUNE_DOMAIN, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++
++   iter=0
++   do while (e_ptr%autotune_step(tune_state, error_elpa))
++     assert_elpa_ok(error_elpa)
++ 
++     iter=iter+1
++     write(iter_string,'(I5.5)') iter
++     call e_ptr%print_settings(error_elpa)
++     assert_elpa_ok(error_elpa)
++
++     call e_ptr%store_settings("saved_parameters_"//trim(iter_string)//".txt", error_elpa)
++     assert_elpa_ok(error_elpa)
++
++     call e_ptr%timer_start("eigenvectors: iteration "//trim(iter_string))
++     call e_ptr%eigenvectors(a, ev, z, error_elpa)
++     assert_elpa_ok(error_elpa)
++     call e_ptr%timer_stop("eigenvectors: iteration "//trim(iter_string))
++
++     assert_elpa_ok(error_elpa)
++     if (myid .eq. 0) then
++       print *, ""
++       call e_ptr%print_times("eigenvectors: iteration "//trim(iter_string))
++     endif
++     status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, &
++                                         .true., .true., print_times=.false.)
++     a(:,:) = as(:,:)
++     call e_ptr%autotune_print_state(tune_state, error_elpa)
++     assert_elpa_ok(error_elpa)
++
++     call e_ptr%autotune_save_state(tune_state, "saved_state_"//trim(iter_string)//".txt", error_elpa)
++     assert_elpa_ok(error_elpa)
++#ifdef WITH_MPI
++     ! barrier after save state, file created from one MPI rank only, but loaded everywhere
++     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++#endif
++     call e_ptr%autotune_load_state(tune_state, "saved_state_"//trim(iter_string)//".txt", error_elpa)
++     assert_elpa_ok(error_elpa)
++
++   end do
++
++   ! set and print the autotuned-settings
++   call e_ptr%autotune_set_best(tune_state, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   if (myid .eq. 0) then
++     print *, "The best combination found by the autotuning:"
++     flush(output_unit)
++     call e_ptr%autotune_print_best(tune_state, error_elpa)
++     assert_elpa_ok(error_elpa)
++   endif
++   ! de-allocate autotune object
++   call elpa_autotune_deallocate(tune_state, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   if (myid .eq. 0) then
++     print *, "Running once more time with the best found setting..."
++   endif
++   call e_ptr%timer_start("eigenvectors: best setting")
++   call e_ptr%eigenvectors(a, ev, z, error_elpa)
++   assert_elpa_ok(error_elpa)
++
++   call e_ptr%timer_stop("eigenvectors: best setting")
++   assert_elpa_ok(error_elpa)
++   if (myid .eq. 0) then
++     print *, ""
++     call e_ptr%print_times("eigenvectors: best setting")
++   endif
++   status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, &
++                                       .true., .true., print_times=.false.)
++
++   call elpa_deallocate(e_ptr, error_elpa)
++   !assert_elpa_ok(error_elpa)
++
++   deallocate(a)
++   deallocate(as)
++   deallocate(z)
++   deallocate(ev)
++
++   call elpa_uninit(error_elpa)
++   !assert_elpa_ok(error_elpa)
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++
++   call exit(status)
++
++contains
++   subroutine set_basic_params(elpa, na, nev, na_rows, na_cols, my_prow, my_pcol)
++     implicit none
++     class(elpa_t), pointer      :: elpa
++     TEST_INT_TYPE, intent(in)   :: na, nev, na_rows, na_cols, my_prow, my_pcol
++
++     call elpa%set("na", int(na,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("nev", int(nev,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("nblk", int(nblk,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++
++#ifdef WITH_MPI
++     call elpa%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("process_row", int(my_prow,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++#endif
++   end subroutine
++
++end program
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/test_skewsymmetric.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/test_skewsymmetric.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/test_skewsymmetric.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/test_skewsymmetric.F90	2022-01-28 09:50:11.240867016 +0100
+@@ -0,0 +1,431 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++! Define one of TEST_REAL or TEST_COMPLEX
++! Define one of TEST_SINGLE or TEST_DOUBLE
++! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
++! Define TEST_NVIDIA_GPU \in [0, 1]
++! Define TEST_INTEL_GPU \in [0, 1]
++! Define TEST_AMD_GPU \in [0, 1]
++! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
++
++#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
++error: define exactly one of TEST_REAL or TEST_COMPLEX
++#endif
++
++#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
++error: define exactly one of TEST_SINGLE or TEST_DOUBLE
++#endif
++
++#ifdef TEST_SINGLE
++#  define EV_TYPE real(kind=C_FLOAT)
++#  define EV_TYPE_COMPLEX complex(kind=C_FLOAT_COMPLEX)
++#  define MATRIX_TYPE_COMPLEX complex(kind=C_FLOAT_COMPLEX)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_FLOAT)
++#  else
++#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
++#  endif
++#else
++#  define MATRIX_TYPE_COMPLEX complex(kind=C_DOUBLE_COMPLEX)
++#  define EV_TYPE_COMPLEX complex(kind=C_DOUBLE_COMPLEX)
++#  define EV_TYPE real(kind=C_DOUBLE)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_DOUBLE)
++#  else
++#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
++#  endif
++#endif
++
++#ifdef TEST_REAL
++#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_REAL
++#else
++#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_COMPLEX
++#endif
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++#define TEST_GPU 0
++#if (TEST_NVIDIA_GPU == 1) || (TEST_AMD_GPU == 1) || (TEST_INTEL_GPU == 1)
++#undef TEST_GPU
++#define TEST_GPU 1
++#endif
++
++#include "assert.h"
++
++program test
++   use elpa
++
++   !use test_util
++   use test_setup_mpi
++   use test_prepare_matrix
++   use test_read_input_parameters
++   use test_blacs_infrastructure
++   use test_check_correctness
++   use precision_for_tests
++   use iso_fortran_env
++
++#ifdef HAVE_REDIRECT
++   use test_redirect
++#endif
++   implicit none
++
++   ! matrix dimensions
++   TEST_INT_TYPE                          :: na, nev, nblk
++
++   ! mpi
++   TEST_INT_TYPE                          :: myid, nprocs
++   TEST_INT_TYPE                          :: na_cols, na_rows  ! local matrix size
++   TEST_INT_TYPE                          :: np_cols, np_rows  ! number of MPI processes per column/row
++   TEST_INT_TYPE                          :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
++   TEST_INT_MPI_TYPE                      :: mpierr
++
++   ! blacs
++   character(len=1)                 :: layout
++   TEST_INT_TYPE                          :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++
++   ! The Matrix
++   MATRIX_TYPE, allocatable         :: a_skewsymmetric(:,:), as_skewsymmetric(:,:)
++   MATRIX_TYPE_COMPLEX, allocatable :: a_complex(:,:), as_complex(:,:)
++   ! eigenvectors
++   MATRIX_TYPE, allocatable         :: z_skewsymmetric(:,:)
++   MATRIX_TYPE_COMPLEX, allocatable :: z_complex(:,:)
++   ! eigenvalues
++   EV_TYPE, allocatable             :: ev_skewsymmetric(:), ev_complex(:)
++
++   TEST_INT_TYPE                    :: status, i, j
++   integer(kind=c_int)              :: error_elpa
++
++   type(output_t)                   :: write_to_file
++   class(elpa_t), pointer           :: e_complex, e_skewsymmetric
++           
++   call read_input_parameters(na, nev, nblk, write_to_file)
++!   call setup_mpi(myid, nprocs)
++    call MPI_INIT_THREAD(MPI_THREAD_SERIALIZED,info, mpierr)
++    call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs, mpierr)
++    call MPI_COMM_RANK(MPI_COMM_WORLD,myid, mpierr)
++#ifdef HAVE_REDIRECT
++#ifdef WITH_MPI
++   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++   call redirect_stdout(myid)
++#endif
++#endif
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++! 
++   layout = 'C'
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++      if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++   np_rows = nprocs/np_cols
++   assert(nprocs == np_rows * np_cols)
++
++   if (myid == 0) then
++     print '((a,i0))', 'Matrix size: ', na
++     print '((a,i0))', 'Num eigenvectors: ', nev
++     print '((a,i0))', 'Blocksize: ', nblk
++#ifdef WITH_MPI
++     print '((a,i0))', 'Num MPI proc: ', nprocs
++     print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
++     print '(a)',      'Process layout: ' // layout
++#endif
++     print *,''
++   endif
++
++   call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, &
++                             np_cols, layout, &
++                             my_blacs_ctxt, my_prow, my_pcol)
++
++   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   allocate(a_skewsymmetric (na_rows,na_cols))
++   allocate(as_skewsymmetric(na_rows,na_cols))
++   allocate(z_skewsymmetric (na_rows,2*na_cols))
++   allocate(ev_skewsymmetric(na))
++
++   a_skewsymmetric(:,:) = 0.0
++   z_skewsymmetric(:,:) = 0.0
++   ev_skewsymmetric(:) = 0.0
++
++   call prepare_matrix_random(na, myid, sc_desc, a_skewsymmetric, &
++   z_skewsymmetric(:,1:na_cols), as_skewsymmetric, is_skewsymmetric=1)
++   
++   !call MPI_BARRIER(MPI_COMM_WORLD, mpierr)  
++   as_skewsymmetric(:,:) = a_skewsymmetric(:,:)
++   
++
++   ! prepare the complex matrix for the "brute force" case
++   allocate(a_complex (na_rows,na_cols))
++   allocate(as_complex(na_rows,na_cols))
++   allocate(z_complex (na_rows,na_cols))
++   allocate(ev_complex(na))
++
++   a_complex(1:na_rows,1:na_cols) = 0.0
++   z_complex(1:na_rows,1:na_cols) = 0.0
++   as_complex(1:na_rows,1:na_cols) = 0.0
++   
++
++      do j=1, na_cols
++        do i=1,na_rows
++#ifdef TEST_DOUBLE
++          a_complex(i,j) = dcmplx(0.0, a_skewsymmetric(i,j))
++#endif
++#ifdef TEST_SINGLE
++          a_complex(i,j) = cmplx(0.0, a_skewsymmetric(i,j))
++#endif
++        enddo
++      enddo
++   
++
++
++   z_complex(1:na_rows,1:na_cols)  = a_complex(1:na_rows,1:na_cols)
++   as_complex(1:na_rows,1:na_cols) = a_complex(1:na_rows,1:na_cols)
++
++   ! first set up and solve the brute force problem
++   e_complex => elpa_allocate(error_elpa)
++   call set_basic_params(e_complex, na, nev, na_rows, na_cols, my_prow, my_pcol)
++
++   call e_complex%set("timings",1, error_elpa)
++
++   call e_complex%set("debug",1,error_elpa)
++
++#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)  
++   call e_complex%set("nvidia-gpu", TEST_GPU,error_elpa)
++#endif
++#if TEST_AMD_GPU == 1
++   call e_complex%set("amd-gpu", TEST_GPU,error_elpa)
++#endif
++#if TEST_INTEL_GPU == 1
++   call e_complex%set("intel-gpu", TEST_GPU,error_elpa)
++#endif
++
++   call e_complex%set("omp_threads", 8, error_elpa)
++
++   assert_elpa_ok(e_complex%setup())
++   call e_complex%set("solver", elpa_solver_2stage, error_elpa)
++
++   call e_complex%timer_start("eigenvectors: brute force as complex matrix")
++   call e_complex%eigenvectors(a_complex, ev_complex, z_complex, error_elpa)
++   call e_complex%timer_stop("eigenvectors: brute force as complex matrix")
++
++   if (myid .eq. 0) then
++     print *, ""
++     call e_complex%print_times("eigenvectors: brute force as complex matrix")
++   endif 
++#ifdef WITH_MPI
++     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++#endif     
++!      as_complex(:,:) = z_complex(:,:)
++#ifdef TEST_SINGLE
++     status = check_correctness_evp_numeric_residuals_complex_single(na, nev, as_complex, z_complex, ev_complex, sc_desc, &
++                                                    nblk, myid, np_rows,np_cols, my_prow, my_pcol)
++#else
++     status = check_correctness_evp_numeric_residuals_complex_double(na, nev, as_complex, z_complex, ev_complex, sc_desc, &
++                                                    nblk, myid, np_rows,np_cols, my_prow, my_pcol)
++#endif
++    status = 0
++    call check_status(status, myid)
++
++#ifdef WITH_MPI
++     call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++#endif
++   ! now run the skewsymmetric case
++   e_skewsymmetric => elpa_allocate(error_elpa)
++   call set_basic_params(e_skewsymmetric, na, nev, na_rows, na_cols, my_prow, my_pcol)
++
++   call e_skewsymmetric%set("timings",1, error_elpa)
++
++   call e_skewsymmetric%set("debug",1,error_elpa)
++
++#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)
++   call e_skewsymmetric%set("nvidia-gpu", TEST_GPU,error_elpa)
++#endif
++#if TEST_AMD_GPU == 1
++   call e_skewsymmetric%set("amd-gpu", TEST_GPU,error_elpa)
++#endif
++#if TEST_INTEL_GPU == 1
++   call e_skewsymmetric%set("intel-gpu", TEST_GPU,error_elpa)
++#endif
++   call e_skewsymmetric%set("omp_threads",8, error_elpa)
++
++   assert_elpa_ok(e_skewsymmetric%setup())
++   
++   call e_skewsymmetric%set("solver", elpa_solver_2stage, error_elpa)
++
++   call e_skewsymmetric%timer_start("eigenvectors: skewsymmetric ")
++   call e_skewsymmetric%skew_eigenvectors(a_skewsymmetric, ev_skewsymmetric, z_skewsymmetric, error_elpa)
++   call e_skewsymmetric%timer_stop("eigenvectors: skewsymmetric ")
++
++   if (myid .eq. 0) then
++     print *, ""
++     call e_skewsymmetric%print_times("eigenvectors: skewsymmetric")
++   endif
++   
++   ! check eigenvalues
++   do i=1, na
++     if (myid == 0) then
++#ifdef TEST_DOUBLE
++       if (abs(ev_complex(i)-ev_skewsymmetric(i))/abs(ev_complex(i)) .gt. 1e-10) then
++#endif
++#ifdef TEST_SINGLE
++       if (abs(ev_complex(i)-ev_skewsymmetric(i))/abs(ev_complex(i)) .gt. 1e-4) then
++#endif
++         print *,"ev: i=",i,ev_complex(i),ev_skewsymmetric(i)
++         status = 1
++     endif
++     endif
++   enddo
++
++
++!    call check_status(status, myid)
++   
++   z_complex(:,:) = 0
++   do j=1, na_cols
++     do i=1,na_rows
++#ifdef TEST_DOUBLE
++       z_complex(i,j) = dcmplx(z_skewsymmetric(i,j), z_skewsymmetric(i,na_cols+j))
++#endif
++#ifdef TEST_SINGLE
++       z_complex(i,j) = cmplx(z_skewsymmetric(i,j), z_skewsymmetric(i,na_cols+j))
++#endif
++     enddo
++   enddo
++#ifdef WITH_MPI
++   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++#endif
++
++#ifdef TEST_SINGLE
++   status = check_correctness_evp_numeric_residuals_ss_real_single(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
++                              sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
++#else
++   status = check_correctness_evp_numeric_residuals_ss_real_double(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
++                              sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
++#endif
++   
++#ifdef WITH_MPI
++    call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++#endif
++   call elpa_deallocate(e_complex,error_elpa)
++   call elpa_deallocate(e_skewsymmetric,error_elpa)
++
++
++   !to do 
++   ! - check whether brute-force check_correctness_evp_numeric_residuals worsk (complex ev)
++   ! - invent a test for skewsymmetric residuals
++
++   deallocate(a_complex)
++   deallocate(as_complex)
++   deallocate(z_complex)
++   deallocate(ev_complex)
++
++   deallocate(a_skewsymmetric)
++   deallocate(as_skewsymmetric)
++   deallocate(z_skewsymmetric)
++   deallocate(ev_skewsymmetric)
++   call elpa_uninit(error_elpa)
++
++
++
++#ifdef WITH_MPI
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++#endif
++
++   call exit(status)
++
++contains
++   subroutine set_basic_params(elpa, na, nev, na_rows, na_cols, my_prow, my_pcol)
++     implicit none
++     class(elpa_t), pointer      :: elpa
++     TEST_INT_TYPE, intent(in)         :: na, nev, na_rows, na_cols, my_prow, my_pcol
++
++     call elpa%set("na", int(na,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("nev", int(nev,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("nblk", int(nblk,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++
++#ifdef WITH_MPI
++     call elpa%set("mpi_comm_parent", int(MPI_COMM_WORLD,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("process_row", int(my_prow,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++#endif
++   end subroutine
++   subroutine check_status(status, myid)
++     implicit none
++     TEST_INT_TYPE, intent(in) :: status, myid
++     TEST_INT_MPI_TYPE         :: mpierr
++     if (status /= 0) then
++       if (myid == 0) print *, "Result incorrect!"
++#ifdef WITH_MPI
++       call mpi_finalize(mpierr)
++#endif
++       call exit(status)
++     endif
++   end subroutine
++end program
+diff -ruN elpa-new_release_2021.11.001/examples/Fortran/test_split_comm.F90 elpa-new_release_2021.11.001_ok/examples/Fortran/test_split_comm.F90
+--- elpa-new_release_2021.11.001/examples/Fortran/test_split_comm.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Fortran/test_split_comm.F90	2022-02-01 17:13:58.420500580 +0100
+@@ -0,0 +1,346 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++! Define one of TEST_REAL or TEST_COMPLEX
++! Define one of TEST_SINGLE or TEST_DOUBLE
++! Define one of TEST_SOLVER_1STAGE or TEST_SOLVER_2STAGE
++! Define TEST_NVIDIA_GPU \in [0, 1]
++! Define TEST_INTEL_GPU \in [0, 1]
++! Define either TEST_ALL_KERNELS or a TEST_KERNEL \in [any valid kernel]
++
++#if !(defined(TEST_REAL) ^ defined(TEST_COMPLEX))
++error: define exactly one of TEST_REAL or TEST_COMPLEX
++#endif
++
++#if !(defined(TEST_SINGLE) ^ defined(TEST_DOUBLE))
++error: define exactly one of TEST_SINGLE or TEST_DOUBLE
++#endif
++
++#ifdef TEST_SINGLE
++#  define EV_TYPE real(kind=C_FLOAT)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_FLOAT)
++#  else
++#    define MATRIX_TYPE complex(kind=C_FLOAT_COMPLEX)
++#  endif
++#else
++#  define EV_TYPE real(kind=C_DOUBLE)
++#  ifdef TEST_REAL
++#    define MATRIX_TYPE real(kind=C_DOUBLE)
++#  else
++#    define MATRIX_TYPE complex(kind=C_DOUBLE_COMPLEX)
++#  endif
++#endif
++
++
++#ifdef TEST_REAL
++#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_REAL
++#else
++#  define AUTOTUNE_DOMAIN ELPA_AUTOTUNE_DOMAIN_COMPLEX
++#endif
++
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++#include "assert.h"
++
++program test
++   use elpa
++
++   !use test_util
++   use test_setup_mpi
++   use test_prepare_matrix
++   use test_read_input_parameters
++   use test_blacs_infrastructure
++   use test_check_correctness
++   use test_analytic
++   use iso_fortran_env
++
++#ifdef HAVE_REDIRECT
++   use test_redirect
++#endif
++   implicit none
++
++   ! matrix dimensions
++   TEST_INT_TYPE                     :: na, nev, nblk
++   TEST_INT_TYPE                     :: num_groups, group_size, color, key
++
++   ! mpi
++   TEST_INT_TYPE                     :: myid, nprocs
++   TEST_INT_TYPE                     :: na_cols, na_rows  ! local matrix size
++   TEST_INT_TYPE                     :: np_cols, np_rows  ! number of MPI processes per column/row
++   TEST_INT_TYPE                     :: my_prow, my_pcol  ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
++   TEST_INT_MPI_TYPE                 :: mpierr, ierr,mpi_sub_commMPI, myidMPI, nprocsMPI, colorMPI, keyMPI, &
++                                        myid_subMPI, nprocs_subMPI
++   TEST_INT_TYPE                     :: mpi_sub_comm
++   TEST_INT_TYPE                     :: myid_sub, nprocs_sub
++
++   ! blacs
++   character(len=1)            :: layout
++   TEST_INT_TYPE                     :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++
++   ! The Matrix
++   MATRIX_TYPE, allocatable    :: a(:,:), as(:,:)
++   ! eigenvectors
++   MATRIX_TYPE, allocatable    :: z(:,:)
++   ! eigenvalues
++   EV_TYPE, allocatable        :: ev(:)
++
++   TEST_INT_TYPE               :: status
++   integer(kind=c_int)         :: error_elpa
++
++   type(output_t)              :: write_to_file
++   class(elpa_t), pointer      :: e
++
++   TEST_INT_TYPE                     :: iter
++   character(len=5)            :: iter_string
++
++   status = 0
++#ifdef WITH_MPI
++
++   call read_input_parameters(na, nev, nblk, write_to_file)
++   !call setup_mpi(myid, nprocs)
++   call mpi_init_thread(MPI_THREAD_SERIALIZED, info, mpierr)
++   call mpi_comm_rank(mpi_comm_world, myidMPI,mpierr)
++   call mpi_comm_size(mpi_comm_world, nprocsMPI,mpierr)
++   myid = int(myidMPI,kind=BLAS_KIND)
++   nprocs = int(nprocsMPI,kind=BLAS_KIND)
++
++   if((mod(nprocs, 4) == 0) .and. (nprocs > 4)) then
++     num_groups = 4
++   else if(mod(nprocs, 3) == 0) then
++     num_groups = 3
++   else if(mod(nprocs, 2) == 0) then
++     num_groups = 2
++   else
++     num_groups = 1
++   endif
++
++   group_size = nprocs / num_groups
++
++   if(num_groups * group_size .ne. nprocs) then 
++     print *, "Something went wrong before splitting the communicator"
++     stop 1
++   else
++     if(myid == 0) then
++       print '((a,i0,a,i0))', "The test will split the global communicator into ", num_groups, " groups of size ", group_size
++     endif
++   endif
++
++   ! each group of processors will have the same color
++   color = mod(myid, num_groups)
++   ! this will determine the myid in each group
++   key = myid/num_groups
++   !split the communicator
++   colorMPI=int(color,kind=MPI_KIND)
++   keyMPI = int(key, kind=MPI_KIND)
++   call mpi_comm_split(mpi_comm_world, colorMPI, keyMPI, mpi_sub_commMPI, mpierr)
++   mpi_sub_comm = int(mpi_sub_commMPI,kind=BLAS_KIND)
++   color = int(colorMPI,kind=BLAS_KIND)
++   key = int(keyMPI,kind=BLAS_KIND)
++   if(mpierr .ne. MPI_SUCCESS) then 
++     print *, "communicator splitting not successfull", mpierr
++     stop 1
++   endif
++
++   call mpi_comm_rank(mpi_sub_commMPI, myid_subMPI, mpierr)
++   call mpi_comm_size(mpi_sub_commMPI, nprocs_subMPI, mpierr)
++   myid_sub = int(myid_subMPI,kind=BLAS_KIND)
++   nprocs_sub = int(nprocs_subMPI,kind=BLAS_KIND)
++
++   !print *, "glob ", myid, nprocs, ", loc ", myid_sub, nprocs_sub, ", color ", color, ", key ", key
++
++   if((mpierr .ne. MPI_SUCCESS) .or. (nprocs_sub .ne. group_size) .or. (myid_sub >= group_size)) then
++     print *, "something wrong with the sub communicators"
++     stop 1
++   endif
++
++
++#ifdef HAVE_REDIRECT
++   call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
++   call redirect_stdout(myid)
++#endif
++
++   if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
++     print *, "ELPA API version not supported"
++     stop 1
++   endif
++
++   layout = 'C'
++   do np_cols = NINT(SQRT(REAL(nprocs_sub))),2,-1
++      if(mod(nprocs_sub,np_cols) == 0 ) exit
++   enddo
++   np_rows = nprocs_sub/np_cols
++   assert(nprocs_sub == np_rows * np_cols)
++   assert(nprocs == np_rows * np_cols * num_groups)
++
++   if (myid == 0) then
++     print '((a,i0))', 'Matrix size: ', na
++     print '((a,i0))', 'Num eigenvectors: ', nev
++     print '((a,i0))', 'Blocksize: ', nblk
++     print '(a)',      'Process layout: ' // layout
++     print *,''
++   endif
++   if (myid_sub == 0) then
++     print '(4(a,i0))','GROUP ', color, ': Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs_sub
++   endif
++
++   ! USING the subcommunicator
++   call set_up_blacsgrid(int(mpi_sub_comm,kind=BLAS_KIND), np_rows, np_cols, layout, &
++                         my_blacs_ctxt, my_prow, my_pcol)
++
++   call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
++                                na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
++
++   allocate(a (na_rows,na_cols))
++   allocate(as(na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++   allocate(ev(na))
++
++   a(:,:) = 0.0
++   z(:,:) = 0.0
++   ev(:) = 0.0
++
++   !call prepare_matrix_analytic(na, a, nblk, myid_sub, np_rows, np_cols, my_prow, my_pcol, print_times=.false.)
++   call prepare_matrix_random(na, myid_sub, sc_desc, a, z, as)
++   as(:,:) = a(:,:)
++
++   e => elpa_allocate(error_elpa)
++   call set_basic_params(e, na, nev, na_rows, na_cols, mpi_sub_comm, my_prow, my_pcol)
++
++   call e%set("timings",1, error_elpa)
++
++   call e%set("debug",1, error_elpa)
++#if TEST_NVIDIA_GPU == 1 || (TEST_NVIDIA_GPU == 0) && (TEST_AMD_GPU == 0) && (TEST_INTEL_GPU == 0)
++   call e%set("nvidia-gpu", 0, error_elpa)
++#endif
++#if TEST_INTEL_GPU == 1
++   call e%set("intel-gpu", 0, error_elpa)
++#endif
++   !call e%set("max_stored_rows", 15, error_elpa)
++
++   assert_elpa_ok(e%setup())
++
++
++
++!   if(myid == 0) print *, "parameters of e"
++!   call e%print_all_parameters()
++!   if(myid == 0) print *, ""
++
++
++   call e%timer_start("eigenvectors")
++   call e%eigenvectors(a, ev, z, error_elpa)
++   call e%timer_stop("eigenvectors")
++
++   assert_elpa_ok(error_elpa)
++
++   !status = check_correctness_analytic(na, nev, ev, z, nblk, myid_sub, np_rows, np_cols, my_prow, my_pcol, &
++    !                   .true., .true., print_times=.false.)
++   status = check_correctness_evp_numeric_residuals(na, nev, as, z, ev, sc_desc, nblk, myid_sub, &
++                    np_rows,np_cols, my_prow, my_pcol)
++   if (status /= 0) &
++     print *, "processor ", myid, ": Result incorrect for processor group ", color
++
++   if (myid .eq. 0) then
++     print *, "Showing times of one goup only"
++     call e%print_times("eigenvectors")
++   endif
++
++   call elpa_deallocate(e, error_elpa)
++
++   deallocate(a)
++   deallocate(as)
++   deallocate(z)
++   deallocate(ev)
++
++   call elpa_uninit(error_elpa)
++
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++
++#endif
++   call exit(status)
++
++contains
++   subroutine set_basic_params(elpa, na, nev, na_rows, na_cols, communicator, my_prow, my_pcol)
++     use iso_c_binding
++     implicit none
++     class(elpa_t), pointer      :: elpa
++     TEST_INT_TYPE, intent(in)         :: na, nev, na_rows, na_cols, my_prow, my_pcol, communicator
++
++#ifdef WITH_MPI
++     call elpa%set("na", int(na,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("nev", int(nev,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("local_nrows", int(na_rows,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("local_ncols", int(na_cols,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("nblk", int(nblk,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++
++     call elpa%set("mpi_comm_parent", int(communicator,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("process_row", int(my_prow,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++     call elpa%set("process_col", int(my_pcol,kind=c_int), error_elpa)
++     assert_elpa_ok(error_elpa)
++#endif
++   end subroutine
++
++end program
+diff -ruN elpa-new_release_2021.11.001/examples/Makefile_hybrid elpa-new_release_2021.11.001_ok/examples/Makefile_hybrid
+--- elpa-new_release_2021.11.001/examples/Makefile_hybrid	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Makefile_hybrid	2022-01-28 09:56:18.548921000 +0100
+@@ -0,0 +1,24 @@
++# MPICH, that is IntelMPI or ParaStationMPI
++SCALAPACK_LIB = -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
++# OpenMPI
++# SCALAPACK_LIB = -lmkl_scalapack_lp64 $(MKLROOT)/lib/intel64/libmkl_blacs_openmpi_lp64.a
++LAPACK_LIB =
++# Intel compiler
++MKL = -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lstdc++
++# GCC
++# MKL = -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lstdc++ -lm
++F90 = mpif90 -O3 -qopenmp -I$(ELPA_MODULES_OPENMP) -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa
++# GCC
++# F90 = mpif90 -O3 -fopenmp -I$(ELPA_MODULES_OPENMP) -I$(ELPA_INCLUDE_OPENMP) -I$(ELPA_INCLUDE_OPENMP)/elpa
++LIBS = -L$(ELPA_LIB_OPENMP) -lelpa_openmp -lelpatest_openmp -lelpa $(SCALAPACK_LIB) $(MKL)
++CC            = mpicc -O3 -qopenmp
++# GCC
++# CC            = mpicc -O3 -fopenmp
++
++all: test_real_e1_omp test_real_e2_omp
++
++test_real_e1_omp: test_real_e1.F90
++	$(F90) -DWITH_OPENMP_TRADITIONAL -o $@ test_real_e1.F90 $(LIBS)
++
++test_real_e2_omp: test_real_e2.F90
++	$(F90) -DWITH_OPENMP_TRADITIONAL -o $@ test_real_e2.F90 $(LIBS)
+diff -ruN elpa-new_release_2021.11.001/examples/Makefile_pure elpa-new_release_2021.11.001_ok/examples/Makefile_pure
+--- elpa-new_release_2021.11.001/examples/Makefile_pure	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Makefile_pure	2022-01-26 09:43:16.599164000 +0100
+@@ -0,0 +1,20 @@
++# MPICH, that is IntelMPI or ParaStationMPI
++SCALAPACK_LIB = -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
++# OpenMPI
++# SCALAPACK_LIB = -lmkl_scalapack_lp64 $(MKLROOT)/lib/intel64/libmkl_blacs_openmpi_lp64.a
++LAPACK_LIB =
++# Intel compiler
++MKL = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5 -lpthread -lstdc++
++# GCC
++# MKL = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lgomp -lpthread -lstdc++ -lm
++F90 = mpif90 -O3 -I$(ELPA_MODULES) -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa
++LIBS = -L$(ELPA_LIB) -lelpa -lelpatest $(SCALAPACK_LIB) $(MKL)
++CC            = mpicc -O3
++
++all: test_real_e1 test_real_e2
++
++test_real_e1: test_real_e1.F90
++	$(F90) -o $@ test_real_e1.F90 $(LIBS)
++
++test_real_e2: test_real_e2.F90
++	$(F90) -o $@ test_real_e2.F90 $(LIBS)
+diff -ruN elpa-new_release_2021.11.001/examples/Makefile_pure_cuda elpa-new_release_2021.11.001_ok/examples/Makefile_pure_cuda
+--- elpa-new_release_2021.11.001/examples/Makefile_pure_cuda	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/Makefile_pure_cuda	2022-01-26 09:43:16.600617000 +0100
+@@ -0,0 +1,20 @@
++# MPICH, that is IntelMPI or ParaStationMPI
++SCALAPACK_LIB = -lmkl_scalapack_lp64 -lmkl_blacs_intelmpi_lp64
++# OpenMPI
++# SCALAPACK_LIB = -lmkl_scalapack_lp64 $(MKLROOT)/lib/intel64/libmkl_blacs_openmpi_lp64.a
++LAPACK_LIB =
++# Intel compiler
++MKL = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5 -lpthread -lstdc++
++# GCC
++# MKL = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core -lgomp -lpthread -lstdc++ -lm
++F90 = mpif90 -O3 -I$(ELPA_MODULES) -I$(ELPA_INCLUDE) -I$(ELPA_INCLUDE)/elpa
++LIBS = -L$(ELPA_LIB) -lelpa -lelpatest $(SCALAPACK_LIB) $(MKL) -lcudart
++CC            = mpicc -O3
++
++all: test_real_e1 test_real_e2
++
++test_real_e1: test_real_e1.F90
++	$(F90) -DCUDA -o $@ test_real_e1.F90 $(LIBS)
++
++test_real_e2: test_real_e2.F90
++	$(F90) -DCUDA -DCUDAKERNEL -o $@ test_real_e2.F90 $(LIBS)
+diff -ruN elpa-new_release_2021.11.001/examples/shared/GPU/CUDA/test_cuda.F90 elpa-new_release_2021.11.001_ok/examples/shared/GPU/CUDA/test_cuda.F90
+--- elpa-new_release_2021.11.001/examples/shared/GPU/CUDA/test_cuda.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/GPU/CUDA/test_cuda.F90	2022-01-26 10:10:58.319812000 +0100
+@@ -0,0 +1,455 @@
++!    Copyright 2014, A. Marek
++!
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++! This file was written by A. Marek, MPCDF
++
++
++#include "config-f90.h"
++module test_cuda_functions
++  use, intrinsic :: iso_c_binding
++  use precision_for_tests
++  implicit none
++
++  public
++
++  integer(kind=ik) :: cudaMemcpyHostToDevice
++  integer(kind=ik) :: cudaMemcpyDeviceToHost
++  integer(kind=ik) :: cudaMemcpyDeviceToDevice
++  integer(kind=ik) :: cudaHostRegisterDefault
++  integer(kind=ik) :: cudaHostRegisterPortable
++  integer(kind=ik) :: cudaHostRegisterMapped
++
++  ! TODO global variable, has to be changed
++  integer(kind=C_intptr_T) :: cublasHandle = -1
++
++  ! functions to set and query the CUDA devices
++  interface
++    function cuda_setdevice_c(n) result(istat) &
++             bind(C, name="cudaSetDeviceFromC")
++
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=C_INT), value    :: n
++      integer(kind=C_INT)           :: istat
++    end function cuda_setdevice_c
++  end interface
++
++  ! functions to copy CUDA memory
++  interface
++    function cuda_memcpyDeviceToDevice_c() result(flag) &
++            bind(C, name="cudaMemcpyDeviceToDeviceFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function cuda_memcpyHostToDevice_c() result(flag) &
++             bind(C, name="cudaMemcpyHostToDeviceFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function cuda_memcpyDeviceToHost_c() result(flag) &
++             bind(C, name="cudaMemcpyDeviceToHostFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function cuda_hostRegisterDefault_c() result(flag) &
++             bind(C, name="cudaHostRegisterDefaultFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function cuda_hostRegisterPortable_c() result(flag) &
++             bind(C, name="cudaHostRegisterPortableFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function cuda_hostRegisterMapped_c() result(flag) &
++             bind(C, name="cudaHostRegisterMappedFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function cuda_memcpy_intptr_c(dst, src, size, dir) result(istat) &
++             bind(C, name="cudaMemcpyFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      integer(kind=C_intptr_t), value              :: dst
++      integer(kind=C_intptr_t), value              :: src
++      integer(kind=c_intptr_t), intent(in), value    :: size
++      integer(kind=C_INT), intent(in), value       :: dir
++      integer(kind=C_INT)                          :: istat
++
++    end function cuda_memcpy_intptr_c
++  end interface
++
++  interface
++    function cuda_memcpy_cptr_c(dst, src, size, dir) result(istat) &
++             bind(C, name="cudaMemcpyFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr), value                           :: dst
++      type(c_ptr), value                           :: src
++      integer(kind=c_intptr_t), intent(in), value  :: size
++      integer(kind=C_INT), intent(in), value       :: dir
++      integer(kind=C_INT)                          :: istat
++
++    end function cuda_memcpy_cptr_c
++  end interface
++
++  interface
++    function cuda_memcpy_mixed_c(dst, src, size, dir) result(istat) &
++             bind(C, name="cudaMemcpyFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr), value                           :: dst
++      integer(kind=C_intptr_t), value              :: src
++      integer(kind=c_intptr_t), intent(in), value  :: size
++      integer(kind=C_INT), intent(in), value       :: dir
++      integer(kind=C_INT)                          :: istat
++
++    end function cuda_memcpy_mixed_c
++  end interface
++
++  ! functions to allocate and free CUDA memory
++
++  interface
++    function cuda_free_intptr_c(a) result(istat) &
++             bind(C, name="cudaFreeFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      integer(kind=C_intptr_T), value  :: a
++      integer(kind=C_INT)              :: istat
++
++    end function cuda_free_intptr_c
++  end interface
++
++  interface
++    function cuda_free_cptr_c(a) result(istat) &
++             bind(C, name="cudaFreeFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr), value  :: a
++      integer(kind=C_INT)              :: istat
++
++    end function cuda_free_cptr_c
++  end interface
++
++  interface cuda_memcpy
++    module procedure cuda_memcpy_intptr
++    module procedure cuda_memcpy_cptr
++    module procedure cuda_memcpy_mixed
++  end interface
++
++  interface cuda_free
++    module procedure cuda_free_intptr
++    module procedure cuda_free_cptr
++  end interface
++
++  interface
++    function cuda_malloc_intptr_c(a, width_height) result(istat) &
++             bind(C, name="cudaMallocFromC")
++
++      use, intrinsic :: iso_c_binding
++      implicit none
++
++      integer(kind=C_intptr_T)                    :: a
++      integer(kind=c_intptr_t), intent(in), value :: width_height
++      integer(kind=C_INT)                         :: istat
++
++    end function cuda_malloc_intptr_c
++  end interface
++
++  interface
++    function cuda_malloc_cptr_c(a, width_height) result(istat) &
++             bind(C, name="cudaMallocFromC")
++
++      use, intrinsic :: iso_c_binding
++      implicit none
++
++      type(c_ptr)                    :: a
++      integer(kind=c_intptr_t), intent(in), value :: width_height
++      integer(kind=C_INT)                         :: istat
++
++    end function cuda_malloc_cptr_c
++  end interface
++
++  !interface cuda_malloc
++  !  module procedure cuda_malloc_intptr
++  !  module procedure cuda_malloc_cptr
++  !end interface
++
++  contains
++
++    function cuda_setdevice(n) result(success)
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++
++      integer(kind=ik), intent(in)  :: n
++      logical                       :: success
++#ifdef WITH_NVIDIA_GPU_VERSION
++      success = cuda_setdevice_c(int(n,kind=c_int)) /= 0
++#else
++      success = .true.
++#endif
++    end function cuda_setdevice
++
++
++    function cuda_malloc_intptr(a, width_height) result(success)
++
++     use, intrinsic :: iso_c_binding
++     implicit none
++
++     integer(kind=c_intptr_t)                  :: a
++     integer(kind=c_intptr_t), intent(in)      :: width_height
++     logical                                   :: success
++#ifdef WITH_NVIDIA_GPU_VERSION
++     success = cuda_malloc_intptr_c(a, width_height) /= 0
++#else
++     success = .true.
++#endif
++   end function
++
++
++    function cuda_malloc_cptr(a, width_height) result(success)
++
++     use, intrinsic :: iso_c_binding
++     implicit none
++
++     type(c_ptr)                  :: a
++     integer(kind=c_intptr_t), intent(in)      :: width_height
++     logical                                   :: success
++#ifdef WITH_NVIDIA_GPU_VERSION
++     success = cuda_malloc_cptr_c(a, width_height) /= 0
++#else
++     success = .true.
++#endif
++   end function
++
++   function cuda_free_intptr(a) result(success)
++
++     use, intrinsic :: iso_c_binding
++
++     implicit none
++     integer(kind=C_intptr_T) :: a
++     logical                  :: success
++#ifdef WITH_NVIDIA_GPU_VERSION
++     success = cuda_free_intptr_c(a) /= 0
++#else
++     success = .true.
++#endif
++   end function cuda_free_intptr
++
++   function cuda_free_cptr(a) result(success)
++
++     use, intrinsic :: iso_c_binding
++
++     implicit none
++     type(c_ptr) :: a
++     logical                  :: success
++#ifdef WITH_NVIDIA_GPU_VERSION
++     success = cuda_free_cptr_c(a) /= 0
++#else
++     success = .true.
++#endif
++   end function cuda_free_cptr
++
++ ! functions to memcopy CUDA memory
++
++ function cuda_memcpyDeviceToDevice() result(flag)
++   use, intrinsic :: iso_c_binding
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_NVIDIA_GPU_VERSION
++   flag = int(cuda_memcpyDeviceToDevice_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function cuda_memcpyHostToDevice() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_NVIDIA_GPU_VERSION
++   flag = int(cuda_memcpyHostToDevice_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function cuda_memcpyDeviceToHost() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_NVIDIA_GPU_VERSION
++   flag = int( cuda_memcpyDeviceToHost_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function cuda_hostRegisterDefault() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_NVIDIA_GPU_VERSION
++   flag = int(cuda_hostRegisterDefault_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function cuda_hostRegisterPortable() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_NVIDIA_GPU_VERSION
++   flag = int(cuda_hostRegisterPortable_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function cuda_hostRegisterMapped() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_NVIDIA_GPU_VERSION
++   flag = int(cuda_hostRegisterMapped_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function cuda_memcpy_intptr(dst, src, size, dir) result(success)
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      integer(kind=C_intptr_t)              :: dst
++      integer(kind=C_intptr_t)              :: src
++      integer(kind=c_intptr_t), intent(in)  :: size
++      integer(kind=C_INT), intent(in)       :: dir
++      logical :: success
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++        success = cuda_memcpy_intptr_c(dst, src, size, dir) /= 0
++#else
++        success = .true.
++#endif
++    end function
++
++ function cuda_memcpy_cptr(dst, src, size, dir) result(success)
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr)                           :: dst
++      type(c_ptr)                           :: src
++      integer(kind=c_intptr_t), intent(in)  :: size
++      integer(kind=C_INT), intent(in)       :: dir
++      logical :: success
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++        success = cuda_memcpy_cptr_c(dst, src, size, dir) /= 0
++#else
++        !success = .true.
++         success = .false.
++#endif
++    end function
++
++ function cuda_memcpy_mixed(dst, src, size, dir) result(success)
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr)                           :: dst
++      integer(kind=C_intptr_t)              :: src
++      integer(kind=c_intptr_t), intent(in)  :: size
++      integer(kind=C_INT), intent(in)       :: dir
++      logical :: success
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++        success = cuda_memcpy_mixed_c(dst, src, size, dir) /= 0
++#else
++        success = .true.
++#endif
++    end function
++
++end module test_cuda_functions
+diff -ruN elpa-new_release_2021.11.001/examples/shared/GPU/CUDA/test_cudaFunctions.cu elpa-new_release_2021.11.001_ok/examples/shared/GPU/CUDA/test_cudaFunctions.cu
+--- elpa-new_release_2021.11.001/examples/shared/GPU/CUDA/test_cudaFunctions.cu	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/GPU/CUDA/test_cudaFunctions.cu	2022-01-26 10:10:58.320705000 +0100
+@@ -0,0 +1,152 @@
++//
++//    Copyright 2014, A. Marek
++//
++//    This file is part of ELPA.
++//
++//    The ELPA library was originally created by the ELPA consortium,
++//    consisting of the following organizations:
++//
++//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++//      Informatik,
++//    - Technische Universität München, Lehrstuhl für Informatik mit
++//      Schwerpunkt Wissenschaftliches Rechnen ,
++//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++//      and
++//    - IBM Deutschland GmbH
++//
++//    This particular source code file contains additions, changes and
++//    enhancements authored by Intel Corporation which is not part of
++//    the ELPA consortium.
++//
++//    More information can be found here:
++//    http://elpa.mpcdf.mpg.de/
++//
++//    ELPA is free software: you can redistribute it and/or modify
++//    it under the terms of the version 3 of the license of the
++//    GNU Lesser General Public License as published by the Free
++//    Software Foundation.
++//
++//    ELPA is distributed in the hope that it will be useful,
++//    but WITHOUT ANY WARRANTY; without even the implied warranty of
++//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++//    GNU Lesser General Public License for more details.
++//
++//    You should have received a copy of the GNU Lesser General Public License
++//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++//
++//    ELPA reflects a substantial effort on the part of the original
++//    ELPA consortium, and we ask you to respect the spirit of the
++//    license that we chose: i.e., please contribute any changes you
++//    may have back to the original ELPA library distribution, and keep
++//    any derivatives of ELPA under the same license that we chose for
++//    the original distribution, the GNU Lesser General Public License.
++//
++//
++// --------------------------------------------------------------------------------------------------
++//
++// This file was written by A. Marek, MPCDF
++#include "config-f90.h"
++
++#include <stdio.h>
++#include <math.h>
++#include <stdio.h>
++
++#include <stdlib.h>
++#include <string.h>
++#include <time.h>
++#include <alloca.h>
++#include <stdint.h>
++#include <complex.h>
++#ifdef WITH_NVIDIA_GPU_VERSION
++#include <cublas_v2.h>
++#endif
++
++
++#define errormessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
++
++#ifdef DEBUG_CUDA
++#define debugmessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
++#else
++#define debugmessage(x, ...)
++#endif
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++extern "C" {
++
++  int cudaSetDeviceFromC(int n) {
++
++    cudaError_t cuerr = cudaSetDevice(n);
++    if (cuerr != cudaSuccess) {
++      errormessage("Error in cudaSetDevice: %s\n",cudaGetErrorString(cuerr));
++      return 0;
++    }
++    return 1;
++  }
++
++  int cudaMallocFromC(intptr_t *a, size_t width_height) {
++
++    cudaError_t cuerr = cudaMalloc((void **) a, width_height);
++#ifdef DEBUG_CUDA
++    printf("CUDA Malloc,  pointer address: %p, size: %d \n", *a, width_height);
++#endif
++    if (cuerr != cudaSuccess) {
++      errormessage("Error in cudaMalloc: %s\n",cudaGetErrorString(cuerr));
++      return 0;
++    }
++    return 1;
++  }
++
++  int cudaFreeFromC(intptr_t *a) {
++#ifdef DEBUG_CUDA
++    printf("CUDA Free, pointer address: %p \n", a);
++#endif
++    cudaError_t cuerr = cudaFree(a);
++
++    if (cuerr != cudaSuccess) {
++      errormessage("Error in cudaFree: %s\n",cudaGetErrorString(cuerr));
++      return 0;
++    }
++    return 1;
++  }
++
++  int cudaMemcpyFromC(intptr_t *dest, intptr_t *src, size_t count, int dir) {
++
++    cudaError_t cuerr = cudaMemcpy( dest, src, count, (cudaMemcpyKind)dir);
++    if (cuerr != cudaSuccess) {
++      errormessage("Error in cudaMemcpy: %s\n",cudaGetErrorString(cuerr));
++      return 0;
++    }
++    return 1;
++  }
++
++  int cudaMemcpyDeviceToDeviceFromC(void) {
++      int val = cudaMemcpyDeviceToDevice;
++      return val;
++  }
++  int cudaMemcpyHostToDeviceFromC(void) {
++      int val = cudaMemcpyHostToDevice;
++      return val;
++  }
++  int cudaMemcpyDeviceToHostFromC(void) {
++      int val = cudaMemcpyDeviceToHost;
++      return val;
++  }
++  int cudaHostRegisterDefaultFromC(void) {
++      int val = cudaHostRegisterDefault;
++      return val;
++  }
++  int cudaHostRegisterPortableFromC(void) {
++      int val = cudaHostRegisterPortable;
++      return val;
++  }
++  int cudaHostRegisterMappedFromC(void) {
++      int val = cudaHostRegisterMapped;
++      return val;
++  }
++
++}
++#endif /* TEST_NVIDIA_GPU == 1 */
+diff -ruN elpa-new_release_2021.11.001/examples/shared/GPU/ROCm/test_hip.F90 elpa-new_release_2021.11.001_ok/examples/shared/GPU/ROCm/test_hip.F90
+--- elpa-new_release_2021.11.001/examples/shared/GPU/ROCm/test_hip.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/GPU/ROCm/test_hip.F90	2022-01-26 10:10:58.327011000 +0100
+@@ -0,0 +1,449 @@
++!    Copyright 2021, A. Marek
++!
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++! This file was written by A. Marek, MPCDF
++
++
++#include "config-f90.h"
++module test_hip_functions
++  use, intrinsic :: iso_c_binding
++  use precision_for_tests
++  implicit none
++
++  public
++
++  integer(kind=ik) :: hipMemcpyHostToDevice
++  integer(kind=ik) :: hipMemcpyDeviceToHost
++  integer(kind=ik) :: hipMemcpyDeviceToDevice
++  integer(kind=ik) :: hipHostRegisterDefault
++  integer(kind=ik) :: hipHostRegisterPortable
++  integer(kind=ik) :: hipHostRegisterMapped
++
++  ! TODO global variable, has to be changed
++  integer(kind=C_intptr_T) :: rocblasHandle = -1
++
++  ! functions to set and query the CUDA devices
++  interface
++    function hip_setdevice_c(n) result(istat) &
++             bind(C, name="hipSetDeviceFromC")
++
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=C_INT), value    :: n
++      integer(kind=C_INT)           :: istat
++    end function hip_setdevice_c
++  end interface
++
++  ! functions to copy CUDA memory
++  interface
++    function hip_memcpyDeviceToDevice_c() result(flag) &
++             bind(C, name="hipMemcpyDeviceToDeviceFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function hip_memcpyHostToDevice_c() result(flag) &
++             bind(C, name="hipMemcpyHostToDeviceFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function hip_memcpyDeviceToHost_c() result(flag) &
++             bind(C, name="hipMemcpyDeviceToHostFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function hip_hostRegisterDefault_c() result(flag) &
++             bind(C, name="hipHostRegisterDefaultFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function hip_hostRegisterPortable_c() result(flag) &
++             bind(C, name="hipHostRegisterPortableFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function hip_hostRegisterMapped_c() result(flag) &
++             bind(C, name="hipHostRegisterMappedFromC")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=c_int) :: flag
++    end function
++  end interface
++
++  interface
++    function hip_memcpy_intptr_c(dst, src, size, dir) result(istat) &
++             bind(C, name="hipMemcpyFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      integer(kind=C_intptr_t), value              :: dst
++      integer(kind=C_intptr_t), value              :: src
++      integer(kind=c_intptr_t), intent(in), value  :: size
++      integer(kind=C_INT), intent(in), value       :: dir
++      integer(kind=C_INT)                          :: istat
++
++    end function hip_memcpy_intptr_c
++  end interface
++
++  interface
++    function hip_memcpy_cptr_c(dst, src, size, dir) result(istat) &
++             bind(C, name="hipMemcpyFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr), value                           :: dst
++      type(c_ptr), value                           :: src
++      integer(kind=c_intptr_t), intent(in), value  :: size
++      integer(kind=C_INT), intent(in), value       :: dir
++      integer(kind=C_INT)                          :: istat
++
++    end function hip_memcpy_cptr_c
++  end interface
++
++  interface
++    function hip_memcpy_mixed_c(dst, src, size, dir) result(istat) &
++             bind(C, name="hipMemcpyFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr), value                           :: dst
++      integer(kind=c_intptr_t), value              :: src
++      integer(kind=c_intptr_t), intent(in), value  :: size
++      integer(kind=C_INT), intent(in), value       :: dir
++      integer(kind=C_INT)                          :: istat
++
++    end function hip_memcpy_mixed_c
++  end interface
++
++  ! functions to allocate and free CUDA memory
++
++  interface
++    function hip_free_intptr_c(a) result(istat) &
++             bind(C, name="hipFreeFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      integer(kind=C_intptr_T), value  :: a
++      integer(kind=C_INT)              :: istat
++
++    end function hip_free_intptr_c
++  end interface
++
++  interface
++    function hip_free_cptr_c(a) result(istat) &
++             bind(C, name="hipFreeFromC")
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr), value  :: a
++      integer(kind=C_INT)              :: istat
++
++    end function hip_free_cptr_c
++  end interface
++
++  interface hip_memcpy
++    module procedure hip_memcpy_intptr
++    module procedure hip_memcpy_cptr
++    module procedure hip_memcpy_mixed
++  end interface
++
++  interface hip_free
++    module procedure hip_free_intptr
++    module procedure hip_free_cptr
++  end interface
++
++  interface
++    function hip_malloc_intptr_c(a, width_height) result(istat) &
++             bind(C, name="hipMallocFromC")
++
++      use, intrinsic :: iso_c_binding
++      implicit none
++
++      integer(kind=C_intptr_T)                    :: a
++      integer(kind=c_intptr_t), intent(in), value   :: width_height
++      integer(kind=C_INT)                         :: istat
++
++    end function hip_malloc_intptr_c
++  end interface
++
++  interface
++    function hip_malloc_cptr_c(a, width_height) result(istat) &
++             bind(C, name="hipMallocFromC")
++
++      use, intrinsic :: iso_c_binding
++      implicit none
++
++      type(c_ptr)                    :: a
++      integer(kind=c_intptr_t), intent(in), value   :: width_height
++      integer(kind=C_INT)                         :: istat
++
++    end function hip_malloc_cptr_c
++  end interface
++
++  contains
++
++    function hip_setdevice(n) result(success)
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++
++      integer(kind=ik), intent(in)  :: n
++      logical                       :: success
++#ifdef WITH_AMD_GPU_VERSION
++      success = hip_setdevice_c(int(n,kind=c_int)) /= 0
++#else
++      success = .true.
++#endif
++    end function hip_setdevice
++
++    ! functions to allocate and free memory
++
++    function hip_malloc_intptr(a, width_height) result(success)
++
++     use, intrinsic :: iso_c_binding
++     implicit none
++
++     integer(kind=C_intptr_t)                  :: a
++     integer(kind=c_intptr_t), intent(in)        :: width_height
++     logical                                   :: success
++#ifdef WITH_AMD_GPU_VERSION
++     success = hip_malloc_intptr_c(a, width_height) /= 0
++#else
++     success = .true.
++#endif
++   end function
++
++    function hip_malloc_cptr(a, width_height) result(success)
++
++     use, intrinsic :: iso_c_binding
++     implicit none
++
++     type(c_ptr)                  :: a
++     integer(kind=c_intptr_t), intent(in)        :: width_height
++     logical                                   :: success
++#ifdef WITH_AMD_GPU_VERSION
++     success = hip_malloc_cptr_c(a, width_height) /= 0
++#else
++     success = .true.
++#endif
++   end function
++
++   function hip_free_intptr(a) result(success)
++
++     use, intrinsic :: iso_c_binding
++
++     implicit none
++     integer(kind=C_intptr_T) :: a
++     logical                  :: success
++#ifdef WITH_AMD_GPU_VERSION
++     success = hip_free_intptr_c(a) /= 0
++#else
++     success = .true.
++#endif
++   end function hip_free_intptr
++
++   function hip_free_cptr(a) result(success)
++
++     use, intrinsic :: iso_c_binding
++
++     implicit none
++     type(c_ptr) :: a
++     logical                  :: success
++#ifdef WITH_AMD_GPU_VERSION
++     success = hip_free_cptr_c(a) /= 0
++#else
++     success = .true.
++#endif
++   end function hip_free_cptr
++
++ ! functions to memcopy CUDA memory
++
++ function hip_memcpyDeviceToDevice() result(flag)
++   use, intrinsic :: iso_c_binding
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_AMD_GPU_VERSION
++   flag = int(hip_memcpyDeviceToDevice_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function hip_memcpyHostToDevice() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_AMD_GPU_VERSION
++   flag = int(hip_memcpyHostToDevice_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function hip_memcpyDeviceToHost() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_AMD_GPU_VERSION
++   flag = int( hip_memcpyDeviceToHost_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function hip_hostRegisterDefault() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_AMD_GPU_VERSION
++   flag = int(hip_hostRegisterDefault_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function hip_hostRegisterPortable() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_AMD_GPU_VERSION
++   flag = int(hip_hostRegisterPortable_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function hip_hostRegisterMapped() result(flag)
++   use, intrinsic :: iso_c_binding
++   use precision_for_tests
++   implicit none
++   integer(kind=ik) :: flag
++#ifdef WITH_AMD_GPU_VERSION
++   flag = int(hip_hostRegisterMapped_c())
++#else
++   flag = 0
++#endif
++ end function
++
++ function hip_memcpy_intptr(dst, src, size, dir) result(success)
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      integer(kind=C_intptr_t)              :: dst
++      integer(kind=C_intptr_t)              :: src
++      integer(kind=c_intptr_t), intent(in)    :: size
++      integer(kind=C_INT), intent(in)       :: dir
++      logical :: success
++
++#ifdef WITH_AMD_GPU_VERSION
++        success = hip_memcpy_intptr_c(dst, src, size, dir) /= 0
++#else
++        success = .true.
++#endif
++    end function
++
++ function hip_memcpy_cptr(dst, src, size, dir) result(success)
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr)                           :: dst
++      type(c_ptr)                           :: src
++      integer(kind=c_intptr_t), intent(in)  :: size
++      integer(kind=C_INT), intent(in)       :: dir
++      logical :: success
++
++#ifdef WITH_AMD_GPU_VERSION
++        success = hip_memcpy_cptr_c(dst, src, size, dir) /= 0
++#else
++        success = .true.
++#endif
++    end function
++
++ function hip_memcpy_mixed(dst, src, size, dir) result(success)
++
++      use, intrinsic :: iso_c_binding
++
++      implicit none
++      type(c_ptr)                           :: dst
++      integer(kind=c_intptr_t)              :: src
++      integer(kind=c_intptr_t), intent(in)  :: size
++      integer(kind=C_INT), intent(in)       :: dir
++      logical :: success
++
++#ifdef WITH_AMD_GPU_VERSION
++        success = hip_memcpy_mixed_c(dst, src, size, dir) /= 0
++#else
++        success = .true.
++#endif
++    end function
++
++end module test_hip_functions
+diff -ruN elpa-new_release_2021.11.001/examples/shared/GPU/ROCm/test_rocmFunctions.cpp elpa-new_release_2021.11.001_ok/examples/shared/GPU/ROCm/test_rocmFunctions.cpp
+--- elpa-new_release_2021.11.001/examples/shared/GPU/ROCm/test_rocmFunctions.cpp	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/GPU/ROCm/test_rocmFunctions.cpp	2022-01-26 10:10:58.328343000 +0100
+@@ -0,0 +1,153 @@
++//
++//    Copyright 2021, A. Marek
++//
++//    This file is part of ELPA.
++//
++//    The ELPA library was originally created by the ELPA consortium,
++//    consisting of the following organizations:
++//
++//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++//      Informatik,
++//    - Technische Universität München, Lehrstuhl für Informatik mit
++//      Schwerpunkt Wissenschaftliches Rechnen ,
++//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++//      and
++//    - IBM Deutschland GmbH
++//
++//    This particular source code file contains additions, changes and
++//    enhancements authored by Intel Corporation which is not part of
++//    the ELPA consortium.
++//
++//    More information can be found here:
++//    http://elpa.mpcdf.mpg.de/
++//
++//    ELPA is free software: you can redistribute it and/or modify
++//    it under the terms of the version 3 of the license of the
++//    GNU Lesser General Public License as published by the Free
++//    Software Foundation.
++//
++//    ELPA is distributed in the hope that it will be useful,
++//    but WITHOUT ANY WARRANTY; without even the implied warranty of
++//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++//    GNU Lesser General Public License for more details.
++//
++//    You should have received a copy of the GNU Lesser General Public License
++//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++//
++//    ELPA reflects a substantial effort on the part of the original
++//    ELPA consortium, and we ask you to respect the spirit of the
++//    license that we chose: i.e., please contribute any changes you
++//    may have back to the original ELPA library distribution, and keep
++//    any derivatives of ELPA under the same license that we chose for
++//    the original distribution, the GNU Lesser General Public License.
++//
++//
++// --------------------------------------------------------------------------------------------------
++//
++// This file was written by A. Marek, MPCDF
++#include "config-f90.h"
++
++#include <stdio.h>
++#include <math.h>
++#include <stdio.h>
++
++#include <stdlib.h>
++#include <string.h>
++#include <time.h>
++#include <alloca.h>
++#include <stdint.h>
++#include <complex.h>
++#ifdef WITH_AMD_GPU_VERSION
++//missing header for rocblas
++#include "rocblas.h"
++#include "hip/hip_runtime_api.h"
++#endif
++
++#define errormessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
++
++#ifdef DEBUG_HIP
++#define debugmessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
++#else
++#define debugmessage(x, ...)
++#endif
++
++#ifdef WITH_AMD_GPU_VERSION
++extern "C" {
++
++  int hipSetDeviceFromC(int n) {
++
++    hipError_t hiperr = hipSetDevice(n);
++    if (hiperr != hipSuccess) {
++      errormessage("Error in hipSetDevice: %s\n",hipGetErrorString(hiperr));
++      return 0;
++    }
++    return 1;
++  }
++
++  int hipMallocFromC(intptr_t *a, size_t width_height) {
++
++    hipError_t hiperr = hipMalloc((void **) a, width_height);
++#ifdef DEBUG_HIP
++    printf("HIP Malloc,  pointer address: %p, size: %d \n", *a, width_height);
++#endif
++    if (hiperr != hipSuccess) {
++      errormessage("Error in hipMalloc: %s\n",hipGetErrorString(hiperr));
++      return 0;
++    }
++    return 1;
++  }
++
++  int hipFreeFromC(intptr_t *a) {
++#ifdef DEBUG_HIP
++    printf("HIP Free, pointer address: %p \n", a);
++#endif
++    hipError_t hiperr = hipFree(a);
++
++    if (hiperr != hipSuccess) {
++      errormessage("Error in hipFree: %s\n",hipGetErrorString(hiperr));
++      return 0;
++    }
++    return 1;
++  }
++
++  int hipMemcpyFromC(intptr_t *dest, intptr_t *src, size_t count, int dir) {
++
++    hipError_t hiperr = hipMemcpy( dest, src, count, (hipMemcpyKind)dir);
++    if (hiperr != hipSuccess) {
++      errormessage("Error in hipMemcpy: %s\n",hipGetErrorString(hiperr));
++      return 0;
++    }
++    return 1;
++  }
++
++  int hipMemcpyDeviceToDeviceFromC(void) {
++      int val = (int)hipMemcpyDeviceToDevice;
++      return val;
++  }
++  int hipMemcpyHostToDeviceFromC(void) {
++      int val = (int)hipMemcpyHostToDevice;
++      return val;
++  }
++  int hipMemcpyDeviceToHostFromC(void) {
++      int val = (int)hipMemcpyDeviceToHost;
++      return val;
++  }
++  int hipHostRegisterDefaultFromC(void) {
++      int val = (int)hipHostRegisterDefault;
++      return val;
++  }
++  int hipHostRegisterPortableFromC(void) {
++      int val = (int)hipHostRegisterPortable;
++      return val;
++  }
++  int hipHostRegisterMappedFromC(void) {
++      int val = (int)hipHostRegisterMapped;
++      return val;
++  }
++
++}
++#endif /* TEST_AMD_GPU == 1 */
+diff -ruN elpa-new_release_2021.11.001/examples/shared/GPU/test_gpu_vendor_agnostic_layer.F90 elpa-new_release_2021.11.001_ok/examples/shared/GPU/test_gpu_vendor_agnostic_layer.F90
+--- elpa-new_release_2021.11.001/examples/shared/GPU/test_gpu_vendor_agnostic_layer.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/GPU/test_gpu_vendor_agnostic_layer.F90	2022-01-26 10:10:58.329439000 +0100
+@@ -0,0 +1,357 @@
++#if 0
++!    Copyright 2021, A. Marek, MPCDF
++!
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!    This particular source code file contains additions, changes and
++!    enhancements authored by Intel Corporation which is not part of
++!    the ELPA consortium.
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++#endif
++
++
++#include "config-f90.h"
++module test_gpu
++  !use precision_for_tests
++  use precision_for_tests
++  use iso_c_binding
++!#if TEST_INTEL_GPU == 1
++!  use mkl_offload
++!#endif
++  integer(kind=c_int), parameter :: nvidia_gpu = 1
++  integer(kind=c_int), parameter :: amd_gpu = 2
++  integer(kind=c_int), parameter :: intel_gpu = 3
++  integer(kind=c_int), parameter :: no_gpu = -1
++  integer(kind=c_int)            :: use_gpu_vendor
++  integer(kind=c_int)            :: gpuHostRegisterDefault    
++  integer(kind=c_int)            :: gpuMemcpyHostToDevice    
++  integer(kind=c_int)            :: gpuMemcpyDeviceToHost   
++  integer(kind=c_int)            :: gpuMemcpyDeviceToDevice
++  integer(kind=c_int)            :: gpuHostRegisterMapped
++  integer(kind=c_int)            :: gpuHostRegisterPortable
++
++  integer(kind=c_intptr_t), parameter :: size_of_double_real    = 8_rk8
++#ifdef WANT_SINGLE_PRECISION_REAL
++  integer(kind=c_intptr_t), parameter :: size_of_single_real    = 4_rk4
++#endif
++
++  integer(kind=c_intptr_t), parameter :: size_of_double_complex = 16_ck8
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++  integer(kind=c_intptr_t), parameter :: size_of_single_complex = 8_ck4
++#endif
++
++  interface gpu_memcpy
++    module procedure gpu_memcpy_intptr
++    module procedure gpu_memcpy_cptr
++    module procedure gpu_memcpy_mixed
++  end interface
++
++  interface gpu_malloc
++    module procedure gpu_malloc_intptr
++    module procedure gpu_malloc_cptr
++  end interface
++
++  interface gpu_free
++    module procedure gpu_free_intptr
++    module procedure gpu_free_cptr
++  end interface
++
++  contains
++    function gpu_vendor(set_vendor) result(vendor)
++      use precision_for_tests
++      implicit none
++      integer(kind=c_int)             :: vendor
++      integer(kind=c_int), intent(in) :: set_vendor
++      ! default
++      vendor = no_gpu
++      if (set_vendor == nvidia_gpu) then
++        vendor = nvidia_gpu
++      endif
++      if (set_vendor == amd_gpu) then
++        vendor = amd_gpu
++      endif
++!#if TEST_INTEL_GPU == 1
++!      vendor = intel_gpu
++!#endif
++      use_gpu_vendor = vendor
++      return
++    end function
++
++    subroutine set_gpu_parameters
++#ifdef WITH_NVIDIA_GPU_VERSION
++      use test_cuda_functions
++#endif
++#ifdef WITH_AMD_GPU_VERSION
++      use test_hip_functions
++#endif
++      implicit none
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++      if (use_gpu_vendor == nvidia_gpu) then
++        cudaMemcpyHostToDevice   = cuda_memcpyHostToDevice()
++        gpuMemcpyHostToDevice    = cudaMemcpyHostToDevice
++        cudaMemcpyDeviceToHost   = cuda_memcpyDeviceToHost()
++        gpuMemcpyDeviceToHost    = cudaMemcpyDeviceToHost
++        cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
++        gpuMemcpyDeviceToDevice  = cudaMemcpyDeviceToDevice
++        cudaHostRegisterPortable = cuda_hostRegisterPortable()
++        gpuHostRegisterPortable  = cudaHostRegisterPortable
++        cudaHostRegisterMapped   = cuda_hostRegisterMapped()
++        gpuHostRegisterMapped    = cudaHostRegisterMapped
++        cudaHostRegisterDefault  = cuda_hostRegisterDefault()
++        gpuHostRegisterDefault   = cudaHostRegisterDefault
++      endif
++#endif
++
++#ifdef WITH_AMD_GPU_VERSION
++      if (use_gpu_vendor == amd_gpu) then
++        hipMemcpyHostToDevice   = hip_memcpyHostToDevice()
++        gpuMemcpyHostToDevice   = hipMemcpyHostToDevice
++        hipMemcpyDeviceToHost   = hip_memcpyDeviceToHost()
++        gpuMemcpyDeviceToHost   = hipMemcpyDeviceToHost
++        hipMemcpyDeviceToDevice = hip_memcpyDeviceToDevice()
++        gpuMemcpyDeviceToDevice = hipMemcpyDeviceToDevice
++        hipHostRegisterPortable = hip_hostRegisterPortable()
++        gpuHostRegisterPortable = hipHostRegisterPortable
++        hipHostRegisterMapped   = hip_hostRegisterMapped()
++        gpuHostRegisterMapped   = hipHostRegisterMapped
++        hipHostRegisterDefault  = hip_hostRegisterDefault()
++        gpuHostRegisterDefault  = hipHostRegisterDefault
++      endif
++#endif
++
++    end subroutine
++
++    function gpu_malloc_intptr(array, elements) result(success)
++      use, intrinsic :: iso_c_binding
++#ifdef WITH_NVIDIA_GPU_VERSION
++      use test_cuda_functions
++#endif
++#ifdef WITH_AMD_GPU_VERSION
++      use test_hip_functions
++#endif
++      implicit none
++      integer(kind=C_intptr_T)             :: array
++      integer(kind=c_intptr_t), intent(in) :: elements
++      logical                              :: success
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++      if (use_gpu_vendor == nvidia_gpu) then
++        success = cuda_malloc_intptr(array, elements)
++      endif
++#endif
++#ifdef WITH_AMD_GPU_VERSION
++      if (use_gpu_vendor == amd_gpu) then
++        success = hip_malloc_intptr(array, elements)
++      endif
++#endif
++
++    end function
++
++    function gpu_malloc_cptr(array, elements) result(success)
++      use, intrinsic :: iso_c_binding
++#ifdef WITH_NVIDIA_GPU_VERSION
++      use test_cuda_functions
++#endif
++#ifdef WITH_AMD_GPU_VERSION
++      use test_hip_functions
++#endif
++      implicit none
++      type(c_ptr)                          :: array
++      integer(kind=c_intptr_t), intent(in) :: elements
++      logical                              :: success
++      success = .false.
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++      if (use_gpu_vendor == nvidia_gpu) then
++        success = cuda_malloc_cptr(array, elements)
++      endif
++#endif
++
++#ifdef WITH_AMD_GPU_VERSION
++      if (use_gpu_vendor == amd_gpu) then
++        success = hip_malloc_cptr(array, elements)
++      endif
++#endif
++
++    end function
++
++    function gpu_memcpy_intptr(dst, src, size, dir) result(success)
++      use, intrinsic :: iso_c_binding
++#ifdef WITH_NVIDIA_GPU_VERSION
++      use test_cuda_functions
++#endif
++#ifdef WITH_AMD_GPU_VERSION
++      use test_hip_functions
++#endif
++      implicit none
++      integer(kind=C_intptr_t)              :: dst
++      integer(kind=C_intptr_t)              :: src
++      integer(kind=c_intptr_t), intent(in)  :: size
++      integer(kind=C_INT), intent(in)       :: dir
++      logical :: success
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++      if (use_gpu_vendor == nvidia_gpu) then
++        success = cuda_memcpy_intptr(dst, src, size, dir)
++      endif
++#endif
++
++#ifdef WITH_AMD_GPU_VERSION
++      if (use_gpu_vendor == amd_gpu) then
++        success = hip_memcpy_intptr(dst, src, size, dir)
++      endif
++#endif
++
++    end function
++    
++    function gpu_memcpy_cptr(dst, src, size, dir) result(success)
++      use, intrinsic :: iso_c_binding
++#ifdef WITH_NVIDIA_GPU_VERSION
++      use test_cuda_functions
++#endif
++#ifdef WITH_AMD_GPU_VERSION
++      use test_hip_functions
++#endif
++      implicit none
++      type(c_ptr)                           :: dst
++      type(c_ptr)                           :: src
++      integer(kind=c_intptr_t), intent(in)  :: size
++      integer(kind=C_INT), intent(in)       :: dir
++      logical :: success
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++      if (use_gpu_vendor == nvidia_gpu) then
++        success = cuda_memcpy_cptr(dst, src, size, dir)
++      endif
++#endif
++
++#ifdef WITH_AMD_GPU_VERSION
++      if (use_gpu_vendor == amd_gpu) then
++        success = hip_memcpy_cptr(dst, src, size, dir)
++      endif
++#endif
++
++    end function
++    
++    function gpu_memcpy_mixed(dst, src, size, dir) result(success)
++      use, intrinsic :: iso_c_binding
++#ifdef WITH_NVIDIA_GPU_VERSION
++      use test_cuda_functions
++#endif
++#ifdef WITH_AMD_GPU_VERSION
++      use test_hip_functions
++#endif
++      implicit none
++      type(c_ptr)                           :: dst
++      integer(kind=C_intptr_t)              :: src
++      integer(kind=c_intptr_t), intent(in)  :: size
++      integer(kind=C_INT), intent(in)       :: dir
++      logical :: success
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++      if (use_gpu_vendor == nvidia_gpu) then
++        success = cuda_memcpy_mixed(dst, src, size, dir)
++      endif
++#endif
++
++#ifdef WITH_AMD_GPU_VERSION
++      if (use_gpu_vendor == amd_gpu) then
++        success = hip_memcpy_mixed(dst, src, size, dir)
++      endif
++#endif
++
++    end function
++
++    function gpu_free_intptr(a) result(success)
++      use, intrinsic :: iso_c_binding
++#ifdef WITH_NVIDIA_GPU_VERSION
++      use test_cuda_functions
++#endif
++#ifdef WITH_AMD_GPU_VERSION
++      use test_hip_functions
++#endif
++      implicit none
++      integer(kind=c_intptr_t)                :: a
++
++      logical :: success
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++      if (use_gpu_vendor == nvidia_gpu) then
++        success = cuda_free_intptr(a)
++      endif
++#endif
++
++#ifdef WITH_AMD_GPU_VERSION
++      if (use_gpu_vendor == amd_gpu) then
++        success = hip_free_intptr(a)
++      endif
++#endif
++
++    end function
++
++    function gpu_free_cptr(a) result(success)
++      use, intrinsic :: iso_c_binding
++#ifdef WITH_NVIDIA_GPU_VERSION
++      use test_cuda_functions
++#endif
++#ifdef WITH_AMD_GPU_VERSION
++      use test_hip_functions
++#endif
++      implicit none
++      type(c_ptr)                :: a
++
++      logical :: success
++
++#ifdef WITH_NVIDIA_GPU_VERSION
++      if (use_gpu_vendor == nvidia_gpu) then
++        success = cuda_free_cptr(a)
++      endif
++#endif
++
++#ifdef WITH_AMD_GPU_VERSION
++      if (use_gpu_vendor == amd_gpu) then
++        success = hip_free_cptr(a)
++      endif
++#endif
++
++    end function
++
++end module 
+diff -ruN elpa-new_release_2021.11.001/examples/shared/mod_tests_blas_interfaces.F90 elpa-new_release_2021.11.001_ok/examples/shared/mod_tests_blas_interfaces.F90
+--- elpa-new_release_2021.11.001/examples/shared/mod_tests_blas_interfaces.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/mod_tests_blas_interfaces.F90	2022-01-26 10:10:58.330923000 +0100
+@@ -0,0 +1,53 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    https://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++! This file was written by A. Marek, MPCDF
++
++#include "config-f90.h"
++#define PRECISION_MODULE precision_for_tests
++module tests_blas_interfaces
++  use iso_c_binding
++  use precision_for_tests
++
++  implicit none
++
++#include "../../src/helpers/fortran_blas_interfaces.F90"
++
++end module
+diff -ruN elpa-new_release_2021.11.001/examples/shared/mod_tests_scalapack_interfaces.F90 elpa-new_release_2021.11.001_ok/examples/shared/mod_tests_scalapack_interfaces.F90
+--- elpa-new_release_2021.11.001/examples/shared/mod_tests_scalapack_interfaces.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/mod_tests_scalapack_interfaces.F90	2022-01-26 10:10:58.331765000 +0100
+@@ -0,0 +1,56 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    https://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++! This file was written by A. Marek, MPCDF
++
++
++#include "config-f90.h"
++#define PRECISION_MODULE precision_for_tests
++module tests_scalapack_interfaces
++  use iso_c_binding
++  use precision_for_tests
++
++  implicit none
++
++#include "../../src/helpers/fortran_scalapack_interfaces.F90"
++
++end module
++
++
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_analytic.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_analytic.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_analytic.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_analytic.F90	2022-01-26 10:10:58.332553000 +0100
+@@ -0,0 +1,204 @@
++! (c) Copyright Pavel Kus, 2017, MPCDF
++!
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++
++#include "../Fortran/assert.h"
++#include "config-f90.h"
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++module test_analytic
++
++  use test_util
++#ifdef HAVE_DETAILED_TIMINGS
++  use ftimings
++#else
++  use timings_dummy
++#endif
++  use precision_for_tests
++
++  interface prepare_matrix_analytic
++    module procedure prepare_matrix_analytic_complex_double
++    module procedure prepare_matrix_analytic_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure prepare_matrix_analytic_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure prepare_matrix_analytic_complex_single
++#endif
++  end interface
++
++  interface check_correctness_analytic
++    module procedure check_correctness_analytic_complex_double
++    module procedure check_correctness_analytic_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure check_correctness_analytic_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure check_correctness_analytic_complex_single
++#endif
++  end interface
++
++
++  interface print_matrix
++    module procedure print_matrix_complex_double
++    module procedure print_matrix_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure print_matrix_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure print_matrix_complex_single
++#endif
++  end interface
++
++  TEST_INT_TYPE, parameter, private  :: num_primes = 3
++#ifdef BUILD_FUGAKU
++  TEST_INT_TYPE, private  :: primes(num_primes)
++#else
++  TEST_INT_TYPE, parameter, private  :: primes(num_primes) = (/2,3,5/)
++#endif
++
++  TEST_INT_TYPE, parameter, private  :: ANALYTIC_MATRIX = 0
++  TEST_INT_TYPE, parameter, private  :: ANALYTIC_EIGENVECTORS = 1
++  TEST_INT_TYPE, parameter, private  :: ANALYTIC_EIGENVALUES = 2
++
++  contains
++
++  function decompose(num, decomposition) result(possible)
++    implicit none
++    TEST_INT_TYPE, intent(in)   :: num
++    TEST_INT_TYPE, intent(out)  :: decomposition(num_primes)
++    logical                        :: possible
++    TEST_INT_TYPE               :: reminder, prime, prime_id
++
++#ifdef BUILD_FUGAKU
++    primes(1) = 2
++    primes(2) = 3
++    primes(3) = 5
++#endif
++    decomposition = 0
++    possible = .true.
++    reminder = num
++    do prime_id = 1, num_primes
++      prime = primes(prime_id)
++      do while (MOD(reminder, prime) == 0)
++        decomposition(prime_id) = decomposition(prime_id) + 1
++        reminder = reminder / prime
++      end do
++    end do
++    if(reminder > 1) then
++      possible = .false.
++    end if
++  end function
++
++  function compose(decomposition) result(num)
++    implicit none
++    TEST_INT_TYPE, intent(in)   :: decomposition(num_primes)
++    TEST_INT_TYPE               :: num, prime_id
++
++    num = 1;
++#ifdef BUILD_FUGAKU
++    primes(1) = 2
++    primes(2) = 3
++    primes(3) = 5
++#endif
++    do prime_id = 1, num_primes
++      num = num * primes(prime_id) ** decomposition(prime_id)
++    end do
++  end function
++
++
++#include "../../src/general/prow_pcol.F90"
++#include "../../src/general/map_global_to_local.F90"
++
++
++#define COMPLEXCASE 1
++#define DOUBLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_analytic_template.F90"
++#undef DOUBLE_PRECISION
++#undef COMPLEXCASE
++
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++
++#define COMPLEXCASE 1
++#define SINGLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_analytic_template.F90"
++#undef SINGLE_PRECISION
++#undef COMPLEXCASE
++
++#endif /* WANT_SINGLE_PRECISION_COMPLEX */
++
++#define REALCASE 1
++#define DOUBLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_analytic_template.F90"
++#undef DOUBLE_PRECISION
++#undef REALCASE
++
++#ifdef WANT_SINGLE_PRECISION_REAL
++
++#define REALCASE 1
++#define SINGLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_analytic_template.F90"
++#undef SINGLE_PRECISION
++#undef REALCASE
++
++#endif /* WANT_SINGLE_PRECISION_REAL */
++
++
++end module
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_analytic_template.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_analytic_template.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_analytic_template.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_analytic_template.F90	2022-01-26 10:10:58.333552000 +0100
+@@ -0,0 +1,701 @@
++!  (c) Copyright Pavel Kus, 2017, MPCDF
++!
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++
++#include "config-f90.h"
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++
++  subroutine prepare_matrix_analytic_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(na, a, nblk, myid, np_rows, np_cols, my_prow, my_pcol, print_times)
++    use precision_for_tests
++
++    implicit none
++    TEST_INT_TYPE, intent(in)                       :: na, nblk, myid, np_rows, np_cols, my_prow, my_pcol
++    MATH_DATATYPE(kind=REAL_DATATYPE), intent(inout):: a(:,:)
++    logical, optional                               :: print_times
++    logical                                         :: print_timer
++    TEST_INT_TYPE                                   :: globI, globJ, locI, locJ, pi, pj, levels(num_primes)
++    integer(kind=c_int)                             :: loc_I, loc_J, p_i, p_j
++#ifdef HAVE_DETAILED_TIMINGS
++    type(timer_t)                                   :: timer
++#else
++    type(timer_dummy_t)                             :: timer
++#endif
++
++    call timer%enable()
++    call timer%start("prepare_matrix_analytic")
++
++    print_timer = .true.
++
++    if (present(print_times)) then
++      print_timer = print_times
++    endif
++
++    ! for debug only, do it systematicaly somehow ... unit tests
++    call check_module_sanity_&
++            &MATH_DATATYPE&
++            &_&
++            &PRECISION&
++            &(myid)
++
++    if(.not. decompose(na, levels)) then
++      if(myid == 0) then
++        print *, "Analytic test can be run only with matrix sizes of the form 2^n * 3^m * 5^o"
++        stop 1
++      end if
++    end if
++
++    call timer%start("loop")
++    do globI = 1, na
++
++      p_i = prow(int(globI,kind=c_int), int(nblk,kind=c_int), int(np_rows,kind=c_int))
++      pi = int(p_i,kind=INT_TYPE)
++      if (my_prow .ne. pi) cycle
++
++      do globJ = 1, na
++
++        p_j = pcol(int(globJ,kind=c_int), int(nblk,kind=c_int), int(np_cols,kind=c_int))
++        pj = int(p_j,kind=INT_TYPE)
++        if (my_pcol .ne. pj) cycle
++
++        if(map_global_array_index_to_local_index(int(globI,kind=c_int), int(globJ,kind=c_int), loc_I, loc_J, &
++                 int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
++                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
++           locI = int(loc_i,kind=INT_TYPE)      
++           locJ = int(loc_j,kind=INT_TYPE)      
++           call timer%start("evaluation")
++           a(locI, locJ) = analytic_matrix_&
++              &MATH_DATATYPE&
++              &_&
++              &PRECISION&
++              &(na, globI, globJ)
++          call timer%stop("evaluation")
++        else
++          print *, "Warning ... error in preparation loop of the analytic test"
++        end if
++      end do
++    end do
++    call timer%stop("loop")
++
++    call timer%stop("prepare_matrix_analytic")
++    if(myid == 0 .and. print_timer) then
++      call timer%print("prepare_matrix_analytic")
++    end if
++    call timer%free()
++  end subroutine
++
++  function check_correctness_analytic_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, check_all_evals, &
++      check_eigenvectors, print_times) result(status)
++    use precision_for_tests
++    
++    implicit none
++#include "./test_precision_kinds.F90"
++    TEST_INT_TYPE, intent(in)              :: na, nev, nblk, myid, np_rows, &
++                                              np_cols, my_prow, my_pcol
++    TEST_INT_TYPE                          :: status
++    TEST_INT_MPI_TYPE                      :: mpierr
++    MATH_DATATYPE(kind=rck), intent(inout) :: z(:,:)
++    real(kind=rk), intent(inout)           :: ev(:)
++    logical, intent(in)                    :: check_all_evals, check_eigenvectors
++
++    TEST_INT_TYPE                          :: globI, globJ, locI, locJ, &
++                                              levels(num_primes)
++    integer(kind=c_int)                    :: loc_I, loc_J
++    real(kind=rk)                          :: diff, max_z_diff, max_ev_diff, &
++                                              glob_max_z_diff, max_curr_z_diff
++#ifdef DOUBLE_PRECISION
++    real(kind=rk), parameter               :: tol_eigenvalues = 5e-14_rk8
++    real(kind=rk), parameter               :: tol_eigenvectors = 6e-11_rk8
++#endif
++#ifdef SINGLE_PRECISION
++    ! tolerance needs to be very high due to qr tests
++    ! it should be distinguished somehow!
++    real(kind=rk), parameter               :: tol_eigenvalues = 7e-6_rk4
++    real(kind=rk), parameter               :: tol_eigenvectors = 4e-3_rk4
++#endif
++    real(kind=rk)                          :: computed_ev, expected_ev
++    MATH_DATATYPE(kind=rck)                :: computed_z,  expected_z
++
++    MATH_DATATYPE(kind=rck)                :: max_value_for_normalization, &
++                                              computed_z_on_max_position,  &
++                                              normalization_quotient
++    MATH_DATATYPE(kind=rck)                :: max_values_array(np_rows * np_cols), &
++                                              corresponding_exact_value
++    integer(kind=c_int)                    :: max_value_idx, rank_with_max, &
++                                              rank_with_max_reduced,        &
++                                              num_checked_evals
++    integer(kind=c_int)                    :: max_idx_array(np_rows * np_cols), &
++                                              rank
++    logical, optional                      :: print_times
++    logical                                :: print_timer
++
++#ifdef HAVE_DETAILED_TIMINGS
++    type(timer_t)    :: timer
++#else
++    type(timer_dummy_t)    :: timer
++#endif
++
++    call timer%enable()
++    call timer%start("check_correctness_analytic")
++
++
++    print_timer = .true.
++    if (present(print_times)) then
++      print_timer = print_times
++    endif
++
++    if(.not. decompose(na, levels)) then
++      print *, "can not decomopse matrix size"
++      stop 1
++    end if
++
++    if(check_all_evals) then
++        num_checked_evals = na
++    else
++        num_checked_evals = nev
++    endif
++    !call print_matrix(myid, na, z, "z")
++    max_z_diff = 0.0_rk
++    max_ev_diff = 0.0_rk
++    call timer%start("loop_eigenvalues")
++    do globJ = 1, num_checked_evals
++      computed_ev = ev(globJ)
++      call timer%start("evaluation")
++      expected_ev = analytic_eigenvalues_real_&
++              &PRECISION&
++              &(na, globJ)
++      call timer%stop("evaluation")
++      diff = abs(computed_ev - expected_ev)
++      max_ev_diff = max(diff, max_ev_diff)
++    end do
++    call timer%stop("loop_eigenvalues")
++
++    call timer%start("loop_eigenvectors")
++    do globJ = 1, nev
++      max_curr_z_diff = 0.0_rk
++
++      ! eigenvectors are unique up to multiplication by scalar (complex in complex case)
++      ! to be able to compare them with analytic, we have to normalize them somehow
++      ! we will find a value in computed eigenvector with highest absolut value and enforce
++      ! such multiple of computed eigenvector, that the value on corresponding position is the same
++      ! as an corresponding value in the analytical eigenvector
++
++      ! find the maximal value in the local part of given eigenvector (with index globJ)
++      max_value_for_normalization = 0.0_rk
++      max_value_idx = -1
++      do globI = 1, na
++        if(map_global_array_index_to_local_index(int(globI,kind=c_int), int(globJ,kind=c_int), loc_I, loc_J, &
++                 int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
++                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
++          locI = int(loc_I,kind=INT_TYPE)
++          locJ = int(loc_J,kind=INT_TYPE)
++          computed_z = z(locI, locJ)
++          if(abs(computed_z) > abs(max_value_for_normalization)) then
++            max_value_for_normalization = computed_z
++            max_value_idx = int(globI,kind=c_int)
++          end if
++        end if
++      end do
++
++      ! find the global maximum and its position. From technical reasons (looking for a 
++      ! maximum of complex number), it is not so easy to do it nicely. Therefore we 
++      ! communicate local maxima to mpi rank 0 and resolve there. If we wanted to do
++      ! it without this, it would be tricky.. question of uniquness - two complex numbers
++      ! with the same absolut values, but completely different... 
++#ifdef WITH_MPI
++      call MPI_Gather(max_value_for_normalization, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
++                      max_values_array, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, 0_MPI_KIND, &
++                      int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
++      call MPI_Gather(max_value_idx, 1_MPI_KIND, MPI_INT, max_idx_array, 1_MPI_KIND, MPI_INT, &
++                      0_MPI_KIND, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
++      max_value_for_normalization = 0.0_rk
++      max_value_idx = -1
++      do rank = 1, np_cols * np_rows 
++        if(abs(max_values_array(rank)) > abs(max_value_for_normalization)) then
++          max_value_for_normalization = max_values_array(rank)
++          max_value_idx = max_idx_array(rank)
++        end if
++      end do
++      call MPI_Bcast(max_value_for_normalization, 1_MPI_KIND, MPI_MATH_DATATYPE_PRECISION, &
++                     0_MPI_KIND, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
++      call MPI_Bcast(max_value_idx, 1_MPI_KIND, MPI_INT, 0_MPI_KIND, &
++                     int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
++#endif
++      ! we decided what the maximum computed value is. Calculate expected value on the same 
++      if(abs(max_value_for_normalization) < 0.0001_rk) then 
++        if(myid == 0) print *, 'Maximal value in eigenvector too small     :', max_value_for_normalization
++        status =1
++        return
++      end if
++      call timer%start("evaluation_helper")
++      corresponding_exact_value  = analytic_eigenvectors_&
++                                       &MATH_DATATYPE&
++                                       &_&
++                                       &PRECISION&
++                                       &(na, int(max_value_idx,kind=INT_TYPE), globJ)
++      call timer%stop("evaluation_helper")
++      normalization_quotient = corresponding_exact_value / max_value_for_normalization
++      ! write(*,*) "normalization q", normalization_quotient
++
++      ! compare computed and expected eigenvector values, but take into account normalization quotient
++      do globI = 1, na
++        if(map_global_array_index_to_local_index(int(globI,kind=c_int), int(globJ,kind=c_int), loc_I, loc_J, &
++                 int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
++                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
++           locI = int(loc_I,kind=INT_TYPE)
++           locJ = int(loc_J,kind=INT_TYPE)
++           computed_z = z(locI, locJ)
++           call timer%start("evaluation")
++           expected_z = analytic_eigenvectors_&
++              &MATH_DATATYPE&
++              &_&
++              &PRECISION&
++              &(na, globI, globJ)
++           call timer%stop("evaluation")
++           max_curr_z_diff = max(abs(normalization_quotient * computed_z - expected_z), max_curr_z_diff)
++        end if
++      end do
++      ! we have max difference of one of the eigenvectors, update global
++      max_z_diff = max(max_z_diff, max_curr_z_diff)
++    end do !globJ
++    call timer%stop("loop_eigenvectors")
++
++#ifdef WITH_MPI
++    call mpi_allreduce(max_z_diff, glob_max_z_diff, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, &
++                       int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
++#else
++    glob_max_z_diff = max_z_diff
++#endif
++    if(myid == 0) print *, 'Maximum error in eigenvalues      :', max_ev_diff
++    if (check_eigenvectors) then
++      if(myid == 0) print *, 'Maximum error in eigenvectors     :', glob_max_z_diff
++    endif
++
++    status = 0
++    if (nev .gt. 2) then
++      if (max_ev_diff .gt. tol_eigenvalues .or. max_ev_diff .eq. 0.0_rk) status = 1
++      if (check_eigenvectors) then
++        if (glob_max_z_diff .gt. tol_eigenvectors .or. glob_max_z_diff .eq. 0.0_rk) status = 1
++      endif
++    else
++      if (max_ev_diff .gt. tol_eigenvalues) status = 1
++      if (check_eigenvectors) then
++        if (glob_max_z_diff .gt. tol_eigenvectors) status = 1
++      endif
++    endif
++
++    call timer%stop("check_correctness_analytic")
++    if(myid == 0 .and. print_timer) then
++      call timer%print("check_correctness_analytic")
++    end if
++    call timer%free()
++  end function
++
++
++  function analytic_matrix_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(na, i, j) result(element)
++    use precision_for_tests
++
++    implicit none
++    TEST_INT_TYPE, intent(in) :: na, i, j
++    MATH_DATATYPE(kind=REAL_DATATYPE)     :: element
++
++    element = analytic_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(na, i, j, ANALYTIC_MATRIX)
++
++  end function
++
++  function analytic_eigenvectors_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(na, i, j) result(element)
++    use precision_for_tests
++
++    implicit none
++    TEST_INT_TYPE, intent(in) :: na, i, j
++    MATH_DATATYPE(kind=REAL_DATATYPE)               :: element
++
++    element = analytic_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(na, i, j, ANALYTIC_EIGENVECTORS)
++
++  end function
++
++  function analytic_eigenvalues_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(na, i) result(element)
++    use precision_for_tests
++
++    implicit none
++    TEST_INT_TYPE, intent(in) :: na, i
++    real(kind=REAL_DATATYPE)              :: element
++
++    element = analytic_real_&
++    &PRECISION&
++    &(na, i, i, ANALYTIC_EIGENVALUES)
++
++  end function
++
++  function analytic_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(na, i, j, what) result(element)
++    use precision_for_tests
++
++    implicit none
++#include "./test_precision_kinds.F90"
++    TEST_INT_TYPE, intent(in)     :: na, i, j, what
++    MATH_DATATYPE(kind=rck)       :: element, mat2x2(2,2), mat(5,5)
++    real(kind=rk)                 :: a, am, amp
++    TEST_INT_TYPE                 :: levels(num_primes)
++    TEST_INT_TYPE                 :: ii, jj, m, prime_id, prime, total_level, level
++
++    real(kind=rk), parameter      :: s = 0.5_rk
++    real(kind=rk), parameter      :: c = 0.86602540378443864679_rk
++    real(kind=rk), parameter      :: sq2 = 1.4142135623730950488_rk
++
++    real(kind=rk), parameter      :: largest_ev = 2.0_rk
++
++    assert(i <= na)
++    assert(j <= na)
++    assert(i >= 0)
++    assert(j >= 0)
++    assert(decompose(na, levels))
++    ! go to zero-based indexing
++    ii = i - 1
++    jj = j - 1
++    if (na .gt. 2) then
++      a = exp(log(largest_ev)/(na-1))
++    else
++      a = exp(log(largest_ev)/(1))
++    endif
++
++    element = 1.0_rck
++#ifdef COMPLEXCASE
++    element = (1.0_rk, 0.0_rk)
++#endif
++    total_level = 0
++    am = a
++#ifdef BUILD_FUGAKU
++    primes(1) = 2
++    primes(2) = 3
++    primes(3) = 5
++#endif
++    do prime_id = 1,num_primes
++      prime = primes(prime_id)
++      do  level = 1, levels(prime_id)
++        amp = am**(prime-1)
++        total_level = total_level + 1
++        if(what == ANALYTIC_MATRIX) then
++#ifdef REALCASE
++#ifndef FUGAKU
++          mat2x2 = reshape((/ c*c + amp * s*s, (amp - 1.0_rk) * s*c,  &
++                           (amp - 1.0_rk) * s*c, s*s + amp * c*c  /), &
++                                      (/2, 2/), order=(/2,1/))
++#endif
++#endif
++#ifdef COMPLEXCASE
++#ifndef FUGAKU
++          mat2x2 = reshape((/ 0.5_rck * (amp + 1.0_rck) * (1.0_rk, 0.0_rk),   sq2/4.0_rk * (amp - 1.0_rk) * (1.0_rk, 1.0_rk),   &
++                              sq2/4.0_rk * (amp - 1.0_rk) * (1.0_rk, -1.0_rk),  0.5_rck * (amp + 1.0_rck) * (1.0_rk, 0.0_rk) /), &
++                                      (/2, 2/), order=(/2,1/))
++! intel 2018 does not reshape correctly (one would have to specify order=(/1,2/)
++! until this is resolved, I resorted to the following
++          mat2x2(1,2) = sq2/4.0_rk * (amp - 1.0_rk) * (1.0_rk, 1.0_rk)
++          mat2x2(2,1) = sq2/4.0_rk * (amp - 1.0_rk) * (1.0_rk, -1.0_rk)
++#endif
++#endif
++        else if(what == ANALYTIC_EIGENVECTORS) then
++#ifdef REALCASE
++#ifndef FUGAKU
++          mat2x2 = reshape((/ c, s,  &
++                           -s,  c  /), &
++                                (/2, 2/), order=(/2,1/))
++! intel 2018 does not reshape correctly (one would have to specify order=(/1,2/)
++! until this is resolved, I resorted to the following
++          mat2x2(1,2) = s
++          mat2x2(2,1) = -s
++#endif
++#endif
++#ifdef COMPLEXCASE
++#ifndef FUGAKU
++          mat2x2 = reshape((/ -sq2/2.0_rck * (1.0_rk, 0.0_rk),       -sq2/2.0_rck * (1.0_rk, 0.0_rk),  &
++                              0.5_rk * (1.0_rk, -1.0_rk),  0.5_rk * (-1.0_rk, 1.0_rk)  /), &
++                                (/2, 2/), order=(/2,1/))
++! intel 2018 does not reshape correctly (one would have to specify order=(/1,2/)
++! until this is resolved, I resorted to the following
++          mat2x2(1,2) = -sq2/2.0_rck * (1.0_rk, 0.0_rk)
++          mat2x2(2,1) = 0.5_rk * (1.0_rk, -1.0_rk)
++#endif
++#endif
++        else if(what == ANALYTIC_EIGENVALUES) then
++#ifndef FUGAKU
++          mat2x2 = reshape((/ 1.0_rck, 0.0_rck,  &
++                           0.0_rck, amp  /), &
++                                 (/2, 2/), order=(/2,1/))
++#endif
++        else
++          assert(.false.)
++        end if
++
++        mat = 0.0_rck
++        if(prime == 2) then
++#ifndef BUILD_FUGAKU
++          mat(1:2, 1:2) = mat2x2
++#endif
++        else if(prime == 3) then
++#ifndef BUILD_FUGAKU
++          mat((/1,3/),(/1,3/)) = mat2x2
++#endif
++          if(what == ANALYTIC_EIGENVECTORS) then
++            mat(2,2) = 1.0_rck
++          else
++            mat(2,2) = am
++          end if
++        else if(prime == 5) then
++#ifndef BUILD_FUGAKU
++          mat((/1,5/),(/1,5/)) = mat2x2
++#endif
++          if(what == ANALYTIC_EIGENVECTORS) then
++            mat(2,2) = 1.0_rck
++            mat(3,3) = 1.0_rck
++            mat(4,4) = 1.0_rck
++          else
++            mat(2,2) = am
++            mat(3,3) = am**2
++            mat(4,4) = am**3
++          end if
++        else
++          assert(.false.)
++        end if
++
++  !      write(*,*) "calc value, elem: ", element, ", mat: ", mod(ii,2), mod(jj,2),  mat(mod(ii,2), mod(jj,2)), "am ", am
++  !      write(*,*) " matrix mat", mat
++        element = element * mat(mod(ii,prime) + 1, mod(jj,prime) + 1)
++        ii = ii / prime
++        jj = jj / prime
++
++        am = am**prime
++      end do
++    end do
++    !write(*,*) "returning value ", element
++  end function
++
++
++  subroutine print_matrix_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(myid, na, mat, mat_name)
++    use precision_for_tests
++
++    implicit none
++#include "./test_precision_kinds.F90"
++    TEST_INT_TYPE, intent(in)    :: myid, na
++    character(len=*), intent(in) :: mat_name
++    MATH_DATATYPE(kind=rck)      :: mat(na, na)
++    TEST_INT_TYPE                :: i,j
++    character(len=20)            :: na_str
++
++    if(myid .ne. 0) &
++      return
++    write(*,*) "Matrix: "//trim(mat_name)
++    write(na_str, *) na
++    do i = 1, na
++#ifdef REALCASE
++      write(*, '('//trim(na_str)//'f8.3)') mat(i, :)
++#endif
++#ifdef COMPLEXCASE
++      write(*,'('//trim(na_str)//'(A,f8.3,A,f8.3,A))') ('(', real(mat(i,j)), ',', aimag(mat(i,j)), ')', j=1,na)
++#endif
++    end do
++    write(*,*)
++  end subroutine
++
++
++  subroutine check_matrices_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(myid, na)
++    use precision_for_tests
++
++    implicit none
++#include "./test_precision_kinds.F90"
++    TEST_INT_TYPE, intent(in) :: myid, na
++    MATH_DATATYPE(kind=rck)   :: A(na, na), S(na, na), L(na, na), res(na, na)
++    TEST_INT_TYPE             :: i, j, decomposition(num_primes)
++
++    real(kind=rk)             :: err
++#ifdef DOUBLE_PRECISION
++    real(kind=rk), parameter  :: TOL =  1e-8
++#endif
++#ifdef SINGLE_PRECISION
++    real(kind=rk), parameter  :: TOL =  1e-4
++#endif
++
++    assert(decompose(na, decomposition))
++
++    do i = 1, na
++      do j = 1, na
++        A(i,j) = analytic_matrix_&
++              &MATH_DATATYPE&
++              &_&
++              &PRECISION&
++              &(na, i, j)
++        S(i,j) = analytic_eigenvectors_&
++              &MATH_DATATYPE&
++              &_&
++              &PRECISION&
++              &(na, i, j)
++        L(i,j) = analytic_&
++              &MATH_DATATYPE&
++              &_&
++              &PRECISION&
++              &(na, i, j, ANALYTIC_EIGENVALUES)
++      end do
++    end do
++
++    res = matmul(A,S) - matmul(S,L)
++    err = maxval(abs(res))
++    
++    if(err > TOL) then
++      print *, "WARNING: sanity test in module analytic failed, error is ", err
++    end if
++
++    if(.false.) then
++    !if(na == 2 .or. na == 5) then
++      call print_matrix(myid, na, A, "A")
++      call print_matrix(myid, na, S, "S")
++      call print_matrix(myid, na, L, "L")
++
++      call print_matrix(myid, na, matmul(A,S), "AS")
++      call print_matrix(myid, na, matmul(S,L), "SL")
++
++      call print_matrix(myid, na, res , "res")
++    end if
++
++  end subroutine
++
++  subroutine check_module_sanity_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    &(myid)
++    use precision_for_tests
++
++    implicit none
++    TEST_INT_TYPE, intent(in)   :: myid
++    TEST_INT_TYPE               :: decomposition(num_primes), i
++#ifndef BUILD_FUGAKU
++    TEST_INT_TYPE, parameter    :: check_sizes(7) = (/2, 3, 5, 6, 10, 25, 150/)
++#else
++    TEST_INT_TYPE    :: check_sizes(7)
++#endif
++    if(myid == 0) print *, "Checking test_analytic module sanity.... "
++#ifndef BUILD_FUGAKU
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++    assert(decompose(1500_lik, decomposition))
++#else
++    assert(decompose(1500_ik, decomposition))
++#endif
++    assert(all(decomposition == (/2,1,3/)))
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++    assert(decompose(6_lik,decomposition))
++#else
++    assert(decompose(6_ik,decomposition))
++#endif
++    assert(all(decomposition == (/1,1,0/)))
++
++#ifdef BUILD_FUGAKU
++    check_sizes(1) = 2
++    check_sizes(2) = 3
++    check_sizes(3) = 5
++    check_sizes(4) = 10
++    check_sizes(5) = 25
++    check_sizes(6) = 150
++#endif
++    do i =1, size(check_sizes)
++      call check_matrices_&
++          &MATH_DATATYPE&
++          &_&
++          &PRECISION&
++          &(myid, check_sizes(i))
++    end do
++
++    if(myid == 0) print *, "Checking test_analytic module sanity.... DONE"
++#endif
++  end subroutine
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_blacs_infrastructure.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_blacs_infrastructure.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_blacs_infrastructure.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_blacs_infrastructure.F90	2022-01-26 10:10:58.368066000 +0100
+@@ -0,0 +1,208 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#define TEST_C_INT_TYPE_PTR long int*
++#define TEST_C_INT_TYPE long int
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#define TEST_C_INT_TYPE_PTR int*
++#define TEST_C_INT_TYPE int
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#define TEST_C_INT_MPI_TYPE_PTR long int*
++#define TEST_C_INT_MPI_TYPE long int
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#define TEST_C_INT_MPI_TYPE_PTR int*
++#define TEST_C_INT_MPI_TYPE int
++#endif
++
++module test_blacs_infrastructure
++
++  contains
++
++    !c> void set_up_blacsgrid_f(TEST_C_INT_TYPE mpi_comm_parent, TEST_C_INT_TYPE np_rows, 
++    !c>                         TEST_C_INT_TYPE np_cols, char layout,
++    !c>                         TEST_C_INT_TYPE_PTR my_blacs_ctxt, TEST_C_INT_TYPE_PTR my_prow, 
++    !c>                         TEST_C_INT_TYPE_PTR my_pcol);
++    subroutine set_up_blacsgrid(mpi_comm_parent, np_rows, np_cols, layout, &
++                                my_blacs_ctxt, my_prow, my_pcol) bind(C, name="set_up_blacsgrid_f")
++
++      use precision_for_tests
++      use test_util
++      use iso_c_binding
++
++      implicit none
++      TEST_INT_TYPE, intent(in), value          :: mpi_comm_parent, np_rows, np_cols
++#ifdef SXAURORA
++      character(len=1), intent(in)              :: layout
++#else
++      character(kind=c_char), intent(in), value :: layout
++#endif
++      TEST_INT_TYPE, intent(out)                :: my_blacs_ctxt, my_prow, my_pcol
++
++#ifdef WITH_MPI
++      TEST_INT_TYPE :: np_rows_, np_cols_
++#endif
++
++      if (layout /= 'R' .and. layout /= 'C') then
++        print *, "layout must be 'R' or 'C'"
++        stop 1
++      end if
++
++      my_blacs_ctxt = mpi_comm_parent
++#ifdef WITH_MPI
++      call BLACS_Gridinit(my_blacs_ctxt, layout, np_rows, np_cols)
++      call BLACS_Gridinfo(my_blacs_ctxt, np_rows_, np_cols_, my_prow, my_pcol)
++      if (np_rows /= np_rows_) then
++        print *, "BLACS_Gridinfo returned different values for np_rows as set by BLACS_Gridinit"
++        stop 1
++      endif
++      if (np_cols /= np_cols_) then
++        print *, "BLACS_Gridinfo returned different values for np_cols as set by BLACS_Gridinit"
++        stop 1
++      endif
++#else
++      my_prow = 0
++      my_pcol = 0
++#endif
++    end subroutine
++
++    subroutine set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
++                                       np_rows, np_cols, na_rows,  &
++                                       na_cols, sc_desc, my_blacs_ctxt, info)
++
++      use elpa_utilities, only : error_unit
++      use test_util
++      use precision_for_tests
++      use tests_scalapack_interfaces
++      implicit none
++
++      TEST_INT_TYPE, intent(in)    :: na, nblk, my_prow, my_pcol, np_rows,   &
++                                       np_cols, &
++                                       my_blacs_ctxt
++      TEST_INT_TYPE, intent(inout) :: info
++      TEST_INT_TYPE, intent(out)   :: na_rows, na_cols, sc_desc(1:9)
++
++#ifdef WITH_MPI
++      TEST_INT_MPI_TYPE            :: mpierr
++
++      sc_desc(:) = 0
++      ! determine the neccessary size of the distributed matrices,
++      ! we use the scalapack tools routine NUMROC
++
++      na_rows = numroc(na, nblk, my_prow, 0_BLAS_KIND, np_rows)
++      na_cols = numroc(na, nblk, my_pcol, 0_BLAS_KIND, np_cols)
++
++      ! set up the scalapack descriptor for the checks below
++      ! For ELPA the following restrictions hold:
++      ! - block sizes in both directions must be identical (args 4 a. 5)
++      ! - first row and column of the distributed matrix must be on
++      !   row/col 0/0 (arg 6 and 7)
++
++      call descinit(sc_desc, na, na, nblk, nblk, 0_BLAS_KIND, 0_BLAS_KIND, &
++                    my_blacs_ctxt, na_rows, info)
++
++      if (info .ne. 0) then
++        write(error_unit,*) 'Error in BLACS descinit! info=',info
++        write(error_unit,*) 'Most likely this happend since you want to use'
++        write(error_unit,*) 'more MPI tasks than are possible for your'
++        write(error_unit,*) 'problem size (matrix size and blocksize)!'
++        write(error_unit,*) 'The blacsgrid can not be set up properly'
++        write(error_unit,*) 'Try reducing the number of MPI tasks...'
++        call MPI_ABORT(int(mpi_comm_world,kind=MPI_KIND), 1_MPI_KIND, mpierr)
++      endif
++#else /* WITH_MPI */
++      na_rows = na
++      na_cols = na
++#endif /* WITH_MPI */
++
++    end subroutine
++
++    !c> void set_up_blacs_descriptor_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nblk, 
++    !c>                                TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol,
++    !c>                                TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols,
++    !c>                                TEST_C_INT_TYPE_PTR na_rows, TEST_C_INT_TYPE_PTR na_cols,
++    !c>                                TEST_C_INT_TYPE sc_desc[9],
++    !c>                                TEST_C_INT_TYPE my_blacs_ctxt,
++    !c>                                TEST_C_INT_TYPE_PTR info);
++    subroutine set_up_blacs_descriptor_f(na, nblk, my_prow, my_pcol, &
++                                         np_rows, np_cols, na_rows,  &
++                                         na_cols, sc_desc,           &
++                                         my_blacs_ctxt, info)        &
++                                         bind(C, name="set_up_blacs_descriptor_f")
++
++      use iso_c_binding
++      implicit none
++
++
++      TEST_INT_TYPE, value :: na, nblk, my_prow, my_pcol, np_rows, &
++                                    np_cols, my_blacs_ctxt
++      TEST_INT_TYPE        :: na_rows, na_cols, info, sc_desc(1:9)
++
++      call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, &
++                                   np_rows, np_cols, na_rows,  &
++                                   na_cols, sc_desc, my_blacs_ctxt, info)
++
++
++    end subroutine
++
++    
++   function index_l2g(idx_loc, nblk, iproc, nprocs) result(indexl2g)
++     use precision_for_tests
++     implicit none
++     TEST_INT_TYPE :: indexl2g
++     TEST_INT_TYPE :: idx_loc, nblk, iproc, nprocs
++     indexl2g = nprocs * nblk * ((idx_loc-1) / nblk) + mod(idx_loc-1,nblk) + mod(nprocs+iproc, nprocs)*nblk + 1
++     return
++   end function
++
++end module
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_check_correctness.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_check_correctness.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_check_correctness.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_check_correctness.F90	2022-01-26 10:10:58.369246000 +0100
+@@ -0,0 +1,156 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++! Author: A. Marek, MPCDF
++#include "config-f90.h"
++
++module test_check_correctness
++  use test_util
++
++  interface check_correctness_evp_numeric_residuals
++    module procedure check_correctness_evp_numeric_residuals_complex_double
++    module procedure check_correctness_evp_numeric_residuals_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure check_correctness_evp_numeric_residuals_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure check_correctness_evp_numeric_residuals_complex_single
++#endif
++  end interface
++  
++  interface check_correctness_evp_numeric_residuals_ss
++!     module procedure check_correctness_evp_numeric_residuals_ss_complex_double
++    module procedure check_correctness_evp_numeric_residuals_ss_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure check_correctness_evp_numeric_residuals_ss_real_single
++#endif
++! #ifdef WANT_SINGLE_PRECISION_COMPLEX
++!     module procedure check_correctness_evp_numeric_residuals_ss_complex_single
++! #endif
++  end interface
++
++  interface check_correctness_eigenvalues_toeplitz
++    module procedure check_correctness_eigenvalues_toeplitz_complex_double
++    module procedure check_correctness_eigenvalues_toeplitz_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure check_correctness_eigenvalues_toeplitz_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure check_correctness_eigenvalues_toeplitz_complex_single
++#endif
++  end interface
++
++  interface check_correctness_eigenvalues_frank
++    module procedure check_correctness_eigenvalues_frank_complex_double
++    module procedure check_correctness_eigenvalues_frank_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure check_correctness_eigenvalues_frank_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure check_correctness_eigenvalues_frank_complex_single
++#endif
++  end interface
++
++  interface check_correctness_cholesky
++    module procedure check_correctness_cholesky_complex_double
++    module procedure check_correctness_cholesky_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure check_correctness_cholesky_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure check_correctness_cholesky_complex_single
++#endif
++  end interface
++
++  interface check_correctness_hermitian_multiply
++    module procedure check_correctness_hermitian_multiply_complex_double
++    module procedure check_correctness_hermitian_multiply_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure check_correctness_hermitian_multiply_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure check_correctness_hermitian_multiply_complex_single
++#endif
++  end interface
++
++
++  contains
++
++#define COMPLEXCASE 1
++#define DOUBLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_check_correctness_template.F90"
++#undef DOUBLE_PRECISION
++#undef COMPLEXCASE
++
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++
++#define COMPLEXCASE 1
++#define SINGLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_check_correctness_template.F90"
++#undef SINGLE_PRECISION
++#undef COMPLEXCASE
++#endif /* WANT_SINGLE_PRECISION_COMPLEX */
++
++#define REALCASE 1
++#define DOUBLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_check_correctness_template.F90"
++#undef DOUBLE_PRECISION
++#undef REALCASE
++
++#ifdef WANT_SINGLE_PRECISION_REAL
++
++#define REALCASE 1
++#define SINGLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_check_correctness_template.F90"
++#undef SINGLE_PRECISION
++#undef REALCASE
++
++
++#endif /* WANT_SINGLE_PRECISION_REAL */
++
++#include "../../src/general/prow_pcol.F90"
++#include "../../src/general/map_global_to_local.F90"
++
++end module
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_check_correctness_template.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_check_correctness_template.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_check_correctness_template.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_check_correctness_template.F90	2022-01-26 10:10:58.370443000 +0100
+@@ -0,0 +1,1134 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++! Author: A. Marek, MPCDF
++
++
++#include "config-f90.h"
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE lik
++#define TEST_C_INT_TYPE_PTR long int*
++#define TEST_C_INT_TYPE long int
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE ik
++#define TEST_C_INT_TYPE_PTR int*
++#define TEST_C_INT_TYPE int
++#endif
++
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE lik
++#define TEST_C_INT_MPI_TYPE_PTR long int*
++#define TEST_C_INT_MPI_TYPE long int
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE ik
++#define TEST_C_INT_MPI_TYPE_PTR int*
++#define TEST_C_INT_MPI_TYPE int
++#endif
++
++#if REALCASE == 1
++    function check_correctness_evp_numeric_residuals_ss_real_&
++    &PRECISION&
++    & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status)
++      use tests_blas_interfaces
++      use tests_scalapack_interfaces
++      use precision_for_tests
++      use iso_c_binding
++      implicit none
++#include "../../src/general/precision_kinds.F90"
++      integer(kind=BLAS_KIND)             :: status, na_cols, na_rows
++      integer(kind=BLAS_KIND), intent(in) :: na, nev, nblk, myid, np_rows, np_cols, my_prow, my_pcol
++      real(kind=rk), intent(in)           :: as(:,:)
++      real(kind=rk)                       :: tmpr
++      complex(kind=rck), intent(in)       :: z(:,:)
++      real(kind=rk)                       :: ev(:)
++      complex(kind=rck), dimension(size(as,dim=1),size(as,dim=2)) :: tmp1, tmp2
++      complex(kind=rck)                   :: xc
++      
++      complex(kind=rck), allocatable      :: as_complex(:,:)
++
++      integer(kind=BLAS_KIND)             :: sc_desc(:)
++
++      integer(kind=BLAS_KIND)             :: i, j, rowLocal, colLocal
++      integer(kind=c_int)                 :: row_Local, col_Local
++      real(kind=rck)                      :: err, errmax
++
++      integer :: mpierr
++
++      ! tolerance for the residual test for different math type/precision setups
++      real(kind=rk), parameter       :: tol_res_real_double      = 5e-4_rk
++      real(kind=rk), parameter       :: tol_res_real_single      = 3e-2_rk
++      real(kind=rk), parameter       :: tol_res_complex_double   = 5e-12_rk
++      real(kind=rk), parameter       :: tol_res_complex_single   = 3e-2_rk
++      real(kind=rk)                  :: tol_res                  = tol_res_&
++                                                                          &MATH_DATATYPE&
++                                                                          &_&
++                                                                          &PRECISION
++      ! precision of generalized problem is lower
++      real(kind=rk), parameter       :: generalized_penalty = 10.0_rk
++
++      ! tolerance for the orthogonality test for different math type/precision setups
++!       real(kind=rk), parameter       :: tol_orth_real_double     = 5e-11_rk
++      real(kind=rk), parameter       :: tol_orth_real_double     = 5e-4_rk
++      real(kind=rk), parameter       :: tol_orth_real_single     = 9e-2_rk
++      real(kind=rk), parameter       :: tol_orth_complex_double  = 5e-11_rk
++      real(kind=rk), parameter       :: tol_orth_complex_single  = 9e-3_rk
++      real(kind=rk), parameter       :: tol_orth                 = tol_orth_&
++                                                                          &MATH_DATATYPE&
++                                                                          &_&
++                                                                          &PRECISION
++                                                  
++      complex(kind=rck), parameter   :: CZERO = (0.0_rck,0.0_rck), CONE = (1.0_rck,0.0_rck)
++
++
++      status = 0
++      ! Setup complex matrices and eigenvalues
++      na_rows = size(as,dim=1)
++      na_cols = size(as,dim=2)
++      
++      allocate(as_complex(na_rows,na_cols))
++      do j=1, na_cols
++        do i=1,na_rows
++#ifdef DOUBLE_PRECISION_REAL
++          as_complex(i,j) = dcmplx(as(i,j),0.0_rk)
++#else
++          as_complex(i,j) = cmplx(as(i,j),0.0_rk)
++#endif
++       enddo
++      enddo
++      
++      ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
++
++      ! tmp1 = Zi*EVi
++      tmp1(:,:) = z(:,:)
++      do i=1,nev
++#ifdef DOUBLE_PRECISION_REAL
++        xc = dcmplx(0.0_rk,ev(i))
++#else
++        xc = cmplx(0.0_rk,ev(i))
++#endif
++#ifdef WITH_MPI
++#ifdef DOUBLE_PRECISION_REAL
++        call pzscal(int(na,kind=BLAS_KIND), xc, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, 1_BLAS_KIND)
++#else
++        call pcscal(int(na,kind=BLAS_KIND), xc, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, 1_BLAS_KIND)
++#endif
++#else /* WITH_MPI */
++#ifdef DOUBLE_PRECISION_REAL
++        call zscal(int(na,kind=BLAS_KIND), xc, tmp1(:,i), 1_BLAS_KIND)
++#else
++        call cscal(int(na,kind=BLAS_KIND), xc, tmp1(:,i), 1_BLAS_KIND)
++#endif
++#endif /* WITH_MPI */
++      enddo
++
++      ! normal eigenvalue problem .. no need to multiply
++        tmp2(:,:) = tmp1(:,:)
++
++      ! tmp1 =  A * Z
++      ! as is original stored matrix, Z are the EVs
++#ifdef WITH_MPI
++#ifdef DOUBLE_PRECISION_REAL
++      call PZGEMM('N', 'N', int(na,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), &
++                  CONE, as_complex, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                  z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, CZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else
++      call PCGEMM('N', 'N', int(na,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), &
++                  CONE, as_complex, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                  z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, CZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#endif
++#else /* WITH_MPI */
++#ifdef DOUBLE_PRECISION_REAL
++      call ZGEMM('N','N',int(na,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), CONE, &
++                 as_complex, int(na,kind=BLAS_KIND), z,int(na,kind=BLAS_KIND), CZERO, tmp1, int(na,kind=BLAS_KIND) )
++#else
++      call CGEMM('N','N', int(na,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), CONE, &
++                  as_complex, int(na,kind=BLAS_KIND), z, int(na,kind=BLAS_KIND), CZERO, tmp1, int(na,kind=BLAS_KIND) )
++#endif
++#endif /* WITH_MPI */
++
++      !  tmp1 = A*Zi - Zi*EVi
++      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
++      
++      ! Get maximum norm of columns of tmp1
++      errmax = 0.0_rk
++
++      do i=1,nev
++        xc = (0.0_rk,0.0_rk)
++#ifdef WITH_MPI
++#ifdef DOUBLE_PRECISION_REAL
++        call PZDOTC(int(na,kind=BLAS_KIND), xc, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, &
++                    1_BLAS_KIND, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, 1_BLAS_KIND)
++#else
++        call PCDOTC(int(na,kind=BLAS_KIND), xc, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, &
++                    1_BLAS_KIND, tmp1, 1_BLAS_KIND, int(i,kind=BLAS_KIND), sc_desc, 1_BLAS_KIND)
++#endif
++#else /* WITH_MPI */
++#ifdef DOUBLE_PRECISION_REAL
++        xc = ZDOTC(int(na,kind=BLAS_KIND) ,tmp1, 1_BLAS_KIND, tmp1, 1_BLAS_KIND)
++#else
++        xc = CDOTC(int(na,kind=BLAS_KIND) ,tmp1, 1_BLAS_KIND, tmp1, 1_BLAS_KIND)
++#endif
++#endif /* WITH_MPI */
++        errmax = max(errmax, sqrt(real(xc,kind=REAL_DATATYPE)))
++      enddo
++
++      ! Get maximum error norm over all processors
++      err = errmax
++#ifdef WITH_MPI
++      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
++#else /* WITH_MPI */
++      errmax = err
++#endif /* WITH_MPI */
++      if (myid==0) print *,'%Results of numerical residual checks, using complex arithmetic:'
++      if (myid==0) print *,'%Error Residual     :',errmax
++      if (nev .ge. 2) then
++        if (errmax .gt. tol_res .or. errmax .eq. 0.0_rk) then
++          status = 1
++        endif
++      else
++        if (errmax .gt. tol_res) then
++          status = 1
++        endif
++      endif
++
++      ! 2. Eigenvector orthogonality
++        tmp2(:,:) = z(:,:)
++      tmp1 = 0
++#ifdef WITH_MPI
++#ifdef DOUBLE_PRECISION_REAL
++      call PZGEMM('C', 'N', int(nev,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), &
++                  CONE, z, 1_BLAS_KIND, 1_BLAS_KIND, &
++                  sc_desc, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, CZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else
++      call PCGEMM('C', 'N', int(nev,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND), &
++                  CONE, z, 1_BLAS_KIND, 1_BLAS_KIND, &
++                  sc_desc, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, CZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#endif
++
++#else /* WITH_MPI */
++#ifdef DOUBLE_PRECISION_REAL
++      call ZGEMM('C','N', int(nev,kind=BLAS_KIND) , int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND),CONE, z, &
++                 int(na,kind=BLAS_KIND), tmp2, int(na,kind=BLAS_KIND), CZERO, tmp1, int(na,kind=BLAS_KIND))
++#else
++      call CGEMM('C','N', int(nev,kind=BLAS_KIND) , int(nev,kind=BLAS_KIND), int(na,kind=BLAS_KIND),CONE, z, &
++                 int(na,kind=BLAS_KIND), tmp2, int(na,kind=BLAS_KIND), CZERO, tmp1, int(na,kind=BLAS_KIND))
++#endif
++#endif /* WITH_MPI */
++      ! First check, whether the elements on diagonal are 1 .. "normality" of the vectors
++      err = 0.0_rk
++      do i=1, nev
++        if (map_global_array_index_to_local_index(int(i,kind=c_int), int(i,kind=c_int), row_Local, col_Local, &
++                                                  int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
++                                                  int(my_prow,kind=c_int), int(my_pcol,kind=c_int)) ) then
++           rowLocal = int(row_Local,kind=INT_TYPE)
++           colLocal = int(col_Local,kind=INT_TYPE)
++           err = max(err, abs(tmp1(rowLocal,colLocal) - CONE))
++         endif
++      end do
++#ifdef WITH_MPI
++      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
++#else /* WITH_MPI */
++      errmax = err
++#endif /* WITH_MPI */
++      if (myid==0) print *,'%Maximal error in eigenvector lengths:',errmax
++
++      ! Second, find the maximal error in the whole Z**T * Z matrix (its diference from identity matrix)
++      ! Initialize tmp2 to unit matrix
++      tmp2 = 0
++#ifdef WITH_MPI
++#ifdef DOUBLE_PRECISION_REAL
++      call PZLASET('A', int(nev,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), CZERO, CONE, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else
++      call PCLASET('A', int(nev,kind=BLAS_KIND), int(nev,kind=BLAS_KIND), CZERO, CONE, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#endif
++#else /* WITH_MPI */
++#ifdef DOUBLE_PRECISION_REAL
++      call ZLASET('A',int(nev,kind=BLAS_KIND) ,int(nev,kind=BLAS_KIND) ,CZERO, CONE, tmp2, int(na,kind=BLAS_KIND))
++#else
++      call CLASET('A',int(nev,kind=BLAS_KIND) ,int(nev,kind=BLAS_KIND) ,CZERO, CONE, tmp2, int(na,kind=BLAS_KIND))
++#endif
++#endif /* WITH_MPI */
++
++      !      ! tmp1 = Z**T * Z - Unit Matrix
++      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
++
++      ! Get maximum error (max abs value in tmp1)
++      err = maxval(abs(tmp1))
++#ifdef WITH_MPI
++      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, int(MPI_COMM_WORLD,kind=MPI_KIND), mpierr)
++#else /* WITH_MPI */
++      errmax = err
++#endif /* WITH_MPI */
++      if (myid==0) print *,'%Error Orthogonality:',errmax
++
++      if (nev .ge. 2) then
++        if (errmax .gt. tol_orth .or. errmax .eq. 0.0_rk) then
++          status = 1
++        endif
++      else
++        if (errmax .gt. tol_orth) then
++          status = 1
++        endif
++      endif
++      
++      deallocate(as_complex)
++    end function
++
++#endif /* REALCASE */
++
++#if REALCASE == 1
++#ifdef DOUBLE_PRECISION_REAL
++    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_ss_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                         double *as, complex double *z, double *ev,  TEST_C_INT_TYPE sc_desc[9],
++    !c>                                         TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#else
++    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_ss_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                         float *as, complex float *z, float *ev, TEST_C_INT_TYPE sc_desc[9],
++    !c>                                         TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#endif
++#endif /* REALCASE */
++
++#if REALCASE == 1
++function check_correctness_evp_numeric_residuals_ss_real_&
++&PRECISION&
++&_f (na, nev, na_rows, na_cols, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status) &
++      bind(C,name="check_correctness_evp_numeric_residuals_ss_&
++      &MATH_DATATYPE&
++      &_&
++      &PRECISION&
++      &_f")
++
++      use precision_for_tests
++      use iso_c_binding
++
++      implicit none
++#include "./test_precision_kinds.F90"
++
++      TEST_INT_TYPE            :: status
++      TEST_INT_TYPE, value     :: na, nev, myid, na_rows, na_cols, nblk, np_rows, np_cols, my_prow, my_pcol
++      real(kind=rck)            :: as(1:na_rows,1:na_cols)
++      complex(kind=rck)         :: z(1:na_rows,1:na_cols)
++      real(kind=rck)            :: ev(1:na)
++      TEST_INT_TYPE            :: sc_desc(1:9)
++
++      status = check_correctness_evp_numeric_residuals_ss_real_&
++      &PRECISION&
++      & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
++    end function
++#endif /* REALCASE */
++
++function check_correctness_evp_numeric_residuals_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol, bs) result(status)
++ 
++      use tests_blas_interfaces
++      use tests_scalapack_interfaces
++      use precision_for_tests
++      implicit none
++#include "./test_precision_kinds.F90"
++      TEST_INT_TYPE                                 :: status
++      TEST_INT_TYPE, intent(in)                     :: na, nev, nblk, myid, np_rows, np_cols, my_prow, my_pcol
++      MATH_DATATYPE(kind=rck), intent(in)           :: as(:,:), z(:,:)
++      MATH_DATATYPE(kind=rck), intent(in), optional :: bs(:,:)
++      real(kind=rk)                                 :: ev(:)
++      MATH_DATATYPE(kind=rck), dimension(size(as,dim=1),size(as,dim=2)) :: tmp1, tmp2
++      MATH_DATATYPE(kind=rck)                       :: xc
++
++      TEST_INT_TYPE                 :: sc_desc(:)
++
++      TEST_INT_TYPE                 :: i, rowLocal, colLocal
++      integer(kind=c_int)           :: row_Local, col_Local
++      real(kind=rck)                :: err, errmax
++
++      TEST_INT_MPI_TYPE             :: mpierr
++
++! tolerance for the residual test for different math type/precision setups
++      real(kind=rk), parameter       :: tol_res_real_double      = 5e-12_rk
++      real(kind=rk), parameter       :: tol_res_real_single      = 3e-2_rk
++      real(kind=rk), parameter       :: tol_res_complex_double   = 5e-12_rk
++      real(kind=rk), parameter       :: tol_res_complex_single   = 3e-2_rk
++      real(kind=rk)                  :: tol_res                  = tol_res_&
++                                                                          &MATH_DATATYPE&
++                                                                          &_&
++                                                                          &PRECISION
++      ! precision of generalized problem is lower
++      real(kind=rk), parameter       :: generalized_penalty = 10.0_rk
++
++      ! tolerance for the orthogonality test for different math type/precision setups
++      real(kind=rk), parameter       :: tol_orth_real_double     = 5e-11_rk
++      real(kind=rk), parameter       :: tol_orth_real_single     = 9e-2_rk
++      real(kind=rk), parameter       :: tol_orth_complex_double  = 5e-11_rk
++      real(kind=rk), parameter       :: tol_orth_complex_single  = 9e-3_rk
++      real(kind=rk), parameter       :: tol_orth                 = tol_orth_&
++                                                                          &MATH_DATATYPE&
++                                                                          &_&
++                                                                          &PRECISION
++
++      if (present(bs)) then
++        tol_res = generalized_penalty * tol_res
++      endif
++      status = 0
++
++      ! 1. Residual (maximum of || A*Zi - Zi*EVi ||)
++     
++!       tmp1 = Zi*EVi
++      tmp1(:,:) = z(:,:)
++      do i=1,nev
++        xc = ev(i)
++#ifdef WITH_MPI
++        call p&
++            &BLAS_CHAR&
++            &scal(na, xc, tmp1, 1_BLAS_KIND, i, sc_desc, 1_BLAS_KIND)
++#else /* WITH_MPI */
++        call BLAS_CHAR&
++            &scal(na, xc, tmp1(:,i), 1_BLAS_KIND)
++#endif /* WITH_MPI */
++      enddo
++
++      ! for generalized EV problem, multiply by bs as well
++      ! tmp2 = B * tmp1
++      if(present(bs)) then
++#ifdef WITH_MPI
++      call scal_PRECISION_GEMM('N', 'N', na, nev, na, ONE, bs, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                               tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++      call PRECISION_GEMM('N','N',na,nev,na,ONE,bs,na,tmp1,na,ZERO,tmp2,na)
++#endif /* WITH_MPI */
++      else
++        ! normal eigenvalue problem .. no need to multiply
++        tmp2(:,:) = tmp1(:,:)
++      end if
++
++      ! tmp1 =  A * Z
++      ! as is original stored matrix, Z are the EVs
++#ifdef WITH_MPI
++      call scal_PRECISION_GEMM('N', 'N', na, nev, na, ONE, as, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                  z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++      call PRECISION_GEMM('N','N',na,nev,na,ONE,as,na,z,na,ZERO,tmp1,na)
++#endif /* WITH_MPI */
++
++      !  tmp1 = A*Zi - Zi*EVi
++      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
++
++      ! Get maximum norm of columns of tmp1
++      errmax = 0.0_rk
++
++      do i=1,nev
++#if REALCASE == 1
++        err = 0.0_rk
++#ifdef WITH_MPI
++        call scal_PRECISION_NRM2(na, err, tmp1, 1_BLAS_KIND, i, sc_desc, 1_BLAS_KIND)
++#else /* WITH_MPI */
++        err = PRECISION_NRM2(na,tmp1(1,i),1_BLAS_KIND)
++#endif /* WITH_MPI */
++        errmax = max(errmax, err)
++#endif /* REALCASE */
++
++#if COMPLEXCASE == 1
++        xc = 0
++#ifdef WITH_MPI
++        call scal_PRECISION_DOTC(na, xc, tmp1, 1_BLAS_KIND, i, sc_desc, &
++                                 1_BLAS_KIND, tmp1, 1_BLAS_KIND, i, sc_desc, 1_BLAS_KIND)
++#else /* WITH_MPI */
++        xc = PRECISION_DOTC(na,tmp1,1_BLAS_KIND,tmp1,1_BLAS_KIND)
++#endif /* WITH_MPI */
++        errmax = max(errmax, sqrt(real(xc,kind=REAL_DATATYPE)))
++#endif /* COMPLEXCASE */
++      enddo
++
++      ! Get maximum error norm over all processors
++      err = errmax
++#ifdef WITH_MPI
++      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
++#else /* WITH_MPI */
++      errmax = err
++#endif /* WITH_MPI */
++      if (myid==0) print *,'Results of numerical residual checks:'
++      if (myid==0) print *,'Error Residual     :',errmax
++      if (nev .ge. 2) then
++        if (errmax .gt. tol_res .or. errmax .eq. 0.0_rk) then
++          status = 1
++        endif
++      else
++        if (errmax .gt. tol_res) then
++          status = 1
++        endif
++      endif
++
++      ! 2. Eigenvector orthogonality
++      if(present(bs)) then
++        !for the generalized EVP, the eigenvectors should be B-orthogonal, not orthogonal
++        ! tmp2 = B * Z
++        tmp2(:,:) = 0.0_rck
++#ifdef WITH_MPI
++        call scal_PRECISION_GEMM('N', 'N', na, nev, na, ONE, bs, 1_BLAS_KIND, 1_BLAS_KIND, &
++                        sc_desc, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++        call PRECISION_GEMM('N','N', na, nev, na, ONE, bs, na, z, na, ZERO, tmp2, na)
++#endif /* WITH_MPI */
++
++      else
++        tmp2(:,:) = z(:,:)
++      endif
++      ! tmp1 = Z**T * tmp2
++      ! actually tmp1 = Z**T * Z for standard case and tmp1 = Z**T * B * Z for generalized
++      tmp1 = 0
++#ifdef WITH_MPI
++      call scal_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', nev, nev, na, ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, &
++                        sc_desc, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, &
++                        tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++      call PRECISION_GEMM(BLAS_TRANS_OR_CONJ,'N',nev,nev,na,ONE,z,na,tmp2,na,ZERO,tmp1,na)
++#endif /* WITH_MPI */
++      ! First check, whether the elements on diagonal are 1 .. "normality" of the vectors
++      err = 0.0_rk
++      do i=1, nev
++        if (map_global_array_index_to_local_index(int(i,kind=c_int), int(i,kind=c_int) , row_Local, col_Local, &
++                                                  int(nblk,kind=c_int), int(np_rows,kind=c_int), &
++                                                  int(np_cols,kind=c_int), int(my_prow,kind=c_int), &
++                                                  int(my_pcol,kind=c_int) )) then
++           rowLocal = int(row_Local,kind=INT_TYPE)
++           colLocal = int(col_Local,kind=INT_TYPE)
++           err = max(err, abs(tmp1(rowLocal,colLocal) - 1.0_rk))
++         endif
++      end do
++#ifdef WITH_MPI
++      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
++#else /* WITH_MPI */
++      errmax = err
++#endif /* WITH_MPI */
++      if (myid==0) print *,'Maximal error in eigenvector lengths:',errmax
++
++      ! Second, find the maximal error in the whole Z**T * Z matrix (its diference from identity matrix)
++      ! Initialize tmp2 to unit matrix
++      tmp2 = 0
++#ifdef WITH_MPI
++      call scal_PRECISION_LASET('A', nev, nev, ZERO, ONE, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++      call PRECISION_LASET('A',nev,nev,ZERO,ONE,tmp2,na)
++#endif /* WITH_MPI */
++
++      !      ! tmp1 = Z**T * Z - Unit Matrix
++      tmp1(:,:) =  tmp1(:,:) - tmp2(:,:)
++
++      ! Get maximum error (max abs value in tmp1)
++      err = maxval(abs(tmp1))
++#ifdef WITH_MPI
++      call mpi_allreduce(err, errmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
++#else /* WITH_MPI */
++      errmax = err
++#endif /* WITH_MPI */
++      if (myid==0) print *,'Error Orthogonality:',errmax
++
++      if (nev .ge. 2) then
++        if (errmax .gt. tol_orth .or. errmax .eq. 0.0_rk) then
++          status = 1
++        endif
++      else
++        if (errmax .gt. tol_orth) then
++          status = 1
++        endif
++      endif
++    end function
++
++#if REALCASE == 1
++#ifdef DOUBLE_PRECISION_REAL
++    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev,
++    !c>                                                                       TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                                                       double *as, double *z, double *ev, 
++    !c>                                                                       TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                                       TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
++    !c>                                                                       TEST_C_INT_TYPE np_rows, 
++    !c>                                                                       TEST_C_INT_TYPE np_cols, 
++    !c>                                                                       TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#else
++    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
++    !c>                                                                       TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                                                       float *as, float *z, float *ev, 
++    !c>                                                                       TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                                       TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
++    !c>                                                                       TEST_C_INT_TYPE np_rows, 
++    !c>                                                                       TEST_C_INT_TYPE np_cols, 
++    !c>                                                                       TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#endif
++#endif /* REALCASE */
++
++#if COMPLEXCASE == 1
++#ifdef DOUBLE_PRECISION_COMPLEX
++    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_complex_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
++    !c>                                                              TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                                              complex double *as, complex double *z, double *ev, 
++    !c>                                                              TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                              TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
++    !c>                                                              TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
++    !c>                                                              TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#else
++    !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_complex_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
++    !c>                                                                  TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                                                  complex float *as, complex float *z, float *ev, 
++    !c>                                                                  TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                                  TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
++    !c>                                                                  TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
++    !c>                                                                  TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#endif
++#endif /* COMPLEXCASE */
++
++function check_correctness_evp_numeric_residuals_&
++&MATH_DATATYPE&
++&_&
++&PRECISION&
++&_f (na, nev, na_rows, na_cols, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status) &
++      bind(C,name="check_correctness_evp_numeric_residuals_&
++      &MATH_DATATYPE&
++      &_&
++      &PRECISION&
++      &_f")
++
++      use precision_for_tests
++      use iso_c_binding
++
++      implicit none
++#include "./test_precision_kinds.F90"
++
++      TEST_INT_TYPE            :: status
++      TEST_INT_TYPE, value     :: na, nev, myid, na_rows, na_cols, nblk, np_rows, np_cols, my_prow, my_pcol
++      MATH_DATATYPE(kind=rck)  :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols)
++      real(kind=rck)           :: ev(1:na)
++      TEST_INT_TYPE            :: sc_desc(1:9)
++
++      status = check_correctness_evp_numeric_residuals_&
++      &MATH_DATATYPE&
++      &_&
++      &PRECISION&
++      & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
++
++    end function
++
++!---- variant for the generalized eigenproblem
++!---- unlike in Fortran, we cannot use optional parameter
++!---- we thus define a different function
++#if REALCASE == 1
++#ifdef DOUBLE_PRECISION_REAL
++    !c> TEST_C_INT_TYPE check_correctness_evp_gen_numeric_residuals_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
++    !c>                                                               TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                                               double *as, double *z, double *ev,
++    !c>                                                               TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                               TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
++    !c>                                                               TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
++    !c>                                                               TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol,
++    !c>                                                               double *bs);
++#else
++    !c> TEST_C_INT_TYPE check_correctness_evp_gen_numeric_residuals_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, 
++    !c>                                                                           TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                                                           float *as, float *z, float *ev, 
++    !c>                                                                           TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                                           TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid,
++    !c>                                                                           TEST_C_INT_TYPE np_rows, 
++    !c>                                                                           TEST_C_INT_TYPE np_cols, 
++    !c>                                                                           TEST_C_INT_TYPE my_prow, 
++    !c>                                                                           TEST_C_INT_TYPE my_pcol, 
++    !c>                                                                           float *bs);
++#endif
++#endif /* REALCASE */
++
++#if COMPLEXCASE == 1
++#ifdef DOUBLE_PRECISION_COMPLEX
++    !c> TEST_C_INT_TYPE check_correctness_evp_gen_numeric_residuals_complex_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev,
++    !c>                                                                    TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                                                    complex double *as, complex double *z, double *ev,
++    !c>                                                                    TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                                    TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
++    !c>                                                                    TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols,
++    !c>                                                                    TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol,
++    !c>                                                                    complex double *bs);
++#else
++    !c> TEST_C_INT_TYPE check_correctness_evp_gen_numeric_residuals_complex_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev,
++    !c>                                                                    TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
++    !c>                                                                    complex float *as, complex float *z, float *ev, 
++    !c>                                                                    TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                                    TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, 
++    !c>                                                                    TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
++    !c>                                                                    TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol,
++    !c>                                                                    complex float *bs);
++#endif
++#endif /* COMPLEXCASE */
++
++function check_correctness_evp_gen_numeric_residuals_&
++&MATH_DATATYPE&
++&_&
++&PRECISION&
++&_f (na, nev, na_rows, na_cols, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol, bs) result(status) &
++      bind(C,name="check_correctness_evp_gen_numeric_residuals_&
++      &MATH_DATATYPE&
++      &_&
++      &PRECISION&
++      &_f")
++
++      use iso_c_binding
++      use precision_for_tests
++      implicit none
++#include "./test_precision_kinds.F90"
++
++      TEST_INT_TYPE            :: status
++      TEST_INT_TYPE, value     :: na, nev, myid, na_rows, na_cols, nblk, np_rows, np_cols, my_prow, my_pcol
++      MATH_DATATYPE(kind=rck)  :: as(1:na_rows,1:na_cols), z(1:na_rows,1:na_cols), bs(1:na_rows,1:na_cols)
++      real(kind=rck)           :: ev(1:na)
++      TEST_INT_TYPE            :: sc_desc(1:9)
++
++      status = check_correctness_evp_numeric_residuals_&
++      &MATH_DATATYPE&
++      &_&
++      &PRECISION&
++      & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol, bs)
++
++    end function
++
++    !-----------------------------------------------------------------------------------------------------------
++
++    function check_correctness_eigenvalues_toeplitz_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    & (na, diagonalElement, subdiagonalElement, ev, z, myid) result(status)
++      use iso_c_binding
++      use precision_for_tests
++      implicit none
++#include "./test_precision_kinds.F90"
++
++      TEST_INT_TYPE               :: status, ii, j, myid
++      TEST_INT_TYPE, intent(in)   :: na
++      real(kind=rck) :: diagonalElement, subdiagonalElement
++      real(kind=rck) :: ev_analytic(na), ev(na)
++      MATH_DATATYPE(kind=rck) :: z(:,:)
++
++#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
++      real(kind=rck), parameter   :: pi = 3.141592653589793238462643383279_c_double
++#else
++      real(kind=rck), parameter   :: pi = 3.1415926535897932_c_float
++#endif
++      real(kind=rck)              :: tmp, maxerr
++      TEST_INT_TYPE               :: loctmp
++      status = 0
++
++     ! analytic solution
++     do ii=1, na
++       ev_analytic(ii) = diagonalElement + 2.0_rk * &
++                         subdiagonalElement *cos( pi*real(ii,kind=rk)/ &
++                         real(na+1,kind=rk) )
++     enddo
++
++     ! sort analytic solution:
++
++     ! this hack is neither elegant, nor optimized: for huge matrixes it might be expensive
++     ! a proper sorting algorithmus might be implemented here
++
++     tmp    = minval(ev_analytic)
++     loctmp = minloc(ev_analytic, 1)
++
++     ev_analytic(loctmp) = ev_analytic(1)
++     ev_analytic(1) = tmp
++     do ii=2, na
++       tmp = ev_analytic(ii)
++       do j= ii, na
++         if (ev_analytic(j) .lt. tmp) then
++           tmp    = ev_analytic(j)
++           loctmp = j
++         endif
++       enddo
++       ev_analytic(loctmp) = ev_analytic(ii)
++       ev_analytic(ii) = tmp
++     enddo
++
++     ! compute a simple error max of eigenvalues
++     maxerr = 0.0
++     maxerr = maxval( (ev(:) - ev_analytic(:))/ev_analytic(:) , 1)
++
++#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
++     if (abs(maxerr) .gt. 8.e-13_c_double) then
++#else
++     if (abs(maxerr) .gt. 8.e-4_c_float) then
++#endif
++       status = 1
++       if (myid .eq. 0) then
++         print *,"Result of Toeplitz matrix test: "
++         print *,"Eigenvalues differ from analytic solution: maxerr = ",abs(maxerr)
++       endif
++     endif
++
++    if (status .eq. 0) then
++       if (myid .eq. 0) then
++         print *,"Result of Toeplitz matrix test: test passed"
++         print *,"Eigenvalues differ from analytic solution: maxerr = ",abs(maxerr)
++       endif
++    endif
++    end function
++
++    function check_correctness_cholesky_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    & (na, a, as, na_rows, sc_desc, myid) result(status)
++      use precision_for_tests
++      use tests_blas_interfaces
++      use tests_scalapack_interfaces
++      implicit none
++#include "./test_precision_kinds.F90"
++      TEST_INT_TYPE                                                     :: status
++      TEST_INT_TYPE, intent(in)                                         :: na, myid, na_rows
++
++      MATH_DATATYPE(kind=rck), intent(in)                               :: a(:,:), as(:,:)
++      MATH_DATATYPE(kind=rck), dimension(size(as,dim=1),size(as,dim=2)) :: tmp1, tmp2
++#if COMPLEXCASE == 1
++      ! needed for [z,c]lange from scalapack
++      real(kind=rk), dimension(2*size(as,dim=1),size(as,dim=2))         :: tmp1_real
++#endif
++      real(kind=rk)                                                     :: norm, normmax
++
++      TEST_INT_TYPE                                                     :: sc_desc(:)
++      real(kind=rck)                                                    :: err, errmax
++      TEST_INT_MPI_TYPE                                                 :: mpierr
++
++      status = 0
++      tmp1(:,:) = 0.0_rck
++
++
++#if REALCASE == 1
++      ! tmp1 = a**T
++#ifdef WITH_MPI
++      call p&
++          &BLAS_CHAR&
++          &tran(na, na, 1.0_rck, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                0.0_rck, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++      tmp1 = transpose(a)
++#endif /* WITH_MPI */
++#endif /* REALCASE == 1 */
++
++#if COMPLEXCASE == 1
++      ! tmp1 = a**H
++#ifdef WITH_MPI
++      call p&
++            &BLAS_CHAR&
++            &tranc(na, na, ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                   ZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++      tmp1 = transpose(conjg(a))
++#endif /* WITH_MPI */
++#endif /* COMPLEXCASE == 1 */
++
++      ! tmp2 = a**T * a
++#ifdef WITH_MPI
++      call p&
++            &BLAS_CHAR&
++            &gemm("N","N", na, na, na, ONE, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                  a, 1_BLAS_KIND, 1_BLAS_KIND, &
++                  sc_desc, ZERO, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++      call BLAS_CHAR&
++                    &gemm("N","N", na, na, na, ONE, tmp1, na, a, na, ZERO, tmp2, na)
++#endif /* WITH_MPI */
++
++      ! compare tmp2 with original matrix
++      tmp2(:,:) = tmp2(:,:) - as(:,:)
++
++#ifdef WITH_MPI
++      norm = p&
++              &BLAS_CHAR&
++              &lange("M",na, na, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++#if COMPLEXCASE == 1
++              tmp1_real)
++#else
++              tmp1)
++#endif
++#else /* WITH_MPI */
++      norm = BLAS_CHAR&
++             &lange("M", na, na, tmp2, na_rows, &
++#if COMPLEXCASE == 1
++             tmp1_real)
++#else
++             tmp1)
++#endif
++#endif /* WITH_MPI */
++
++
++#ifdef WITH_MPI
++      call mpi_allreduce(norm, normmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
++#else /* WITH_MPI */
++      normmax = norm
++#endif /* WITH_MPI */
++
++      if (myid .eq. 0) then
++        print *," Maximum error of result: ", normmax
++      endif
++
++#if REALCASE == 1
++#ifdef DOUBLE_PRECISION_REAL
++!      if (normmax .gt. 5e-12_rk8 .or. normmax .eq. 0.0_rk8) then
++      if (normmax .gt. 5e-12_rk8) then
++        status = 1
++      endif
++#else
++!      if (normmax .gt. 5e-4_rk4 .or. normmax .eq. 0.0_rk4) then
++      if (normmax .gt. 5e-4_rk4 ) then
++        status = 1
++      endif
++#endif
++#endif
++
++#if COMPLEXCASE == 1
++#ifdef DOUBLE_PRECISION_COMPLEX
++!      if (normmax .gt. 5e-11_rk8 .or. normmax .eq. 0.0_rk8) then
++      if (normmax .gt. 5e-11_rk8 ) then
++        status = 1
++      endif
++#else
++!      if (normmax .gt. 5e-3_rk4 .or. normmax .eq. 0.0_rk4) then
++      if (normmax .gt. 5e-3_rk4) then
++        status = 1
++      endif
++#endif
++#endif
++    end function
++
++    function check_correctness_hermitian_multiply_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    & (na, a, b, c, na_rows, sc_desc, myid) result(status)
++      use precision_for_tests
++      use tests_blas_interfaces
++      use tests_scalapack_interfaces
++      implicit none
++#include "./test_precision_kinds.F90"
++      TEST_INT_TYPE                                                   :: status
++      TEST_INT_TYPE, intent(in)                                       :: na, myid, na_rows
++      MATH_DATATYPE(kind=rck), intent(in)                             :: a(:,:), b(:,:), c(:,:)
++      MATH_DATATYPE(kind=rck), dimension(size(a,dim=1),size(a,dim=2)) :: tmp1, tmp2
++#if COMPLEXCASE == 1
++      real(kind=rk), dimension(2*size(a,dim=1),size(a,dim=2))         :: tmp1_real
++#endif
++      real(kind=rck)                                                  :: norm, normmax
++
++
++      TEST_INT_TYPE                                                   :: sc_desc(:)
++      real(kind=rck)                                                  :: err, errmax
++      TEST_INT_MPI_TYPE                                               :: mpierr
++
++      status = 0
++      tmp1(:,:) = ZERO
++
++#if REALCASE == 1
++      ! tmp1 = a**T
++#ifdef WITH_MPI
++      call p&
++            &BLAS_CHAR&
++            &tran(na, na, ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++      tmp1 = transpose(a)
++#endif /* WITH_MPI */
++
++#endif /* REALCASE == 1 */
++
++#if COMPLEXCASE == 1
++      ! tmp1 = a**H
++#ifdef WITH_MPI
++      call p&
++            &BLAS_CHAR&
++            &tranc(na, na, ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, ZERO, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else /* WITH_MPI */
++      tmp1 = transpose(conjg(a))
++#endif /* WITH_MPI */
++#endif /* COMPLEXCASE == 1 */
++
++   ! tmp2 = tmp1 * b
++#ifdef WITH_MPI
++   call p&
++         &BLAS_CHAR&
++         &gemm("N","N", na, na, na, ONE, tmp1, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, b, 1_BLAS_KIND, 1_BLAS_KIND, &
++               sc_desc, ZERO, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc)
++#else
++   call BLAS_CHAR&
++        &gemm("N","N", na, na, na, ONE, tmp1, na, b, na, ZERO, tmp2, na)
++#endif
++
++      ! compare tmp2 with c
++      tmp2(:,:) = tmp2(:,:) - c(:,:)
++
++#ifdef WITH_MPI
++      ! dirty hack: the last argument should be a real array, but is not referenced
++      ! if mode = "M", thus we get away with a complex argument
++      norm = p&
++              &BLAS_CHAR&
++              &lange("M", na, na, tmp2, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++#if COMPLEXCASE == 1              
++              tmp1_real)
++#else
++              tmp1)
++#endif
++#else /* WITH_MPI */
++      ! dirty hack: the last argument should be a real array, but is not referenced
++      ! if mode = "M", thus we get away with a complex argument
++      norm = BLAS_CHAR&
++             &lange("M", na, na, tmp2, na_rows, &
++#if COMPLEXCASE == 1              
++              tmp1_real)
++#else
++              tmp1)
++#endif
++#endif /* WITH_MPI */
++
++#ifdef WITH_MPI
++      call mpi_allreduce(norm, normmax, 1_MPI_KIND, MPI_REAL_PRECISION, MPI_MAX, MPI_COMM_WORLD, mpierr)
++#else /* WITH_MPI */
++      normmax = norm
++#endif /* WITH_MPI */
++
++      if (myid .eq. 0) then
++        print *," Maximum error of result: ", normmax
++      endif
++
++#ifdef DOUBLE_PRECISION_REAL
++      if (normmax .gt. 5e-11_rk8 ) then
++        status = 1
++      endif
++#else
++      if (normmax .gt. 5e-3_rk4 ) then
++        status = 1
++      endif
++#endif
++
++#ifdef DOUBLE_PRECISION_COMPLEX
++      if (normmax .gt. 5e-11_rk8 ) then
++        status = 1
++      endif
++#else
++      if (normmax .gt. 5e-3_rk4 ) then
++        status = 1
++      endif
++#endif
++    end function
++
++    function check_correctness_eigenvalues_frank_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    & (na, ev, z, myid) result(status)
++      use iso_c_binding
++      use precision_for_tests
++      implicit none
++#include "./test_precision_kinds.F90"
++
++      TEST_INT_TYPE                   :: status, i, j, myid
++      TEST_INT_TYPE, intent(in)       :: na
++      real(kind=rck)            :: ev_analytic(na), ev(na)
++      MATH_DATATYPE(kind=rck)   :: z(:,:)
++
++#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
++      real(kind=rck), parameter :: pi = 3.141592653589793238462643383279_c_double
++#else
++      real(kind=rck), parameter :: pi = 3.1415926535897932_c_float
++#endif
++      real(kind=rck)            :: tmp, maxerr
++      TEST_INT_TYPE                  :: loctmp
++      status = 0
++
++     ! analytic solution
++     do i = 1, na
++       j = na - i
++#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
++       ev_analytic(i) = pi * (2.0_c_double * real(j,kind=c_double) + 1.0_c_double) / &
++           (2.0_c_double * real(na,kind=c_double) + 1.0_c_double)
++       ev_analytic(i) = 0.5_c_double / (1.0_c_double - cos(ev_analytic(i)))
++#else
++       ev_analytic(i) = pi * (2.0_c_float * real(j,kind=c_float) + 1.0_c_float) / &
++           (2.0_c_float * real(na,kind=c_float) + 1.0_c_float)
++       ev_analytic(i) = 0.5_c_float / (1.0_c_float - cos(ev_analytic(i)))
++#endif
++     enddo
++
++     ! sort analytic solution:
++
++     ! this hack is neither elegant, nor optimized: for huge matrixes it might be expensive
++     ! a proper sorting algorithmus might be implemented here
++
++     tmp    = minval(ev_analytic)
++     loctmp = minloc(ev_analytic, 1)
++
++     ev_analytic(loctmp) = ev_analytic(1)
++     ev_analytic(1) = tmp
++     do i=2, na
++       tmp = ev_analytic(i)
++       do j= i, na
++         if (ev_analytic(j) .lt. tmp) then
++           tmp    = ev_analytic(j)
++           loctmp = j
++         endif
++       enddo
++       ev_analytic(loctmp) = ev_analytic(i)
++       ev_analytic(i) = tmp
++     enddo
++
++     ! compute a simple error max of eigenvalues
++     maxerr = 0.0
++     maxerr = maxval( (ev(:) - ev_analytic(:))/ev_analytic(:) , 1)
++
++#if defined(DOUBLE_PRECISION_REAL) || defined(DOUBLE_PRECISION_COMPLEX)
++     if (maxerr .gt. 8.e-13_c_double) then
++#else
++     if (maxerr .gt. 8.e-4_c_float) then
++#endif
++       status = 1
++       if (myid .eq. 0) then
++         print *,"Result of Frank matrix test: "
++         print *,"Eigenvalues differ from analytic solution: maxerr = ",maxerr
++       endif
++     endif
++    end function
++
++! vim: syntax=fortran
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_output_type.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_output_type.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_output_type.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_output_type.F90	2022-01-26 10:10:58.372166000 +0100
+@@ -0,0 +1,11 @@
++#include "config-f90.h"
++
++module test_output_type
++
++ type :: output_t
++   logical :: eigenvectors
++   logical :: eigenvalues
++ end type
++
++
++end module
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_precision_kinds.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_precision_kinds.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_precision_kinds.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_precision_kinds.F90	2022-01-26 10:10:58.373135000 +0100
+@@ -0,0 +1,25 @@
++#ifdef REALCASE
++#ifdef DOUBLE_PRECISION
++  integer, parameter :: rk = C_DOUBLE
++  integer, parameter :: rck = C_DOUBLE
++#endif
++#ifdef SINGLE_PRECISION
++  integer, parameter :: rk = C_FLOAT
++  integer, parameter :: rck = C_FLOAT
++#endif
++  real(kind=rck), parameter      :: ZERO=0.0_rk, ONE = 1.0_rk
++#endif
++
++#ifdef COMPLEXCASE
++#ifdef DOUBLE_PRECISION
++  integer, parameter :: rk = C_DOUBLE
++  integer, parameter :: ck = C_DOUBLE_COMPLEX
++  integer, parameter :: rck = C_DOUBLE_COMPLEX
++#endif
++#ifdef SINGLE_PRECISION
++  integer, parameter :: rk = C_FLOAT
++  integer, parameter :: ck = C_FLOAT_COMPLEX
++  integer, parameter :: rck = C_FLOAT_COMPLEX
++#endif
++  complex(kind=rck), parameter     :: ZERO = (0.0_rk,0.0_rk), ONE = (1.0_rk,0.0_rk)
++#endif
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_prepare_matrix.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_prepare_matrix.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_prepare_matrix.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_prepare_matrix.F90	2022-01-26 10:10:58.374062000 +0100
+@@ -0,0 +1,145 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++! Author: A. Marek, MPCDF
++#include "config-f90.h"
++
++module test_prepare_matrix
++
++  use precision_for_tests
++  interface prepare_matrix_random
++    module procedure prepare_matrix_random_complex_double
++    module procedure prepare_matrix_random_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure prepare_matrix_random_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure prepare_matrix_random_complex_single
++#endif
++   end interface
++
++
++  interface prepare_matrix_random_spd
++    module procedure prepare_matrix_random_spd_complex_double
++    module procedure prepare_matrix_random_spd_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure prepare_matrix_random_spd_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure prepare_matrix_random_spd_complex_single
++#endif
++   end interface
++
++
++  interface prepare_matrix_toeplitz
++    module procedure prepare_matrix_toeplitz_complex_double
++    module procedure prepare_matrix_toeplitz_real_double
++    module procedure prepare_matrix_toeplitz_mixed_complex_complex_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure prepare_matrix_toeplitz_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure prepare_matrix_toeplitz_complex_single
++    module procedure prepare_matrix_toeplitz_mixed_complex_complex_single
++#endif
++   end interface
++
++  interface prepare_matrix_frank
++    module procedure prepare_matrix_frank_complex_double
++    module procedure prepare_matrix_frank_real_double
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure prepare_matrix_frank_real_single
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure prepare_matrix_frank_complex_single
++#endif
++   end interface
++
++
++
++   private prows, pcols, map_global_array_index_to_local_index
++
++  contains
++
++#include "../../src/general/prow_pcol.F90"
++#include "../../src/general/map_global_to_local.F90"
++
++#define COMPLEXCASE 1
++#define DOUBLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_prepare_matrix_template.F90"
++#undef DOUBLE_PRECISION
++#undef COMPLEXCASE
++
++
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++
++
++#define COMPLEXCASE 1
++#define SINGLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_prepare_matrix_template.F90"
++#undef SINGLE_PRECISION
++#undef COMPLEXCASE
++#endif /* WANT_SINGLE_PRECISION_COMPLEX */
++
++
++#define REALCASE 1
++#define DOUBLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_prepare_matrix_template.F90"
++#undef DOUBLE_PRECISION
++#undef REALCASE
++
++#ifdef WANT_SINGLE_PRECISION_REAL
++
++
++#define REALCASE 1
++#define SINGLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_prepare_matrix_template.F90"
++#undef SINGLE_PRECISION
++#undef REALCASE
++
++#endif /* WANT_SINGLE_PRECISION_REAL */
++
++
++end module
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_prepare_matrix_template.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_prepare_matrix_template.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_prepare_matrix_template.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_prepare_matrix_template.F90	2022-01-26 10:10:58.375330000 +0100
+@@ -0,0 +1,510 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++! Author: A. Marek, MPCDF
++
++#include "config-f90.h"
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#define TEST_C_INT_TYPE_PTR long int*
++#define TEST_C_INT_TYPE long int
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#define TEST_C_INT_TYPE_PTR int*
++#define TEST_C_INT_TYPE int
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#define TEST_C_INT_MPI_TYPE_PTR long int*
++#define TEST_C_INT_MPI_TYPE long int
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#define TEST_C_INT_MPI_TYPE_PTR int*
++#define TEST_C_INT_MPI_TYPE int
++#endif
++
++
++    subroutine prepare_matrix_random_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    & (na, myid, sc_desc, a, z, as, is_skewsymmetric)
++
++
++      !use test_util
++      use tests_scalapack_interfaces
++
++      implicit none
++#include "./test_precision_kinds.F90"
++      TEST_INT_TYPE, intent(in)                 :: myid, na, sc_desc(:)
++      MATH_DATATYPE(kind=rck), intent(inout)    :: z(:,:), a(:,:), as(:,:)
++
++#if COMPLEXCASE == 1
++      real(kind=rk)                             :: xr(size(a,dim=1), size(a,dim=2))
++#endif /* COMPLEXCASE */
++
++      integer(kind=c_int), allocatable          :: iseed(:)
++      integer(kind=c_int)                       ::  n
++      integer(kind=c_int), intent(in), optional :: is_skewsymmetric
++      logical                                   :: skewsymmetric
++
++      if (present(is_skewsymmetric)) then
++        if (is_skewsymmetric .eq. 1) then
++          skewsymmetric = .true.
++        else
++          skewsymmetric = .false.
++        endif      
++      else
++        skewsymmetric = .false.
++      endif
++
++      ! for getting a hermitian test matrix A we get a random matrix Z
++      ! and calculate A = Z + Z**H
++      ! in case of a skewsymmetric matrix A = Z - Z**H
++
++      ! we want different random numbers on every process
++      ! (otherwise A might get rank deficient):
++
++      call random_seed(size=n)
++      allocate(iseed(n))
++      iseed(:) = myid
++      call random_seed(put=iseed)
++#if REALCASE == 1
++      call random_number(z)
++
++      a(:,:) = z(:,:)
++#endif /* REALCASE */
++
++#if COMPLEXCASE == 1
++      call random_number(xr)
++
++      z(:,:) = xr(:,:)
++      call RANDOM_NUMBER(xr)
++      z(:,:) = z(:,:) + (0.0_rk,1.0_rk)*xr(:,:)
++      a(:,:) = z(:,:)
++#endif /* COMPLEXCASE */
++
++      if (myid == 0) then
++        print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
++      endif
++
++#if REALCASE == 1
++#ifdef WITH_MPI
++      if (skewsymmetric) then
++        call p&
++             &BLAS_CHAR&
++             &tran(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), -ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                   ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc) ! A = A + Z**T
++      else
++        call p&
++             &BLAS_CHAR&
++             &tran(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                   ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc) ! A = A + Z**T
++      endif
++#else /* WITH_MPI */
++      if (skewsymmetric) then
++        a = a - transpose(z)
++      else
++        a = a + transpose(z)
++      endif
++#endif /* WITH_MPI */
++#endif /* REALCASE */
++
++#if COMPLEXCASE == 1
++#ifdef WITH_MPI
++      if (skewsymmetric) then
++        call p&
++             &BLAS_CHAR&
++             &tranc(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), -ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                    ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc) ! A = A + Z**H
++      else
++        call p&
++             &BLAS_CHAR&
++             &tranc(int(na,kind=BLAS_KIND), int(na,kind=BLAS_KIND), ONE, z, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc, &
++                    ONE, a, 1_BLAS_KIND, 1_BLAS_KIND, sc_desc) ! A = A + Z**H
++      endif
++#else /* WITH_MPI */
++      if (skewsymmetric) then
++        a = a - transpose(conjg(z))
++      else
++        a = a + transpose(conjg(z))
++      endif
++#endif /* WITH_MPI */
++#endif /* COMPLEXCASE */
++
++
++      if (myid == 0) then
++        print '(a)','| Random matrix block has been symmetrized'
++      endif
++
++      ! save original matrix A for later accuracy checks
++
++      as = a
++
++      deallocate(iseed)
++
++    end subroutine
++
++#if REALCASE == 1
++#ifdef DOUBLE_PRECISION_REAL
++    !c> void prepare_matrix_random_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
++    !c>                                          TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
++    !c>                                          double *a, double *z, double *as);
++#else
++    !c> void prepare_matrix_random_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
++    !c>                                          TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
++    !c>                                          float *a, float *z, float *as);
++#endif
++#endif /* REALCASE */
++
++#if COMPLEXCASE == 1
++#ifdef DOUBLE_PRECISION_COMPLEX
++    !c> void prepare_matrix_random_complex_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
++    !c>                                             TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
++    !c>                                             complex double *a, complex double *z, complex double *as);
++#else
++    !c> void prepare_matrix_random_complex_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
++    !c>                                             TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
++    !c>                                             complex float *a, complex float *z, complex float *as);
++#endif
++#endif /* COMPLEXCASE */
++
++subroutine prepare_matrix_random_&
++&MATH_DATATYPE&
++&_wrapper_&
++&PRECISION&
++& (na, myid, na_rows, na_cols, sc_desc, a, z, as) &
++   bind(C, name="prepare_matrix_random_&
++   &MATH_DATATYPE&
++   &_&
++   &PRECISION&
++   &_f")
++      use iso_c_binding
++
++      implicit none
++#include "./test_precision_kinds.F90"
++
++      TEST_INT_TYPE , value   :: myid, na, na_rows, na_cols
++      TEST_INT_TYPE           :: sc_desc(1:9)
++      MATH_DATATYPE(kind=rck) :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols),  &
++                                 as(1:na_rows,1:na_cols)
++      call prepare_matrix_random_&
++      &MATH_DATATYPE&
++      &_&
++      &PRECISION&
++      & (na, myid, sc_desc, a, z, as)
++    end subroutine
++
++!----------------------------------------------------------------------------------------------------------------
++
++    subroutine prepare_matrix_random_spd_&
++    &MATH_DATATYPE&
++    &_&
++    &PRECISION&
++    & (na, myid, sc_desc, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
++
++      !use test_util
++      use precision_for_tests
++      implicit none
++#include "./test_precision_kinds.F90"
++      TEST_INT_TYPE, intent(in)              :: myid, na, sc_desc(:)
++      MATH_DATATYPE(kind=rck), intent(inout) :: z(:,:), a(:,:), as(:,:)
++      TEST_INT_TYPE, intent(in)              ::  nblk, np_rows, np_cols, my_prow, my_pcol
++
++      TEST_INT_TYPE                          :: ii
++      integer(kind=c_int)                    :: rowLocal, colLocal
++
++
++      call prepare_matrix_random_&
++        &MATH_DATATYPE&
++        &_&
++        &PRECISION&
++        & (na, myid, sc_desc, a, z, as)
++
++      ! hermitian diagonaly dominant matrix => positive definite
++      do ii=1, na
++        if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii,kind=c_int), &
++                                                  rowLocal, colLocal, &
++                                                  int(nblk,kind=c_int), int(np_rows,kind=c_int),      &
++                                                  int(np_cols,kind=c_int), int(my_prow,kind=c_int),  &
++                                                  int(my_pcol,kind=c_int) )) then
++          a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = &
++                  real(a(int(rowLocal,kind=INT_TYPE), int(colLocal,kind=INT_TYPE))) + na + 1
++        end if
++      end do
++
++      as = a
++
++   end subroutine
++
++#if REALCASE == 1
++#ifdef DOUBLE_PRECISION_REAL
++    !c> void prepare_matrix_random_spd_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
++    !c>                                              TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
++    !c>                                              double *a, double *z, double *as,
++    !c>                                              TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, 
++    !c>                                              TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#else
++    !c> void prepare_matrix_random_spd_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
++    !c>                                              TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
++    !c>                                              float *a, float *z, float *as,
++    !c>                                              TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols,
++    !c>                                              TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#endif
++#endif /* REALCASE */
++
++#if COMPLEXCASE == 1
++#ifdef DOUBLE_PRECISION_COMPLEX
++    !c> void prepare_matrix_random_spd_complex_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows, 
++    !c>                                                 TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                 complex double *a, complex double *z, complex double *as,
++    !c>                                                 TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE np_rows, 
++    !c>                                                 TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#else
++    !c> void prepare_matrix_random_spd_complex_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE na_rows,
++    !c>                                                 TEST_C_INT_TYPE na_cols, TEST_C_INT_TYPE sc_desc[9],
++    !c>                                                 complex float *a, complex float *z, complex float *as,
++    !c>                                                 TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE np_rows, 
++    !c>                                                 TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
++#endif
++#endif /* COMPLEXCASE */
++
++subroutine prepare_matrix_random_spd_&
++&MATH_DATATYPE&
++&_wrapper_&
++&PRECISION&
++& (na, myid, na_rows, na_cols, sc_desc, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol) &
++   bind(C, name="prepare_matrix_random_spd_&
++   &MATH_DATATYPE&
++   &_&
++   &PRECISION&
++   &_f")
++      use iso_c_binding
++
++      implicit none
++#include "./test_precision_kinds.F90"
++
++      TEST_INT_TYPE , value   :: myid, na, na_rows, na_cols
++      TEST_INT_TYPE           :: sc_desc(1:9)
++      MATH_DATATYPE(kind=rck) :: z(1:na_rows,1:na_cols), a(1:na_rows,1:na_cols),  &
++                                 as(1:na_rows,1:na_cols)
++      TEST_INT_TYPE , value   :: nblk, np_rows, np_cols, my_prow, my_pcol
++      call prepare_matrix_random_spd_&
++      &MATH_DATATYPE&
++      &_&
++      &PRECISION&
++      & (na, myid, sc_desc, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
++    end subroutine
++
++
++!----------------------------------------------------------------------------------------------------------------
++
++   subroutine prepare_matrix_toeplitz_&
++   &MATH_DATATYPE&
++   &_&
++   &PRECISION&
++   & (na, diagonalElement, subdiagonalElement, d, sd, ds, sds, a, as, &
++      nblk, np_rows, np_cols, my_prow, my_pcol)
++     !use test_util
++     use precision_for_tests
++     implicit none
++#include "./test_precision_kinds.F90"
++
++     TEST_INT_TYPE, intent(in) :: na, nblk, np_rows, np_cols, my_prow, my_pcol
++     MATH_DATATYPE(kind=rck)   :: diagonalElement, subdiagonalElement
++     MATH_DATATYPE(kind=rck)   :: d(:), sd(:), ds(:), sds(:)
++     MATH_DATATYPE(kind=rck)   :: a(:,:), as(:,:)
++
++     TEST_INT_TYPE             :: ii
++     integer(kind=c_int)       :: rowLocal, colLocal
++
++     d(:) = diagonalElement
++     sd(:) = subdiagonalElement
++     a(:,:) = ZERO
++
++     ! set up the diagonal and subdiagonals (for general solver test)
++     do ii=1, na ! for diagonal elements
++       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii,kind=c_int), rowLocal, &
++                                                 colLocal, int(nblk,kind=c_int), int(np_rows,kind=c_int), &
++                                                 int(np_cols,kind=c_int), int(my_prow,kind=c_int), &
++                                                 int(my_pcol,kind=c_int) ) ) then
++         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = diagonalElement
++       endif
++     enddo
++     do ii=1, na-1
++       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii+1,kind=c_int), rowLocal, &
++                                                 colLocal, int(nblk,kind=c_int), int(np_rows,kind=c_int), &
++                                                 int(np_cols,kind=c_int), int(my_prow,kind=c_int), &
++                                                 int(my_pcol,kind=c_int) ) ) then
++         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = subdiagonalElement
++       endif
++     enddo
++
++     do ii=2, na
++       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii-1,kind=c_int), rowLocal, &
++                                                 colLocal, int(nblk,kind=c_int), int(np_rows,kind=c_int), &
++                                                 int(np_cols,kind=c_int), int(my_prow,kind=c_int), &
++                                                 int(my_pcol,kind=c_int) ) ) then
++         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = subdiagonalElement
++       endif
++     enddo
++
++     ds = d
++     sds = sd
++     as = a
++   end subroutine
++
++   subroutine prepare_matrix_toeplitz_mixed_complex&
++   &_&
++   &MATH_DATATYPE&
++   &_&
++   &PRECISION&
++#if COMPLEXCASE == 1
++   & (na, diagonalElement, subdiagonalElement, d, sd, ds, sds, a, as, &
++      nblk, np_rows, np_cols, my_prow, my_pcol)
++#endif
++#if REALCASE == 1
++   & (na, diagonalElement, subdiagonalElement, d, sd, ds, sds, &
++      nblk, np_rows, np_cols, my_prow, my_pcol)
++#endif
++     !use test_util
++     implicit none
++
++     TEST_INT_TYPE, intent(in)     :: na, nblk, np_rows, np_cols, my_prow, my_pcol
++     real(kind=C_DATATYPE_KIND)    :: diagonalElement, subdiagonalElement
++
++     real(kind=C_DATATYPE_KIND)    :: d(:), sd(:), ds(:), sds(:)
++
++#if COMPLEXCASE == 1
++     complex(kind=C_DATATYPE_KIND) :: a(:,:), as(:,:)
++#endif
++#if REALCASE == 1
++#endif
++
++     TEST_INT_TYPE                 :: ii
++     integer(kind=c_int)           :: rowLocal, colLocal
++#if COMPLEXCASE == 1
++     d(:) = diagonalElement
++     sd(:) = subdiagonalElement
++
++     ! set up the diagonal and subdiagonals (for general solver test)
++     do ii=1, na ! for diagonal elements
++       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii,kind=c_int), rowLocal, &
++                                                 colLocal, int(nblk,kind=c_int),                   &
++                                                 int(np_rows,kind=c_int), int(np_cols,kind=c_int),                 &
++                                                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
++         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = diagonalElement
++       endif
++     enddo
++     do ii=1, na-1
++       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii+1,kind=c_int), rowLocal, &
++                                                 colLocal, int(nblk,kind=c_int),                   &
++                                                 int(np_rows,kind=c_int), int(np_cols,kind=c_int),                 &
++                                                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
++         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = subdiagonalElement
++       endif
++     enddo
++
++     do ii=2, na
++       if (map_global_array_index_to_local_index(int(ii,kind=c_int), int(ii-1,kind=c_int), rowLocal, &
++                                                 colLocal, int(nblk,kind=c_int),                   &
++                                                 int(np_rows,kind=c_int), int(np_cols,kind=c_int),                 &
++                                                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
++         a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = subdiagonalElement
++       endif
++     enddo
++
++     ds = d
++     sds = sd
++     as = a
++#endif
++   end subroutine
++
++   subroutine prepare_matrix_frank_&
++   &MATH_DATATYPE&
++   &_&
++   &PRECISION&
++   & (na, a, z, as, nblk, np_rows, np_cols, my_prow, my_pcol)
++     !use test_util
++     use precision_for_tests
++     implicit none
++
++     TEST_INT_TYPE, intent(in)     :: na, nblk, np_rows, np_cols, my_prow, my_pcol
++
++#if REALCASE == 1
++     real(kind=C_DATATYPE_KIND)    :: a(:,:), z(:,:), as(:,:)
++#endif
++#if COMPLEXCASE == 1
++     complex(kind=C_DATATYPE_KIND) :: a(:,:), z(:,:), as(:,:)
++#endif
++
++     TEST_INT_TYPE                 :: i, j
++     integer(kind=c_int)           :: rowLocal, colLocal
++
++     do i = 1, na
++       do j = 1, na
++         if (map_global_array_index_to_local_index(int(i,kind=c_int), int(j,kind=c_int), rowLocal, &
++                                                 colLocal, int(nblk,kind=c_int),                   &
++                                                 int(np_rows,kind=c_int), int(np_cols,kind=c_int),                 &
++                                                 int(my_prow,kind=c_int), int(my_pcol,kind=c_int) )) then
++           if (j .le. i) then
++             a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = &
++                     real((na+1-i), kind=C_DATATYPE_KIND) / real(na, kind=C_DATATYPE_KIND)
++           else
++             a(int(rowLocal,kind=INT_TYPE),int(colLocal,kind=INT_TYPE)) = &
++                     real((na+1-j), kind=C_DATATYPE_KIND) / real(na, kind=C_DATATYPE_KIND)
++           endif
++         endif
++       enddo
++     enddo
++
++     z(:,:)  = a(:,:)
++     as(:,:) = a(:,:)
++
++   end subroutine
++
++
++! vim: syntax=fortran
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_read_input_parameters.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_read_input_parameters.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_read_input_parameters.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_read_input_parameters.F90	2022-01-26 10:10:58.376155000 +0100
+@@ -0,0 +1,455 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++
++module test_read_input_parameters
++  use elpa, only : ELPA_2STAGE_COMPLEX_DEFAULT, ELPA_2STAGE_REAL_DEFAULT, elpa_int_string_to_value, &
++                   elpa_int_value_to_string, ELPA_OK
++  use elpa_utilities, only : error_unit
++  use iso_c_binding
++  use test_util, only : x_ao, x_a
++  use test_output_type
++
++  implicit none
++
++  type input_options_t
++    TEST_INT_TYPE        :: datatype
++    TEST_INT_TYPE        :: na, nev, nblk
++    type(output_t) :: write_to_file
++    TEST_INT_TYPE        :: this_real_kernel, this_complex_kernel
++    logical        :: realKernelIsSet, complexKernelIsSet
++    TEST_INT_TYPE        :: useQrIsSet, useGPUIsSet
++    logical        :: doSolveTridi, do1stage, do2stage, justHelpMessage, &
++                      doCholesky, doInvertTrm, doTransposeMultiply
++  end type
++
++  interface read_input_parameters
++    module procedure read_input_parameters_general
++    module procedure read_input_parameters_traditional
++    module procedure read_input_parameters_traditional_noskip
++  end interface
++
++  contains
++
++    subroutine parse_arguments(command_line_argument, input_options)
++      implicit none
++
++      type(input_options_t)  :: input_options
++      character(len=128)     :: command_line_argument
++      integer(kind=c_int)    :: elpa_error
++
++      if (command_line_argument == "--help") then
++        print *,"usage: elpa_tests [--help] [datatype={real|complex}] [na=number] [nev=number] "
++        print *,"                  [nblk=size of block cyclic distribution] [--output_eigenvalues]"
++        print *,"                  [--output_eigenvectors] [--real-kernel=name_of_kernel]"
++        print *,"                  [--complex-kernel=name_of_kernel] [--use-gpu={0|1}]"
++        print *,"                  [--use-qr={0,1}] [--tests={all|solve-tridi|1stage|2stage|cholesky&
++            &|invert-triangular|transpose-mulitply}]"
++        input_options%justHelpMessage=.true.
++        return
++      endif
++
++
++      if (command_line_argument(1:11) == "--datatype=") then
++        if (command_line_argument(12:15) == "real") then
++          input_options%datatype=1
++        else
++          if (command_line_argument(12:18) == "complex") then
++            input_options%datatype=2
++          else
++            print *,"datatype unknown! use either --datatype=real or --datatpye=complex"
++            stop 1
++          endif
++        endif
++      endif
++
++      if (command_line_argument(1:3) == "na=") then
++        read(command_line_argument(4:), *) input_options%na
++      endif
++      if (command_line_argument(1:4) == "nev=") then
++        read(command_line_argument(5:), *) input_options%nev
++      endif
++      if (command_line_argument(1:5) == "nblk=") then
++        read(command_line_argument(6:), *) input_options%nblk
++      endif
++
++      if (command_line_argument(1:21)   == "--output_eigenvectors") then
++        input_options%write_to_file%eigenvectors = .true.
++      endif
++
++      if (command_line_argument(1:20)   == "--output_eigenvalues") then
++        input_options%write_to_file%eigenvalues = .true.
++      endif
++
++      if (command_line_argument(1:14) == "--real-kernel=") then
++        input_options%this_real_kernel = int(elpa_int_string_to_value("real_kernel",     &
++                                             command_line_argument(15:), elpa_error), &
++                                             kind=INT_TYPE)
++        if (elpa_error /= ELPA_OK) then
++          print *, "Invalid argument for --real-kernel"
++          stop 1
++        endif
++        print *,"Setting ELPA2 real kernel to ", elpa_int_value_to_string("real_kernel", &
++                                                                        int(input_options%this_real_kernel,kind=c_int))
++        input_options%realKernelIsSet = .true.
++      endif
++
++      if (command_line_argument(1:17) == "--complex-kernel=") then
++        input_options%this_complex_kernel = int(elpa_int_string_to_value("complex_kernel",    &
++                                                command_line_argument(18:), elpa_error), kind=INT_TYPE)
++        if (elpa_error /= ELPA_OK) then
++          print *, "Invalid argument for --complex-kernel"
++          stop 1
++        endif
++        print *,"Setting ELPA2 complex kernel to ", elpa_int_value_to_string("complex_kernel", &
++                                                                          int(input_options%this_complex_kernel,kind=c_int))
++        input_options%complexKernelIsSet = .true.
++      endif
++
++      if (command_line_argument(1:9) == "--use-qr=") then
++        read(command_line_argument(10:), *) input_options%useQrIsSet
++      endif
++
++      if (command_line_argument(1:10) == "--use-gpu=") then
++        read(command_line_argument(11:), *) input_options%useGPUIsSet
++      endif
++
++      if (command_line_argument(1:8) == "--tests=") then
++        if (command_line_argument(9:11) == "all") then
++          input_options%doSolveTridi=.true.
++          input_options%do1stage=.true.
++          input_options%do2stage=.true.
++          input_options%doCholesky=.true.
++          input_options%doInvertTrm=.true.
++          input_options%doTransposeMultiply=.true.
++        else if (command_line_argument(9:19) == "solve-tride") then
++          input_options%doSolveTridi=.true.
++          input_options%do1stage=.false.
++          input_options%do2stage=.false.
++          input_options%doCholesky=.false.
++          input_options%doInvertTrm=.false.
++          input_options%doTransposeMultiply=.false.
++        else if (command_line_argument(9:14) == "1stage") then
++          input_options%doSolveTridi=.false.
++          input_options%do1stage=.true.
++          input_options%do2stage=.false.
++          input_options%doCholesky=.false.
++          input_options%doInvertTrm=.false.
++          input_options%doTransposeMultiply=.false.
++        else if (command_line_argument(9:14) == "2stage") then
++          input_options%doSolveTridi=.false.
++          input_options%do1stage=.false.
++          input_options%do2stage=.true.
++          input_options%doCholesky=.false.
++          input_options%doInvertTrm=.false.
++          input_options%doTransposeMultiply=.false.
++        else if (command_line_argument(9:16) == "cholesky") then
++          input_options%doSolveTridi=.false.
++          input_options%do1stage=.false.
++          input_options%do2stage=.false.
++          input_options%doCholesky=.true.
++          input_options%doInvertTrm=.false.
++          input_options%doTransposeMultiply=.false.
++        else if (command_line_argument(9:25) == "invert-triangular") then
++          input_options%doSolveTridi=.false.
++          input_options%do1stage=.false.
++          input_options%do2stage=.false.
++          input_options%doCholesky=.false.
++          input_options%doInvertTrm=.true.
++          input_options%doTransposeMultiply=.false.
++        else if (command_line_argument(9:26) == "transpose-multiply") then
++          input_options%doSolveTridi=.false.
++          input_options%do1stage=.false.
++          input_options%do2stage=.false.
++          input_options%doCholesky=.false.
++          input_options%doInvertTrm=.false.
++          input_options%doTransposeMultiply=.true.
++        else
++           print *,"unknown test specified"
++           stop 1
++        endif
++      endif
++
++    end subroutine
++
++    subroutine read_input_parameters_general(input_options)
++      use precision_for_tests
++      implicit none
++
++      type(input_options_t)         :: input_options
++
++      ! Command line arguments
++      character(len=128)            :: arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10
++
++      ! default parameters
++      input_options%datatype = 1
++      input_options%na = 500
++      input_options%nev = 150
++      input_options%nblk = 16
++
++      input_options%write_to_file%eigenvectors = .false.
++      input_options%write_to_file%eigenvalues  = .false.
++
++      input_options%this_real_kernel = ELPA_2STAGE_REAL_DEFAULT
++      input_options%this_complex_kernel = ELPA_2STAGE_COMPLEX_DEFAULT
++      input_options%realKernelIsSet = .false.
++      input_options%complexKernelIsSet = .false.
++
++      input_options%useQrIsSet = 0
++
++      input_options%useGPUIsSet = 0
++
++      input_options%do1Stage = .true.
++      input_options%do2Stage = .true.
++      input_options%doSolveTridi = .true.
++      input_options%doCholesky=.true.
++      input_options%doInvertTrm=.true.
++      input_options%doTransposeMultiply=.true.
++      input_options%justHelpMessage=.false.
++
++      ! test na=1500 nev=50 nblk=16 --help --kernel --output_eigenvectors --output_eigenvalues
++      if (COMMAND_ARGUMENT_COUNT() .gt. 8) then
++        write(error_unit, '(a,i0,a)') "Invalid number (", COMMAND_ARGUMENT_COUNT(), ") of command line arguments!"
++        stop 1
++      endif
++
++      if (COMMAND_ARGUMENT_COUNT() .gt. 0) then
++
++        call get_COMMAND_ARGUMENT(1, arg1)
++
++        call parse_arguments(arg1, input_options)
++
++
++
++        if (COMMAND_ARGUMENT_COUNT() .ge. 2) then
++          ! argument 2
++          call get_COMMAND_ARGUMENT(2, arg2)
++
++          call parse_arguments(arg2, input_options)
++        endif
++
++        ! argument 3
++        if (COMMAND_ARGUMENT_COUNT() .ge. 3) then
++
++          call get_COMMAND_ARGUMENT(3, arg3)
++
++          call parse_arguments(arg3, input_options)
++        endif
++
++        ! argument 4
++        if (COMMAND_ARGUMENT_COUNT() .ge. 4) then
++
++          call get_COMMAND_ARGUMENT(4, arg4)
++
++          call parse_arguments(arg4, input_options)
++
++        endif
++
++        ! argument 5
++        if (COMMAND_ARGUMENT_COUNT() .ge. 5) then
++
++          call get_COMMAND_ARGUMENT(5, arg5)
++
++          call parse_arguments(arg5, input_options)
++        endif
++
++        ! argument 6
++        if (COMMAND_ARGUMENT_COUNT() .ge. 6) then
++
++          call get_COMMAND_ARGUMENT(6, arg6)
++
++          call parse_arguments(arg6, input_options)
++        endif
++
++        ! argument 7
++        if (COMMAND_ARGUMENT_COUNT() .ge. 7) then
++
++          call get_COMMAND_ARGUMENT(7, arg7)
++
++          call parse_arguments(arg7, input_options)
++
++        endif
++
++        ! argument 8
++        if (COMMAND_ARGUMENT_COUNT() .ge. 8) then
++
++          call get_COMMAND_ARGUMENT(8, arg8)
++
++          call parse_arguments(arg8, input_options)
++
++        endif
++
++        ! argument 9
++        if (COMMAND_ARGUMENT_COUNT() .ge. 9) then
++
++          call get_COMMAND_ARGUMENT(9, arg9)
++
++          call parse_arguments(arg8, input_options)
++
++        endif
++
++        ! argument 10
++        if (COMMAND_ARGUMENT_COUNT() .ge. 10) then
++
++          call get_COMMAND_ARGUMENT(10, arg10)
++
++          call parse_arguments(arg8, input_options)
++
++        endif
++
++      endif
++
++      if (input_options%useQrIsSet .eq. 1 .and. input_options%datatype .eq. 2) then
++        print *,"You cannot use QR-decomposition in complex case"
++        stop 1
++      endif
++
++    end subroutine
++
++    subroutine read_input_parameters_traditional_noskip(na, nev, nblk, write_to_file)
++      use precision_for_tests
++      implicit none
++
++      TEST_INT_TYPE, intent(out) :: na, nev, nblk
++
++      type(output_t), intent(out)   :: write_to_file
++      logical                       :: skip_check_correctness
++
++      call read_input_parameters_traditional(na, nev, nblk, write_to_file, skip_check_correctness)
++    end subroutine
++
++    subroutine read_input_parameters_traditional(na, nev, nblk, write_to_file, skip_check_correctness)
++      use precision_for_tests
++      implicit none
++
++      TEST_INT_TYPE, intent(out) :: na, nev, nblk
++
++      type(output_t), intent(out)   :: write_to_file
++      logical, intent(out)          :: skip_check_correctness
++
++      ! Command line arguments
++      character(len=128)            :: arg1, arg2, arg3, arg4, arg5
++
++      ! default parameters
++      na = 5000
++      nev = 150
++      nblk = 16
++      write_to_file%eigenvectors = .false.
++      write_to_file%eigenvalues  = .false.
++      skip_check_correctness = .false.
++
++      if (.not. any(COMMAND_ARGUMENT_COUNT() == [0, 3, 4, 5])) then
++        write(error_unit, '(a,i0,a)') "Invalid number (", COMMAND_ARGUMENT_COUNT(), ") of command line arguments!"
++        write(error_unit, *) "Expected: program [ [matrix_size num_eigenvalues block_size] &
++            ""output_eigenvalues"" ""output_eigenvectors""]"
++        stop 1
++      endif
++
++      if (COMMAND_ARGUMENT_COUNT() == 3) then
++        call GET_COMMAND_ARGUMENT(1, arg1)
++        call GET_COMMAND_ARGUMENT(2, arg2)
++        call GET_COMMAND_ARGUMENT(3, arg3)
++
++        read(arg1, *) na
++        read(arg2, *) nev
++        read(arg3, *) nblk
++      endif
++
++      if (COMMAND_ARGUMENT_COUNT() == 4) then
++        call GET_COMMAND_ARGUMENT(1, arg1)
++        call GET_COMMAND_ARGUMENT(2, arg2)
++        call GET_COMMAND_ARGUMENT(3, arg3)
++        call GET_COMMAND_ARGUMENT(4, arg4)
++        read(arg1, *) na
++        read(arg2, *) nev
++        read(arg3, *) nblk
++
++        if (arg4 .eq. "output_eigenvalues") then
++          write_to_file%eigenvalues = .true.
++        elseif (arg4 .eq. "skip_check_correctness") then
++          skip_check_correctness = .true.
++        else
++          write(error_unit, *) &
++          "Invalid value for parameter 4.  Must be ""output_eigenvalues"", ""skip_check_correctness"" or omitted"
++          stop 1
++        endif
++
++      endif
++
++      if (COMMAND_ARGUMENT_COUNT() == 5) then
++        call GET_COMMAND_ARGUMENT(1, arg1)
++        call GET_COMMAND_ARGUMENT(2, arg2)
++        call GET_COMMAND_ARGUMENT(3, arg3)
++        call GET_COMMAND_ARGUMENT(4, arg4)
++        call GET_COMMAND_ARGUMENT(5, arg5)
++        read(arg1, *) na
++        read(arg2, *) nev
++        read(arg3, *) nblk
++
++        if (arg4 .eq. "output_eigenvalues") then
++          write_to_file%eigenvalues = .true.
++        else
++          write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvalues"" or omitted"
++          stop 1
++        endif
++
++        if (arg5 .eq. "output_eigenvectors") then
++          write_to_file%eigenvectors = .true.
++        else
++          write(error_unit, *) "Invalid value for output flag! Must be ""output_eigenvectors"" or omitted"
++          stop 1
++        endif
++
++      endif
++    end subroutine
++
++end module
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_redir.c elpa-new_release_2021.11.001_ok/examples/shared/test_redir.c
+--- elpa-new_release_2021.11.001/examples/shared/test_redir.c	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_redir.c	2022-01-26 10:10:58.378020000 +0100
+@@ -0,0 +1,125 @@
++//    This file is part of ELPA.
++//
++//    The ELPA library was originally created by the ELPA consortium,
++//    consisting of the following organizations:
++//
++//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++//      Informatik,
++//    - Technische Universität München, Lehrstuhl für Informatik mit
++//      Schwerpunkt Wissenschaftliches Rechnen ,
++//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++//      and
++//    - IBM Deutschland GmbH
++//
++//
++//    More information can be found here:
++//    http://elpa.mpcdf.mpg.de/
++//
++//    ELPA is free software: you can redistribute it and/or modify
++//    it under the terms of the version 3 of the license of the
++//    GNU Lesser General Public License as published by the Free
++//    Software Foundation.
++//
++//    ELPA is distributed in the hope that it will be useful,
++//    but WITHOUT ANY WARRANTY; without even the implied warranty of
++//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++//    GNU Lesser General Public License for more details.
++//
++//    You should have received a copy of the GNU Lesser General Public License
++//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++//
++//    ELPA reflects a substantial effort on the part of the original
++//    ELPA consortium, and we ask you to respect the spirit of the
++//    license that we chose: i.e., please contribute any changes you
++//    may have back to the original ELPA library distribution, and keep
++//    any derivatives of ELPA under the same license that we chose for
++//    the original distribution, the GNU Lesser General Public License.
++//
++//
++// --------------------------------------------------------------------------------------------------
++#include <stdio.h>
++#include <fcntl.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <sys/stat.h>
++#include <sys/types.h>
++#include <unistd.h>
++#include <errno.h>
++
++#define NAME_LENGTH 4096
++#define FILENAME "./mpi_stdout/std%3s_rank%04d.txt"
++
++FILE *tout, *terr;
++void dup_filename(char *filename, int dupfd);
++void dup_fd(int fd, int dupfd);
++
++int _mkdirifnotexists(const char *dir) {
++    struct stat s;
++    if (stat(dir, &s) != 0) {
++        if (errno == ENOENT) {
++            if (mkdir(dir, 0755) != 0) {
++                perror("mkdir");
++                return 0;
++            } else {
++                return 1;
++            }
++        } else {
++            perror("stat()");
++	    return 0;
++        }
++    } else if (!S_ISDIR(s.st_mode)) {
++        fprintf(stderr, "\"%s\" does exist and is not a directory\n", dir);
++        return 0;
++    } else {
++        return 1;
++    }
++}
++
++int create_directories(void) {
++    if (!_mkdirifnotexists("mpi_stdout")) return 0;
++    return 1;
++}
++
++void redirect_stdout(int *myproc) {
++  char buf[NAME_LENGTH];
++
++  if (*myproc == 0) {
++    snprintf(buf, NAME_LENGTH, "tee " FILENAME, "out", *myproc);
++    tout = popen(buf, "w");
++    dup_fd(fileno(tout), 1);
++
++    snprintf(buf, NAME_LENGTH, "tee " FILENAME, "err", *myproc);
++    terr = popen(buf, "w");
++    dup_fd(fileno(terr), 2);
++  } else {
++    snprintf(buf, NAME_LENGTH, FILENAME, "out", *myproc);
++    dup_filename(buf, 1);
++
++    snprintf(buf, NAME_LENGTH, FILENAME, "err", *myproc);
++    dup_filename(buf, 2);
++  }
++
++  return;
++}
++
++/* Redirect file descriptor dupfd to file filename */
++void dup_filename(char *filename, int dupfd) {
++  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
++  if(fd < 0) {
++    perror("open()");
++    exit(1);
++  }
++  dup_fd(fd, dupfd);
++}
++
++/* Redirect file descriptor dupfd to file descriptor fd */
++void dup_fd(int fd, int dupfd) {
++  if(dup2(fd,dupfd) < 0) {
++    perror("dup2()");
++    exit(1);
++  }
++}
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_redirect.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_redirect.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_redirect.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_redirect.F90	2022-01-26 10:10:58.379123000 +0100
+@@ -0,0 +1,116 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++module test_redirect
++  use, intrinsic :: iso_c_binding
++
++  implicit none
++  public
++
++  logical :: use_redirect_stdout
++
++  interface
++    integer(kind=C_INT) function create_directories_c() bind(C, name="create_directories")
++      use, intrinsic :: iso_c_binding
++      implicit none
++    end function
++  end interface
++
++  interface
++    subroutine redirect_stdout_c(myproc) bind(C, name="redirect_stdout")
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=C_INT), intent(in) :: myproc
++    end subroutine
++  end interface
++
++  contains
++!>
++!> This function is the Fortran driver for the
++!> C program to create the redirect output
++!> directory
++!>
++!> \param none
++!> \result res integer indicates success or failure
++    function create_directories() result(res)
++      implicit none
++      integer(kind=C_INT) :: res
++      res = int(create_directories_c())
++    end function
++!>
++!> This subroutine is the Fortran driver for the
++!> redirection of stdout and stderr of each MPI
++!> task
++!>
++!> \param myproc MPI task id
++    subroutine redirect_stdout(myproc)
++      use, intrinsic :: iso_c_binding
++      implicit none
++      integer(kind=C_INT), intent(in) :: myproc
++      call redirect_stdout_c(int(myproc, kind=C_INT))
++    end subroutine
++!>
++!> This function checks, whether the environment variable
++!> "REDIRECT_ELPA_TEST_OUTPUT" is set to "true".
++!> Returns ".true." if variable is set, otherwise ".false."
++!> This function only works if the during the build process
++!> "HAVE_ENVIRONMENT_CHECKING" was tested successfully
++!>
++!> \param none
++!> \return logical
++    function check_redirect_environment_variable() result(redirect)
++      implicit none
++      logical            :: redirect
++      character(len=255) :: REDIRECT_VARIABLE
++
++      redirect = .false.
++
++#if defined(HAVE_ENVIRONMENT_CHECKING)
++      call get_environment_variable("REDIRECT_ELPA_TEST_OUTPUT",REDIRECT_VARIABLE)
++#endif
++      if (trim(REDIRECT_VARIABLE) .eq. "true") redirect = .true.
++
++    end function
++
++end module test_redirect
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_scalapack.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_scalapack.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_scalapack.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_scalapack.F90	2022-01-26 10:10:58.380143000 +0100
+@@ -0,0 +1,111 @@
++! (c) Copyright Pavel Kus, 2017, MPCDF
++!
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++
++#include "../Fortran/assert.h"
++#include "config-f90.h"
++
++module test_scalapack
++  use test_util
++
++  interface solve_scalapack_all
++    module procedure solve_pdsyevd
++    module procedure solve_pzheevd
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure solve_pssyevd
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure solve_pcheevd
++#endif
++  end interface
++
++  interface solve_scalapack_part
++    module procedure solve_pdsyevr
++    module procedure solve_pzheevr
++#ifdef WANT_SINGLE_PRECISION_REAL
++    module procedure solve_pssyevr
++#endif
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++    module procedure solve_pcheevr
++#endif
++  end interface
++
++contains
++
++#define COMPLEXCASE 1
++#define DOUBLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_scalapack_template.F90"
++#undef DOUBLE_PRECISION
++#undef COMPLEXCASE
++
++#ifdef WANT_SINGLE_PRECISION_COMPLEX
++
++#define COMPLEXCASE 1
++#define SINGLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_scalapack_template.F90"
++#undef SINGLE_PRECISION
++#undef COMPLEXCASE
++
++#endif /* WANT_SINGLE_PRECISION_COMPLEX */
++
++#define REALCASE 1
++#define DOUBLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_scalapack_template.F90"
++#undef DOUBLE_PRECISION
++#undef REALCASE
++
++#ifdef WANT_SINGLE_PRECISION_REAL
++
++#define REALCASE 1
++#define SINGLE_PRECISION 1
++#include "../../src/general/precision_macros.h"
++#include "test_scalapack_template.F90"
++#undef SINGLE_PRECISION
++#undef REALCASE
++
++#endif /* WANT_SINGLE_PRECISION_REAL */
++
++
++end module
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_scalapack_template.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_scalapack_template.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_scalapack_template.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_scalapack_template.F90	2022-01-26 10:10:58.381125000 +0100
+@@ -0,0 +1,161 @@
++! (c) Copyright Pavel Kus, 2017, MPCDF
++!
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++
++  ! compute all eigenvectors
++  subroutine solve_p&
++      &BLAS_CHAR_AND_SY_OR_HE&
++      &evd(na, a, sc_desc, ev, z)
++    implicit none
++#include "./test_precision_kinds.F90"
++    integer(kind=ik), intent(in)     :: na
++    MATH_DATATYPE(kind=rck), intent(in)       :: a(:,:)
++    MATH_DATATYPE(kind=rck), intent(inout)    :: z(:,:)
++    real(kind=rk), intent(inout)    :: ev(:)
++    integer(kind=ik), intent(in)     :: sc_desc(:)
++    integer(kind=ik)                 :: info, lwork, liwork, lrwork
++    MATH_DATATYPE(kind=rck), allocatable      :: work(:)
++    real(kind=rk), allocatable       :: rwork(:)
++    integer, allocatable             :: iwork(:)
++
++    allocate(work(1), iwork(1), rwork(1))
++
++    ! query for required workspace
++#ifdef REALCASE
++    call p&
++         &BLAS_CHAR&
++         &syevd('V', 'U', na, a, 1, 1, sc_desc, ev, z, 1, 1, sc_desc, work, -1, iwork, -1, info)
++#endif
++#ifdef COMPLEXCASE
++    call p&
++         &BLAS_CHAR&
++         &heevd('V', 'U', na, a, 1, 1, sc_desc, ev, z, 1, 1, sc_desc, work, -1, rwork, -1, iwork, -1, info)
++#endif
++    !  write(*,*) "computed sizes", lwork, liwork, "required sizes ", work(1), iwork(1)
++    lwork = work(1)
++    liwork = iwork(1)
++    deallocate(work, iwork)
++    allocate(work(lwork), stat = info)
++    allocate(iwork(liwork), stat = info)
++#ifdef COMPLEXCASE
++    lrwork = rwork(1)
++    deallocate(rwork)
++    allocate(rwork(lrwork), stat = info)
++#endif
++    ! the actuall call to the method
++#ifdef REALCASE
++    call p&
++         &BLAS_CHAR&
++         &syevd('V', 'U', na, a, 1, 1, sc_desc, ev, z, 1, 1, sc_desc, work, lwork, iwork, liwork, info)
++#endif
++#ifdef COMPLEXCASE
++    call p&
++         &BLAS_CHAR&
++         &heevd('V', 'U', na, a, 1, 1, sc_desc, ev, z, 1, 1, sc_desc, work, lwork, rwork, lrwork, iwork, liwork, info)
++#endif
++
++    deallocate(iwork, work, rwork)
++  end subroutine
++
++
++  ! compute part of eigenvectors
++  subroutine solve_p&
++      &BLAS_CHAR_AND_SY_OR_HE&
++      &evr(na, a, sc_desc, nev, ev, z)
++    implicit none
++#include "./test_precision_kinds.F90"
++    integer(kind=ik), intent(in)     :: na, nev
++    MATH_DATATYPE(kind=rck), intent(in)       :: a(:,:)
++    MATH_DATATYPE(kind=rck), intent(inout)    :: z(:,:)
++    real(kind=rk), intent(inout)    :: ev(:)
++    integer(kind=ik), intent(in)     :: sc_desc(:)
++    integer(kind=ik)                 :: info, lwork, liwork, lrwork
++    MATH_DATATYPE(kind=rck), allocatable      :: work(:)
++    real(kind=rk), allocatable       :: rwork(:)
++    integer, allocatable             :: iwork(:)
++    integer(kind=ik)                 :: comp_eigenval, comp_eigenvec, smallest_ev_idx, largest_ev_idx
++
++    allocate(work(1), iwork(1), rwork(1))
++    smallest_ev_idx = 1
++    largest_ev_idx = nev
++    ! query for required workspace
++#ifdef REALCASE
++    call p&
++         &BLAS_CHAR&
++         &syevr('V', 'I', 'U', na, a, 1, 1, sc_desc, 0.0_rk, 0.0_rk, smallest_ev_idx, largest_ev_idx, &
++                comp_eigenval, comp_eigenvec,  ev, z, 1, 1, sc_desc, work, -1, iwork, -1, info)
++#endif
++#ifdef COMPLEXCASE
++    call p&
++         &BLAS_CHAR&
++         &heevr('V', 'I', 'U', na, a, 1, 1, sc_desc, 0.0_rk, 0.0_rk, smallest_ev_idx, largest_ev_idx, &
++                comp_eigenval, comp_eigenvec,  ev, z, 1, 1, sc_desc, work, -1, rwork, -1, iwork, -1, info)
++#endif
++    !  write(*,*) "computed sizes", lwork, liwork, "required sizes ", work(1), iwork(1)
++    lwork = work(1)
++    liwork = iwork(1)
++    deallocate(work, iwork)
++    allocate(work(lwork), stat = info)
++    allocate(iwork(liwork), stat = info)
++#ifdef COMPLEXCASE
++    lrwork = rwork(1)
++    deallocate(rwork)
++    allocate(rwork(lrwork), stat = info)
++#endif
++    ! the actuall call to the method
++#ifdef REALCASE
++    call p&
++         &BLAS_CHAR&
++         &syevr('V', 'I', 'U', na, a, 1, 1, sc_desc, 0.0_rk, 0.0_rk, smallest_ev_idx, largest_ev_idx, &
++                comp_eigenval, comp_eigenvec,  ev, z, 1, 1, sc_desc, work, lwork, iwork, liwork, info)
++#endif
++#ifdef COMPLEXCASE
++    call p&
++         &BLAS_CHAR&
++         &heevr('V', 'I', 'U', na, a, 1, 1, sc_desc, 0.0_rk, 0.0_rk, smallest_ev_idx, largest_ev_idx, &
++                comp_eigenval, comp_eigenvec,  ev, z, 1, 1, sc_desc, work, lwork, rwork, lrwork, iwork, liwork, info)
++#endif
++    assert(comp_eigenval == nev)
++    assert(comp_eigenvec == nev)
++    deallocate(iwork, work, rwork)
++  end subroutine
++
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_setup_mpi.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_setup_mpi.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_setup_mpi.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_setup_mpi.F90	2022-01-26 10:10:58.382203000 +0100
+@@ -0,0 +1,115 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++module test_setup_mpi
++
++  contains
++
++    subroutine setup_mpi(myid, nprocs)
++      use test_util
++      use ELPA_utilities
++      use precision_for_tests
++      implicit none
++
++      TEST_INT_MPI_TYPE              :: mpierr
++
++      TEST_INT_TYPE, intent(out)     :: myid, nprocs
++      TEST_INT_MPI_TYPE              :: myidMPI, nprocsMPI
++#ifdef WITH_OPENMP_TRADITIONAL
++      TEST_INT_MPI_TYPE              :: required_mpi_thread_level, &
++                                        provided_mpi_thread_level
++#endif
++
++
++#ifdef WITH_MPI
++
++#ifndef WITH_OPENMP_TRADITIONAL
++      call mpi_init(mpierr)
++#else
++      required_mpi_thread_level = MPI_THREAD_MULTIPLE
++
++      call mpi_init_thread(required_mpi_thread_level,     &
++                           provided_mpi_thread_level, mpierr)
++
++      if (required_mpi_thread_level .ne. provided_mpi_thread_level) then
++        write(error_unit,*) "MPI ERROR: MPI_THREAD_MULTIPLE is not provided on this system"
++        write(error_unit,*) "           only ", mpi_thread_level_name(provided_mpi_thread_level), " is available"
++        call MPI_FINALIZE(mpierr)
++        call exit(77)
++      endif
++#endif
++      call mpi_comm_rank(mpi_comm_world, myidMPI,  mpierr)
++      call mpi_comm_size(mpi_comm_world, nprocsMPI,mpierr)
++
++      myid   = int(myidMPI,kind=BLAS_KIND)
++      nprocs = int(nprocsMPI,kind=BLAS_KIND)
++
++      if (nprocs <= 1) then
++        print *, "The test programs must be run with more than 1 task to ensure that usage with MPI is actually tested"
++        stop 1
++      endif
++#else
++      myid = 0
++      nprocs = 1
++#endif
++
++    end subroutine
++
++
++end module
+diff -ruN elpa-new_release_2021.11.001/examples/shared/tests_variable_definitions.F90 elpa-new_release_2021.11.001_ok/examples/shared/tests_variable_definitions.F90
+--- elpa-new_release_2021.11.001/examples/shared/tests_variable_definitions.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/tests_variable_definitions.F90	2022-01-26 10:10:58.385037000 +0100
+@@ -0,0 +1,65 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    https://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++! This file was written by A. Marek, MPC
++
++#include "config-f90.h"
++module precision_for_tests
++  use iso_c_binding, only : C_FLOAT, C_DOUBLE, C_FLOAT_COMPLEX, C_DOUBLE_COMPLEX, C_INT32_T, C_INT64_T, C_INT
++
++  implicit none
++  integer, parameter :: rk8  = C_DOUBLE
++  integer, parameter :: rk4  = C_FLOAT
++  integer, parameter :: ck8  = C_DOUBLE_COMPLEX
++  integer, parameter :: ck4  = C_FLOAT_COMPLEX
++  integer, parameter :: ik  = C_INT32_T
++  integer, parameter :: lik = C_INT64_T
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++  integer, parameter :: BLAS_KIND = C_INT64_T
++#else
++  integer, parameter :: BLAS_KIND = C_INT32_T
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++  integer, parameter :: MPI_KIND  = C_INT64_T
++#else
++  integer, parameter :: MPI_KIND  = C_INT32_T
++#endif
++end module precision_for_tests
+diff -ruN elpa-new_release_2021.11.001/examples/shared/test_util.F90 elpa-new_release_2021.11.001_ok/examples/shared/test_util.F90
+--- elpa-new_release_2021.11.001/examples/shared/test_util.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/shared/test_util.F90	2022-01-26 10:10:58.383252000 +0100
+@@ -0,0 +1,156 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++#include "config-f90.h"
++#undef TEST_INT_TYPE
++#undef INT_TYPE
++#undef TEST_INT_MPI_TYPE
++#undef INT_MPI_TYPE
++
++#ifdef HAVE_64BIT_INTEGER_MATH_SUPPORT
++#define TEST_INT_TYPE integer(kind=c_int64_t)
++#define INT_TYPE c_int64_t
++#else
++#define TEST_INT_TYPE integer(kind=c_int32_t)
++#define INT_TYPE c_int32_t
++#endif
++#ifdef HAVE_64BIT_INTEGER_MPI_SUPPORT
++#define TEST_INT_MPI_TYPE integer(kind=c_int64_t)
++#define INT_MPI_TYPE c_int64_t
++#else
++#define TEST_INT_MPI_TYPE integer(kind=c_int32_t)
++#define INT_MPI_TYPE c_int32_t
++#endif
++
++module test_util
++  use iso_c_binding
++  use precision_for_tests
++#ifdef WITH_MPI
++#ifdef HAVE_MPI_MODULE
++  use mpi
++  implicit none
++#else
++  implicit none
++  include 'mpif.h'
++#endif
++#else
++  TEST_INT_MPI_TYPE, parameter :: mpi_comm_world = -1
++#endif
++
++  contains
++!>
++!> This function translates, if ELPA was build with OpenMP support,
++!> the found evel of "thread safetiness" from the internal number
++!> of the MPI library into a human understandable value
++!>
++!> \param level thread-saftiness of the MPI library
++!> \return str human understandable value of thread saftiness
++  pure function mpi_thread_level_name(level) result(str)
++    use, intrinsic :: iso_c_binding
++    implicit none
++    integer(kind=c_int), intent(in) :: level
++    character(len=21)            :: str
++#ifdef WITH_MPI
++    select case(level)
++      case (MPI_THREAD_SINGLE)
++        str = "MPI_THREAD_SINGLE"
++      case (MPI_THREAD_FUNNELED)
++        str = "MPI_THREAD_FUNNELED"
++      case (MPI_THREAD_SERIALIZED)
++        str = "MPI_THREAD_SERIALIZED"
++      case (MPI_THREAD_MULTIPLE)
++        str = "MPI_THREAD_MULTIPLE"
++      case default
++        write(str,'(i0,1x,a)') level, "(Unknown level)"
++    end select
++#endif
++  end function
++
++  function seconds() result(s)
++    integer :: ticks, tick_rate
++    real(kind=c_double) :: s
++
++    call system_clock(count=ticks, count_rate=tick_rate)
++    s = real(ticks, kind=c_double) / tick_rate
++  end function
++
++    subroutine x_a(condition, condition_string, file, line)
++#ifdef HAVE_ISO_FORTRAN_ENV
++      use iso_fortran_env, only : error_unit
++#endif
++      implicit none
++#ifndef HAVE_ISO_FORTRAN_ENV
++      integer, parameter :: error_unit = 0
++#endif
++      logical, intent(in) :: condition
++      character(len=*), intent(in) :: condition_string
++      character(len=*), intent(in) :: file
++      integer, intent(in) :: line
++
++      if (.not. condition) then
++        write(error_unit,'(a,i0)') "Assertion `" // condition_string // "` failed at " // file // ":", line
++        stop 1
++      end if
++    end subroutine
++
++    subroutine x_ao(error_code, error_code_string, file, line)
++      use elpa
++#ifdef HAVE_ISO_FORTRAN_ENV
++      use iso_fortran_env, only : error_unit
++#endif
++      implicit none
++#ifndef HAVE_ISO_FORTRAN_ENV
++      integer, parameter :: error_unit = 0
++#endif
++      integer, intent(in) :: error_code
++      character(len=*), intent(in) :: error_code_string
++      character(len=*), intent(in) :: file
++      integer, intent(in) :: line
++
++      if (error_code /= ELPA_OK) then
++        write(error_unit,'(a,i0)') "Assertion failed: `" // error_code_string // &
++           " is " // elpa_strerr(error_code) // "` at " // file // ":", line
++        stop 1
++      end if
++    end subroutine
++end module
++
+diff -ruN elpa-new_release_2021.11.001/examples/test_real_e1.F90 elpa-new_release_2021.11.001_ok/examples/test_real_e1.F90
+--- elpa-new_release_2021.11.001/examples/test_real_e1.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/test_real_e1.F90	2022-01-28 16:43:29.688434545 +0100
+@@ -0,0 +1,255 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++!>
++!> Fortran test programm to demonstrates the use of
++!> ELPA 1 real case library.
++!> If "HAVE_REDIRECT" was defined at build time
++!> the stdout and stderr output of each MPI task
++!> can be redirected to files if the environment
++!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
++!> to "true".
++!>
++!> By calling executable [arg1] [arg2] [arg3] [arg4]
++!> one can define the size (arg1), the number of
++!> Eigenvectors to compute (arg2), and the blocking (arg3).
++!> If these values are not set default values (4000, 1500, 16)
++!> are choosen.
++!> If these values are set the 4th argument can be
++!> "output", which specifies that the EV's are written to
++!> an ascii file.
++!>
++program test_real_example
++
++!-------------------------------------------------------------------------------
++! Standard eigenvalue problem - REAL version
++!
++! This program demonstrates the use of the ELPA module
++! together with standard scalapack routines
++!
++! Copyright of the original code rests with the authors inside the ELPA
++! consortium. The copyright of any additional modifications shall rest
++! with their original authors, but shall adhere to the licensing terms
++! distributed along with the original code in the file "COPYING".
++!
++!-------------------------------------------------------------------------------
++
++   use iso_c_binding
++
++   use elpa
++#ifdef WITH_OPENMP_TRADITIONAL
++   use omp_lib
++#endif
++
++
++#ifdef HAVE_MPI_MODULE
++   use mpi
++   implicit none
++#else
++   implicit none
++   include 'mpif.h'
++#endif
++
++   !-------------------------------------------------------------------------------
++   ! Please set system size parameters below!
++   ! na:   System size
++   ! nev:  Number of eigenvectors to be calculated
++   ! nblk: Blocking factor in block cyclic distribution
++   !-------------------------------------------------------------------------------
++
++   integer           :: nblk
++   integer                          :: na, nev
++
++   integer                          :: np_rows, np_cols, na_rows, na_cols
++
++   integer                          :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
++   integer                          :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++
++   integer, external                :: numroc
++
++   real(kind=c_double), allocatable :: a(:,:), z(:,:), ev(:)
++
++   integer                          :: iseed(4096) ! Random seed, size should be sufficient for every generator
++
++   integer                          :: STATUS
++   integer                          :: success
++   character(len=8)                 :: task_suffix
++   integer                          :: j
++
++   integer, parameter               :: error_units = 0
++
++#ifdef WITH_OPENMP_TRADITIONAL
++   integer n_threads
++#endif
++   class(elpa_t), pointer           :: e
++   !-------------------------------------------------------------------------------
++
++
++   ! default parameters
++   na = 1000
++   nev = 500
++   nblk = 16
++
++   call mpi_init_thread(MPI_THREAD_SERIALIZED,info,mpierr)
++   call mpi_comm_rank(mpi_comm_world,myid,mpierr)
++   call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
++
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++     if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++   ! at the end of the above loop, nprocs is always divisible by np_cols
++
++   np_rows = nprocs/np_cols
++
++   ! initialise BLACS
++   my_blacs_ctxt = mpi_comm_world
++   call BLACS_Gridinit(my_blacs_ctxt, 'C', np_rows, np_cols)
++   call BLACS_Gridinfo(my_blacs_ctxt, nprow, npcol, my_prow, my_pcol)
++
++   if (myid==0) then
++     print '(a)','| Past BLACS_Gridinfo.'
++   end if
++   ! determine the neccessary size of the distributed matrices,
++   ! we use the scalapack tools routine NUMROC
++
++#ifdef WITH_OPENMP_TRADITIONAL
++   n_threads=omp_get_max_threads()
++#endif
++
++
++   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
++   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
++
++
++   ! set up the scalapack descriptor for the checks below
++   ! For ELPA the following restrictions hold:
++   ! - block sizes in both directions must be identical (args 4 a. 5)
++   ! - first row and column of the distributed matrix must be on
++   !   row/col 0/0 (arg 6 and 7)
++
++   call descinit(sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info)
++
++   if (info .ne. 0) then
++     write(error_units,*) 'Error in BLACS descinit! info=',info
++     write(error_units,*) 'Most likely this happend since you want to use'
++     write(error_units,*) 'more MPI tasks than are possible for your'
++     write(error_units,*) 'problem size (matrix size and blocksize)!'
++     write(error_units,*) 'The blacsgrid can not be set up properly'
++     write(error_units,*) 'Try reducing the number of MPI tasks...'
++     call MPI_ABORT(mpi_comm_world, 1, mpierr)
++   endif
++
++   if (myid==0) then
++     print '(a)','| Past scalapack descriptor setup.'
++   end if
++
++   allocate(a (na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++
++   allocate(ev(na))
++
++   ! we want different random numbers on every process
++   ! (otherwise A might get rank deficient):
++
++   iseed(:) = myid
++   call RANDOM_SEED(put=iseed)
++   call RANDOM_NUMBER(z)
++
++   a(:,:) = z(:,:)
++
++   if (myid == 0) then
++     print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
++   endif
++   call pdtran(na, na, 1.d0, z, 1, 1, sc_desc, 1.d0, a, 1, 1, sc_desc) ! A = A + Z**T
++
++   !-------------------------------------------------------------------------------
++
++   if (elpa_init(20171201) /= elpa_ok) then
++     print *, "ELPA API version not supported"
++     stop
++   endif
++   e => elpa_allocate()
++
++   ! set parameters decribing the matrix and it's MPI distribution
++   call e%set("na", na, success)
++   call e%set("nev", nev, success)
++   call e%set("local_nrows", na_rows, success)
++   call e%set("local_ncols", na_cols, success)
++   call e%set("nblk", nblk, success)
++   call e%set("mpi_comm_parent", mpi_comm_world, success)
++   call e%set("process_row", my_prow, success)
++   call e%set("process_col", my_pcol, success)
++
++#ifdef CUDA
++         call e%set("nvidia-gpu", 1, success)
++#endif
++#ifdef WITH_OPENMP_TRADITIONAL
++   call e%set("omp_threads", n_threads, success)
++#endif
++   success = e%setup()
++
++   call e%set("solver", elpa_solver_1stage, success)
++
++
++   ! Calculate eigenvalues/eigenvectors
++
++   if (myid==0) then
++     print '(a)','| Entering one-step ELPA solver ... '
++     print *
++   end if
++
++   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
++   call e%eigenvectors(a, ev, z, success)
++
++   if (myid==0) then
++     print '(a)','| One-step ELPA solver complete.'
++     print *
++   end if
++
++   call elpa_deallocate(e)
++   call elpa_uninit()
++
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++
++end
++
+diff -ruN elpa-new_release_2021.11.001/examples/test_real_e2.F90 elpa-new_release_2021.11.001_ok/examples/test_real_e2.F90
+--- elpa-new_release_2021.11.001/examples/test_real_e2.F90	1970-01-01 01:00:00.000000000 +0100
++++ elpa-new_release_2021.11.001_ok/examples/test_real_e2.F90	2022-02-01 09:28:16.102146696 +0100
+@@ -0,0 +1,262 @@
++!    This file is part of ELPA.
++!
++!    The ELPA library was originally created by the ELPA consortium,
++!    consisting of the following organizations:
++!
++!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
++!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
++!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
++!      Informatik,
++!    - Technische Universität München, Lehrstuhl für Informatik mit
++!      Schwerpunkt Wissenschaftliches Rechnen ,
++!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
++!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
++!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
++!      and
++!    - IBM Deutschland GmbH
++!
++!
++!    More information can be found here:
++!    http://elpa.mpcdf.mpg.de/
++!
++!    ELPA is free software: you can redistribute it and/or modify
++!    it under the terms of the version 3 of the license of the
++!    GNU Lesser General Public License as published by the Free
++!    Software Foundation.
++!
++!    ELPA is distributed in the hope that it will be useful,
++!    but WITHOUT ANY WARRANTY; without even the implied warranty of
++!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++!    GNU Lesser General Public License for more details.
++!
++!    You should have received a copy of the GNU Lesser General Public License
++!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
++!
++!    ELPA reflects a substantial effort on the part of the original
++!    ELPA consortium, and we ask you to respect the spirit of the
++!    license that we chose: i.e., please contribute any changes you
++!    may have back to the original ELPA library distribution, and keep
++!    any derivatives of ELPA under the same license that we chose for
++!    the original distribution, the GNU Lesser General Public License.
++!
++!
++!>
++!> Fortran test programm to demonstrates the use of
++!> ELPA 2 real case library.
++!> If "HAVE_REDIRECT" was defined at build time
++!> the stdout and stderr output of each MPI task
++!> can be redirected to files if the environment
++!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
++!> to "true".
++!>
++!> By calling executable [arg1] [arg2] [arg3] [arg4]
++!> one can define the size (arg1), the number of
++!> Eigenvectors to compute (arg2), and the blocking (arg3).
++!> If these values are not set default values (4000, 1500, 16)
++!> are choosen.
++!> If these values are set the 4th argument can be
++!> "output", which specifies that the EV's are written to
++!> an ascii file.
++!>
++program test_real_example
++
++!-------------------------------------------------------------------------------
++! Standard eigenvalue problem - REAL version
++!
++! This program demonstrates the use of the ELPA module
++! together with standard scalapack routines
++!
++! Copyright of the original code rests with the authors inside the ELPA
++! consortium. The copyright of any additional modifications shall rest
++! with their original authors, but shall adhere to the licensing terms
++! distributed along with the original code in the file "COPYING".
++!
++!-------------------------------------------------------------------------------
++
++   use iso_c_binding
++
++   use elpa
++
++#ifdef HAVE_MPI_MODULE
++   use mpi
++   implicit none
++#else
++   implicit none
++   include 'mpif.h'
++#endif
++
++   !-------------------------------------------------------------------------------
++   ! Please set system size parameters below!
++   ! na:   System size
++   ! nev:  Number of eigenvectors to be calculated
++   ! nblk: Blocking factor in block cyclic distribution
++   !-------------------------------------------------------------------------------
++
++   integer           :: nblk
++   integer                          :: na, nev
++
++   integer                          :: np_rows, np_cols, na_rows, na_cols
++
++   integer                          :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
++   integer                          :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
++
++   integer, external                :: numroc
++
++   real(kind=c_double), allocatable :: a(:,:), z(:,:), ev(:)
++
++   integer                          :: iseed(4096) ! Random seed, size should be sufficient for every generator
++
++   integer                          :: STATUS
++   integer                          :: success
++   character(len=8)                 :: task_suffix
++   integer                          :: j
++
++   integer, parameter               :: error_units = 0
++
++   class(elpa_t), pointer           :: e
++   !-------------------------------------------------------------------------------
++
++
++   ! default parameters
++   na = 1000
++   nev = 500
++   nblk = 16
++
++   call mpi_init_thread(MPI_THREAD_SERIALIZED,info,mpierr)
++   call mpi_comm_rank(mpi_comm_world,myid,mpierr)
++   call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
++
++   do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
++     if(mod(nprocs,np_cols) == 0 ) exit
++   enddo
++   ! at the end of the above loop, nprocs is always divisible by np_cols
++
++   np_rows = nprocs/np_cols
++
++   ! initialise BLACS
++   my_blacs_ctxt = mpi_comm_world
++   call BLACS_Gridinit(my_blacs_ctxt, 'C', np_rows, np_cols)
++   call BLACS_Gridinfo(my_blacs_ctxt, nprow, npcol, my_prow, my_pcol)
++
++   if (myid==0) then
++     print '(a)','| Past BLACS_Gridinfo.'
++   end if
++   ! determine the neccessary size of the distributed matrices,
++   ! we use the scalapack tools routine NUMROC
++
++   na_rows = numroc(na, nblk, my_prow, 0, np_rows)
++   na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
++
++
++   ! set up the scalapack descriptor for the checks below
++   ! For ELPA the following restrictions hold:
++   ! - block sizes in both directions must be identical (args 4 a. 5)
++   ! - first row and column of the distributed matrix must be on
++   !   row/col 0/0 (arg 6 and 7)
++
++   call descinit(sc_desc, na, na, nblk, nblk, 0, 0, my_blacs_ctxt, na_rows, info)
++
++   if (info .ne. 0) then
++     write(error_units,*) 'Error in BLACS descinit! info=',info
++     write(error_units,*) 'Most likely this happend since you want to use'
++     write(error_units,*) 'more MPI tasks than are possible for your'
++     write(error_units,*) 'problem size (matrix size and blocksize)!'
++     write(error_units,*) 'The blacsgrid can not be set up properly'
++     write(error_units,*) 'Try reducing the number of MPI tasks...'
++     call MPI_ABORT(mpi_comm_world, 1, mpierr)
++   endif
++
++   if (myid==0) then
++     print '(a)','| Past scalapack descriptor setup.'
++   end if
++
++   allocate(a (na_rows,na_cols))
++   allocate(z (na_rows,na_cols))
++
++   allocate(ev(na))
++
++   ! we want different random numbers on every process
++   ! (otherwise A might get rank deficient):
++
++   iseed(:) = myid
++   call RANDOM_SEED(put=iseed)
++   call RANDOM_NUMBER(z)
++
++   a(:,:) = z(:,:)
++
++   if (myid == 0) then
++     print '(a)','| Random matrix block has been set up. (only processor 0 confirms this step)'
++   endif
++   call pdtran(na, na, 1.d0, z, 1, 1, sc_desc, 1.d0, a, 1, 1, sc_desc) ! A = A + Z**T
++
++   !-------------------------------------------------------------------------------
++
++   if (elpa_init(20171201) /= elpa_ok) then
++     print *, "ELPA API version not supported"
++     stop
++   endif
++   e => elpa_allocate()
++
++   ! set parameters decribing the matrix and it's MPI distribution
++   call e%set("na", na, success)
++   call e%set("nev", nev, success)
++   call e%set("local_nrows", na_rows, success)
++   call e%set("local_ncols", na_cols, success)
++   call e%set("nblk", nblk, success)
++   call e%set("mpi_comm_parent", mpi_comm_world, success)
++   call e%set("process_row", my_prow, success)
++   call e%set("process_col", my_pcol, success)
++#ifdef CUDA
++   call e%set("nvidia-gpu", 1, success)
++#endif
++
++   success = e%setup()
++
++#ifdef CUDAKERNEL
++         call e%set("real_kernel", ELPA_2STAGE_REAL_NVIDIA_GPU, success)
++#endif
++#ifdef AVX512
++         call e%set("real_kernel", ELPA_2STAGE_REAL_AVX512_BLOCK2,success )
++#endif
++#ifdef AVX2_B6
++         call e%set("real_kernel", ELPA_2STAGE_REAL_AVX2_BLOCK6,success )
++#endif
++#ifdef AVX2_B4
++         call e%set("real_kernel", ELPA_2STAGE_REAL_AVX2_BLOCK4,success )
++#endif
++#ifdef AVX2_B2
++         call e%set("real_kernel", ELPA_2STAGE_REAL_AVX2_BLOCK2,success )
++#endif
++#ifdef GENERIC
++         call e%set("real_kernel", ELPA_2STAGE_REAL_GENERIC,success )
++#endif
++#ifdef GENERIC_SIMPLE
++         call e%set("real_kernel", ELPA_2STAGE_REAL_GENERIC_SIMPLE,success )
++#endif
++
++   call e%set("solver", elpa_solver_2stage, success)
++
++
++   ! Calculate eigenvalues/eigenvectors
++
++   if (myid==0) then
++     print '(a)','| Entering two-step ELPA solver ... '
++     print *
++   end if
++
++   call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
++   call e%eigenvectors(a, ev, z, success)
++
++   if (myid==0) then
++     print '(a)','| Two-step ELPA solver complete.'
++     print *
++   end if
++
++   call elpa_deallocate(e)
++   call elpa_uninit()
++
++   call blacs_gridexit(my_blacs_ctxt)
++   call mpi_finalize(mpierr)
++
++end
++