From cfd8ebca3888dfadeb424858d9a2a9646a120c9d Mon Sep 17 00:00:00 2001
From: Xinzhe Wu <wu7@juwels01.ib.juwels.fzj.de>
Date: Wed, 21 Oct 2020 00:26:19 +0200
Subject: [PATCH] update examples pf sum

---
 3_affinity_query/CMakeLists.txt               |   3 -
 .../3_sum_mpi_omp.cpp                         |  31 +++--
 3_sum/3_sum_one_domain.cpp                    | 113 ++++++++++++++++++
 3_sum/3_sum_one_node.cpp                      |  97 +++++++++++++++
 3_sum/3_sum_one_node_numa_aware.cpp           | 101 ++++++++++++++++
 3_sum/3_sum_serial.cpp                        |  56 +++++++++
 3_sum/CMakeLists.txt                          |  34 ++++++
 3_sum/README.md                               |  96 +++++++++++++++
 CMakeLists.txt                                |   3 +-
 9 files changed, 521 insertions(+), 13 deletions(-)
 delete mode 100644 3_affinity_query/CMakeLists.txt
 rename 3_affinity_query/3_affinity_query.cpp => 3_sum/3_sum_mpi_omp.cpp (82%)
 create mode 100644 3_sum/3_sum_one_domain.cpp
 create mode 100644 3_sum/3_sum_one_node.cpp
 create mode 100644 3_sum/3_sum_one_node_numa_aware.cpp
 create mode 100644 3_sum/3_sum_serial.cpp
 create mode 100644 3_sum/CMakeLists.txt
 create mode 100644 3_sum/README.md

diff --git a/3_affinity_query/CMakeLists.txt b/3_affinity_query/CMakeLists.txt
deleted file mode 100644
index d46704f..0000000
--- a/3_affinity_query/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_executable(3_affinity_query.exe 3_affinity_query.cpp)
-target_link_libraries(3_affinity_query.exe PUBLIC OpenMP::OpenMP_CXX)
-
diff --git a/3_affinity_query/3_affinity_query.cpp b/3_sum/3_sum_mpi_omp.cpp
similarity index 82%
rename from 3_affinity_query/3_affinity_query.cpp
rename to 3_sum/3_sum_mpi_omp.cpp
index e0ead30..cd6a4a5 100644
--- a/3_affinity_query/3_affinity_query.cpp
+++ b/3_sum/3_sum_mpi_omp.cpp
@@ -2,6 +2,8 @@
 #include <omp.h>
 #include <iostream>
 #include <chrono>
+#include <mpi.h>
+
 using namespace std::chrono;
 
 void socket_init(int socket_num)
@@ -33,22 +35,32 @@ void numa_in_operations(int socket_num){
 
 }
 
-int main()
+int main(int argc, char** argv)
 {
 /*
- * output: Size, Sum, serial time, NUMA domain time, node time, NUMA-aware time
+ * output: Type, Sockets, Size, Sum, serial time, NUMA domain time, node time, NUMA-aware time
  * 
 */
+   MPI_Init(&argc,&argv);
+ 
+   int rank;
+
+   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
    int n_sockets, socket_num;
    int n_procs;
 
+   int num_thread;
+
+   num_thread = atoi(argv[1]);
+
    omp_set_nested(1);
 
    omp_set_max_active_levels(2);
 
    n_sockets = omp_get_num_places();
 
-//   printf("number of sockets = %d \n", n_sockets);
+   int thread_per_socket = num_thread / n_sockets;
 
    int size = 100000000;
 
@@ -71,7 +83,7 @@ int main()
 
    auto t = duration_cast<duration<double>>(t2 - t1);
 
-   printf("%d,%f,%f,", size, sum, t.count());
+   if (rank == 0) printf("MPI+OpenMP,%d,%d,%d,%f,%f,", num_thread, n_sockets, size, sum, t.count());
 
    delete []a;
 
@@ -91,7 +103,7 @@ int main()
       socket_num = omp_get_place_num();
       n_procs = omp_get_place_num_procs(socket_num);
       if(socket_num == 0){
-           #pragma omp parallel for reduction(+:sum) num_threads(n_procs) 
+           #pragma omp parallel for reduction(+:sum) num_threads(thread_per_socket) 
 	   for(int i = 0; i < size; i++){
                sum += b[i];
            } 
@@ -107,7 +119,7 @@ int main()
 
    t = duration_cast<duration<double>>(t2 - t1);
 
-   printf("%f,", t.count());
+   if (rank == 0) printf("%f,", t.count());
 
    delete [] b;
 
@@ -131,7 +143,7 @@ int main()
 
    t = duration_cast<duration<double>>(t2 - t1);
 
-   printf("%f,", t.count());
+   if (rank == 0) printf("%f,", t.count());
 
 //   printf("Node: Sum of array is : %f in %f seconds\n", sum, t.count());
 
@@ -158,9 +170,10 @@ int main()
 
    t = duration_cast<duration<double>>(t2 - t1);
 
-   printf("%f\n", t.count());
+   if (rank == 0) printf("%f\n", t.count());
 
    delete [] d;
 
-   return 0;
+   MPI_Finalize();
+
 }
diff --git a/3_sum/3_sum_one_domain.cpp b/3_sum/3_sum_one_domain.cpp
new file mode 100644
index 0000000..c01496f
--- /dev/null
+++ b/3_sum/3_sum_one_domain.cpp
@@ -0,0 +1,113 @@
+#include <stdio.h>
+#include <omp.h>
+#include <iostream>
+#include <chrono>
+
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+
+using namespace std::chrono;
+
+void socket_init(int socket_num)
+{
+   int n_procs;
+
+   n_procs = omp_get_place_num_procs(socket_num);
+   #pragma omp parallel num_threads(n_procs) proc_bind(close)
+   {
+      printf("Reporting in from socket %d, thread ID: %d\n",
+                                socket_num,omp_get_thread_num() );
+   }
+}
+
+void numa_in_operations(int socket_num){
+
+   int n_procs;
+
+   n_procs = omp_get_place_num_procs(socket_num);
+
+   if(socket_num == 0){
+        #pragma omp parallel num_threads(n_procs)
+        {
+	    printf("The first socket does the computation in parallel\n");
+	}
+   }else{
+	printf("The other sockets do nothing\n");
+  }
+
+}
+
+int main(int argc, char** argv)
+{
+
+   int rank;
+
+#ifdef USE_MPI
+   MPI_Init(&argc,&argv);
+
+   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#else
+   rank = 0;
+#endif
+
+   int n_sockets, socket_num;
+   int n_procs;
+
+   int num_thread;
+   
+   num_thread = atoi(argv[1]);
+
+   omp_set_nested(1);
+
+   omp_set_max_active_levels(2);
+
+   n_sockets = omp_get_num_places();
+
+   int thread_per_socket = num_thread / n_sockets;
+
+   int size = 100000000;
+
+   double *b = new double[size];
+
+   for(int i = 0; i < size; i++){
+        b[i] = i + 1;
+   }
+
+   double sum = 0;
+
+   auto t1 = high_resolution_clock::now();
+
+   #pragma omp parallel num_threads(n_sockets) shared(sum) private(socket_num, n_procs) proc_bind(spread)
+  {
+      socket_num = omp_get_place_num();
+      n_procs = omp_get_place_num_procs(socket_num);
+      if(socket_num == 0){
+           #pragma omp parallel for reduction(+:sum) num_threads(thread_per_socket) 
+	   for(int i = 0; i < size; i++){
+               sum += b[i];
+           } 
+      }else{
+/*
+          printf("The other sockets do nothing\n");
+*/
+      }
+
+   }
+
+   auto t2 = high_resolution_clock::now();
+
+   auto t = duration_cast<duration<double>>(t2 - t1);
+
+   if(rank == 0)
+       std::cout << "OMP (1 DOMAIN)," << size << "," << sum << "," << num_thread << "," << n_sockets << "," << t.count() << std::endl;
+
+   delete [] b;
+
+#ifdef USE_MPI
+   MPI_Finalize();
+#else
+   return 0;
+#endif
+
+}
diff --git a/3_sum/3_sum_one_node.cpp b/3_sum/3_sum_one_node.cpp
new file mode 100644
index 0000000..3e0cbcc
--- /dev/null
+++ b/3_sum/3_sum_one_node.cpp
@@ -0,0 +1,97 @@
+#include <stdio.h>
+#include <omp.h>
+#include <iostream>
+#include <chrono>
+
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+
+using namespace std::chrono;
+
+void socket_init(int socket_num)
+{
+   int n_procs;
+
+   n_procs = omp_get_place_num_procs(socket_num);
+   #pragma omp parallel num_threads(n_procs) proc_bind(close)
+   {
+      printf("Reporting in from socket %d, thread ID: %d\n",
+                                socket_num,omp_get_thread_num() );
+   }
+}
+
+void numa_in_operations(int socket_num){
+
+   int n_procs;
+
+   n_procs = omp_get_place_num_procs(socket_num);
+
+   if(socket_num == 0){
+        #pragma omp parallel num_threads(n_procs)
+        {
+	    printf("The first socket does the computation in parallel\n");
+	}
+   }else{
+	printf("The other sockets do nothing\n");
+  }
+
+}
+
+int main(int argc, char** argv)
+{
+
+   int rank;
+
+#ifdef USE_MPI
+   MPI_Init(&argc,&argv);
+
+   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#else
+   rank = 0;
+#endif
+
+   int n_sockets, socket_num;
+   int n_procs;
+
+   int num_thread;
+   
+   num_thread = atoi(argv[1]);
+
+   n_sockets = omp_get_num_places();
+
+   int thread_per_socket = num_thread / n_sockets;
+
+   int size = 100000000;
+
+/*Node*/
+   double *c = new double[size];
+
+   for(int i = 0; i < size; i++){
+        c[i] = i + 1;
+   }
+
+   double sum = 0;
+
+   auto t1 = high_resolution_clock::now();
+
+   #pragma omp parallel for reduction(+:sum) num_threads(num_thread)
+   for(int i = 0; i < size; i++){
+       sum += c[i];
+   }
+
+   auto t2 = high_resolution_clock::now();
+
+   auto t = duration_cast<duration<double>>(t2 - t1);
+
+   if(rank == 0)
+       std::cout << "Navie OMP (1 NODE)," << size << "," << sum << "," << num_thread << "," << n_sockets << "," << t.count() << std::endl;
+   delete [] c;
+
+#ifdef USE_MPI
+   MPI_Finalize();
+#else
+   return 0;
+#endif
+
+}
diff --git a/3_sum/3_sum_one_node_numa_aware.cpp b/3_sum/3_sum_one_node_numa_aware.cpp
new file mode 100644
index 0000000..ffb5b03
--- /dev/null
+++ b/3_sum/3_sum_one_node_numa_aware.cpp
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <omp.h>
+#include <iostream>
+#include <chrono>
+
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+
+using namespace std::chrono;
+
+void socket_init(int socket_num)
+{
+   int n_procs;
+
+   n_procs = omp_get_place_num_procs(socket_num);
+   #pragma omp parallel num_threads(n_procs) proc_bind(close)
+   {
+      printf("Reporting in from socket %d, thread ID: %d\n",
+                                socket_num,omp_get_thread_num() );
+   }
+}
+
+void numa_in_operations(int socket_num){
+
+   int n_procs;
+
+   n_procs = omp_get_place_num_procs(socket_num);
+
+   if(socket_num == 0){
+        #pragma omp parallel num_threads(n_procs)
+        {
+	    printf("The first socket does the computation in parallel\n");
+	}
+   }else{
+	printf("The other sockets do nothing\n");
+  }
+
+}
+
+int main(int argc, char** argv)
+{
+   int rank;
+
+#ifdef USE_MPI
+   MPI_Init(&argc,&argv);
+
+   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#else
+   rank = 0;
+#endif
+
+   int n_sockets, socket_num;
+   int n_procs;
+
+   int num_thread;
+   
+   num_thread = atoi(argv[1]);
+
+   omp_set_nested(1);
+
+   omp_set_max_active_levels(2);
+
+   n_sockets = omp_get_num_places();
+
+   int thread_per_socket = num_thread / n_sockets;
+
+   int size = 100000000;
+
+/*Node with NUMA-Aware*/
+   double *d = new double[size];
+
+   #pragma omp parallel for num_threads(num_thread)
+   for(int i = 0; i < size; i++){
+        d[i] = i + 1;
+   }
+
+   double sum = 0;
+
+   auto t1 = high_resolution_clock::now();
+
+   #pragma omp parallel for reduction(+:sum) num_threads(num_thread)
+   for(int i = 0; i < size; i++){
+       sum += d[i];
+   }
+
+   auto t2 = high_resolution_clock::now();
+
+   auto t = duration_cast<duration<double>>(t2 - t1);
+
+   if(rank == 0)
+       std::cout << "NUMA-aware OMP (1 NODE)," << size << "," << sum << "," << num_thread << "," << n_sockets << "," << t.count() << std::endl;
+   delete [] d;
+
+#ifdef USE_MPI
+   MPI_Finalize();
+#else
+   return 0;
+#endif
+
+}
diff --git a/3_sum/3_sum_serial.cpp b/3_sum/3_sum_serial.cpp
new file mode 100644
index 0000000..445ac38
--- /dev/null
+++ b/3_sum/3_sum_serial.cpp
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <iostream>
+#include <chrono>
+
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+
+using namespace std::chrono;
+
+int main(int argc, char** argv)
+{
+
+   int rank;
+
+#ifdef USE_MPI
+   MPI_Init(&argc,&argv);
+
+   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#else
+   rank = 0;
+#endif
+
+   int size = 100000000;
+
+/*Serial Sum*/
+   double *a = new double[size];
+
+   for(int i = 0; i < size; i++){
+	a[i] = i + 1;
+   }
+
+   double sum = 0;
+
+   auto t1 = high_resolution_clock::now();
+
+   for(int i = 0; i < size; i++){
+       sum += a[i];
+   }
+
+   auto t2 = high_resolution_clock::now();
+
+   auto t = duration_cast<duration<double>>(t2 - t1);
+
+   if(rank == 0)
+       std::cout << "Serial," << size << "," << sum << "," << 1 << "," << 1 << "," << t.count() << std::endl;
+
+   delete []a;
+
+#ifdef USE_MPI
+   MPI_Finalize();
+#else
+   return 0;
+#endif
+
+}
diff --git a/3_sum/CMakeLists.txt b/3_sum/CMakeLists.txt
new file mode 100644
index 0000000..9d1ab2e
--- /dev/null
+++ b/3_sum/CMakeLists.txt
@@ -0,0 +1,34 @@
+add_executable(3_sum_mpi_omp.exe 3_sum_mpi_omp.cpp)
+target_link_libraries(3_sum_mpi_omp.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX)
+
+####
+add_executable(3_sum_serial.exe 3_sum_serial.cpp)
+
+add_executable(3_sum_one_node.exe 3_sum_one_node.cpp)
+target_link_libraries(3_sum_one_node.exe PUBLIC OpenMP::OpenMP_CXX)
+
+add_executable(3_sum_one_node_numa_aware.exe 3_sum_one_node_numa_aware.cpp)
+target_link_libraries(3_sum_one_node_numa_aware.exe PUBLIC OpenMP::OpenMP_CXX)
+
+add_executable(3_sum_one_domain.exe 3_sum_one_domain.cpp)
+target_link_libraries(3_sum_one_domain.exe PUBLIC OpenMP::OpenMP_CXX)
+
+
+####
+add_executable(3_sum_serial_mpi.exe 3_sum_serial.cpp)
+target_link_libraries(3_sum_serial_mpi.exe PUBLIC MPI::MPI_CXX)
+target_compile_definitions(3_sum_serial_mpi.exe PRIVATE USE_MPI=1 )
+
+add_executable(3_sum_one_node_mpi.exe 3_sum_one_node.cpp)
+target_link_libraries(3_sum_one_node_mpi.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX)
+target_compile_definitions(3_sum_one_node_mpi.exe PRIVATE USE_MPI=1 )
+
+add_executable(3_sum_one_node_numa_aware_mpi.exe 3_sum_one_node_numa_aware.cpp)
+target_link_libraries(3_sum_one_node_numa_aware_mpi.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX)
+target_compile_definitions(3_sum_one_node_numa_aware_mpi.exe PRIVATE USE_MPI=1 )
+
+add_executable(3_sum_one_domain_mpi.exe 3_sum_one_domain.cpp)
+target_link_libraries(3_sum_one_domain_mpi.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX)
+target_compile_definitions(3_sum_one_domain_mpi.exe PRIVATE USE_MPI=1 )
+
+
diff --git a/3_sum/README.md b/3_sum/README.md
new file mode 100644
index 0000000..035dd01
--- /dev/null
+++ b/3_sum/README.md
@@ -0,0 +1,96 @@
+# Multiple Implementation of SUM 
+
+
+
+1. load the modules
+
+```bash
+module load GCC
+module load ParaStationMPI
+module load CMake
+```
+2. compile by CMake
+
+```bash
+mkdir build
+cd build
+cmake ..
+make -j
+```
+
+3. Execution of clusters 
+
+```bash
+#!/bin/bash -x
+#SBATCH --account=slai
+#SBATCH --nodes=1
+#SBATCH --hint=nomultithread
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=48
+#SBATCH --output=sum-out.%j
+#SBATCH --error=sum-err.%j
+#SBATCH --time=00:30:00
+#SBATCH --partition=batch
+
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
+export OMP_PLACES=sockets
+
+echo "PURE OMP"
+
+echo "Impl,Problem size,Sum,number of threads,number of sockets,Time(s)"
+
+for T in {2..48..2}
+do
+export OMP_NUM_THREADS=${T}
+srun ./3_sum/3_sum_serial.exe ${T}
+done
+
+for T in {2..48..2}
+do
+export OMP_NUM_THREADS=${T}
+srun ./3_sum/3_sum_one_node.exe ${T}
+done
+
+for T in {2..48..2}
+do
+export OMP_NUM_THREADS=${T}
+srun ./3_sum/3_sum_one_node_numa_aware.exe ${T}
+done
+
+for T in {2..48..2}
+do
+export OMP_NUM_THREADS=${T}
+srun ./3_sum/3_sum_one_domain.exe ${T}
+done
+
+
+
+echo "MPI+OMP"
+
+echo "Impl,Problem size,Sum,number of threads,number of sockets,Time(s)"
+
+for T in {2..48..2}
+do
+export OMP_NUM_THREADS=${T}
+srun ./3_sum/3_sum_serial_mpi.exe ${T}
+done
+
+for T in {2..48..2}
+do
+export OMP_NUM_THREADS=${T}
+srun ./3_sum/3_sum_one_node_mpi.exe ${T}
+done
+
+for T in {2..48..2}
+do
+export OMP_NUM_THREADS=${T}
+srun ./3_sum/3_sum_one_node_numa_aware_mpi.exe ${T}
+done
+
+for T in {2..48..2}
+do
+export OMP_NUM_THREADS=${T}
+srun ./3_sum/3_sum_one_domain_mpi.exe ${T}
+done
+```
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf3e531..b700afd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,11 @@ set(CMAKE_CXX_FLAGS_DEBUG "-g")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 
 find_package(OpenMP REQUIRED)
+find_package(MPI REQUIRED)
 
 ADD_SUBDIRECTORY(0_hello_world)
 ADD_SUBDIRECTORY(1_integral)
-ADD_SUBDIRECTORY(3_affinity_query)
+ADD_SUBDIRECTORY(3_sum)
 
 include(Dart)
 include(CPack)
-- 
GitLab