From cfd8ebca3888dfadeb424858d9a2a9646a120c9d Mon Sep 17 00:00:00 2001 From: Xinzhe Wu <wu7@juwels01.ib.juwels.fzj.de> Date: Wed, 21 Oct 2020 00:26:19 +0200 Subject: [PATCH] update examples pf sum --- 3_affinity_query/CMakeLists.txt | 3 - .../3_sum_mpi_omp.cpp | 31 +++-- 3_sum/3_sum_one_domain.cpp | 113 ++++++++++++++++++ 3_sum/3_sum_one_node.cpp | 97 +++++++++++++++ 3_sum/3_sum_one_node_numa_aware.cpp | 101 ++++++++++++++++ 3_sum/3_sum_serial.cpp | 56 +++++++++ 3_sum/CMakeLists.txt | 34 ++++++ 3_sum/README.md | 96 +++++++++++++++ CMakeLists.txt | 3 +- 9 files changed, 521 insertions(+), 13 deletions(-) delete mode 100644 3_affinity_query/CMakeLists.txt rename 3_affinity_query/3_affinity_query.cpp => 3_sum/3_sum_mpi_omp.cpp (82%) create mode 100644 3_sum/3_sum_one_domain.cpp create mode 100644 3_sum/3_sum_one_node.cpp create mode 100644 3_sum/3_sum_one_node_numa_aware.cpp create mode 100644 3_sum/3_sum_serial.cpp create mode 100644 3_sum/CMakeLists.txt create mode 100644 3_sum/README.md diff --git a/3_affinity_query/CMakeLists.txt b/3_affinity_query/CMakeLists.txt deleted file mode 100644 index d46704f..0000000 --- a/3_affinity_query/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_executable(3_affinity_query.exe 3_affinity_query.cpp) -target_link_libraries(3_affinity_query.exe PUBLIC OpenMP::OpenMP_CXX) - diff --git a/3_affinity_query/3_affinity_query.cpp b/3_sum/3_sum_mpi_omp.cpp similarity index 82% rename from 3_affinity_query/3_affinity_query.cpp rename to 3_sum/3_sum_mpi_omp.cpp index e0ead30..cd6a4a5 100644 --- a/3_affinity_query/3_affinity_query.cpp +++ b/3_sum/3_sum_mpi_omp.cpp @@ -2,6 +2,8 @@ #include <omp.h> #include <iostream> #include <chrono> +#include <mpi.h> + using namespace std::chrono; void socket_init(int socket_num) @@ -33,22 +35,32 @@ void numa_in_operations(int socket_num){ } -int main() +int main(int argc, char** argv) { /* - * output: Size, Sum, serial time, NUMA domain time, node time, NUMA-aware time + * output: Type, Sockets, Size, Sum, serial time, NUMA domain time, node time, NUMA-aware time * */ + MPI_Init(&argc,&argv); + + int rank; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + int n_sockets, socket_num; int n_procs; + int num_thread; + + num_thread = atoi(argv[1]); + omp_set_nested(1); omp_set_max_active_levels(2); n_sockets = omp_get_num_places(); -// printf("number of sockets = %d \n", n_sockets); + int thread_per_socket = num_thread / n_sockets; int size = 100000000; @@ -71,7 +83,7 @@ int main() auto t = duration_cast<duration<double>>(t2 - t1); - printf("%d,%f,%f,", size, sum, t.count()); + if (rank == 0) printf("MPI+OpenMP,%d,%d,%d,%f,%f,", num_thread, n_sockets, size, sum, t.count()); delete []a; @@ -91,7 +103,7 @@ int main() socket_num = omp_get_place_num(); n_procs = omp_get_place_num_procs(socket_num); if(socket_num == 0){ - #pragma omp parallel for reduction(+:sum) num_threads(n_procs) + #pragma omp parallel for reduction(+:sum) num_threads(thread_per_socket) for(int i = 0; i < size; i++){ sum += b[i]; } @@ -107,7 +119,7 @@ int main() t = duration_cast<duration<double>>(t2 - t1); - printf("%f,", t.count()); + if (rank == 0) printf("%f,", t.count()); delete [] b; @@ -131,7 +143,7 @@ int main() t = duration_cast<duration<double>>(t2 - t1); - printf("%f,", t.count()); + if (rank == 0) printf("%f,", t.count()); // printf("Node: Sum of array is : %f in %f seconds\n", sum, t.count()); @@ -158,9 +170,10 @@ int main() t = duration_cast<duration<double>>(t2 - t1); - printf("%f\n", t.count()); + if (rank == 0) printf("%f\n", t.count()); delete [] d; - return 0; + MPI_Finalize(); + } diff --git a/3_sum/3_sum_one_domain.cpp b/3_sum/3_sum_one_domain.cpp new file mode 100644 index 0000000..c01496f --- /dev/null +++ b/3_sum/3_sum_one_domain.cpp @@ -0,0 +1,113 @@ +#include <stdio.h> +#include <omp.h> +#include <iostream> +#include <chrono> + +#ifdef USE_MPI +#include <mpi.h> +#endif + +using namespace std::chrono; + +void socket_init(int socket_num) +{ + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + #pragma omp parallel num_threads(n_procs) proc_bind(close) + { + printf("Reporting in from socket %d, thread ID: %d\n", + socket_num,omp_get_thread_num() ); + } +} + +void numa_in_operations(int socket_num){ + + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + + if(socket_num == 0){ + #pragma omp parallel num_threads(n_procs) + { + printf("The first socket does the computation in parallel\n"); + } + }else{ + printf("The other sockets do nothing\n"); + } + +} + +int main(int argc, char** argv) +{ + + int rank; + +#ifdef USE_MPI + MPI_Init(&argc,&argv); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#else + rank = 0; +#endif + + int n_sockets, socket_num; + int n_procs; + + int num_thread; + + num_thread = atoi(argv[1]); + + omp_set_nested(1); + + omp_set_max_active_levels(2); + + n_sockets = omp_get_num_places(); + + int thread_per_socket = num_thread / n_sockets; + + int size = 100000000; + + double *b = new double[size]; + + for(int i = 0; i < size; i++){ + b[i] = i + 1; + } + + double sum = 0; + + auto t1 = high_resolution_clock::now(); + + #pragma omp parallel num_threads(n_sockets) shared(sum) private(socket_num, n_procs) proc_bind(spread) + { + socket_num = omp_get_place_num(); + n_procs = omp_get_place_num_procs(socket_num); + if(socket_num == 0){ + #pragma omp parallel for reduction(+:sum) num_threads(thread_per_socket) + for(int i = 0; i < size; i++){ + sum += b[i]; + } + }else{ +/* + printf("The other sockets do nothing\n"); +*/ + } + + } + + auto t2 = high_resolution_clock::now(); + + auto t = duration_cast<duration<double>>(t2 - t1); + + if(rank == 0) + std::cout << "OMP (1 DOMAIN)," << size << "," << sum << "," << num_thread << "," << n_sockets << "," << t.count() << std::endl; + + delete [] b; + +#ifdef USE_MPI + MPI_Finalize(); +#else + return 0; +#endif + +} diff --git a/3_sum/3_sum_one_node.cpp b/3_sum/3_sum_one_node.cpp new file mode 100644 index 0000000..3e0cbcc --- /dev/null +++ b/3_sum/3_sum_one_node.cpp @@ -0,0 +1,97 @@ +#include <stdio.h> +#include <omp.h> +#include <iostream> +#include <chrono> + +#ifdef USE_MPI +#include <mpi.h> +#endif + +using namespace std::chrono; + +void socket_init(int socket_num) +{ + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + #pragma omp parallel num_threads(n_procs) proc_bind(close) + { + printf("Reporting in from socket %d, thread ID: %d\n", + socket_num,omp_get_thread_num() ); + } +} + +void numa_in_operations(int socket_num){ + + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + + if(socket_num == 0){ + #pragma omp parallel num_threads(n_procs) + { + printf("The first socket does the computation in parallel\n"); + } + }else{ + printf("The other sockets do nothing\n"); + } + +} + +int main(int argc, char** argv) +{ + + int rank; + +#ifdef USE_MPI + MPI_Init(&argc,&argv); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#else + rank = 0; +#endif + + int n_sockets, socket_num; + int n_procs; + + int num_thread; + + num_thread = atoi(argv[1]); + + n_sockets = omp_get_num_places(); + + int thread_per_socket = num_thread / n_sockets; + + int size = 100000000; + +/*Node*/ + double *c = new double[size]; + + for(int i = 0; i < size; i++){ + c[i] = i + 1; + } + + double sum = 0; + + auto t1 = high_resolution_clock::now(); + + #pragma omp parallel for reduction(+:sum) num_threads(num_thread) + for(int i = 0; i < size; i++){ + sum += c[i]; + } + + auto t2 = high_resolution_clock::now(); + + auto t = duration_cast<duration<double>>(t2 - t1); + + if(rank == 0) + std::cout << "Navie OMP (1 NODE)," << size << "," << sum << "," << num_thread << "," << n_sockets << "," << t.count() << std::endl; + delete [] c; + +#ifdef USE_MPI + MPI_Finalize(); +#else + return 0; +#endif + +} diff --git a/3_sum/3_sum_one_node_numa_aware.cpp b/3_sum/3_sum_one_node_numa_aware.cpp new file mode 100644 index 0000000..ffb5b03 --- /dev/null +++ b/3_sum/3_sum_one_node_numa_aware.cpp @@ -0,0 +1,101 @@ +#include <stdio.h> +#include <omp.h> +#include <iostream> +#include <chrono> + +#ifdef USE_MPI +#include <mpi.h> +#endif + +using namespace std::chrono; + +void socket_init(int socket_num) +{ + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + #pragma omp parallel num_threads(n_procs) proc_bind(close) + { + printf("Reporting in from socket %d, thread ID: %d\n", + socket_num,omp_get_thread_num() ); + } +} + +void numa_in_operations(int socket_num){ + + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + + if(socket_num == 0){ + #pragma omp parallel num_threads(n_procs) + { + printf("The first socket does the computation in parallel\n"); + } + }else{ + printf("The other sockets do nothing\n"); + } + +} + +int main(int argc, char** argv) +{ + int rank; + +#ifdef USE_MPI + MPI_Init(&argc,&argv); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#else + rank = 0; +#endif + + int n_sockets, socket_num; + int n_procs; + + int num_thread; + + num_thread = atoi(argv[1]); + + omp_set_nested(1); + + omp_set_max_active_levels(2); + + n_sockets = omp_get_num_places(); + + int thread_per_socket = num_thread / n_sockets; + + int size = 100000000; + +/*Node with NUMA-Aware*/ + double *d = new double[size]; + + #pragma omp parallel for num_threads(num_thread) + for(int i = 0; i < size; i++){ + d[i] = i + 1; + } + + double sum = 0; + + auto t1 = high_resolution_clock::now(); + + #pragma omp parallel for reduction(+:sum) num_threads(num_thread) + for(int i = 0; i < size; i++){ + sum += d[i]; + } + + auto t2 = high_resolution_clock::now(); + + auto t = duration_cast<duration<double>>(t2 - t1); + + if(rank == 0) + std::cout << "NUMA-aware OMP (1 NODE)," << size << "," << sum << "," << num_thread << "," << n_sockets << "," << t.count() << std::endl; + delete [] d; + +#ifdef USE_MPI + MPI_Finalize(); +#else + return 0; +#endif + +} diff --git a/3_sum/3_sum_serial.cpp b/3_sum/3_sum_serial.cpp new file mode 100644 index 0000000..445ac38 --- /dev/null +++ b/3_sum/3_sum_serial.cpp @@ -0,0 +1,56 @@ +#include <stdio.h> +#include <iostream> +#include <chrono> + +#ifdef USE_MPI +#include <mpi.h> +#endif + +using namespace std::chrono; + +int main(int argc, char** argv) +{ + + int rank; + +#ifdef USE_MPI + MPI_Init(&argc,&argv); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#else + rank = 0; +#endif + + int size = 100000000; + +/*Serial Sum*/ + double *a = new double[size]; + + for(int i = 0; i < size; i++){ + a[i] = i + 1; + } + + double sum = 0; + + auto t1 = high_resolution_clock::now(); + + for(int i = 0; i < size; i++){ + sum += a[i]; + } + + auto t2 = high_resolution_clock::now(); + + auto t = duration_cast<duration<double>>(t2 - t1); + + if(rank == 0) + std::cout << "Serial," << size << "," << sum << "," << 1 << "," << 1 << "," << t.count() << std::endl; + + delete []a; + +#ifdef USE_MPI + MPI_Finalize(); +#else + return 0; +#endif + +} diff --git a/3_sum/CMakeLists.txt b/3_sum/CMakeLists.txt new file mode 100644 index 0000000..9d1ab2e --- /dev/null +++ b/3_sum/CMakeLists.txt @@ -0,0 +1,34 @@ +add_executable(3_sum_mpi_omp.exe 3_sum_mpi_omp.cpp) +target_link_libraries(3_sum_mpi_omp.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX) + +#### +add_executable(3_sum_serial.exe 3_sum_serial.cpp) + +add_executable(3_sum_one_node.exe 3_sum_one_node.cpp) +target_link_libraries(3_sum_one_node.exe PUBLIC OpenMP::OpenMP_CXX) + +add_executable(3_sum_one_node_numa_aware.exe 3_sum_one_node_numa_aware.cpp) +target_link_libraries(3_sum_one_node_numa_aware.exe PUBLIC OpenMP::OpenMP_CXX) + +add_executable(3_sum_one_domain.exe 3_sum_one_domain.cpp) +target_link_libraries(3_sum_one_domain.exe PUBLIC OpenMP::OpenMP_CXX) + + +#### +add_executable(3_sum_serial_mpi.exe 3_sum_serial.cpp) +target_link_libraries(3_sum_serial_mpi.exe PUBLIC MPI::MPI_CXX) +target_compile_definitions(3_sum_serial_mpi.exe PRIVATE USE_MPI=1 ) + +add_executable(3_sum_one_node_mpi.exe 3_sum_one_node.cpp) +target_link_libraries(3_sum_one_node_mpi.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX) +target_compile_definitions(3_sum_one_node_mpi.exe PRIVATE USE_MPI=1 ) + +add_executable(3_sum_one_node_numa_aware_mpi.exe 3_sum_one_node_numa_aware.cpp) +target_link_libraries(3_sum_one_node_numa_aware_mpi.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX) +target_compile_definitions(3_sum_one_node_numa_aware_mpi.exe PRIVATE USE_MPI=1 ) + +add_executable(3_sum_one_domain_mpi.exe 3_sum_one_domain.cpp) +target_link_libraries(3_sum_one_domain_mpi.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX) +target_compile_definitions(3_sum_one_domain_mpi.exe PRIVATE USE_MPI=1 ) + + diff --git a/3_sum/README.md b/3_sum/README.md new file mode 100644 index 0000000..035dd01 --- /dev/null +++ b/3_sum/README.md @@ -0,0 +1,96 @@ +# Multiple Implementation of SUM + + + +1. load the modules + +```bash +module load GCC +module load ParaStationMPI +module load CMake +``` +2. compile by CMake + +```bash +mkdir build +cd build +cmake .. +make -j +``` + +3. Execution of clusters + +```bash +#!/bin/bash -x +#SBATCH --account=slai +#SBATCH --nodes=1 +#SBATCH --hint=nomultithread +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=48 +#SBATCH --output=sum-out.%j +#SBATCH --error=sum-err.%j +#SBATCH --time=00:30:00 +#SBATCH --partition=batch + +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} +export OMP_PLACES=sockets + +echo "PURE OMP" + +echo "Impl,Problem size,Sum,number of threads,number of sockets,Time(s)" + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_serial.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_node.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_node_numa_aware.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_domain.exe ${T} +done + + + +echo "MPI+OMP" + +echo "Impl,Problem size,Sum,number of threads,number of sockets,Time(s)" + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_serial_mpi.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_node_mpi.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_node_numa_aware_mpi.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_domain_mpi.exe ${T} +done +``` diff --git a/CMakeLists.txt b/CMakeLists.txt index cf3e531..b700afd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,11 @@ set(CMAKE_CXX_FLAGS_DEBUG "-g") set(CMAKE_CXX_FLAGS_RELEASE "-O3") find_package(OpenMP REQUIRED) +find_package(MPI REQUIRED) ADD_SUBDIRECTORY(0_hello_world) ADD_SUBDIRECTORY(1_integral) -ADD_SUBDIRECTORY(3_affinity_query) +ADD_SUBDIRECTORY(3_sum) include(Dart) include(CPack) -- GitLab