diff --git a/3_affinity_query/CMakeLists.txt b/3_affinity_query/CMakeLists.txt deleted file mode 100644 index d46704f911ec66a9405b16cd931083857b1e179b..0000000000000000000000000000000000000000 --- a/3_affinity_query/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_executable(3_affinity_query.exe 3_affinity_query.cpp) -target_link_libraries(3_affinity_query.exe PUBLIC OpenMP::OpenMP_CXX) - diff --git a/3_affinity_query/3_affinity_query.cpp b/3_sum/3_sum_mpi_omp.cpp similarity index 82% rename from 3_affinity_query/3_affinity_query.cpp rename to 3_sum/3_sum_mpi_omp.cpp index e0ead30aad1b7433c0eb8fca20333ca177fdd5b0..cd6a4a53a7382291c2e927e21ad1eef94aedeed9 100644 --- a/3_affinity_query/3_affinity_query.cpp +++ b/3_sum/3_sum_mpi_omp.cpp @@ -2,6 +2,8 @@ #include <omp.h> #include <iostream> #include <chrono> +#include <mpi.h> + using namespace std::chrono; void socket_init(int socket_num) @@ -33,22 +35,32 @@ void numa_in_operations(int socket_num){ } -int main() +int main(int argc, char** argv) { /* - * output: Size, Sum, serial time, NUMA domain time, node time, NUMA-aware time + * output: Type, Sockets, Size, Sum, serial time, NUMA domain time, node time, NUMA-aware time * */ + MPI_Init(&argc,&argv); + + int rank; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + int n_sockets, socket_num; int n_procs; + int num_thread; + + num_thread = atoi(argv[1]); + omp_set_nested(1); omp_set_max_active_levels(2); n_sockets = omp_get_num_places(); -// printf("number of sockets = %d \n", n_sockets); + int thread_per_socket = num_thread / n_sockets; int size = 100000000; @@ -71,7 +83,7 @@ int main() auto t = duration_cast<duration<double>>(t2 - t1); - printf("%d,%f,%f,", size, sum, t.count()); + if (rank == 0) printf("MPI+OpenMP,%d,%d,%d,%f,%f,", num_thread, n_sockets, size, sum, t.count()); delete []a; @@ -91,7 +103,7 @@ int main() socket_num = omp_get_place_num(); n_procs = omp_get_place_num_procs(socket_num); if(socket_num == 0){ - #pragma omp parallel for reduction(+:sum) num_threads(n_procs) + #pragma omp parallel for reduction(+:sum) num_threads(thread_per_socket) for(int i = 0; i < size; i++){ sum += b[i]; } @@ -107,7 +119,7 @@ int main() t = duration_cast<duration<double>>(t2 - t1); - printf("%f,", t.count()); + if (rank == 0) printf("%f,", t.count()); delete [] b; @@ -131,7 +143,7 @@ int main() t = duration_cast<duration<double>>(t2 - t1); - printf("%f,", t.count()); + if (rank == 0) printf("%f,", t.count()); // printf("Node: Sum of array is : %f in %f seconds\n", sum, t.count()); @@ -158,9 +170,10 @@ int main() t = duration_cast<duration<double>>(t2 - t1); - printf("%f\n", t.count()); + if (rank == 0) printf("%f\n", t.count()); delete [] d; - return 0; + MPI_Finalize(); + } diff --git a/3_sum/3_sum_one_domain.cpp b/3_sum/3_sum_one_domain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c01496f27316cc3a95682fc6cd6bc8b7e7309aa9 --- /dev/null +++ b/3_sum/3_sum_one_domain.cpp @@ -0,0 +1,113 @@ +#include <stdio.h> +#include <omp.h> +#include <iostream> +#include <chrono> + +#ifdef USE_MPI +#include <mpi.h> +#endif + +using namespace std::chrono; + +void socket_init(int socket_num) +{ + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + #pragma omp parallel num_threads(n_procs) proc_bind(close) + { + printf("Reporting in from socket %d, thread ID: %d\n", + socket_num,omp_get_thread_num() ); + } +} + +void numa_in_operations(int socket_num){ + + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + + if(socket_num == 0){ + #pragma omp parallel num_threads(n_procs) + { + printf("The first socket does the computation in parallel\n"); + } + }else{ + printf("The other sockets do nothing\n"); + } + +} + +int main(int argc, char** argv) +{ + + int rank; + +#ifdef USE_MPI + MPI_Init(&argc,&argv); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#else + rank = 0; +#endif + + int n_sockets, socket_num; + int n_procs; + + int num_thread; + + num_thread = atoi(argv[1]); + + omp_set_nested(1); + + omp_set_max_active_levels(2); + + n_sockets = omp_get_num_places(); + + int thread_per_socket = num_thread / n_sockets; + + int size = 100000000; + + double *b = new double[size]; + + for(int i = 0; i < size; i++){ + b[i] = i + 1; + } + + double sum = 0; + + auto t1 = high_resolution_clock::now(); + + #pragma omp parallel num_threads(n_sockets) shared(sum) private(socket_num, n_procs) proc_bind(spread) + { + socket_num = omp_get_place_num(); + n_procs = omp_get_place_num_procs(socket_num); + if(socket_num == 0){ + #pragma omp parallel for reduction(+:sum) num_threads(thread_per_socket) + for(int i = 0; i < size; i++){ + sum += b[i]; + } + }else{ +/* + printf("The other sockets do nothing\n"); +*/ + } + + } + + auto t2 = high_resolution_clock::now(); + + auto t = duration_cast<duration<double>>(t2 - t1); + + if(rank == 0) + std::cout << "OMP (1 DOMAIN)," << size << "," << sum << "," << num_thread << "," << n_sockets << "," << t.count() << std::endl; + + delete [] b; + +#ifdef USE_MPI + MPI_Finalize(); +#else + return 0; +#endif + +} diff --git a/3_sum/3_sum_one_node.cpp b/3_sum/3_sum_one_node.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3e0cbcc3ae9e7072ca3951efb3decbce128e76ad --- /dev/null +++ b/3_sum/3_sum_one_node.cpp @@ -0,0 +1,97 @@ +#include <stdio.h> +#include <omp.h> +#include <iostream> +#include <chrono> + +#ifdef USE_MPI +#include <mpi.h> +#endif + +using namespace std::chrono; + +void socket_init(int socket_num) +{ + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + #pragma omp parallel num_threads(n_procs) proc_bind(close) + { + printf("Reporting in from socket %d, thread ID: %d\n", + socket_num,omp_get_thread_num() ); + } +} + +void numa_in_operations(int socket_num){ + + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + + if(socket_num == 0){ + #pragma omp parallel num_threads(n_procs) + { + printf("The first socket does the computation in parallel\n"); + } + }else{ + printf("The other sockets do nothing\n"); + } + +} + +int main(int argc, char** argv) +{ + + int rank; + +#ifdef USE_MPI + MPI_Init(&argc,&argv); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#else + rank = 0; +#endif + + int n_sockets, socket_num; + int n_procs; + + int num_thread; + + num_thread = atoi(argv[1]); + + n_sockets = omp_get_num_places(); + + int thread_per_socket = num_thread / n_sockets; + + int size = 100000000; + +/*Node*/ + double *c = new double[size]; + + for(int i = 0; i < size; i++){ + c[i] = i + 1; + } + + double sum = 0; + + auto t1 = high_resolution_clock::now(); + + #pragma omp parallel for reduction(+:sum) num_threads(num_thread) + for(int i = 0; i < size; i++){ + sum += c[i]; + } + + auto t2 = high_resolution_clock::now(); + + auto t = duration_cast<duration<double>>(t2 - t1); + + if(rank == 0) + std::cout << "Navie OMP (1 NODE)," << size << "," << sum << "," << num_thread << "," << n_sockets << "," << t.count() << std::endl; + delete [] c; + +#ifdef USE_MPI + MPI_Finalize(); +#else + return 0; +#endif + +} diff --git a/3_sum/3_sum_one_node_numa_aware.cpp b/3_sum/3_sum_one_node_numa_aware.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ffb5b03f65bcfab879d9dcd6e5cd2442d4d1ef19 --- /dev/null +++ b/3_sum/3_sum_one_node_numa_aware.cpp @@ -0,0 +1,101 @@ +#include <stdio.h> +#include <omp.h> +#include <iostream> +#include <chrono> + +#ifdef USE_MPI +#include <mpi.h> +#endif + +using namespace std::chrono; + +void socket_init(int socket_num) +{ + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + #pragma omp parallel num_threads(n_procs) proc_bind(close) + { + printf("Reporting in from socket %d, thread ID: %d\n", + socket_num,omp_get_thread_num() ); + } +} + +void numa_in_operations(int socket_num){ + + int n_procs; + + n_procs = omp_get_place_num_procs(socket_num); + + if(socket_num == 0){ + #pragma omp parallel num_threads(n_procs) + { + printf("The first socket does the computation in parallel\n"); + } + }else{ + printf("The other sockets do nothing\n"); + } + +} + +int main(int argc, char** argv) +{ + int rank; + +#ifdef USE_MPI + MPI_Init(&argc,&argv); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#else + rank = 0; +#endif + + int n_sockets, socket_num; + int n_procs; + + int num_thread; + + num_thread = atoi(argv[1]); + + omp_set_nested(1); + + omp_set_max_active_levels(2); + + n_sockets = omp_get_num_places(); + + int thread_per_socket = num_thread / n_sockets; + + int size = 100000000; + +/*Node with NUMA-Aware*/ + double *d = new double[size]; + + #pragma omp parallel for num_threads(num_thread) + for(int i = 0; i < size; i++){ + d[i] = i + 1; + } + + double sum = 0; + + auto t1 = high_resolution_clock::now(); + + #pragma omp parallel for reduction(+:sum) num_threads(num_thread) + for(int i = 0; i < size; i++){ + sum += d[i]; + } + + auto t2 = high_resolution_clock::now(); + + auto t = duration_cast<duration<double>>(t2 - t1); + + if(rank == 0) + std::cout << "NUMA-aware OMP (1 NODE)," << size << "," << sum << "," << num_thread << "," << n_sockets << "," << t.count() << std::endl; + delete [] d; + +#ifdef USE_MPI + MPI_Finalize(); +#else + return 0; +#endif + +} diff --git a/3_sum/3_sum_serial.cpp b/3_sum/3_sum_serial.cpp new file mode 100644 index 0000000000000000000000000000000000000000..445ac38cfbc8079e24837d2d1bdc79a3ad25f83e --- /dev/null +++ b/3_sum/3_sum_serial.cpp @@ -0,0 +1,56 @@ +#include <stdio.h> +#include <iostream> +#include <chrono> + +#ifdef USE_MPI +#include <mpi.h> +#endif + +using namespace std::chrono; + +int main(int argc, char** argv) +{ + + int rank; + +#ifdef USE_MPI + MPI_Init(&argc,&argv); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#else + rank = 0; +#endif + + int size = 100000000; + +/*Serial Sum*/ + double *a = new double[size]; + + for(int i = 0; i < size; i++){ + a[i] = i + 1; + } + + double sum = 0; + + auto t1 = high_resolution_clock::now(); + + for(int i = 0; i < size; i++){ + sum += a[i]; + } + + auto t2 = high_resolution_clock::now(); + + auto t = duration_cast<duration<double>>(t2 - t1); + + if(rank == 0) + std::cout << "Serial," << size << "," << sum << "," << 1 << "," << 1 << "," << t.count() << std::endl; + + delete []a; + +#ifdef USE_MPI + MPI_Finalize(); +#else + return 0; +#endif + +} diff --git a/3_sum/CMakeLists.txt b/3_sum/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d1ab2e5b730c20e10ee2a3e01eb1618be290010 --- /dev/null +++ b/3_sum/CMakeLists.txt @@ -0,0 +1,34 @@ +add_executable(3_sum_mpi_omp.exe 3_sum_mpi_omp.cpp) +target_link_libraries(3_sum_mpi_omp.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX) + +#### +add_executable(3_sum_serial.exe 3_sum_serial.cpp) + +add_executable(3_sum_one_node.exe 3_sum_one_node.cpp) +target_link_libraries(3_sum_one_node.exe PUBLIC OpenMP::OpenMP_CXX) + +add_executable(3_sum_one_node_numa_aware.exe 3_sum_one_node_numa_aware.cpp) +target_link_libraries(3_sum_one_node_numa_aware.exe PUBLIC OpenMP::OpenMP_CXX) + +add_executable(3_sum_one_domain.exe 3_sum_one_domain.cpp) +target_link_libraries(3_sum_one_domain.exe PUBLIC OpenMP::OpenMP_CXX) + + +#### +add_executable(3_sum_serial_mpi.exe 3_sum_serial.cpp) +target_link_libraries(3_sum_serial_mpi.exe PUBLIC MPI::MPI_CXX) +target_compile_definitions(3_sum_serial_mpi.exe PRIVATE USE_MPI=1 ) + +add_executable(3_sum_one_node_mpi.exe 3_sum_one_node.cpp) +target_link_libraries(3_sum_one_node_mpi.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX) +target_compile_definitions(3_sum_one_node_mpi.exe PRIVATE USE_MPI=1 ) + +add_executable(3_sum_one_node_numa_aware_mpi.exe 3_sum_one_node_numa_aware.cpp) +target_link_libraries(3_sum_one_node_numa_aware_mpi.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX) +target_compile_definitions(3_sum_one_node_numa_aware_mpi.exe PRIVATE USE_MPI=1 ) + +add_executable(3_sum_one_domain_mpi.exe 3_sum_one_domain.cpp) +target_link_libraries(3_sum_one_domain_mpi.exe PUBLIC OpenMP::OpenMP_CXX MPI::MPI_CXX) +target_compile_definitions(3_sum_one_domain_mpi.exe PRIVATE USE_MPI=1 ) + + diff --git a/3_sum/README.md b/3_sum/README.md new file mode 100644 index 0000000000000000000000000000000000000000..035dd01b1c16e80bbc13399d2eea48b6c1177c97 --- /dev/null +++ b/3_sum/README.md @@ -0,0 +1,96 @@ +# Multiple Implementation of SUM + + + +1. load the modules + +```bash +module load GCC +module load ParaStationMPI +module load CMake +``` +2. compile by CMake + +```bash +mkdir build +cd build +cmake .. +make -j +``` + +3. Execution of clusters + +```bash +#!/bin/bash -x +#SBATCH --account=slai +#SBATCH --nodes=1 +#SBATCH --hint=nomultithread +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=48 +#SBATCH --output=sum-out.%j +#SBATCH --error=sum-err.%j +#SBATCH --time=00:30:00 +#SBATCH --partition=batch + +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} +export OMP_PLACES=sockets + +echo "PURE OMP" + +echo "Impl,Problem size,Sum,number of threads,number of sockets,Time(s)" + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_serial.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_node.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_node_numa_aware.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_domain.exe ${T} +done + + + +echo "MPI+OMP" + +echo "Impl,Problem size,Sum,number of threads,number of sockets,Time(s)" + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_serial_mpi.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_node_mpi.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_node_numa_aware_mpi.exe ${T} +done + +for T in {2..48..2} +do +export OMP_NUM_THREADS=${T} +srun ./3_sum/3_sum_one_domain_mpi.exe ${T} +done +``` diff --git a/CMakeLists.txt b/CMakeLists.txt index cf3e5317f8644a2a65382865012f696ed78f7e6b..b700afdf463c290cd18df1884734a24087522bb5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,11 @@ set(CMAKE_CXX_FLAGS_DEBUG "-g") set(CMAKE_CXX_FLAGS_RELEASE "-O3") find_package(OpenMP REQUIRED) +find_package(MPI REQUIRED) ADD_SUBDIRECTORY(0_hello_world) ADD_SUBDIRECTORY(1_integral) -ADD_SUBDIRECTORY(3_affinity_query) +ADD_SUBDIRECTORY(3_sum) include(Dart) include(CPack)