From 255608ff74acd37de260f28f6cc0872c1a063a61 Mon Sep 17 00:00:00 2001
From: Sebastian Achilles <s.achilles@fz-juelich.de>
Date: Wed, 19 Apr 2023 12:06:56 +0200
Subject: [PATCH] add 02 MSA Ping-Pong

---
 02-MSA-ping-pong/Makefile          |  22 ++++++
 02-MSA-ping-pong/compile_all.sh    |   2 +
 02-MSA-ping-pong/job_msa_jureca.sh |  14 ++++
 02-MSA-ping-pong/job_msa_juwels.sh |  14 ++++
 02-MSA-ping-pong/ping-pong.c       |  93 ++++++++++++++++++++++++
 02-MSA-ping-pong/ping-pong.cu      | 109 +++++++++++++++++++++++++++++
 6 files changed, 254 insertions(+)
 create mode 100644 02-MSA-ping-pong/Makefile
 create mode 100755 02-MSA-ping-pong/compile_all.sh
 create mode 100644 02-MSA-ping-pong/job_msa_jureca.sh
 create mode 100644 02-MSA-ping-pong/job_msa_juwels.sh
 create mode 100644 02-MSA-ping-pong/ping-pong.c
 create mode 100644 02-MSA-ping-pong/ping-pong.cu

diff --git a/02-MSA-ping-pong/Makefile b/02-MSA-ping-pong/Makefile
new file mode 100644
index 0000000..0fbc367
--- /dev/null
+++ b/02-MSA-ping-pong/Makefile
@@ -0,0 +1,22 @@
+CUCOMP  = nvcc
+CUFLAGS = -arch=sm_80
+
+INCLUDES  = -I$(EBROOTOPENMPI)/include
+LIBRARIES = -L$(EBROOTOPENMPI)/lib -lmpi
+
+all: pp_cpu.out pp_cuda_aware.out
+
+pp_cpu.out: ping-pong.c
+	mpicc ping-pong.c -o pp_cpu.out
+
+pp_cuda_aware.out: ping-pong.o
+	$(CUCOMP) $(CUFLAGS) $(LIBRARIES) ping-pong.o -o pp_cuda_aware.out
+
+ping-pong.o: ping-pong.cu
+	$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c ping-pong.cu
+
+.PHONY: clean
+
+clean:
+	rm -f pp_cpu.out pp_cuda_aware.out *.o
+
diff --git a/02-MSA-ping-pong/compile_all.sh b/02-MSA-ping-pong/compile_all.sh
new file mode 100755
index 0000000..1f677bb
--- /dev/null
+++ b/02-MSA-ping-pong/compile_all.sh
@@ -0,0 +1,2 @@
+ml GCC OpenMPI MPI-settings/CUDA
+make all
diff --git a/02-MSA-ping-pong/job_msa_jureca.sh b/02-MSA-ping-pong/job_msa_jureca.sh
new file mode 100644
index 0000000..467ff8c
--- /dev/null
+++ b/02-MSA-ping-pong/job_msa_jureca.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -x
+#SBATCH --account=exalab
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=dc-cpu-devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=dc-gpu-devel
+
+srun xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cpu.out : xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cuda_aware.out
diff --git a/02-MSA-ping-pong/job_msa_juwels.sh b/02-MSA-ping-pong/job_msa_juwels.sh
new file mode 100644
index 0000000..5e6f8bb
--- /dev/null
+++ b/02-MSA-ping-pong/job_msa_juwels.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -x
+#SBATCH --account=exalab
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=develbooster
+
+srun xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cpu.out : xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cuda_aware.out
diff --git a/02-MSA-ping-pong/ping-pong.c b/02-MSA-ping-pong/ping-pong.c
new file mode 100644
index 0000000..29343e7
--- /dev/null
+++ b/02-MSA-ping-pong/ping-pong.c
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+int main(int argc, char *argv[])
+{
+	/* -------------------------------------------------------------------------------------------
+		MPI Initialization 
+	--------------------------------------------------------------------------------------------*/
+	MPI_Init(&argc, &argv);
+
+	int size;
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	MPI_Status stat;
+
+	if(size != 2){
+		if(rank == 0){
+			printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+		}
+		MPI_Finalize();
+		exit(0);
+	}
+
+	/* -------------------------------------------------------------------------------------------
+		Loop from 8 B to 1 GB
+	--------------------------------------------------------------------------------------------*/
+
+	for(int i=0; i<=27; i++){
+
+		long int N = 1 << i;
+	
+   	 	// Allocate memory for A on CPU
+		double *A = (double*)malloc(N*sizeof(double));
+
+		// Initialize all elements of A to 0.0
+		for(int i=0; i<N; i++){
+			A[i] = 0.0;
+		}
+	
+		int tag1 = 10;
+		int tag2 = 20;
+	
+		int loop_count = 50;
+
+		// Warm-up loop
+		for(int i=1; i<=5; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
+		double start_time, stop_time, elapsed_time;
+		start_time = MPI_Wtime();
+	
+		for(int i=1; i<=loop_count; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		stop_time = MPI_Wtime();
+		elapsed_time = stop_time - start_time;
+
+		long int num_B = 8*N;
+		long int B_in_GB = 1 << 30;
+		double num_GB = (double)num_B / (double)B_in_GB;
+		double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
+
+		if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
+
+		free(A);
+	}
+
+	MPI_Finalize();
+
+	return 0;
+}
+
diff --git a/02-MSA-ping-pong/ping-pong.cu b/02-MSA-ping-pong/ping-pong.cu
new file mode 100644
index 0000000..ae313a4
--- /dev/null
+++ b/02-MSA-ping-pong/ping-pong.cu
@@ -0,0 +1,109 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+// Macro for checking errors in CUDA API calls
+#define cudaErrorCheck(call)                                                              \
+do{                                                                                       \
+	cudaError_t cuErr = call;                                                             \
+	if(cudaSuccess != cuErr){                                                             \
+		printf("CUDA Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(cuErr));\
+		exit(0);                                                                            \
+	}                                                                                     \
+}while(0)
+
+
+int main(int argc, char *argv[])
+{
+	/* -------------------------------------------------------------------------------------------
+		MPI Initialization 
+	--------------------------------------------------------------------------------------------*/
+	MPI_Init(&argc, &argv);
+
+	int size;
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	MPI_Status stat;
+
+	if(size != 2){
+		if(rank == 0){
+			printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+		}
+		MPI_Finalize();
+		exit(0);
+	}
+
+	/* -------------------------------------------------------------------------------------------
+		Loop from 8 B to 1 GB
+	--------------------------------------------------------------------------------------------*/
+
+	for(int i=0; i<=27; i++){
+
+		long int N = 1 << i;
+	
+		// Allocate memory for A on CPU
+		double *A = (double*)malloc(N*sizeof(double));
+
+		// Initialize all elements of A to 0.0
+		for(int i=0; i<N; i++){
+			A[i] = 0.0;
+		}
+
+		double *d_A;
+		cudaErrorCheck( cudaMalloc(&d_A, N*sizeof(double)) );
+		cudaErrorCheck( cudaMemcpy(d_A, A, N*sizeof(double), cudaMemcpyHostToDevice) );
+	
+		int tag1 = 10;
+		int tag2 = 20;
+	
+		int loop_count = 50;
+
+		// Warm-up loop
+		for(int i=1; i<=5; i++){
+			if(rank == 0){
+				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
+		double start_time, stop_time, elapsed_time;
+		start_time = MPI_Wtime();
+	
+		for(int i=1; i<=loop_count; i++){
+			if(rank == 0){
+				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		stop_time = MPI_Wtime();
+		elapsed_time = stop_time - start_time;
+
+		long int num_B = 8*N;
+		long int B_in_GB = 1 << 30;
+		double num_GB = (double)num_B / (double)B_in_GB;
+		double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
+
+		if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
+
+		cudaErrorCheck( cudaFree(d_A) );
+		free(A);
+	}
+
+	MPI_Finalize();
+
+	return 0;
+}
+
-- 
GitLab