From 3d892e12685f1296b0de7f097d12eed2d3d4789c Mon Sep 17 00:00:00 2001
From: Andreas Herten <a.herten@fz-juelich.de>
Date: Sat, 17 Jun 2023 14:55:31 +0200
Subject: [PATCH] Add TODOs

---
 03-MSA-ping-pong/README.md    |  8 +++++---
 03-MSA-ping-pong/ping-pong.cu | 25 +++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/03-MSA-ping-pong/README.md b/03-MSA-ping-pong/README.md
index 75fef91..5d44932 100644
--- a/03-MSA-ping-pong/README.md
+++ b/03-MSA-ping-pong/README.md
@@ -1,14 +1,16 @@
 # MSA CPU-GPU Ping Pong
 
-We extend the simple previous examples to now use a heterogeneous job to send _ping pong_ messages between the job components of increasing size.
+Extending the previous examples, we now send ping pong messages between the CPU memory of one node and the GPU memory of another node, using the heterogeneous job features of Slurm.
 
-Execute the following on JUWELS Booster
+TODOs in `ping-pong.cu` indicate points to implement pointers to GPU memory instead of CPU memory.
+
+After working on the TODOs, execute the following on JUWELS Booster to compile `ping-pong.cu` with the right modules.
 
 ```bash
 bash compile.sh
 ```
 
-Execute the following on JUWELS Cluster
+Execute the following on JUWELS Cluster to compile the CPU part of the application and submit a batch job:
 
 ```bash
 bash compile.sh
diff --git a/03-MSA-ping-pong/ping-pong.cu b/03-MSA-ping-pong/ping-pong.cu
index ae313a4..845b56c 100644
--- a/03-MSA-ping-pong/ping-pong.cu
+++ b/03-MSA-ping-pong/ping-pong.cu
@@ -52,24 +52,38 @@ int main(int argc, char *argv[])
 			A[i] = 0.0;
 		}
 
+		// TODO: Create an empty double pointer, d_A; allocate d_A on the GPU; copy the content of A to d_A
+		#ifdef SOLUTION
 		double *d_A;
 		cudaErrorCheck( cudaMalloc(&d_A, N*sizeof(double)) );
 		cudaErrorCheck( cudaMemcpy(d_A, A, N*sizeof(double), cudaMemcpyHostToDevice) );
+		#endif
 	
 		int tag1 = 10;
 		int tag2 = 20;
 	
 		int loop_count = 50;
 
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
 		// Warm-up loop
 		for(int i=1; i<=5; i++){
 			if(rank == 0){
+				#ifdef SOLUTION
 				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
 				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+				#else
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+				#endif
 			}
 			else if(rank == 1){
+				#ifdef SOLUTION
 				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
 				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+				#else
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+				#endif
 			}
 		}
 
@@ -77,14 +91,25 @@ int main(int argc, char *argv[])
 		double start_time, stop_time, elapsed_time;
 		start_time = MPI_Wtime();
 	
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
 		for(int i=1; i<=loop_count; i++){
 			if(rank == 0){
+				#ifdef SOLUTION
 				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
 				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+				#else
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+				#endif
 			}
 			else if(rank == 1){
+				#ifdef SOLUTION
 				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
 				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+				#else
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+				#endif
 			}
 		}
 
-- 
GitLab