diff --git a/03-MSA-ping-pong/README.md b/03-MSA-ping-pong/README.md
index 75fef91442bbcf57676ed3620007a96ded5a29b6..5d44932ca6a1157c0b450e11abb83cb52946b760 100644
--- a/03-MSA-ping-pong/README.md
+++ b/03-MSA-ping-pong/README.md
@@ -1,14 +1,16 @@
 # MSA CPU-GPU Ping Pong
 
-We extend the simple previous examples to now use a heterogeneous job to send _ping pong_ messages between the job components of increasing size.
+Extending the previous examples, we now send ping pong messages between the CPU memory of one node and the GPU memory of another node, using the heterogeneous job features of Slurm.
 
-Execute the following on JUWELS Booster
+TODOs in `ping-pong.cu` indicate points to implement pointers to GPU memory instead of CPU memory.
+
+After working on the TODOs, execute the following on JUWELS Booster to compile `ping-pong.cu` with the right modules.
 
 ```bash
 bash compile.sh
 ```
 
-Execute the following on JUWELS Cluster
+Execute the following on JUWELS Cluster to compile the CPU part of the application and submit a batch job:
 
 ```bash
 bash compile.sh
diff --git a/03-MSA-ping-pong/ping-pong.cu b/03-MSA-ping-pong/ping-pong.cu
index ae313a40be8f8e87795f47c4d8523d63020542f2..845b56c8ebee3acae03c8c2f25eaf5b42bdfc4a1 100644
--- a/03-MSA-ping-pong/ping-pong.cu
+++ b/03-MSA-ping-pong/ping-pong.cu
@@ -52,24 +52,38 @@ int main(int argc, char *argv[])
 			A[i] = 0.0;
 		}
 
+		// TODO: Create an empty double pointer, d_A; allocate d_A on the GPU; copy the content of A to d_A
+		#ifdef SOLUTION
 		double *d_A;
 		cudaErrorCheck( cudaMalloc(&d_A, N*sizeof(double)) );
 		cudaErrorCheck( cudaMemcpy(d_A, A, N*sizeof(double), cudaMemcpyHostToDevice) );
+		#endif
 	
 		int tag1 = 10;
 		int tag2 = 20;
 	
 		int loop_count = 50;
 
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
 		// Warm-up loop
 		for(int i=1; i<=5; i++){
 			if(rank == 0){
+				#ifdef SOLUTION
 				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
 				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+				#else
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+				#endif
 			}
 			else if(rank == 1){
+				#ifdef SOLUTION
 				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
 				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+				#else
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+				#endif
 			}
 		}
 
@@ -77,14 +91,25 @@ int main(int argc, char *argv[])
 		double start_time, stop_time, elapsed_time;
 		start_time = MPI_Wtime();
 	
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
 		for(int i=1; i<=loop_count; i++){
 			if(rank == 0){
+				#ifdef SOLUTION
 				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
 				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+				#else
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+				#endif
 			}
 			else if(rank == 1){
+				#ifdef SOLUTION
 				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
 				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+				#else
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+				#endif
 			}
 		}