diff --git a/03-MSA-ping-pong/Makefile b/03-MSA-ping-pong/.master/Makefile
similarity index 100%
rename from 03-MSA-ping-pong/Makefile
rename to 03-MSA-ping-pong/.master/Makefile
diff --git a/03-MSA-ping-pong/README.md b/03-MSA-ping-pong/.master/README.md
similarity index 100%
rename from 03-MSA-ping-pong/README.md
rename to 03-MSA-ping-pong/.master/README.md
diff --git a/03-MSA-ping-pong/compile.sh b/03-MSA-ping-pong/.master/compile.sh
similarity index 100%
rename from 03-MSA-ping-pong/compile.sh
rename to 03-MSA-ping-pong/.master/compile.sh
diff --git a/03-MSA-ping-pong/.master/copy.mk b/03-MSA-ping-pong/.master/copy.mk
new file mode 100755
index 0000000000000000000000000000000000000000..bfd2eee41e88a5450ef1fa77b7056cbc0fcadb56
--- /dev/null
+++ b/03-MSA-ping-pong/.master/copy.mk
@@ -0,0 +1,34 @@
+#!/usr/bin/make -f
+TASKDIR = ../tasks
+SOLUTIONDIR = ../solutions
+
+PROCESSFILES = ping-pong.cu
+COPYFILES = Makefile README.md job_msa_juwels.sh job_msa_jureca.sh compile.sh ping-pong.c
+
+
+TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES))
+TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES))
+SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES))
+SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES))
+
+.PHONY: all task
+all: task
+task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES}
+
+
+${TASKPROCCESFILES}: $(PROCESSFILES)
+	mkdir -p $(TASKDIR)/
+	cppp -USOLUTION $(notdir $@) $@
+	
+${SOLUTIONPROCCESFILES}: $(PROCESSFILES)
+	mkdir -p $(SOLUTIONDIR)/
+	cppp -DSOLUTION $(notdir $@) $@
+
+
+${TASKCOPYFILES}: $(COPYFILES)
+	mkdir -p $(TASKDIR)/
+	cp $(notdir $@) $@
+	
+${SOLUTIONCOPYFILES}: $(COPYFILES)
+	mkdir -p $(SOLUTIONDIR)/
+	cp $(notdir $@) $@
\ No newline at end of file
diff --git a/03-MSA-ping-pong/job_msa_jureca.sh b/03-MSA-ping-pong/.master/job_msa_jureca.sh
similarity index 100%
rename from 03-MSA-ping-pong/job_msa_jureca.sh
rename to 03-MSA-ping-pong/.master/job_msa_jureca.sh
diff --git a/03-MSA-ping-pong/job_msa_juwels.sh b/03-MSA-ping-pong/.master/job_msa_juwels.sh
similarity index 100%
rename from 03-MSA-ping-pong/job_msa_juwels.sh
rename to 03-MSA-ping-pong/.master/job_msa_juwels.sh
diff --git a/03-MSA-ping-pong/ping-pong.c b/03-MSA-ping-pong/.master/ping-pong.c
similarity index 100%
rename from 03-MSA-ping-pong/ping-pong.c
rename to 03-MSA-ping-pong/.master/ping-pong.c
diff --git a/03-MSA-ping-pong/ping-pong.cu b/03-MSA-ping-pong/.master/ping-pong.cu
similarity index 100%
rename from 03-MSA-ping-pong/ping-pong.cu
rename to 03-MSA-ping-pong/.master/ping-pong.cu
diff --git a/03-MSA-ping-pong/solutions/Makefile b/03-MSA-ping-pong/solutions/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f7233d293b3d3bca9d4bced8af5ada4752477b75
--- /dev/null
+++ b/03-MSA-ping-pong/solutions/Makefile
@@ -0,0 +1,29 @@
+CUCOMP  = nvcc
+CUFLAGS = -arch=sm_80
+
+ifdef EBROOTOPENMPI
+MPI_HOME+=$(EBROOTOPENMPI)
+endif
+ifdef EBROOTPSMPI
+MPI_HOME+=$(EBROOTPSMPI)
+endif
+
+INCLUDES  = -I$(MPI_HOME)/include
+LIBRARIES = -L$(MPI_HOME)/lib -lmpi
+
+all: ping-pong.cpu.out ping-pong.gpu.out
+
+ping-pong.cpu.out: ping-pong.c
+	mpicc $< -o $@
+
+ping-pong.gpu.out: ping-pong.gpu.o
+	$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
+
+ping-pong.gpu.o: ping-pong.cu
+	$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
+
+.PHONY: clean
+
+clean:
+	rm -f ping-pong.cpu.out ping-pong.gpu.out *.o
+
diff --git a/03-MSA-ping-pong/solutions/README.md b/03-MSA-ping-pong/solutions/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d44932ca6a1157c0b450e11abb83cb52946b760
--- /dev/null
+++ b/03-MSA-ping-pong/solutions/README.md
@@ -0,0 +1,20 @@
+# MSA CPU-GPU Ping Pong
+
+Extending the previous examples, we now send ping pong messages between the CPU memory of one node and the GPU memory of another node, using the heterogeneous job features of Slurm.
+
+TODOs in `ping-pong.cu` indicate points to implement pointers to GPU memory instead of CPU memory.
+
+After working on the TODOs, execute the following on JUWELS Booster to compile `ping-pong.cu` with the right modules.
+
+```bash
+bash compile.sh
+```
+
+Execute the following on JUWELS Cluster to compile the CPU part of the application and submit a batch job:
+
+```bash
+bash compile.sh
+sbatch job_msa_juwels.sh
+```
+
+Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
diff --git a/03-MSA-ping-pong/solutions/compile.sh b/03-MSA-ping-pong/solutions/compile.sh
new file mode 100755
index 0000000000000000000000000000000000000000..302fa09c0ea2489033882647ba3c7278dc198f96
--- /dev/null
+++ b/03-MSA-ping-pong/solutions/compile.sh
@@ -0,0 +1,11 @@
+if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
+	echo "Building GPU-aware version for $SYSTEMNAME"
+	ml GCC ParaStationMPI MPI-settings/CUDA
+	make ping-pong.gpu.out
+elif [[ "$SYSTEMNAME" == "juwels" ]]; then
+	echo "Building CPU version for $SYSTEMNAME"
+	ml GCC ParaStationMPI 
+	make ping-pong.cpu.out
+else
+	echo "The system $SYSTEMNAME is not supported!"
+fi
\ No newline at end of file
diff --git a/03-MSA-ping-pong/solutions/job_msa_jureca.sh b/03-MSA-ping-pong/solutions/job_msa_jureca.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d328271e5fdf8031af52d77d9b9d1dc7911bdac9
--- /dev/null
+++ b/03-MSA-ping-pong/solutions/job_msa_jureca.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -x
+#SBATCH --account=exalab
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=dc-cpu-devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=dc-gpu-devel
+
+srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
diff --git a/03-MSA-ping-pong/solutions/job_msa_juwels.sh b/03-MSA-ping-pong/solutions/job_msa_juwels.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e6f2ba4bdb4fd07222f526aecf43a57f0040a878
--- /dev/null
+++ b/03-MSA-ping-pong/solutions/job_msa_juwels.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -x
+#SBATCH --account=training2317
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=develbooster
+
+srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
diff --git a/03-MSA-ping-pong/solutions/ping-pong.c b/03-MSA-ping-pong/solutions/ping-pong.c
new file mode 100644
index 0000000000000000000000000000000000000000..29343e7ab69e5a8aef58a1ed12e8d95525cb0a33
--- /dev/null
+++ b/03-MSA-ping-pong/solutions/ping-pong.c
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+int main(int argc, char *argv[])
+{
+	/* -------------------------------------------------------------------------------------------
+		MPI Initialization 
+	--------------------------------------------------------------------------------------------*/
+	MPI_Init(&argc, &argv);
+
+	int size;
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	MPI_Status stat;
+
+	if(size != 2){
+		if(rank == 0){
+			printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+		}
+		MPI_Finalize();
+		exit(0);
+	}
+
+	/* -------------------------------------------------------------------------------------------
+		Loop from 8 B to 1 GB
+	--------------------------------------------------------------------------------------------*/
+
+	for(int i=0; i<=27; i++){
+
+		long int N = 1 << i;
+	
+   	 	// Allocate memory for A on CPU
+		double *A = (double*)malloc(N*sizeof(double));
+
+		// Initialize all elements of A to 0.0
+		for(int i=0; i<N; i++){
+			A[i] = 0.0;
+		}
+	
+		int tag1 = 10;
+		int tag2 = 20;
+	
+		int loop_count = 50;
+
+		// Warm-up loop
+		for(int i=1; i<=5; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
+		double start_time, stop_time, elapsed_time;
+		start_time = MPI_Wtime();
+	
+		for(int i=1; i<=loop_count; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		stop_time = MPI_Wtime();
+		elapsed_time = stop_time - start_time;
+
+		long int num_B = 8*N;
+		long int B_in_GB = 1 << 30;
+		double num_GB = (double)num_B / (double)B_in_GB;
+		double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
+
+		if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
+
+		free(A);
+	}
+
+	MPI_Finalize();
+
+	return 0;
+}
+
diff --git a/03-MSA-ping-pong/solutions/ping-pong.cu b/03-MSA-ping-pong/solutions/ping-pong.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c64a6b2c89c4c5e4fd5e9d2b49d9aeceee535e2
--- /dev/null
+++ b/03-MSA-ping-pong/solutions/ping-pong.cu
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+// Macro for checking errors in CUDA API calls
+#define cudaErrorCheck(call)                                                              \
+do{                                                                                       \
+	cudaError_t cuErr = call;                                                             \
+	if(cudaSuccess != cuErr){                                                             \
+		printf("CUDA Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(cuErr));\
+		exit(0);                                                                            \
+	}                                                                                     \
+}while(0)
+
+
+int main(int argc, char *argv[])
+{
+	/* -------------------------------------------------------------------------------------------
+		MPI Initialization 
+	--------------------------------------------------------------------------------------------*/
+	MPI_Init(&argc, &argv);
+
+	int size;
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	MPI_Status stat;
+
+	if(size != 2){
+		if(rank == 0){
+			printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+		}
+		MPI_Finalize();
+		exit(0);
+	}
+
+	/* -------------------------------------------------------------------------------------------
+		Loop from 8 B to 1 GB
+	--------------------------------------------------------------------------------------------*/
+
+	for(int i=0; i<=27; i++){
+
+		long int N = 1 << i;
+	
+		// Allocate memory for A on CPU
+		double *A = (double*)malloc(N*sizeof(double));
+
+		// Initialize all elements of A to 0.0
+		for(int i=0; i<N; i++){
+			A[i] = 0.0;
+		}
+
+		// TODO: Create an empty double pointer, d_A; allocate d_A on the GPU; copy the content of A to d_A
+		double *d_A;
+		cudaErrorCheck( cudaMalloc(&d_A, N*sizeof(double)) );
+		cudaErrorCheck( cudaMemcpy(d_A, A, N*sizeof(double), cudaMemcpyHostToDevice) );
+	
+		int tag1 = 10;
+		int tag2 = 20;
+	
+		int loop_count = 50;
+
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
+		// Warm-up loop
+		for(int i=1; i<=5; i++){
+			if(rank == 0){
+				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
+		double start_time, stop_time, elapsed_time;
+		start_time = MPI_Wtime();
+	
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
+		for(int i=1; i<=loop_count; i++){
+			if(rank == 0){
+				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		stop_time = MPI_Wtime();
+		elapsed_time = stop_time - start_time;
+
+		long int num_B = 8*N;
+		long int B_in_GB = 1 << 30;
+		double num_GB = (double)num_B / (double)B_in_GB;
+		double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
+
+		if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
+
+		cudaErrorCheck( cudaFree(d_A) );
+		free(A);
+	}
+
+	MPI_Finalize();
+
+	return 0;
+}
+
diff --git a/03-MSA-ping-pong/tasks/Makefile b/03-MSA-ping-pong/tasks/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f7233d293b3d3bca9d4bced8af5ada4752477b75
--- /dev/null
+++ b/03-MSA-ping-pong/tasks/Makefile
@@ -0,0 +1,29 @@
+CUCOMP  = nvcc
+CUFLAGS = -arch=sm_80
+
+ifdef EBROOTOPENMPI
+MPI_HOME+=$(EBROOTOPENMPI)
+endif
+ifdef EBROOTPSMPI
+MPI_HOME+=$(EBROOTPSMPI)
+endif
+
+INCLUDES  = -I$(MPI_HOME)/include
+LIBRARIES = -L$(MPI_HOME)/lib -lmpi
+
+all: ping-pong.cpu.out ping-pong.gpu.out
+
+ping-pong.cpu.out: ping-pong.c
+	mpicc $< -o $@
+
+ping-pong.gpu.out: ping-pong.gpu.o
+	$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
+
+ping-pong.gpu.o: ping-pong.cu
+	$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
+
+.PHONY: clean
+
+clean:
+	rm -f ping-pong.cpu.out ping-pong.gpu.out *.o
+
diff --git a/03-MSA-ping-pong/tasks/README.md b/03-MSA-ping-pong/tasks/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d44932ca6a1157c0b450e11abb83cb52946b760
--- /dev/null
+++ b/03-MSA-ping-pong/tasks/README.md
@@ -0,0 +1,20 @@
+# MSA CPU-GPU Ping Pong
+
+Extending the previous examples, we now send ping pong messages between the CPU memory of one node and the GPU memory of another node, using the heterogeneous job features of Slurm.
+
+TODOs in `ping-pong.cu` indicate points to implement pointers to GPU memory instead of CPU memory.
+
+After working on the TODOs, execute the following on JUWELS Booster to compile `ping-pong.cu` with the right modules.
+
+```bash
+bash compile.sh
+```
+
+Execute the following on JUWELS Cluster to compile the CPU part of the application and submit a batch job:
+
+```bash
+bash compile.sh
+sbatch job_msa_juwels.sh
+```
+
+Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
diff --git a/03-MSA-ping-pong/tasks/compile.sh b/03-MSA-ping-pong/tasks/compile.sh
new file mode 100755
index 0000000000000000000000000000000000000000..302fa09c0ea2489033882647ba3c7278dc198f96
--- /dev/null
+++ b/03-MSA-ping-pong/tasks/compile.sh
@@ -0,0 +1,11 @@
+if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
+	echo "Building GPU-aware version for $SYSTEMNAME"
+	ml GCC ParaStationMPI MPI-settings/CUDA
+	make ping-pong.gpu.out
+elif [[ "$SYSTEMNAME" == "juwels" ]]; then
+	echo "Building CPU version for $SYSTEMNAME"
+	ml GCC ParaStationMPI 
+	make ping-pong.cpu.out
+else
+	echo "The system $SYSTEMNAME is not supported!"
+fi
\ No newline at end of file
diff --git a/03-MSA-ping-pong/tasks/job_msa_jureca.sh b/03-MSA-ping-pong/tasks/job_msa_jureca.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d328271e5fdf8031af52d77d9b9d1dc7911bdac9
--- /dev/null
+++ b/03-MSA-ping-pong/tasks/job_msa_jureca.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -x
+#SBATCH --account=exalab
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=dc-cpu-devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=dc-gpu-devel
+
+srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
diff --git a/03-MSA-ping-pong/tasks/job_msa_juwels.sh b/03-MSA-ping-pong/tasks/job_msa_juwels.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e6f2ba4bdb4fd07222f526aecf43a57f0040a878
--- /dev/null
+++ b/03-MSA-ping-pong/tasks/job_msa_juwels.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -x
+#SBATCH --account=training2317
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=develbooster
+
+srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
diff --git a/03-MSA-ping-pong/tasks/ping-pong.c b/03-MSA-ping-pong/tasks/ping-pong.c
new file mode 100644
index 0000000000000000000000000000000000000000..29343e7ab69e5a8aef58a1ed12e8d95525cb0a33
--- /dev/null
+++ b/03-MSA-ping-pong/tasks/ping-pong.c
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+int main(int argc, char *argv[])
+{
+	/* -------------------------------------------------------------------------------------------
+		MPI Initialization 
+	--------------------------------------------------------------------------------------------*/
+	MPI_Init(&argc, &argv);
+
+	int size;
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	MPI_Status stat;
+
+	if(size != 2){
+		if(rank == 0){
+			printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+		}
+		MPI_Finalize();
+		exit(0);
+	}
+
+	/* -------------------------------------------------------------------------------------------
+		Loop from 8 B to 1 GB
+	--------------------------------------------------------------------------------------------*/
+
+	for(int i=0; i<=27; i++){
+
+		long int N = 1 << i;
+	
+   	 	// Allocate memory for A on CPU
+		double *A = (double*)malloc(N*sizeof(double));
+
+		// Initialize all elements of A to 0.0
+		for(int i=0; i<N; i++){
+			A[i] = 0.0;
+		}
+	
+		int tag1 = 10;
+		int tag2 = 20;
+	
+		int loop_count = 50;
+
+		// Warm-up loop
+		for(int i=1; i<=5; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
+		double start_time, stop_time, elapsed_time;
+		start_time = MPI_Wtime();
+	
+		for(int i=1; i<=loop_count; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		stop_time = MPI_Wtime();
+		elapsed_time = stop_time - start_time;
+
+		long int num_B = 8*N;
+		long int B_in_GB = 1 << 30;
+		double num_GB = (double)num_B / (double)B_in_GB;
+		double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
+
+		if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
+
+		free(A);
+	}
+
+	MPI_Finalize();
+
+	return 0;
+}
+
diff --git a/03-MSA-ping-pong/tasks/ping-pong.cu b/03-MSA-ping-pong/tasks/ping-pong.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f0042aa96769e632fda2f778689e6e139b5e305d
--- /dev/null
+++ b/03-MSA-ping-pong/tasks/ping-pong.cu
@@ -0,0 +1,109 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+// Macro for checking errors in CUDA API calls
+#define cudaErrorCheck(call)                                                              \
+do{                                                                                       \
+	cudaError_t cuErr = call;                                                             \
+	if(cudaSuccess != cuErr){                                                             \
+		printf("CUDA Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(cuErr));\
+		exit(0);                                                                            \
+	}                                                                                     \
+}while(0)
+
+
+int main(int argc, char *argv[])
+{
+	/* -------------------------------------------------------------------------------------------
+		MPI Initialization 
+	--------------------------------------------------------------------------------------------*/
+	MPI_Init(&argc, &argv);
+
+	int size;
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	MPI_Status stat;
+
+	if(size != 2){
+		if(rank == 0){
+			printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+		}
+		MPI_Finalize();
+		exit(0);
+	}
+
+	/* -------------------------------------------------------------------------------------------
+		Loop from 8 B to 1 GB
+	--------------------------------------------------------------------------------------------*/
+
+	for(int i=0; i<=27; i++){
+
+		long int N = 1 << i;
+	
+		// Allocate memory for A on CPU
+		double *A = (double*)malloc(N*sizeof(double));
+
+		// Initialize all elements of A to 0.0
+		for(int i=0; i<N; i++){
+			A[i] = 0.0;
+		}
+
+		// TODO: Create an empty double pointer, d_A; allocate d_A on the GPU; copy the content of A to d_A
+	
+		int tag1 = 10;
+		int tag2 = 20;
+	
+		int loop_count = 50;
+
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
+		// Warm-up loop
+		for(int i=1; i<=5; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
+		double start_time, stop_time, elapsed_time;
+		start_time = MPI_Wtime();
+	
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
+		for(int i=1; i<=loop_count; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		stop_time = MPI_Wtime();
+		elapsed_time = stop_time - start_time;
+
+		long int num_B = 8*N;
+		long int B_in_GB = 1 << 30;
+		double num_GB = (double)num_B / (double)B_in_GB;
+		double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
+
+		if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
+
+		cudaErrorCheck( cudaFree(d_A) );
+		free(A);
+	}
+
+	MPI_Finalize();
+
+	return 0;
+}
+