diff --git a/02-MSA-hello-world-gpu/solutions/Makefile b/02-MSA-hello-world-gpu/solutions/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..c51d61d957d0decf1cb391bed05c7a34ddaac7e3
--- /dev/null
+++ b/02-MSA-hello-world-gpu/solutions/Makefile
@@ -0,0 +1,28 @@
+CUCOMP  = nvcc
+CUFLAGS = -arch=sm_80
+
+ifdef EBROOTOPENMPI
+MPI_HOME+=$(EBROOTOPENMPI)
+endif
+ifdef EBROOTPSMPI
+MPI_HOME+=$(EBROOTPSMPI)
+endif
+
+INCLUDES  = -I$(MPI_HOME)/include
+LIBRARIES = -L$(MPI_HOME)/lib -lmpi
+
+all: hello.cpu.out hello.gpu.out
+
+hello.cpu.out: hello-world.c
+	mpicc $< -o $@
+
+hello.gpu.out: hello.gpu.o
+	$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
+
+hello.gpu.o: hello-world.cu
+	$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
+
+.PHONY: clean
+
+clean:
+	rm -f hello.cpu.out hello.gpu.out *.o
\ No newline at end of file
diff --git a/02-MSA-hello-world-gpu/solutions/README.md b/02-MSA-hello-world-gpu/solutions/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd4ba44ce5142d231041d64cd3af2de2ce6527be
--- /dev/null
+++ b/02-MSA-hello-world-gpu/solutions/README.md
@@ -0,0 +1,23 @@
+# MSA GPU Hello World
+
+Building up on the previous exercise, in this exercise the GPU-side `printf()` function is used to print from a kernel a "hello world!", where the second word is received directly from the CPU process.
+
+TODOs are included in `hello-world.c` and `hello-world.cu`, indicating how to implement the `MPI_Send()` / `MPI_Recv()` structure.
+
+Once the function calls are implemented,e xecute the following on JUWELS Booster
+
+```bash
+bash compile.sh
+```
+(Which is equivalent to calling the `make` for the GPU part of the application, including the right modules.)
+
+Also, execute the following on JUWELS Cluster
+
+```bash
+bash compile.sh
+sbatch job_msa_juwels.sh
+```
+
+(Which compiles the CPU part of the application and then submits a heterogeneous job to the batch queue.)
+
+Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
diff --git a/02-MSA-hello-world-gpu/solutions/compile.sh b/02-MSA-hello-world-gpu/solutions/compile.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4c9bb3e28b8370255cf2accdbd1cbd4f6ca290a5
--- /dev/null
+++ b/02-MSA-hello-world-gpu/solutions/compile.sh
@@ -0,0 +1,12 @@
+if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
+	echo "Building for $SYSTEMNAME"
+	ml GCC CUDA ParaStationMPI
+	make hello.gpu.out
+elif [[ "$SYSTEMNAME" == "juwels" ]]; then
+	echo "Building for $SYSTEMNAME"
+	ml GCC ParaStationMPI
+	make hello.cpu.out
+else
+	echo "The system $SYSTEMNAME is not supported!"
+	echo "Please load manually load environment modules for compiler and MPI and compile with the Makefile"
+fi
\ No newline at end of file
diff --git a/02-MSA-hello-world-gpu/solutions/hello-world.c b/02-MSA-hello-world-gpu/solutions/hello-world.c
new file mode 100644
index 0000000000000000000000000000000000000000..562b47bb502a3361079267555b2350e9b626d574
--- /dev/null
+++ b/02-MSA-hello-world-gpu/solutions/hello-world.c
@@ -0,0 +1,35 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+int main(int argc, char** argv){
+    MPI_Init(&argc, &argv);
+
+    int size;
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    MPI_Status stat;
+
+    if(size != 2){
+        if(rank == 0){
+            printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+        }
+        MPI_Finalize();
+        exit(0);
+    }
+    int tag = 10;
+    
+    const char *payload = "world!";
+
+    // TODO: Implement the MPI_Send call to send the six characters of "payload" to rank 1
+    if (rank == 0) {
+        MPI_Send(payload, 6, MPI_CHAR, 1, tag, MPI_COMM_WORLD);
+    }
+
+    printf("\n");
+    
+    return 0;
+}
diff --git a/02-MSA-hello-world-gpu/solutions/hello-world.cu b/02-MSA-hello-world-gpu/solutions/hello-world.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7a2bf1688937b3b93b47e7c8f59ee320bbc452c7
--- /dev/null
+++ b/02-MSA-hello-world-gpu/solutions/hello-world.cu
@@ -0,0 +1,67 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+__global__ void hello(const char * payload){
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    
+    if (i == 0) {
+        printf("%s", payload);
+    }
+}
+
+int main(int argc, char** argv){
+    MPI_Init(&argc, &argv);
+
+    int size;
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    MPI_Status stat;
+
+    if(size != 2){
+        if(rank == 0){
+            printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+        }
+        MPI_Finalize();
+        exit(0);
+    }
+    int tag = 10;
+
+    const char *payload = "hello ";
+
+    char * d_payload;
+    CUDA_RT_CALL( cudaMalloc((void**)&d_payload, 6) );
+    CUDA_RT_CALL( cudaMemcpy(d_payload, payload, 6, cudaMemcpyHostToDevice) );
+
+    hello<<<1, 1>>>(d_payload);
+
+    CUDA_RT_CALL( cudaPeekAtLastError() );
+    CUDA_RT_CALL( cudaDeviceSynchronize() );
+    
+    // TODO: Implement the MPI_Recv() call to receive the "payload" from rank 1 using directly "d_payload" as the target buffer on the GPU
+    if (rank == 1) {
+        MPI_Recv(d_payload, 6, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &stat);
+    }
+    
+    hello<<<1, 1>>>(d_payload);
+
+    CUDA_RT_CALL( cudaPeekAtLastError() );
+    CUDA_RT_CALL( cudaDeviceSynchronize() );
+    
+    printf("\n");
+    
+    return 0;
+}
diff --git a/02-MSA-hello-world-gpu/solutions/job_msa_juwels.sh b/02-MSA-hello-world-gpu/solutions/job_msa_juwels.sh
new file mode 100644
index 0000000000000000000000000000000000000000..32bd3696ee2e173aaa8207ee2d0c3466e2ffafc6
--- /dev/null
+++ b/02-MSA-hello-world-gpu/solutions/job_msa_juwels.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -x
+#SBATCH --account=training2317
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=develbooster
+
+srun xenv -P -L GCC -L ParaStationMPI ./hello.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./hello.gpu.out
diff --git a/02-MSA-hello-world-gpu/tasks/Makefile b/02-MSA-hello-world-gpu/tasks/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..c51d61d957d0decf1cb391bed05c7a34ddaac7e3
--- /dev/null
+++ b/02-MSA-hello-world-gpu/tasks/Makefile
@@ -0,0 +1,28 @@
+CUCOMP  = nvcc
+CUFLAGS = -arch=sm_80
+
+ifdef EBROOTOPENMPI
+MPI_HOME+=$(EBROOTOPENMPI)
+endif
+ifdef EBROOTPSMPI
+MPI_HOME+=$(EBROOTPSMPI)
+endif
+
+INCLUDES  = -I$(MPI_HOME)/include
+LIBRARIES = -L$(MPI_HOME)/lib -lmpi
+
+all: hello.cpu.out hello.gpu.out
+
+hello.cpu.out: hello-world.c
+	mpicc $< -o $@
+
+hello.gpu.out: hello.gpu.o
+	$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
+
+hello.gpu.o: hello-world.cu
+	$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
+
+.PHONY: clean
+
+clean:
+	rm -f hello.cpu.out hello.gpu.out *.o
\ No newline at end of file
diff --git a/02-MSA-hello-world-gpu/tasks/README.md b/02-MSA-hello-world-gpu/tasks/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd4ba44ce5142d231041d64cd3af2de2ce6527be
--- /dev/null
+++ b/02-MSA-hello-world-gpu/tasks/README.md
@@ -0,0 +1,23 @@
+# MSA GPU Hello World
+
+Building up on the previous exercise, in this exercise the GPU-side `printf()` function is used to print from a kernel a "hello world!", where the second word is received directly from the CPU process.
+
+TODOs are included in `hello-world.c` and `hello-world.cu`, indicating how to implement the `MPI_Send()` / `MPI_Recv()` structure.
+
+Once the function calls are implemented,e xecute the following on JUWELS Booster
+
+```bash
+bash compile.sh
+```
+(Which is equivalent to calling the `make` for the GPU part of the application, including the right modules.)
+
+Also, execute the following on JUWELS Cluster
+
+```bash
+bash compile.sh
+sbatch job_msa_juwels.sh
+```
+
+(Which compiles the CPU part of the application and then submits a heterogeneous job to the batch queue.)
+
+Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
diff --git a/02-MSA-hello-world-gpu/tasks/compile.sh b/02-MSA-hello-world-gpu/tasks/compile.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4c9bb3e28b8370255cf2accdbd1cbd4f6ca290a5
--- /dev/null
+++ b/02-MSA-hello-world-gpu/tasks/compile.sh
@@ -0,0 +1,12 @@
+if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
+	echo "Building for $SYSTEMNAME"
+	ml GCC CUDA ParaStationMPI
+	make hello.gpu.out
+elif [[ "$SYSTEMNAME" == "juwels" ]]; then
+	echo "Building for $SYSTEMNAME"
+	ml GCC ParaStationMPI
+	make hello.cpu.out
+else
+	echo "The system $SYSTEMNAME is not supported!"
+	echo "Please load manually load environment modules for compiler and MPI and compile with the Makefile"
+fi
\ No newline at end of file
diff --git a/02-MSA-hello-world-gpu/tasks/hello-world.c b/02-MSA-hello-world-gpu/tasks/hello-world.c
new file mode 100644
index 0000000000000000000000000000000000000000..4c6384d012b24c68208e62455915d4080dfd5fab
--- /dev/null
+++ b/02-MSA-hello-world-gpu/tasks/hello-world.c
@@ -0,0 +1,35 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+int main(int argc, char** argv){
+    MPI_Init(&argc, &argv);
+
+    int size;
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    MPI_Status stat;
+
+    if(size != 2){
+        if(rank == 0){
+            printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+        }
+        MPI_Finalize();
+        exit(0);
+    }
+    int tag = 10;
+    
+    const char *payload = "world!";
+
+    // TODO: Implement the MPI_Send call to send the six characters of "payload" to rank 1
+    if (rank == 0) {
+        MPI_Send();
+    }
+
+    printf("\n");
+    
+    return 0;
+}
diff --git a/02-MSA-hello-world-gpu/tasks/hello-world.cu b/02-MSA-hello-world-gpu/tasks/hello-world.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6f6e11b58f60ade0e16c9b2b66556266262c230f
--- /dev/null
+++ b/02-MSA-hello-world-gpu/tasks/hello-world.cu
@@ -0,0 +1,67 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#define CUDA_RT_CALL(call)                                                                  \
+    {                                                                                       \
+        cudaError_t cudaStatus = call;                                                      \
+        if (cudaSuccess != cudaStatus)                                                      \
+            fprintf(stderr,                                                                 \
+                    "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
+                    "with "                                                                 \
+                    "%s (%d).\n",                                                           \
+                    #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
+    }
+__global__ void hello(const char * payload){
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    
+    if (i == 0) {
+        printf("%s", payload);
+    }
+}
+
+int main(int argc, char** argv){
+    MPI_Init(&argc, &argv);
+
+    int size;
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    MPI_Status stat;
+
+    if(size != 2){
+        if(rank == 0){
+            printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+        }
+        MPI_Finalize();
+        exit(0);
+    }
+    int tag = 10;
+
+    const char *payload = "hello ";
+
+    char * d_payload;
+    CUDA_RT_CALL( cudaMalloc((void**)&d_payload, 6) );
+    CUDA_RT_CALL( cudaMemcpy(d_payload, payload, 6, cudaMemcpyHostToDevice) );
+
+    hello<<<1, 1>>>(d_payload);
+
+    CUDA_RT_CALL( cudaPeekAtLastError() );
+    CUDA_RT_CALL( cudaDeviceSynchronize() );
+    
+    // TODO: Implement the MPI_Recv() call to receive the "payload" from rank 1 using directly "d_payload" as the target buffer on the GPU
+    if (rank == 1) {
+        MPI_Recv();
+    }
+    
+    hello<<<1, 1>>>(d_payload);
+
+    CUDA_RT_CALL( cudaPeekAtLastError() );
+    CUDA_RT_CALL( cudaDeviceSynchronize() );
+    
+    printf("\n");
+    
+    return 0;
+}
diff --git a/02-MSA-hello-world-gpu/tasks/job_msa_juwels.sh b/02-MSA-hello-world-gpu/tasks/job_msa_juwels.sh
new file mode 100644
index 0000000000000000000000000000000000000000..32bd3696ee2e173aaa8207ee2d0c3466e2ffafc6
--- /dev/null
+++ b/02-MSA-hello-world-gpu/tasks/job_msa_juwels.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -x
+#SBATCH --account=training2317
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=develbooster
+
+srun xenv -P -L GCC -L ParaStationMPI ./hello.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./hello.gpu.out