From 3d892e12685f1296b0de7f097d12eed2d3d4789c Mon Sep 17 00:00:00 2001 From: Andreas Herten <a.herten@fz-juelich.de> Date: Sat, 17 Jun 2023 14:55:31 +0200 Subject: [PATCH] Add TODOs --- 03-MSA-ping-pong/README.md | 8 +++++--- 03-MSA-ping-pong/ping-pong.cu | 25 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/03-MSA-ping-pong/README.md b/03-MSA-ping-pong/README.md index 75fef91..5d44932 100644 --- a/03-MSA-ping-pong/README.md +++ b/03-MSA-ping-pong/README.md @@ -1,14 +1,16 @@ # MSA CPU-GPU Ping Pong -We extend the simple previous examples to now use a heterogeneous job to send _ping pong_ messages between the job components of increasing size. +Extending the previous examples, we now send ping pong messages between the CPU memory of one node and the GPU memory of another node, using the heterogeneous job features of Slurm. -Execute the following on JUWELS Booster +TODOs in `ping-pong.cu` indicate points to implement pointers to GPU memory instead of CPU memory. + +After working on the TODOs, execute the following on JUWELS Booster to compile `ping-pong.cu` with the right modules. ```bash bash compile.sh ``` -Execute the following on JUWELS Cluster +Execute the following on JUWELS Cluster to compile the CPU part of the application and submit a batch job: ```bash bash compile.sh diff --git a/03-MSA-ping-pong/ping-pong.cu b/03-MSA-ping-pong/ping-pong.cu index ae313a4..845b56c 100644 --- a/03-MSA-ping-pong/ping-pong.cu +++ b/03-MSA-ping-pong/ping-pong.cu @@ -52,24 +52,38 @@ int main(int argc, char *argv[]) A[i] = 0.0; } + // TODO: Create an empty double pointer, d_A; allocate d_A on the GPU; copy the content of A to d_A + #ifdef SOLUTION double *d_A; cudaErrorCheck( cudaMalloc(&d_A, N*sizeof(double)) ); cudaErrorCheck( cudaMemcpy(d_A, A, N*sizeof(double), cudaMemcpyHostToDevice) ); + #endif int tag1 = 10; int tag2 = 20; int loop_count = 50; + // TODO: Use the GPU pointer d_A in the following MPI calls instead of A // Warm-up loop for(int i=1; i<=5; i++){ if(rank == 0){ + #ifdef SOLUTION MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD); MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat); + #else + MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD); + MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat); + #endif } else if(rank == 1){ + #ifdef SOLUTION MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat); MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD); + #else + MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat); + MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD); + #endif } } @@ -77,14 +91,25 @@ int main(int argc, char *argv[]) double start_time, stop_time, elapsed_time; start_time = MPI_Wtime(); + // TODO: Use the GPU pointer d_A in the following MPI calls instead of A for(int i=1; i<=loop_count; i++){ if(rank == 0){ + #ifdef SOLUTION MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD); MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat); + #else + MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD); + MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat); + #endif } else if(rank == 1){ + #ifdef SOLUTION MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat); MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD); + #else + MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat); + MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD); + #endif } } -- GitLab