diff --git a/02-MSA-hello-world-gpu/solutions/Makefile b/02-MSA-hello-world-gpu/solutions/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c51d61d957d0decf1cb391bed05c7a34ddaac7e3 --- /dev/null +++ b/02-MSA-hello-world-gpu/solutions/Makefile @@ -0,0 +1,28 @@ +CUCOMP = nvcc +CUFLAGS = -arch=sm_80 + +ifdef EBROOTOPENMPI +MPI_HOME+=$(EBROOTOPENMPI) +endif +ifdef EBROOTPSMPI +MPI_HOME+=$(EBROOTPSMPI) +endif + +INCLUDES = -I$(MPI_HOME)/include +LIBRARIES = -L$(MPI_HOME)/lib -lmpi + +all: hello.cpu.out hello.gpu.out + +hello.cpu.out: hello-world.c + mpicc $< -o $@ + +hello.gpu.out: hello.gpu.o + $(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@ + +hello.gpu.o: hello-world.cu + $(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@ + +.PHONY: clean + +clean: + rm -f hello.cpu.out hello.gpu.out *.o \ No newline at end of file diff --git a/02-MSA-hello-world-gpu/solutions/README.md b/02-MSA-hello-world-gpu/solutions/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd4ba44ce5142d231041d64cd3af2de2ce6527be --- /dev/null +++ b/02-MSA-hello-world-gpu/solutions/README.md @@ -0,0 +1,23 @@ +# MSA GPU Hello World + +Building up on the previous exercise, in this exercise the GPU-side `printf()` function is used to print from a kernel a "hello world!", where the second word is received directly from the CPU process. + +TODOs are included in `hello-world.c` and `hello-world.cu`, indicating how to implement the `MPI_Send()` / `MPI_Recv()` structure. + +Once the function calls are implemented,e xecute the following on JUWELS Booster + +```bash +bash compile.sh +``` +(Which is equivalent to calling the `make` for the GPU part of the application, including the right modules.) + +Also, execute the following on JUWELS Cluster + +```bash +bash compile.sh +sbatch job_msa_juwels.sh +``` + +(Which compiles the CPU part of the application and then submits a heterogeneous job to the batch queue.) + +Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number. \ No newline at end of file diff --git a/02-MSA-hello-world-gpu/solutions/compile.sh b/02-MSA-hello-world-gpu/solutions/compile.sh new file mode 100755 index 0000000000000000000000000000000000000000..4c9bb3e28b8370255cf2accdbd1cbd4f6ca290a5 --- /dev/null +++ b/02-MSA-hello-world-gpu/solutions/compile.sh @@ -0,0 +1,12 @@ +if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then + echo "Building for $SYSTEMNAME" + ml GCC CUDA ParaStationMPI + make hello.gpu.out +elif [[ "$SYSTEMNAME" == "juwels" ]]; then + echo "Building for $SYSTEMNAME" + ml GCC ParaStationMPI + make hello.cpu.out +else + echo "The system $SYSTEMNAME is not supported!" + echo "Please load manually load environment modules for compiler and MPI and compile with the Makefile" +fi \ No newline at end of file diff --git a/02-MSA-hello-world-gpu/solutions/hello-world.c b/02-MSA-hello-world-gpu/solutions/hello-world.c new file mode 100644 index 0000000000000000000000000000000000000000..562b47bb502a3361079267555b2350e9b626d574 --- /dev/null +++ b/02-MSA-hello-world-gpu/solutions/hello-world.c @@ -0,0 +1,35 @@ +#include <stdio.h> +#include <stdlib.h> +#include <mpi.h> + +int main(int argc, char** argv){ + MPI_Init(&argc, &argv); + + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + MPI_Status stat; + + if(size != 2){ + if(rank == 0){ + printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size); + } + MPI_Finalize(); + exit(0); + } + int tag = 10; + + const char *payload = "world!"; + + // TODO: Implement the MPI_Send call to send the six characters of "payload" to rank 1 + if (rank == 0) { + MPI_Send(payload, 6, MPI_CHAR, 1, tag, MPI_COMM_WORLD); + } + + printf("\n"); + + return 0; +} diff --git a/02-MSA-hello-world-gpu/solutions/hello-world.cu b/02-MSA-hello-world-gpu/solutions/hello-world.cu new file mode 100644 index 0000000000000000000000000000000000000000..7a2bf1688937b3b93b47e7c8f59ee320bbc452c7 --- /dev/null +++ b/02-MSA-hello-world-gpu/solutions/hello-world.cu @@ -0,0 +1,67 @@ +#include <stdio.h> +#include <stdlib.h> +#include <mpi.h> + +#define CUDA_RT_CALL(call) \ + { \ + cudaError_t cudaStatus = call; \ + if (cudaSuccess != cudaStatus) \ + fprintf(stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ + } +__global__ void hello(const char * payload){ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i == 0) { + printf("%s", payload); + } +} + +int main(int argc, char** argv){ + MPI_Init(&argc, &argv); + + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + MPI_Status stat; + + if(size != 2){ + if(rank == 0){ + printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size); + } + MPI_Finalize(); + exit(0); + } + int tag = 10; + + const char *payload = "hello "; + + char * d_payload; + CUDA_RT_CALL( cudaMalloc((void**)&d_payload, 6) ); + CUDA_RT_CALL( cudaMemcpy(d_payload, payload, 6, cudaMemcpyHostToDevice) ); + + hello<<<1, 1>>>(d_payload); + + CUDA_RT_CALL( cudaPeekAtLastError() ); + CUDA_RT_CALL( cudaDeviceSynchronize() ); + + // TODO: Implement the MPI_Recv() call to receive the "payload" from rank 1 using directly "d_payload" as the target buffer on the GPU + if (rank == 1) { + MPI_Recv(d_payload, 6, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &stat); + } + + hello<<<1, 1>>>(d_payload); + + CUDA_RT_CALL( cudaPeekAtLastError() ); + CUDA_RT_CALL( cudaDeviceSynchronize() ); + + printf("\n"); + + return 0; +} diff --git a/02-MSA-hello-world-gpu/solutions/job_msa_juwels.sh b/02-MSA-hello-world-gpu/solutions/job_msa_juwels.sh new file mode 100644 index 0000000000000000000000000000000000000000..32bd3696ee2e173aaa8207ee2d0c3466e2ffafc6 --- /dev/null +++ b/02-MSA-hello-world-gpu/solutions/job_msa_juwels.sh @@ -0,0 +1,14 @@ +#!/bin/bash -x +#SBATCH --account=training2317 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=slurm-out.%j +#SBATCH --error=slurm-err.%j +#SBATCH --time=00:15:00 +#SBATCH --partition=devel +#SBATCH hetjob +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --partition=develbooster + +srun xenv -P -L GCC -L ParaStationMPI ./hello.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./hello.gpu.out diff --git a/02-MSA-hello-world-gpu/tasks/Makefile b/02-MSA-hello-world-gpu/tasks/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c51d61d957d0decf1cb391bed05c7a34ddaac7e3 --- /dev/null +++ b/02-MSA-hello-world-gpu/tasks/Makefile @@ -0,0 +1,28 @@ +CUCOMP = nvcc +CUFLAGS = -arch=sm_80 + +ifdef EBROOTOPENMPI +MPI_HOME+=$(EBROOTOPENMPI) +endif +ifdef EBROOTPSMPI +MPI_HOME+=$(EBROOTPSMPI) +endif + +INCLUDES = -I$(MPI_HOME)/include +LIBRARIES = -L$(MPI_HOME)/lib -lmpi + +all: hello.cpu.out hello.gpu.out + +hello.cpu.out: hello-world.c + mpicc $< -o $@ + +hello.gpu.out: hello.gpu.o + $(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@ + +hello.gpu.o: hello-world.cu + $(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@ + +.PHONY: clean + +clean: + rm -f hello.cpu.out hello.gpu.out *.o \ No newline at end of file diff --git a/02-MSA-hello-world-gpu/tasks/README.md b/02-MSA-hello-world-gpu/tasks/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd4ba44ce5142d231041d64cd3af2de2ce6527be --- /dev/null +++ b/02-MSA-hello-world-gpu/tasks/README.md @@ -0,0 +1,23 @@ +# MSA GPU Hello World + +Building up on the previous exercise, in this exercise the GPU-side `printf()` function is used to print from a kernel a "hello world!", where the second word is received directly from the CPU process. + +TODOs are included in `hello-world.c` and `hello-world.cu`, indicating how to implement the `MPI_Send()` / `MPI_Recv()` structure. + +Once the function calls are implemented,e xecute the following on JUWELS Booster + +```bash +bash compile.sh +``` +(Which is equivalent to calling the `make` for the GPU part of the application, including the right modules.) + +Also, execute the following on JUWELS Cluster + +```bash +bash compile.sh +sbatch job_msa_juwels.sh +``` + +(Which compiles the CPU part of the application and then submits a heterogeneous job to the batch queue.) + +Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number. \ No newline at end of file diff --git a/02-MSA-hello-world-gpu/tasks/compile.sh b/02-MSA-hello-world-gpu/tasks/compile.sh new file mode 100755 index 0000000000000000000000000000000000000000..4c9bb3e28b8370255cf2accdbd1cbd4f6ca290a5 --- /dev/null +++ b/02-MSA-hello-world-gpu/tasks/compile.sh @@ -0,0 +1,12 @@ +if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then + echo "Building for $SYSTEMNAME" + ml GCC CUDA ParaStationMPI + make hello.gpu.out +elif [[ "$SYSTEMNAME" == "juwels" ]]; then + echo "Building for $SYSTEMNAME" + ml GCC ParaStationMPI + make hello.cpu.out +else + echo "The system $SYSTEMNAME is not supported!" + echo "Please load manually load environment modules for compiler and MPI and compile with the Makefile" +fi \ No newline at end of file diff --git a/02-MSA-hello-world-gpu/tasks/hello-world.c b/02-MSA-hello-world-gpu/tasks/hello-world.c new file mode 100644 index 0000000000000000000000000000000000000000..4c6384d012b24c68208e62455915d4080dfd5fab --- /dev/null +++ b/02-MSA-hello-world-gpu/tasks/hello-world.c @@ -0,0 +1,35 @@ +#include <stdio.h> +#include <stdlib.h> +#include <mpi.h> + +int main(int argc, char** argv){ + MPI_Init(&argc, &argv); + + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + MPI_Status stat; + + if(size != 2){ + if(rank == 0){ + printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size); + } + MPI_Finalize(); + exit(0); + } + int tag = 10; + + const char *payload = "world!"; + + // TODO: Implement the MPI_Send call to send the six characters of "payload" to rank 1 + if (rank == 0) { + MPI_Send(); + } + + printf("\n"); + + return 0; +} diff --git a/02-MSA-hello-world-gpu/tasks/hello-world.cu b/02-MSA-hello-world-gpu/tasks/hello-world.cu new file mode 100644 index 0000000000000000000000000000000000000000..6f6e11b58f60ade0e16c9b2b66556266262c230f --- /dev/null +++ b/02-MSA-hello-world-gpu/tasks/hello-world.cu @@ -0,0 +1,67 @@ +#include <stdio.h> +#include <stdlib.h> +#include <mpi.h> + +#define CUDA_RT_CALL(call) \ + { \ + cudaError_t cudaStatus = call; \ + if (cudaSuccess != cudaStatus) \ + fprintf(stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ + } +__global__ void hello(const char * payload){ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i == 0) { + printf("%s", payload); + } +} + +int main(int argc, char** argv){ + MPI_Init(&argc, &argv); + + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + MPI_Status stat; + + if(size != 2){ + if(rank == 0){ + printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size); + } + MPI_Finalize(); + exit(0); + } + int tag = 10; + + const char *payload = "hello "; + + char * d_payload; + CUDA_RT_CALL( cudaMalloc((void**)&d_payload, 6) ); + CUDA_RT_CALL( cudaMemcpy(d_payload, payload, 6, cudaMemcpyHostToDevice) ); + + hello<<<1, 1>>>(d_payload); + + CUDA_RT_CALL( cudaPeekAtLastError() ); + CUDA_RT_CALL( cudaDeviceSynchronize() ); + + // TODO: Implement the MPI_Recv() call to receive the "payload" from rank 1 using directly "d_payload" as the target buffer on the GPU + if (rank == 1) { + MPI_Recv(); + } + + hello<<<1, 1>>>(d_payload); + + CUDA_RT_CALL( cudaPeekAtLastError() ); + CUDA_RT_CALL( cudaDeviceSynchronize() ); + + printf("\n"); + + return 0; +} diff --git a/02-MSA-hello-world-gpu/tasks/job_msa_juwels.sh b/02-MSA-hello-world-gpu/tasks/job_msa_juwels.sh new file mode 100644 index 0000000000000000000000000000000000000000..32bd3696ee2e173aaa8207ee2d0c3466e2ffafc6 --- /dev/null +++ b/02-MSA-hello-world-gpu/tasks/job_msa_juwels.sh @@ -0,0 +1,14 @@ +#!/bin/bash -x +#SBATCH --account=training2317 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=slurm-out.%j +#SBATCH --error=slurm-err.%j +#SBATCH --time=00:15:00 +#SBATCH --partition=devel +#SBATCH hetjob +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --partition=develbooster + +srun xenv -P -L GCC -L ParaStationMPI ./hello.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./hello.gpu.out