From 255608ff74acd37de260f28f6cc0872c1a063a61 Mon Sep 17 00:00:00 2001 From: Sebastian Achilles <s.achilles@fz-juelich.de> Date: Wed, 19 Apr 2023 12:06:56 +0200 Subject: [PATCH] add 02 MSA Ping-Pong --- 02-MSA-ping-pong/Makefile | 22 ++++++ 02-MSA-ping-pong/compile_all.sh | 2 + 02-MSA-ping-pong/job_msa_jureca.sh | 14 ++++ 02-MSA-ping-pong/job_msa_juwels.sh | 14 ++++ 02-MSA-ping-pong/ping-pong.c | 93 ++++++++++++++++++++++++ 02-MSA-ping-pong/ping-pong.cu | 109 +++++++++++++++++++++++++++++ 6 files changed, 254 insertions(+) create mode 100644 02-MSA-ping-pong/Makefile create mode 100755 02-MSA-ping-pong/compile_all.sh create mode 100644 02-MSA-ping-pong/job_msa_jureca.sh create mode 100644 02-MSA-ping-pong/job_msa_juwels.sh create mode 100644 02-MSA-ping-pong/ping-pong.c create mode 100644 02-MSA-ping-pong/ping-pong.cu diff --git a/02-MSA-ping-pong/Makefile b/02-MSA-ping-pong/Makefile new file mode 100644 index 0000000..0fbc367 --- /dev/null +++ b/02-MSA-ping-pong/Makefile @@ -0,0 +1,22 @@ +CUCOMP = nvcc +CUFLAGS = -arch=sm_80 + +INCLUDES = -I$(EBROOTOPENMPI)/include +LIBRARIES = -L$(EBROOTOPENMPI)/lib -lmpi + +all: pp_cpu.out pp_cuda_aware.out + +pp_cpu.out: ping-pong.c + mpicc ping-pong.c -o pp_cpu.out + +pp_cuda_aware.out: ping-pong.o + $(CUCOMP) $(CUFLAGS) $(LIBRARIES) ping-pong.o -o pp_cuda_aware.out + +ping-pong.o: ping-pong.cu + $(CUCOMP) $(CUFLAGS) $(INCLUDES) -c ping-pong.cu + +.PHONY: clean + +clean: + rm -f pp_cpu.out pp_cuda_aware.out *.o + diff --git a/02-MSA-ping-pong/compile_all.sh b/02-MSA-ping-pong/compile_all.sh new file mode 100755 index 0000000..1f677bb --- /dev/null +++ b/02-MSA-ping-pong/compile_all.sh @@ -0,0 +1,2 @@ +ml GCC OpenMPI MPI-settings/CUDA +make all diff --git a/02-MSA-ping-pong/job_msa_jureca.sh b/02-MSA-ping-pong/job_msa_jureca.sh new file mode 100644 index 0000000..467ff8c --- /dev/null +++ b/02-MSA-ping-pong/job_msa_jureca.sh @@ -0,0 +1,14 @@ +#!/bin/bash -x +#SBATCH --account=exalab +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=slurm-out.%j +#SBATCH --error=slurm-err.%j +#SBATCH --time=00:15:00 +#SBATCH --partition=dc-cpu-devel +#SBATCH hetjob +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --partition=dc-gpu-devel + +srun xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cpu.out : xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cuda_aware.out diff --git a/02-MSA-ping-pong/job_msa_juwels.sh b/02-MSA-ping-pong/job_msa_juwels.sh new file mode 100644 index 0000000..5e6f8bb --- /dev/null +++ b/02-MSA-ping-pong/job_msa_juwels.sh @@ -0,0 +1,14 @@ +#!/bin/bash -x +#SBATCH --account=exalab +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=slurm-out.%j +#SBATCH --error=slurm-err.%j +#SBATCH --time=00:15:00 +#SBATCH --partition=devel +#SBATCH hetjob +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --partition=develbooster + +srun xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cpu.out : xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cuda_aware.out diff --git a/02-MSA-ping-pong/ping-pong.c b/02-MSA-ping-pong/ping-pong.c new file mode 100644 index 0000000..29343e7 --- /dev/null +++ b/02-MSA-ping-pong/ping-pong.c @@ -0,0 +1,93 @@ +#include <stdio.h> +#include <stdlib.h> +#include <mpi.h> + +int main(int argc, char *argv[]) +{ + /* ------------------------------------------------------------------------------------------- + MPI Initialization + --------------------------------------------------------------------------------------------*/ + MPI_Init(&argc, &argv); + + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + MPI_Status stat; + + if(size != 2){ + if(rank == 0){ + printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size); + } + MPI_Finalize(); + exit(0); + } + + /* ------------------------------------------------------------------------------------------- + Loop from 8 B to 1 GB + --------------------------------------------------------------------------------------------*/ + + for(int i=0; i<=27; i++){ + + long int N = 1 << i; + + // Allocate memory for A on CPU + double *A = (double*)malloc(N*sizeof(double)); + + // Initialize all elements of A to 0.0 + for(int i=0; i<N; i++){ + A[i] = 0.0; + } + + int tag1 = 10; + int tag2 = 20; + + int loop_count = 50; + + // Warm-up loop + for(int i=1; i<=5; i++){ + if(rank == 0){ + MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD); + MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat); + } + else if(rank == 1){ + MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat); + MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD); + } + } + + // Time ping-pong for loop_count iterations of data transfer size 8*N bytes + double start_time, stop_time, elapsed_time; + start_time = MPI_Wtime(); + + for(int i=1; i<=loop_count; i++){ + if(rank == 0){ + MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD); + MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat); + } + else if(rank == 1){ + MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat); + MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD); + } + } + + stop_time = MPI_Wtime(); + elapsed_time = stop_time - start_time; + + long int num_B = 8*N; + long int B_in_GB = 1 << 30; + double num_GB = (double)num_B / (double)B_in_GB; + double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count); + + if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer ); + + free(A); + } + + MPI_Finalize(); + + return 0; +} + diff --git a/02-MSA-ping-pong/ping-pong.cu b/02-MSA-ping-pong/ping-pong.cu new file mode 100644 index 0000000..ae313a4 --- /dev/null +++ b/02-MSA-ping-pong/ping-pong.cu @@ -0,0 +1,109 @@ +#include <stdio.h> +#include <stdlib.h> +#include <mpi.h> + +// Macro for checking errors in CUDA API calls +#define cudaErrorCheck(call) \ +do{ \ + cudaError_t cuErr = call; \ + if(cudaSuccess != cuErr){ \ + printf("CUDA Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(cuErr));\ + exit(0); \ + } \ +}while(0) + + +int main(int argc, char *argv[]) +{ + /* ------------------------------------------------------------------------------------------- + MPI Initialization + --------------------------------------------------------------------------------------------*/ + MPI_Init(&argc, &argv); + + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + MPI_Status stat; + + if(size != 2){ + if(rank == 0){ + printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size); + } + MPI_Finalize(); + exit(0); + } + + /* ------------------------------------------------------------------------------------------- + Loop from 8 B to 1 GB + --------------------------------------------------------------------------------------------*/ + + for(int i=0; i<=27; i++){ + + long int N = 1 << i; + + // Allocate memory for A on CPU + double *A = (double*)malloc(N*sizeof(double)); + + // Initialize all elements of A to 0.0 + for(int i=0; i<N; i++){ + A[i] = 0.0; + } + + double *d_A; + cudaErrorCheck( cudaMalloc(&d_A, N*sizeof(double)) ); + cudaErrorCheck( cudaMemcpy(d_A, A, N*sizeof(double), cudaMemcpyHostToDevice) ); + + int tag1 = 10; + int tag2 = 20; + + int loop_count = 50; + + // Warm-up loop + for(int i=1; i<=5; i++){ + if(rank == 0){ + MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD); + MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat); + } + else if(rank == 1){ + MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat); + MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD); + } + } + + // Time ping-pong for loop_count iterations of data transfer size 8*N bytes + double start_time, stop_time, elapsed_time; + start_time = MPI_Wtime(); + + for(int i=1; i<=loop_count; i++){ + if(rank == 0){ + MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD); + MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat); + } + else if(rank == 1){ + MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat); + MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD); + } + } + + stop_time = MPI_Wtime(); + elapsed_time = stop_time - start_time; + + long int num_B = 8*N; + long int B_in_GB = 1 << 30; + double num_GB = (double)num_B / (double)B_in_GB; + double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count); + + if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer ); + + cudaErrorCheck( cudaFree(d_A) ); + free(A); + } + + MPI_Finalize(); + + return 0; +} + -- GitLab