Skip to content
Snippets Groups Projects
Commit 255608ff authored by Sebastian Achilles's avatar Sebastian Achilles
Browse files

add 02 MSA Ping-Pong

parent b24d6111
No related branches found
No related tags found
1 merge request!3add 02 MSA Ping-Pong
CUCOMP = nvcc
CUFLAGS = -arch=sm_80
INCLUDES = -I$(EBROOTOPENMPI)/include
LIBRARIES = -L$(EBROOTOPENMPI)/lib -lmpi
all: pp_cpu.out pp_cuda_aware.out
pp_cpu.out: ping-pong.c
mpicc ping-pong.c -o pp_cpu.out
pp_cuda_aware.out: ping-pong.o
$(CUCOMP) $(CUFLAGS) $(LIBRARIES) ping-pong.o -o pp_cuda_aware.out
ping-pong.o: ping-pong.cu
$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c ping-pong.cu
.PHONY: clean
clean:
rm -f pp_cpu.out pp_cuda_aware.out *.o
ml GCC OpenMPI MPI-settings/CUDA
make all
#!/bin/bash -x
#SBATCH --account=exalab
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --output=slurm-out.%j
#SBATCH --error=slurm-err.%j
#SBATCH --time=00:15:00
#SBATCH --partition=dc-cpu-devel
#SBATCH hetjob
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=dc-gpu-devel
srun xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cpu.out : xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cuda_aware.out
#!/bin/bash -x
#SBATCH --account=exalab
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --output=slurm-out.%j
#SBATCH --error=slurm-err.%j
#SBATCH --time=00:15:00
#SBATCH --partition=devel
#SBATCH hetjob
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=develbooster
srun xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cpu.out : xenv -P -L GCC -L OpenMPI -L MPI-settings/CUDA ./pp_cuda_aware.out
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
/* -------------------------------------------------------------------------------------------
MPI Initialization
--------------------------------------------------------------------------------------------*/
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status stat;
if(size != 2){
if(rank == 0){
printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
}
MPI_Finalize();
exit(0);
}
/* -------------------------------------------------------------------------------------------
Loop from 8 B to 1 GB
--------------------------------------------------------------------------------------------*/
for(int i=0; i<=27; i++){
long int N = 1 << i;
// Allocate memory for A on CPU
double *A = (double*)malloc(N*sizeof(double));
// Initialize all elements of A to 0.0
for(int i=0; i<N; i++){
A[i] = 0.0;
}
int tag1 = 10;
int tag2 = 20;
int loop_count = 50;
// Warm-up loop
for(int i=1; i<=5; i++){
if(rank == 0){
MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
}
else if(rank == 1){
MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
}
}
// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
double start_time, stop_time, elapsed_time;
start_time = MPI_Wtime();
for(int i=1; i<=loop_count; i++){
if(rank == 0){
MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
}
else if(rank == 1){
MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
}
}
stop_time = MPI_Wtime();
elapsed_time = stop_time - start_time;
long int num_B = 8*N;
long int B_in_GB = 1 << 30;
double num_GB = (double)num_B / (double)B_in_GB;
double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
free(A);
}
MPI_Finalize();
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
// Macro for checking errors in CUDA API calls
#define cudaErrorCheck(call) \
do{ \
cudaError_t cuErr = call; \
if(cudaSuccess != cuErr){ \
printf("CUDA Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(cuErr));\
exit(0); \
} \
}while(0)
int main(int argc, char *argv[])
{
/* -------------------------------------------------------------------------------------------
MPI Initialization
--------------------------------------------------------------------------------------------*/
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status stat;
if(size != 2){
if(rank == 0){
printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
}
MPI_Finalize();
exit(0);
}
/* -------------------------------------------------------------------------------------------
Loop from 8 B to 1 GB
--------------------------------------------------------------------------------------------*/
for(int i=0; i<=27; i++){
long int N = 1 << i;
// Allocate memory for A on CPU
double *A = (double*)malloc(N*sizeof(double));
// Initialize all elements of A to 0.0
for(int i=0; i<N; i++){
A[i] = 0.0;
}
double *d_A;
cudaErrorCheck( cudaMalloc(&d_A, N*sizeof(double)) );
cudaErrorCheck( cudaMemcpy(d_A, A, N*sizeof(double), cudaMemcpyHostToDevice) );
int tag1 = 10;
int tag2 = 20;
int loop_count = 50;
// Warm-up loop
for(int i=1; i<=5; i++){
if(rank == 0){
MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
}
else if(rank == 1){
MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
}
}
// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
double start_time, stop_time, elapsed_time;
start_time = MPI_Wtime();
for(int i=1; i<=loop_count; i++){
if(rank == 0){
MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
}
else if(rank == 1){
MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
}
}
stop_time = MPI_Wtime();
elapsed_time = stop_time - start_time;
long int num_B = 8*N;
long int B_in_GB = 1 << 30;
double num_GB = (double)num_B / (double)B_in_GB;
double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
cudaErrorCheck( cudaFree(d_A) );
free(A);
}
MPI_Finalize();
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment