Skip to content
Snippets Groups Projects
Commit bca7f165 authored by Andreas Herten's avatar Andreas Herten
Browse files

Add static versions of files

parent 3d892e12
No related branches found
No related tags found
No related merge requests found
Showing
with 415 additions and 0 deletions
File moved
File moved
File moved
#!/usr/bin/make -f
TASKDIR = ../tasks
SOLUTIONDIR = ../solutions
PROCESSFILES = ping-pong.cu
COPYFILES = Makefile README.md job_msa_juwels.sh job_msa_jureca.sh compile.sh ping-pong.c
TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES))
TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES))
SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES))
SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES))
.PHONY: all task
all: task
task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES}
${TASKPROCCESFILES}: $(PROCESSFILES)
mkdir -p $(TASKDIR)/
cppp -USOLUTION $(notdir $@) $@
${SOLUTIONPROCCESFILES}: $(PROCESSFILES)
mkdir -p $(SOLUTIONDIR)/
cppp -DSOLUTION $(notdir $@) $@
${TASKCOPYFILES}: $(COPYFILES)
mkdir -p $(TASKDIR)/
cp $(notdir $@) $@
${SOLUTIONCOPYFILES}: $(COPYFILES)
mkdir -p $(SOLUTIONDIR)/
cp $(notdir $@) $@
\ No newline at end of file
File moved
CUCOMP = nvcc
CUFLAGS = -arch=sm_80
ifdef EBROOTOPENMPI
MPI_HOME+=$(EBROOTOPENMPI)
endif
ifdef EBROOTPSMPI
MPI_HOME+=$(EBROOTPSMPI)
endif
INCLUDES = -I$(MPI_HOME)/include
LIBRARIES = -L$(MPI_HOME)/lib -lmpi
all: ping-pong.cpu.out ping-pong.gpu.out
ping-pong.cpu.out: ping-pong.c
mpicc $< -o $@
ping-pong.gpu.out: ping-pong.gpu.o
$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
ping-pong.gpu.o: ping-pong.cu
$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
.PHONY: clean
clean:
rm -f ping-pong.cpu.out ping-pong.gpu.out *.o
# MSA CPU-GPU Ping Pong
Extending the previous examples, we now send ping pong messages between the CPU memory of one node and the GPU memory of another node, using the heterogeneous job features of Slurm.
TODOs in `ping-pong.cu` indicate points to implement pointers to GPU memory instead of CPU memory.
After working on the TODOs, execute the following on JUWELS Booster to compile `ping-pong.cu` with the right modules.
```bash
bash compile.sh
```
Execute the following on JUWELS Cluster to compile the CPU part of the application and submit a batch job:
```bash
bash compile.sh
sbatch job_msa_juwels.sh
```
Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
echo "Building GPU-aware version for $SYSTEMNAME"
ml GCC ParaStationMPI MPI-settings/CUDA
make ping-pong.gpu.out
elif [[ "$SYSTEMNAME" == "juwels" ]]; then
echo "Building CPU version for $SYSTEMNAME"
ml GCC ParaStationMPI
make ping-pong.cpu.out
else
echo "The system $SYSTEMNAME is not supported!"
fi
\ No newline at end of file
#!/bin/bash -x
#SBATCH --account=exalab
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --output=slurm-out.%j
#SBATCH --error=slurm-err.%j
#SBATCH --time=00:15:00
#SBATCH --partition=dc-cpu-devel
#SBATCH hetjob
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=dc-gpu-devel
srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
#!/bin/bash -x
#SBATCH --account=training2317
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --output=slurm-out.%j
#SBATCH --error=slurm-err.%j
#SBATCH --time=00:15:00
#SBATCH --partition=devel
#SBATCH hetjob
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=develbooster
srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
/* -------------------------------------------------------------------------------------------
MPI Initialization
--------------------------------------------------------------------------------------------*/
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status stat;
if(size != 2){
if(rank == 0){
printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
}
MPI_Finalize();
exit(0);
}
/* -------------------------------------------------------------------------------------------
Loop from 8 B to 1 GB
--------------------------------------------------------------------------------------------*/
for(int i=0; i<=27; i++){
long int N = 1 << i;
// Allocate memory for A on CPU
double *A = (double*)malloc(N*sizeof(double));
// Initialize all elements of A to 0.0
for(int i=0; i<N; i++){
A[i] = 0.0;
}
int tag1 = 10;
int tag2 = 20;
int loop_count = 50;
// Warm-up loop
for(int i=1; i<=5; i++){
if(rank == 0){
MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
}
else if(rank == 1){
MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
}
}
// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
double start_time, stop_time, elapsed_time;
start_time = MPI_Wtime();
for(int i=1; i<=loop_count; i++){
if(rank == 0){
MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
}
else if(rank == 1){
MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
}
}
stop_time = MPI_Wtime();
elapsed_time = stop_time - start_time;
long int num_B = 8*N;
long int B_in_GB = 1 << 30;
double num_GB = (double)num_B / (double)B_in_GB;
double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
free(A);
}
MPI_Finalize();
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
// Macro for checking errors in CUDA API calls
#define cudaErrorCheck(call) \
do{ \
cudaError_t cuErr = call; \
if(cudaSuccess != cuErr){ \
printf("CUDA Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(cuErr));\
exit(0); \
} \
}while(0)
int main(int argc, char *argv[])
{
/* -------------------------------------------------------------------------------------------
MPI Initialization
--------------------------------------------------------------------------------------------*/
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status stat;
if(size != 2){
if(rank == 0){
printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
}
MPI_Finalize();
exit(0);
}
/* -------------------------------------------------------------------------------------------
Loop from 8 B to 1 GB
--------------------------------------------------------------------------------------------*/
for(int i=0; i<=27; i++){
long int N = 1 << i;
// Allocate memory for A on CPU
double *A = (double*)malloc(N*sizeof(double));
// Initialize all elements of A to 0.0
for(int i=0; i<N; i++){
A[i] = 0.0;
}
// TODO: Create an empty double pointer, d_A; allocate d_A on the GPU; copy the content of A to d_A
double *d_A;
cudaErrorCheck( cudaMalloc(&d_A, N*sizeof(double)) );
cudaErrorCheck( cudaMemcpy(d_A, A, N*sizeof(double), cudaMemcpyHostToDevice) );
int tag1 = 10;
int tag2 = 20;
int loop_count = 50;
// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
// Warm-up loop
for(int i=1; i<=5; i++){
if(rank == 0){
MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
}
else if(rank == 1){
MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
}
}
// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
double start_time, stop_time, elapsed_time;
start_time = MPI_Wtime();
// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
for(int i=1; i<=loop_count; i++){
if(rank == 0){
MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
}
else if(rank == 1){
MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
}
}
stop_time = MPI_Wtime();
elapsed_time = stop_time - start_time;
long int num_B = 8*N;
long int B_in_GB = 1 << 30;
double num_GB = (double)num_B / (double)B_in_GB;
double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
cudaErrorCheck( cudaFree(d_A) );
free(A);
}
MPI_Finalize();
return 0;
}
CUCOMP = nvcc
CUFLAGS = -arch=sm_80
ifdef EBROOTOPENMPI
MPI_HOME+=$(EBROOTOPENMPI)
endif
ifdef EBROOTPSMPI
MPI_HOME+=$(EBROOTPSMPI)
endif
INCLUDES = -I$(MPI_HOME)/include
LIBRARIES = -L$(MPI_HOME)/lib -lmpi
all: ping-pong.cpu.out ping-pong.gpu.out
ping-pong.cpu.out: ping-pong.c
mpicc $< -o $@
ping-pong.gpu.out: ping-pong.gpu.o
$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
ping-pong.gpu.o: ping-pong.cu
$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
.PHONY: clean
clean:
rm -f ping-pong.cpu.out ping-pong.gpu.out *.o
# MSA CPU-GPU Ping Pong
Extending the previous examples, we now send ping pong messages between the CPU memory of one node and the GPU memory of another node, using the heterogeneous job features of Slurm.
TODOs in `ping-pong.cu` indicate points to implement pointers to GPU memory instead of CPU memory.
After working on the TODOs, execute the following on JUWELS Booster to compile `ping-pong.cu` with the right modules.
```bash
bash compile.sh
```
Execute the following on JUWELS Cluster to compile the CPU part of the application and submit a batch job:
```bash
bash compile.sh
sbatch job_msa_juwels.sh
```
Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
echo "Building GPU-aware version for $SYSTEMNAME"
ml GCC ParaStationMPI MPI-settings/CUDA
make ping-pong.gpu.out
elif [[ "$SYSTEMNAME" == "juwels" ]]; then
echo "Building CPU version for $SYSTEMNAME"
ml GCC ParaStationMPI
make ping-pong.cpu.out
else
echo "The system $SYSTEMNAME is not supported!"
fi
\ No newline at end of file
#!/bin/bash -x
#SBATCH --account=exalab
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --output=slurm-out.%j
#SBATCH --error=slurm-err.%j
#SBATCH --time=00:15:00
#SBATCH --partition=dc-cpu-devel
#SBATCH hetjob
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=dc-gpu-devel
srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
#!/bin/bash -x
#SBATCH --account=training2317
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --output=slurm-out.%j
#SBATCH --error=slurm-err.%j
#SBATCH --time=00:15:00
#SBATCH --partition=devel
#SBATCH hetjob
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=develbooster
srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment