Add static versions of files

bca7f165 · Andreas Herten · 3d892e12 · bca7f165 · bca7f165 · bca7f165
Commit bca7f165 authored 1 year ago by Andreas Herten
--- a/03-MSA-ping-pong/Makefile
+++ b/03-MSA-ping-pong/Makefile
--- a/03-MSA-ping-pong/README.md
+++ b/03-MSA-ping-pong/README.md
--- a/03-MSA-ping-pong/compile.sh
+++ b/03-MSA-ping-pong/compile.sh
--- a/03-MSA-ping-pong/.master/copy.mk
+++ b/03-MSA-ping-pong/.master/copy.mk
+#!/usr/bin/make -f
+TASKDIR = ../tasks
+SOLUTIONDIR = ../solutions
+
+PROCESSFILES = ping-pong.cu
+COPYFILES = Makefile README.md job_msa_juwels.sh job_msa_jureca.sh compile.sh ping-pong.c
+
+
+TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES))
+TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES))
+SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES))
+SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES))
+
+.PHONY: all task
+all: task
+task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES}
+
+
+${TASKPROCCESFILES}: $(PROCESSFILES)
+	mkdir -p $(TASKDIR)/
+	cppp -USOLUTION $(notdir $@) $@
+	
+${SOLUTIONPROCCESFILES}: $(PROCESSFILES)
+	mkdir -p $(SOLUTIONDIR)/
+	cppp -DSOLUTION $(notdir $@) $@
+
+
+${TASKCOPYFILES}: $(COPYFILES)
+	mkdir -p $(TASKDIR)/
+	cp $(notdir $@) $@
+	
+${SOLUTIONCOPYFILES}: $(COPYFILES)
+	mkdir -p $(SOLUTIONDIR)/
+	cp $(notdir $@) $@
\ No newline at end of file
--- a/03-MSA-ping-pong/job_msa_jureca.sh
+++ b/03-MSA-ping-pong/job_msa_jureca.sh
--- a/03-MSA-ping-pong/job_msa_juwels.sh
+++ b/03-MSA-ping-pong/job_msa_juwels.sh
--- a/03-MSA-ping-pong/ping-pong.c
+++ b/03-MSA-ping-pong/ping-pong.c
--- a/03-MSA-ping-pong/ping-pong.cu
+++ b/03-MSA-ping-pong/ping-pong.cu
--- a/03-MSA-ping-pong/solutions/Makefile
+++ b/03-MSA-ping-pong/solutions/Makefile
+CUCOMP  = nvcc
+CUFLAGS = -arch=sm_80
+
+ifdef EBROOTOPENMPI
+MPI_HOME+=$(EBROOTOPENMPI)
+endif
+ifdef EBROOTPSMPI
+MPI_HOME+=$(EBROOTPSMPI)
+endif
+
+INCLUDES  = -I$(MPI_HOME)/include
+LIBRARIES = -L$(MPI_HOME)/lib -lmpi
+
+all: ping-pong.cpu.out ping-pong.gpu.out
+
+ping-pong.cpu.out: ping-pong.c
+	mpicc $< -o $@
+
+ping-pong.gpu.out: ping-pong.gpu.o
+	$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
+
+ping-pong.gpu.o: ping-pong.cu
+	$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
+
+.PHONY: clean
+
+clean:
+	rm -f ping-pong.cpu.out ping-pong.gpu.out *.o
+
--- a/03-MSA-ping-pong/solutions/README.md
+++ b/03-MSA-ping-pong/solutions/README.md
+# MSA CPU-GPU Ping Pong
+
+Extending the previous examples, we now send ping pong messages between the CPU memory of one node and the GPU memory of another node, using the heterogeneous job features of Slurm.
+
+TODOs in `ping-pong.cu` indicate points to implement pointers to GPU memory instead of CPU memory.
+
+After working on the TODOs, execute the following on JUWELS Booster to compile `ping-pong.cu` with the right modules.
+
+```bash
+bash compile.sh
+```
+
+Execute the following on JUWELS Cluster to compile the CPU part of the application and submit a batch job:
+
+```bash
+bash compile.sh
+sbatch job_msa_juwels.sh
+```
+
+Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
--- a/03-MSA-ping-pong/solutions/compile.sh
+++ b/03-MSA-ping-pong/solutions/compile.sh
+if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
+	echo "Building GPU-aware version for $SYSTEMNAME"
+	ml GCC ParaStationMPI MPI-settings/CUDA
+	make ping-pong.gpu.out
+elif [[ "$SYSTEMNAME" == "juwels" ]]; then
+	echo "Building CPU version for $SYSTEMNAME"
+	ml GCC ParaStationMPI 
+	make ping-pong.cpu.out
+else
+	echo "The system $SYSTEMNAME is not supported!"
+fi
\ No newline at end of file
--- a/03-MSA-ping-pong/solutions/job_msa_jureca.sh
+++ b/03-MSA-ping-pong/solutions/job_msa_jureca.sh
+#!/bin/bash -x
+#SBATCH --account=exalab
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=dc-cpu-devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=dc-gpu-devel
+
+srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
--- a/03-MSA-ping-pong/solutions/job_msa_juwels.sh
+++ b/03-MSA-ping-pong/solutions/job_msa_juwels.sh
+#!/bin/bash -x
+#SBATCH --account=training2317
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=develbooster
+
+srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
--- a/03-MSA-ping-pong/solutions/ping-pong.c
+++ b/03-MSA-ping-pong/solutions/ping-pong.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+int main(int argc, char *argv[])
+{
+	/* -------------------------------------------------------------------------------------------
+		MPI Initialization 
+	--------------------------------------------------------------------------------------------*/
+	MPI_Init(&argc, &argv);
+
+	int size;
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	MPI_Status stat;
+
+	if(size != 2){
+		if(rank == 0){
+			printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+		}
+		MPI_Finalize();
+		exit(0);
+	}
+
+	/* -------------------------------------------------------------------------------------------
+		Loop from 8 B to 1 GB
+	--------------------------------------------------------------------------------------------*/
+
+	for(int i=0; i<=27; i++){
+
+		long int N = 1 << i;
+	
+   	 	// Allocate memory for A on CPU
+		double *A = (double*)malloc(N*sizeof(double));
+
+		// Initialize all elements of A to 0.0
+		for(int i=0; i<N; i++){
+			A[i] = 0.0;
+		}
+	
+		int tag1 = 10;
+		int tag2 = 20;
+	
+		int loop_count = 50;
+
+		// Warm-up loop
+		for(int i=1; i<=5; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
+		double start_time, stop_time, elapsed_time;
+		start_time = MPI_Wtime();
+	
+		for(int i=1; i<=loop_count; i++){
+			if(rank == 0){
+				MPI_Send(A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		stop_time = MPI_Wtime();
+		elapsed_time = stop_time - start_time;
+
+		long int num_B = 8*N;
+		long int B_in_GB = 1 << 30;
+		double num_GB = (double)num_B / (double)B_in_GB;
+		double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
+
+		if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
+
+		free(A);
+	}
+
+	MPI_Finalize();
+
+	return 0;
+}
+
--- a/03-MSA-ping-pong/solutions/ping-pong.cu
+++ b/03-MSA-ping-pong/solutions/ping-pong.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+// Macro for checking errors in CUDA API calls
+#define cudaErrorCheck(call)                                                              \
+do{                                                                                       \
+	cudaError_t cuErr = call;                                                             \
+	if(cudaSuccess != cuErr){                                                             \
+		printf("CUDA Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(cuErr));\
+		exit(0);                                                                            \
+	}                                                                                     \
+}while(0)
+
+
+int main(int argc, char *argv[])
+{
+	/* -------------------------------------------------------------------------------------------
+		MPI Initialization 
+	--------------------------------------------------------------------------------------------*/
+	MPI_Init(&argc, &argv);
+
+	int size;
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	MPI_Status stat;
+
+	if(size != 2){
+		if(rank == 0){
+			printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
+		}
+		MPI_Finalize();
+		exit(0);
+	}
+
+	/* -------------------------------------------------------------------------------------------
+		Loop from 8 B to 1 GB
+	--------------------------------------------------------------------------------------------*/
+
+	for(int i=0; i<=27; i++){
+
+		long int N = 1 << i;
+	
+		// Allocate memory for A on CPU
+		double *A = (double*)malloc(N*sizeof(double));
+
+		// Initialize all elements of A to 0.0
+		for(int i=0; i<N; i++){
+			A[i] = 0.0;
+		}
+
+		// TODO: Create an empty double pointer, d_A; allocate d_A on the GPU; copy the content of A to d_A
+		double *d_A;
+		cudaErrorCheck( cudaMalloc(&d_A, N*sizeof(double)) );
+		cudaErrorCheck( cudaMemcpy(d_A, A, N*sizeof(double), cudaMemcpyHostToDevice) );
+	
+		int tag1 = 10;
+		int tag2 = 20;
+	
+		int loop_count = 50;
+
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
+		// Warm-up loop
+		for(int i=1; i<=5; i++){
+			if(rank == 0){
+				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		// Time ping-pong for loop_count iterations of data transfer size 8*N bytes
+		double start_time, stop_time, elapsed_time;
+		start_time = MPI_Wtime();
+	
+		// TODO: Use the GPU pointer d_A in the following MPI calls instead of A
+		for(int i=1; i<=loop_count; i++){
+			if(rank == 0){
+				MPI_Send(d_A, N, MPI_DOUBLE, 1, tag1, MPI_COMM_WORLD);
+				MPI_Recv(d_A, N, MPI_DOUBLE, 1, tag2, MPI_COMM_WORLD, &stat);
+			}
+			else if(rank == 1){
+				MPI_Recv(d_A, N, MPI_DOUBLE, 0, tag1, MPI_COMM_WORLD, &stat);
+				MPI_Send(d_A, N, MPI_DOUBLE, 0, tag2, MPI_COMM_WORLD);
+			}
+		}
+
+		stop_time = MPI_Wtime();
+		elapsed_time = stop_time - start_time;
+
+		long int num_B = 8*N;
+		long int B_in_GB = 1 << 30;
+		double num_GB = (double)num_B / (double)B_in_GB;
+		double avg_time_per_transfer = elapsed_time / (2.0*(double)loop_count);
+
+		if(rank == 0) printf("Transfer size (B): %10li, Transfer Time (s): %15.9f, Bandwidth (GB/s): %15.9f\n", num_B, avg_time_per_transfer, num_GB/avg_time_per_transfer );
+
+		cudaErrorCheck( cudaFree(d_A) );
+		free(A);
+	}
+
+	MPI_Finalize();
+
+	return 0;
+}
+
--- a/03-MSA-ping-pong/tasks/Makefile
+++ b/03-MSA-ping-pong/tasks/Makefile
+CUCOMP  = nvcc
+CUFLAGS = -arch=sm_80
+
+ifdef EBROOTOPENMPI
+MPI_HOME+=$(EBROOTOPENMPI)
+endif
+ifdef EBROOTPSMPI
+MPI_HOME+=$(EBROOTPSMPI)
+endif
+
+INCLUDES  = -I$(MPI_HOME)/include
+LIBRARIES = -L$(MPI_HOME)/lib -lmpi
+
+all: ping-pong.cpu.out ping-pong.gpu.out
+
+ping-pong.cpu.out: ping-pong.c
+	mpicc $< -o $@
+
+ping-pong.gpu.out: ping-pong.gpu.o
+	$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
+
+ping-pong.gpu.o: ping-pong.cu
+	$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
+
+.PHONY: clean
+
+clean:
+	rm -f ping-pong.cpu.out ping-pong.gpu.out *.o
+
--- a/03-MSA-ping-pong/tasks/README.md
+++ b/03-MSA-ping-pong/tasks/README.md
+# MSA CPU-GPU Ping Pong
+
+Extending the previous examples, we now send ping pong messages between the CPU memory of one node and the GPU memory of another node, using the heterogeneous job features of Slurm.
+
+TODOs in `ping-pong.cu` indicate points to implement pointers to GPU memory instead of CPU memory.
+
+After working on the TODOs, execute the following on JUWELS Booster to compile `ping-pong.cu` with the right modules.
+
+```bash
+bash compile.sh
+```
+
+Execute the following on JUWELS Cluster to compile the CPU part of the application and submit a batch job:
+
+```bash
+bash compile.sh
+sbatch job_msa_juwels.sh
+```
+
+Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
--- a/03-MSA-ping-pong/tasks/compile.sh
+++ b/03-MSA-ping-pong/tasks/compile.sh
+if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
+	echo "Building GPU-aware version for $SYSTEMNAME"
+	ml GCC ParaStationMPI MPI-settings/CUDA
+	make ping-pong.gpu.out
+elif [[ "$SYSTEMNAME" == "juwels" ]]; then
+	echo "Building CPU version for $SYSTEMNAME"
+	ml GCC ParaStationMPI 
+	make ping-pong.cpu.out
+else
+	echo "The system $SYSTEMNAME is not supported!"
+fi
\ No newline at end of file
--- a/03-MSA-ping-pong/tasks/job_msa_jureca.sh
+++ b/03-MSA-ping-pong/tasks/job_msa_jureca.sh
+#!/bin/bash -x
+#SBATCH --account=exalab
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=dc-cpu-devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=dc-gpu-devel
+
+srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out
--- a/03-MSA-ping-pong/tasks/job_msa_juwels.sh
+++ b/03-MSA-ping-pong/tasks/job_msa_juwels.sh
+#!/bin/bash -x
+#SBATCH --account=training2317
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=slurm-out.%j
+#SBATCH --error=slurm-err.%j
+#SBATCH --time=00:15:00
+#SBATCH --partition=devel
+#SBATCH hetjob
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=develbooster
+
+srun xenv -P -L GCC -L ParaStationMPI ./ping-pong.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./ping-pong.gpu.out