Skip to content
Snippets Groups Projects
Commit 52e22bfb authored by Andreas Herten's avatar Andreas Herten
Browse files

Add processed versions of files

parent 03712a37
No related branches found
No related tags found
No related merge requests found
Showing with 358 additions and 0 deletions
CUCOMP = nvcc
CUFLAGS = -arch=sm_80
ifdef EBROOTOPENMPI
MPI_HOME+=$(EBROOTOPENMPI)
endif
ifdef EBROOTPSMPI
MPI_HOME+=$(EBROOTPSMPI)
endif
INCLUDES = -I$(MPI_HOME)/include
LIBRARIES = -L$(MPI_HOME)/lib -lmpi
all: hello.cpu.out hello.gpu.out
hello.cpu.out: hello-world.c
mpicc $< -o $@
hello.gpu.out: hello.gpu.o
$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
hello.gpu.o: hello-world.cu
$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
.PHONY: clean
clean:
rm -f hello.cpu.out hello.gpu.out *.o
\ No newline at end of file
# MSA GPU Hello World
Building up on the previous exercise, in this exercise the GPU-side `printf()` function is used to print from a kernel a "hello world!", where the second word is received directly from the CPU process.
TODOs are included in `hello-world.c` and `hello-world.cu`, indicating how to implement the `MPI_Send()` / `MPI_Recv()` structure.
Once the function calls are implemented,e xecute the following on JUWELS Booster
```bash
bash compile.sh
```
(Which is equivalent to calling the `make` for the GPU part of the application, including the right modules.)
Also, execute the following on JUWELS Cluster
```bash
bash compile.sh
sbatch job_msa_juwels.sh
```
(Which compiles the CPU part of the application and then submits a heterogeneous job to the batch queue.)
Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
echo "Building for $SYSTEMNAME"
ml GCC CUDA ParaStationMPI
make hello.gpu.out
elif [[ "$SYSTEMNAME" == "juwels" ]]; then
echo "Building for $SYSTEMNAME"
ml GCC ParaStationMPI
make hello.cpu.out
else
echo "The system $SYSTEMNAME is not supported!"
echo "Please load manually load environment modules for compiler and MPI and compile with the Makefile"
fi
\ No newline at end of file
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char** argv){
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status stat;
if(size != 2){
if(rank == 0){
printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
}
MPI_Finalize();
exit(0);
}
int tag = 10;
const char *payload = "world!";
// TODO: Implement the MPI_Send call to send the six characters of "payload" to rank 1
if (rank == 0) {
MPI_Send(payload, 6, MPI_CHAR, 1, tag, MPI_COMM_WORLD);
}
printf("\n");
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define CUDA_RT_CALL(call) \
{ \
cudaError_t cudaStatus = call; \
if (cudaSuccess != cudaStatus) \
fprintf(stderr, \
"ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \
"with " \
"%s (%d).\n", \
#call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
}
__global__ void hello(const char * payload){
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i == 0) {
printf("%s", payload);
}
}
int main(int argc, char** argv){
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status stat;
if(size != 2){
if(rank == 0){
printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
}
MPI_Finalize();
exit(0);
}
int tag = 10;
const char *payload = "hello ";
char * d_payload;
CUDA_RT_CALL( cudaMalloc((void**)&d_payload, 6) );
CUDA_RT_CALL( cudaMemcpy(d_payload, payload, 6, cudaMemcpyHostToDevice) );
hello<<<1, 1>>>(d_payload);
CUDA_RT_CALL( cudaPeekAtLastError() );
CUDA_RT_CALL( cudaDeviceSynchronize() );
// TODO: Implement the MPI_Recv() call to receive the "payload" from rank 1 using directly "d_payload" as the target buffer on the GPU
if (rank == 1) {
MPI_Recv(d_payload, 6, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &stat);
}
hello<<<1, 1>>>(d_payload);
CUDA_RT_CALL( cudaPeekAtLastError() );
CUDA_RT_CALL( cudaDeviceSynchronize() );
printf("\n");
return 0;
}
#!/bin/bash -x
#SBATCH --account=training2317
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --output=slurm-out.%j
#SBATCH --error=slurm-err.%j
#SBATCH --time=00:15:00
#SBATCH --partition=devel
#SBATCH hetjob
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=develbooster
srun xenv -P -L GCC -L ParaStationMPI ./hello.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./hello.gpu.out
CUCOMP = nvcc
CUFLAGS = -arch=sm_80
ifdef EBROOTOPENMPI
MPI_HOME+=$(EBROOTOPENMPI)
endif
ifdef EBROOTPSMPI
MPI_HOME+=$(EBROOTPSMPI)
endif
INCLUDES = -I$(MPI_HOME)/include
LIBRARIES = -L$(MPI_HOME)/lib -lmpi
all: hello.cpu.out hello.gpu.out
hello.cpu.out: hello-world.c
mpicc $< -o $@
hello.gpu.out: hello.gpu.o
$(CUCOMP) $(CUFLAGS) $(LIBRARIES) $< -o $@
hello.gpu.o: hello-world.cu
$(CUCOMP) $(CUFLAGS) $(INCLUDES) -c $< -o $@
.PHONY: clean
clean:
rm -f hello.cpu.out hello.gpu.out *.o
\ No newline at end of file
# MSA GPU Hello World
Building up on the previous exercise, in this exercise the GPU-side `printf()` function is used to print from a kernel a "hello world!", where the second word is received directly from the CPU process.
TODOs are included in `hello-world.c` and `hello-world.cu`, indicating how to implement the `MPI_Send()` / `MPI_Recv()` structure.
Once the function calls are implemented,e xecute the following on JUWELS Booster
```bash
bash compile.sh
```
(Which is equivalent to calling the `make` for the GPU part of the application, including the right modules.)
Also, execute the following on JUWELS Cluster
```bash
bash compile.sh
sbatch job_msa_juwels.sh
```
(Which compiles the CPU part of the application and then submits a heterogeneous job to the batch queue.)
Monitor your job with `squeue --me`. When it ran through successfully, have a look at the output in `slurm-out.N`, with `N` being your job number.
\ No newline at end of file
if [[ "$SYSTEMNAME" == "juwelsbooster" ]]; then
echo "Building for $SYSTEMNAME"
ml GCC CUDA ParaStationMPI
make hello.gpu.out
elif [[ "$SYSTEMNAME" == "juwels" ]]; then
echo "Building for $SYSTEMNAME"
ml GCC ParaStationMPI
make hello.cpu.out
else
echo "The system $SYSTEMNAME is not supported!"
echo "Please load manually load environment modules for compiler and MPI and compile with the Makefile"
fi
\ No newline at end of file
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char** argv){
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status stat;
if(size != 2){
if(rank == 0){
printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
}
MPI_Finalize();
exit(0);
}
int tag = 10;
const char *payload = "world!";
// TODO: Implement the MPI_Send call to send the six characters of "payload" to rank 1
if (rank == 0) {
MPI_Send();
}
printf("\n");
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define CUDA_RT_CALL(call) \
{ \
cudaError_t cudaStatus = call; \
if (cudaSuccess != cudaStatus) \
fprintf(stderr, \
"ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \
"with " \
"%s (%d).\n", \
#call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
}
__global__ void hello(const char * payload){
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i == 0) {
printf("%s", payload);
}
}
int main(int argc, char** argv){
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status stat;
if(size != 2){
if(rank == 0){
printf("This program requires exactly 2 MPI ranks, but you are attempting to use %d! Exiting...\n", size);
}
MPI_Finalize();
exit(0);
}
int tag = 10;
const char *payload = "hello ";
char * d_payload;
CUDA_RT_CALL( cudaMalloc((void**)&d_payload, 6) );
CUDA_RT_CALL( cudaMemcpy(d_payload, payload, 6, cudaMemcpyHostToDevice) );
hello<<<1, 1>>>(d_payload);
CUDA_RT_CALL( cudaPeekAtLastError() );
CUDA_RT_CALL( cudaDeviceSynchronize() );
// TODO: Implement the MPI_Recv() call to receive the "payload" from rank 1 using directly "d_payload" as the target buffer on the GPU
if (rank == 1) {
MPI_Recv();
}
hello<<<1, 1>>>(d_payload);
CUDA_RT_CALL( cudaPeekAtLastError() );
CUDA_RT_CALL( cudaDeviceSynchronize() );
printf("\n");
return 0;
}
#!/bin/bash -x
#SBATCH --account=training2317
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --output=slurm-out.%j
#SBATCH --error=slurm-err.%j
#SBATCH --time=00:15:00
#SBATCH --partition=devel
#SBATCH hetjob
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=develbooster
srun xenv -P -L GCC -L ParaStationMPI ./hello.cpu.out : xenv -P -L GCC -L ParaStationMPI -L MPI-settings/CUDA ./hello.gpu.out
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment