Select Git revision
README.md
Forked from
SC Recipe Book / AI Recipes / singularity_docker_jupyter
Source project has a limited visibility.
poisson2d.solution.c 8.34 KiB
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <math.h>
#include <mpi.h>
#include <openacc.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "common.h"
// NVSHMEM
#include <nvshmem.h>
#include <nvshmemx.h>
// Helper function to map existing device allocation to host allocation for NVSHMEM
void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }
int main(int argc, char **argv) {
int ny = 4096;
int nx = 4096;
int iter_max = 1000;
const real tol = 1.0e-5;
if (argc == 2) {
iter_max = atoi(argv[1]);
}
int rank = 0;
int size = 1;
// Initialize MPI and determine rank and size
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// NVSHMEM
MPI_Comm mpi_comm = MPI_COMM_WORLD;
nvshmemx_init_attr_t attr;
attr.mpi_comm = &mpi_comm;
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
#pragma acc set device_num(rank)
real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));
// NVSHMEM
real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
map(A, d_A, nx * ny * sizeof(real));
// set rhs
for (int iy = 1; iy < ny - 1; iy++) {
for (int ix = 1; ix < nx - 1; ix++) {
const real x = -1.0 + (2.0 * ix / (nx - 1));
const real y = -1.0 + (2.0 * iy / (ny - 1));
rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
}
}
#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
int ix_start = 1;
int ix_end = (nx - 1);
// Ensure correctness if ny%size != 0
int chunk_size = ceil((1.0 * ny) / size);
int iy_start = rank * chunk_size;
int iy_end = iy_start + chunk_size;
// Do not process boundaries
iy_start = max(iy_start, 1);
iy_end = min(iy_end, ny - 1);
// OpenACC Warm-up
#pragma acc parallel loop present(A, Aref)
for (int iy = 0; iy < ny; iy++) {
for (int ix = 0; ix < nx; ix++) {
Aref[iy * nx + ix] = 0.0;
A[iy * nx + ix] = 0.0;
}
}
// Wait for all processes to finish Warm-up
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
double runtime_serial = 0.0;
if (rank == 0) {
printf("Calculate reference solution and time serial execution.\n");
// Timing of MPI rank 0 is used to calculate speedup do this in isolation
double start = MPI_Wtime();
poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
runtime_serial = MPI_Wtime() - start;
}
MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
// Wait for all processes to ensure correct timing of the parallel version
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) printf("Parallel execution.\n");
// double mpi_time = 0.0;
double start = MPI_Wtime();
int iter = 0;
real error = 1.0;
#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
rhs [iy_start * nx:(iy_end - iy_start) * nx])
while (error > tol && iter < iter_max) {
error = 0.0;
#pragma acc parallel loop present(A, Anew, rhs) async
for (int iy = iy_start; iy < iy_end; iy++) {
for (int ix = ix_start; ix < ix_end; ix++) {
Anew[iy * nx + ix] =
-0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
}
}
#pragma acc wait
real globalerror = 0.0;
MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
error = globalerror;
#pragma acc parallel loop present(A, Anew) async
for (int iy = iy_start; iy < iy_end; iy++) {
for (int ix = ix_start; ix < ix_end; ix++) {
A[iy * nx + ix] = Anew[iy * nx + ix];
}
}
// Periodic boundary conditions
int top = (rank == 0) ? (size - 1) : rank - 1;
int bottom = (rank == (size - 1)) ? 0 : rank + 1;
int iy_start_top = top * chunk_size;
int iy_end_top = iy_start_top + chunk_size;
// Do not process boundaries
iy_start_top = max(iy_start_top, 1);
iy_end_top = min(iy_end_top, ny - 1);
int iy_start_bottom = bottom * chunk_size;
int iy_end_bottom = iy_start_bottom + chunk_size;
// Do not process boundaries
iy_start_bottom = max(iy_start_bottom, 1);
iy_end_bottom = min(iy_end_bottom, ny - 1);
// Halo exchange
#pragma acc host_data use_device(A)
{
nvshmemx_double_put_on_stream(
(double *)(A + iy_end_top * nx + ix_start),
(double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top,
(cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
nvshmemx_double_put_on_stream(
(double *)(A + (iy_start_bottom - 1) * nx + ix_start),
(double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start), bottom,
(cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
}
nvshmemx_barrier_all_on_stream((cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
#pragma acc parallel loop present(A) async
for (int iy = iy_start; iy < iy_end; iy++) {
A[iy * nx + 0] = A[iy * nx + (nx - 2)];
A[iy * nx + (nx - 1)] = A[iy * nx + 1];
}
if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
iter++;
}
#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx]) wait
MPI_Barrier(MPI_COMM_WORLD);
double runtime = MPI_Wtime() - start;
int errors = 0;
if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
if (rank == 0) {
printf("Num GPUs: %d.\n", size);
printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
runtime_serial / (size * runtime) * 100);
// printf( "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
// (iter*4*(ix_end-ix_start)*sizeof(real))/(1024*1024*1024*mpi_time) );
}
} else {
errors = -1;
}
#pragma acc exit data delete (A, Aref, Anew, rhs)
MPI_Finalize();
free(rhs);
free(Anew);
free(Aref);
free(A);
nvshmem_free(d_A);
return errors;
}