poisson2d.solution.c

/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <math.h>
#include <mpi.h>
#include <openacc.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "common.h"

// NVSHMEM
#include <nvshmem.h>
#include <nvshmemx.h>

// Helper function to map existing device allocation to host allocation for NVSHMEM
void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }

int main(int argc, char **argv) {
    int ny = 4096;
    int nx = 4096;
    int iter_max = 1000;
    const real tol = 1.0e-5;

    if (argc == 2) {
        iter_max = atoi(argv[1]);
    }

    int rank = 0;
    int size = 1;

    // Initialize MPI and determine rank and size
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // NVSHMEM
    MPI_Comm mpi_comm = MPI_COMM_WORLD;
    nvshmemx_init_attr_t attr;
    attr.mpi_comm = &mpi_comm;
    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);

#pragma acc set device_num(rank)

    real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
    real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
    real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
    real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));

    // NVSHMEM
    real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
    map(A, d_A, nx * ny * sizeof(real));

    // set rhs
    for (int iy = 1; iy < ny - 1; iy++) {
        for (int ix = 1; ix < nx - 1; ix++) {
            const real x = -1.0 + (2.0 * ix / (nx - 1));
            const real y = -1.0 + (2.0 * iy / (ny - 1));
            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
        }
    }

#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])

    int ix_start = 1;
    int ix_end = (nx - 1);

    // Ensure correctness if ny%size != 0
    int chunk_size = ceil((1.0 * ny) / size);

    int iy_start = rank * chunk_size;
    int iy_end = iy_start + chunk_size;

    // Do not process boundaries
    iy_start = max(iy_start, 1);
    iy_end = min(iy_end, ny - 1);

// OpenACC Warm-up
#pragma acc parallel loop present(A, Aref)
    for (int iy = 0; iy < ny; iy++) {
        for (int ix = 0; ix < nx; ix++) {
            Aref[iy * nx + ix] = 0.0;
            A[iy * nx + ix] = 0.0;
        }
    }

    // Wait for all processes to finish Warm-up
    MPI_Barrier(MPI_COMM_WORLD);

    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);

    double runtime_serial = 0.0;
    if (rank == 0) {
        printf("Calculate reference solution and time serial execution.\n");
        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
        double start = MPI_Wtime();
        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
        runtime_serial = MPI_Wtime() - start;
    }
    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);

    // Wait for all processes to ensure correct timing of the parallel version
    MPI_Barrier(MPI_COMM_WORLD);
    if (rank == 0) printf("Parallel execution.\n");
    // double mpi_time = 0.0;
    double start = MPI_Wtime();
    int iter = 0;
    real error = 1.0;

#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
    while (error > tol && iter < iter_max) {
        error = 0.0;

#pragma acc parallel loop present(A, Anew, rhs) async
        for (int iy = iy_start; iy < iy_end; iy++) {
            for (int ix = ix_start; ix < ix_end; ix++) {
                Anew[iy * nx + ix] =
                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
            }
        }

#pragma acc wait

        real globalerror = 0.0;
        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
        error = globalerror;

#pragma acc parallel loop present(A, Anew) async
        for (int iy = iy_start; iy < iy_end; iy++) {
            for (int ix = ix_start; ix < ix_end; ix++) {
                A[iy * nx + ix] = Anew[iy * nx + ix];
            }
        }

        // Periodic boundary conditions
        int top = (rank == 0) ? (size - 1) : rank - 1;
        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
        int iy_start_top = top * chunk_size;
        int iy_end_top = iy_start_top + chunk_size;

        // Do not process boundaries
        iy_start_top = max(iy_start_top, 1);
        iy_end_top = min(iy_end_top, ny - 1);

        int iy_start_bottom = bottom * chunk_size;
        int iy_end_bottom = iy_start_bottom + chunk_size;

        // Do not process boundaries
        iy_start_bottom = max(iy_start_bottom, 1);
        iy_end_bottom = min(iy_end_bottom, ny - 1);

        // Halo exchange
#pragma acc host_data use_device(A)
        {
            nvshmemx_double_put_on_stream(
                (double *)(A + iy_end_top * nx + ix_start),
                (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top,
                (cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
            nvshmemx_double_put_on_stream(
                (double *)(A + (iy_start_bottom - 1) * nx + ix_start),
                (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start), bottom,
                (cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
        }
        nvshmemx_barrier_all_on_stream((cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));

#pragma acc parallel loop present(A) async
        for (int iy = iy_start; iy < iy_end; iy++) {
            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
        }

        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);

        iter++;
    }
#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx]) wait
    MPI_Barrier(MPI_COMM_WORLD);
    double runtime = MPI_Wtime() - start;

    int errors = 0;
    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
        if (rank == 0) {
            printf("Num GPUs: %d.\n", size);
            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
                   runtime_serial / (size * runtime) * 100);
            // printf( "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
            // (iter*4*(ix_end-ix_start)*sizeof(real))/(1024*1024*1024*mpi_time) );
        }
    } else {
        errors = -1;
    }

#pragma acc exit data delete (A, Aref, Anew, rhs)
    MPI_Finalize();

    free(rhs);
    free(Anew);
    free(Aref);
    free(A);
    nvshmem_free(d_A);
    return errors;
}