Skip to content
Snippets Groups Projects
Select Git revision
  • 662814158054adfb122da0f4566242bbb73b39d8
  • main default protected
2 results

README.md

  • Forked from SC Recipe Book / AI Recipes / singularity_docker_jupyter
    Source project has a limited visibility.
    poisson2d.solution.c 8.34 KiB
    /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
     *
     * Redistribution and use in source and binary forms, with or without
     * modification, are permitted provided that the following conditions
     * are met:
     *  * Redistributions of source code must retain the above copyright
     *    notice, this list of conditions and the following disclaimer.
     *  * Redistributions in binary form must reproduce the above copyright
     *    notice, this list of conditions and the following disclaimer in the
     *    documentation and/or other materials provided with the distribution.
     *  * Neither the name of NVIDIA CORPORATION nor the names of its
     *    contributors may be used to endorse or promote products derived
     *    from this software without specific prior written permission.
     *
     * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
     * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
     * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     */
    
    #include <math.h>
    #include <mpi.h>
    #include <openacc.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    #include "common.h"
    
    // NVSHMEM
    #include <nvshmem.h>
    #include <nvshmemx.h>
    
    // Helper function to map existing device allocation to host allocation for NVSHMEM
    void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }
    
    int main(int argc, char **argv) {
        int ny = 4096;
        int nx = 4096;
        int iter_max = 1000;
        const real tol = 1.0e-5;
    
        if (argc == 2) {
            iter_max = atoi(argv[1]);
        }
    
        int rank = 0;
        int size = 1;
    
        // Initialize MPI and determine rank and size
        MPI_Init(&argc, &argv);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
    
        // NVSHMEM
        MPI_Comm mpi_comm = MPI_COMM_WORLD;
        nvshmemx_init_attr_t attr;
        attr.mpi_comm = &mpi_comm;
        nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
    
    #pragma acc set device_num(rank)
    
        real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
        real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
        real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
        real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));
    
        // NVSHMEM
        real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
        map(A, d_A, nx * ny * sizeof(real));
    
        // set rhs
        for (int iy = 1; iy < ny - 1; iy++) {
            for (int ix = 1; ix < nx - 1; ix++) {
                const real x = -1.0 + (2.0 * ix / (nx - 1));
                const real y = -1.0 + (2.0 * iy / (ny - 1));
                rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
            }
        }
    
    #pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
    
        int ix_start = 1;
        int ix_end = (nx - 1);
    
        // Ensure correctness if ny%size != 0
        int chunk_size = ceil((1.0 * ny) / size);
    
        int iy_start = rank * chunk_size;
        int iy_end = iy_start + chunk_size;
    
        // Do not process boundaries
        iy_start = max(iy_start, 1);
        iy_end = min(iy_end, ny - 1);
    
    // OpenACC Warm-up
    #pragma acc parallel loop present(A, Aref)
        for (int iy = 0; iy < ny; iy++) {
            for (int ix = 0; ix < nx; ix++) {
                Aref[iy * nx + ix] = 0.0;
                A[iy * nx + ix] = 0.0;
            }
        }
    
        // Wait for all processes to finish Warm-up
        MPI_Barrier(MPI_COMM_WORLD);
    
        if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
    
        double runtime_serial = 0.0;
        if (rank == 0) {
            printf("Calculate reference solution and time serial execution.\n");
            // Timing of MPI rank 0 is used to calculate speedup do this in isolation
            double start = MPI_Wtime();
            poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
            runtime_serial = MPI_Wtime() - start;
        }
        MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
    
        // Wait for all processes to ensure correct timing of the parallel version
        MPI_Barrier(MPI_COMM_WORLD);
        if (rank == 0) printf("Parallel execution.\n");
        // double mpi_time = 0.0;
        double start = MPI_Wtime();
        int iter = 0;
        real error = 1.0;
    
    #pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
                              rhs [iy_start * nx:(iy_end - iy_start) * nx])
        while (error > tol && iter < iter_max) {
            error = 0.0;
    
    #pragma acc parallel loop present(A, Anew, rhs) async
            for (int iy = iy_start; iy < iy_end; iy++) {
                for (int ix = ix_start; ix < ix_end; ix++) {
                    Anew[iy * nx + ix] =
                        -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
                                                      A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
                }
            }
    
    #pragma acc wait
    
            real globalerror = 0.0;
            MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
            error = globalerror;
    
    #pragma acc parallel loop present(A, Anew) async
            for (int iy = iy_start; iy < iy_end; iy++) {
                for (int ix = ix_start; ix < ix_end; ix++) {
                    A[iy * nx + ix] = Anew[iy * nx + ix];
                }
            }
    
            // Periodic boundary conditions
            int top = (rank == 0) ? (size - 1) : rank - 1;
            int bottom = (rank == (size - 1)) ? 0 : rank + 1;
            int iy_start_top = top * chunk_size;
            int iy_end_top = iy_start_top + chunk_size;
    
            // Do not process boundaries
            iy_start_top = max(iy_start_top, 1);
            iy_end_top = min(iy_end_top, ny - 1);
    
            int iy_start_bottom = bottom * chunk_size;
            int iy_end_bottom = iy_start_bottom + chunk_size;
    
            // Do not process boundaries
            iy_start_bottom = max(iy_start_bottom, 1);
            iy_end_bottom = min(iy_end_bottom, ny - 1);
    
            // Halo exchange
    #pragma acc host_data use_device(A)
            {
                nvshmemx_double_put_on_stream(
                    (double *)(A + iy_end_top * nx + ix_start),
                    (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top,
                    (cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
                nvshmemx_double_put_on_stream(
                    (double *)(A + (iy_start_bottom - 1) * nx + ix_start),
                    (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start), bottom,
                    (cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
            }
            nvshmemx_barrier_all_on_stream((cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
    
    #pragma acc parallel loop present(A) async
            for (int iy = iy_start; iy < iy_end; iy++) {
                A[iy * nx + 0] = A[iy * nx + (nx - 2)];
                A[iy * nx + (nx - 1)] = A[iy * nx + 1];
            }
    
            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
    
            iter++;
        }
    #pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx]) wait
        MPI_Barrier(MPI_COMM_WORLD);
        double runtime = MPI_Wtime() - start;
    
        int errors = 0;
        if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
            if (rank == 0) {
                printf("Num GPUs: %d.\n", size);
                printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
                       ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
                       runtime_serial / (size * runtime) * 100);
                // printf( "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
                // (iter*4*(ix_end-ix_start)*sizeof(real))/(1024*1024*1024*mpi_time) );
            }
        } else {
            errors = -1;
        }
    
    #pragma acc exit data delete (A, Aref, Anew, rhs)
        MPI_Finalize();
    
        free(rhs);
        free(Anew);
        free(Aref);
        free(A);
        nvshmem_free(d_A);
        return errors;
    }