master files should be good now

fb17d31b · mathias wagner · 04ebbf3a · fb17d31b · fb17d31b · fb17d31b
Commit fb17d31b authored Nov 8, 2019 by mathias wagner
--- a/4-GPU/HandsOn/.master/C/task0/poisson2d.solution.pgprof
+++ b/4-GPU/HandsOn/.master/C/task0/poisson2d.solution.pgprof
--- a/4-GPU/HandsOn/.master/C/task1/pgprof.poisson2d.Task1.solution.tar.gz
+++ b/4-GPU/HandsOn/.master/C/task1/pgprof.poisson2d.Task1.solution.tar.gz
--- a/4-GPU/HandsOn/.master/C/task1/poisson2d.solution.efficiency.csv
+++ b/4-GPU/HandsOn/.master/C/task1/poisson2d.solution.efficiency.csv
+"Device","Kernel","Invocations","Metric Name","Metric Description","Min","Max","Avg"
+"Tesla V100-SXM2-16GB (0)","main_98_gpu",3,"gld_efficiency","Global Memory Load Efficiency",90.885714%,91.040204%,90.938132%
+"Tesla V100-SXM2-16GB (0)","main_98_gpu",3,"gst_efficiency","Global Memory Store Efficiency",88.956522%,88.956522%,88.956522%
+"Tesla V100-SXM2-16GB (0)","main_106_gpu",3,"gld_efficiency","Global Memory Load Efficiency",94.722222%,94.722222%,94.722222%
+"Tesla V100-SXM2-16GB (0)","main_106_gpu",3,"gst_efficiency","Global Memory Store Efficiency",88.956522%,88.956522%,88.956522%
+"Tesla V100-SXM2-16GB (0)","main_94_gpu__red",3,"gld_efficiency","Global Memory Load Efficiency",99.756335%,99.756335%,99.756335%
+"Tesla V100-SXM2-16GB (0)","main_94_gpu__red",3,"gst_efficiency","Global Memory Store Efficiency",25.000000%,25.000000%,25.000000%
+"Tesla V100-SXM2-16GB (0)","main_66_gpu",1,"gld_efficiency","Global Memory Load Efficiency",0.000000%,0.000000%,0.000000%
+"Tesla V100-SXM2-16GB (0)","main_66_gpu",1,"gst_efficiency","Global Memory Store Efficiency",100.000000%,100.000000%,100.000000%
+"Tesla V100-SXM2-16GB (0)","main_88_gpu",3,"gld_efficiency","Global Memory Load Efficiency",91.836772%,91.874827%,91.856345%
+"Tesla V100-SXM2-16GB (0)","main_88_gpu",3,"gst_efficiency","Global Memory Store Efficiency",88.845486%,88.845486%,88.845486%
+"Tesla V100-SXM2-16GB (0)","main_111_gpu",3,"gld_efficiency","Global Memory Load Efficiency",25.000000%,25.000000%,25.000000%
+"Tesla V100-SXM2-16GB (0)","main_111_gpu",3,"gst_efficiency","Global Memory Store Efficiency",25.000000%,25.000000%,25.000000%
--- a/4-GPU/HandsOn/.master/C/task2/pgprof.poisson2d.Task2.solution.tar.gz
+++ b/4-GPU/HandsOn/.master/C/task2/pgprof.poisson2d.Task2.solution.tar.gz
--- a/4-GPU/HandsOn/.master/C/task3/pgprof.poisson2d.Task3.solution.tar.gz
+++ b/4-GPU/HandsOn/.master/C/task3/pgprof.poisson2d.Task3.solution.tar.gz
--- a/4-GPU/HandsOn/.master/C/task4/pgprof.poisson2d.Task4.solution.tar.gz
+++ b/4-GPU/HandsOn/.master/C/task4/pgprof.poisson2d.Task4.solution.tar.gz
--- a/4-GPU/HandsOn/.master/C/task4/poisson2d.c
+++ b/4-GPU/HandsOn/.master/C/task4/poisson2d.c
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "common.h"
+// TODO: Inlucde necessary headers for NVSHMEM
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real* restrict harr, real* restrict darr, int size) { acc_map_data(harr, darr, size); }
+int main(int argc, char** argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+    int rank = 0;
+    int size = 1;
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    // TODO: Initialize NVSHMEM with MPI using nvshmemx_init_attr
+#pragma acc set device_num(rank)
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+    // TODO: Allocate symmetric device memory for A
+    // real *d_A = ...
+    // TODO: For OpenACC we need to map it to A and Anew so that OpenACC knows we already allocated
+    // device memory for A and Anew
+    // You can use the helper function map(...) above or us acc_map_data directly
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+    // TODO: Warming up MPI / CUDA IPC is not needed with NVSHMEM - remove that part
+    // MPI Warm-up to establish CUDA IPC connections
+    for (int i = 0; i < 2; ++i) {
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+    }
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+            // TODO: Replace both MPI calls with ons-sided nvshmem_<type>_put
+            // make sure to put data in the right location on the remote side
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            // TODO: Add a barrier to make sure data had arrived from remote
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    // TODO: free shmem memory
+    return errors;
+}
--- a/4-GPU/HandsOn/.master/C/task5/pgprof.poisson2d.Task5.solution.tar.gz
+++ b/4-GPU/HandsOn/.master/C/task5/pgprof.poisson2d.Task5.solution.tar.gz
--- a/4-GPU/HandsOn/.master/C/task6/pgprof.poisson2d.Task6.solution.tar.gz
+++ b/4-GPU/HandsOn/.master/C/task6/pgprof.poisson2d.Task6.solution.tar.gz
--- a/4-GPU/HandsOn/.master/FORTRAN/task0/poisson2d.solution.pgprof
+++ b/4-GPU/HandsOn/.master/FORTRAN/task0/poisson2d.solution.pgprof
--- a/4-GPU/HandsOn/.master/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz
+++ b/4-GPU/HandsOn/.master/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz
--- a/4-GPU/HandsOn/.master/FORTRAN/task2/pgprof.poisson2d.Task2.solution.tar.gz
+++ b/4-GPU/HandsOn/.master/FORTRAN/task2/pgprof.poisson2d.Task2.solution.tar.gz
--- a/4-GPU/HandsOn/.master/FORTRAN/task3/pgprof.poisson2d.Task3.solution.tar.gz
+++ b/4-GPU/HandsOn/.master/FORTRAN/task3/pgprof.poisson2d.Task3.solution.tar.gz
--- a/4-GPU/HandsOn/.master/HandsOnGPUProgramming_master.ipynb
+++ b/4-GPU/HandsOn/.master/HandsOnGPUProgramming_master.ipynb
--- a/4-GPU/HandsOn/.master/resources/Solution3.png
+++ b/4-GPU/HandsOn/.master/resources/Solution3.png
--- a/4-GPU/HandsOn/.master/resources/Solution4.png
+++ b/4-GPU/HandsOn/.master/resources/Solution4.png
--- a/4-GPU/HandsOn/.master/resources/Solution5.png
+++ b/4-GPU/HandsOn/.master/resources/Solution5.png
--- a/4-GPU/HandsOn/.master/resources/Solution6.png
+++ b/4-GPU/HandsOn/.master/resources/Solution6.png
--- a/4-GPU/HandsOn/.master/resources/eval.png
+++ b/4-GPU/HandsOn/.master/resources/eval.png
--- a/4-GPU/HandsOn/Solution/.clang-format
+++ b/4-GPU/HandsOn/Solution/.clang-format
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories: 
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats: 
+  - Language:        Cpp
+    Delimiters:      
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:      
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions: 
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+TabWidth:        8
+UseTab:          Never
+...